diff --git a/planning2ics.py b/planning2ics.py index 8383894..1a119c6 100644 --- a/planning2ics.py +++ b/planning2ics.py @@ -514,12 +514,18 @@ def main(): # 1. Extraire tous les événements des PDFs all_events = [] + seen = set() pdf_files = sorted(PDF_DIR.glob("*.pdf")) print(f"PDFs trouvés : {len(pdf_files)}") for pdf_path in pdf_files: events = extract_events_from_pdf(pdf_path) - print(f" {pdf_path.name}: {len(events)} événements") - all_events.extend(events) + before = len(all_events) + for evt in events: + key = (evt['date'], evt['start_time'], evt['titre'], evt['note']) + if key not in seen: + seen.add(key) + all_events.append(evt) + print(f" {pdf_path.name}: {len(events)} extraits, {len(all_events)-before} ajoutés") print(f"Total : {len(all_events)} événements") # 2. Scraper le site web (avec cache) diff --git a/webapp/core.py b/webapp/core.py index c046bc0..de223ae 100644 --- a/webapp/core.py +++ b/webapp/core.py @@ -431,10 +431,15 @@ def process_pdfs(pdf_paths: list, config: dict, data_dir: Path, if log: log(f"Extraction de {len(pdf_paths)} PDF(s)...") all_events = [] + seen = set() for i, pdf_path in enumerate(pdf_paths): if log: log(f"Extraction {i+1}/{len(pdf_paths)} : {pdf_path.name}") - all_events.extend(extract_events_from_pdf(pdf_path)) + for evt in extract_events_from_pdf(pdf_path): + key = (evt['date'], evt['start_time'], evt['titre'], evt['note']) + if key not in seen: + seen.add(key) + all_events.append(evt) if log: log(f"{len(all_events)} événements extraits au total")