""" core.py - Logique métier pour planning2ics web app. Adapté de planning2ics.py pour usage web (config injectable, callback de progression). """ import re import json import time as time_module from pathlib import Path from datetime import datetime, date, time, timedelta from typing import Callable, Optional import pdfplumber import requests from bs4 import BeautifulSoup from icalendar import Calendar, Event from uuid import uuid4 MONTH_MAP = { "JANV": 1, "JAN": 1, "JANVIER": 1, "FEV": 2, "FEVR": 2, "FEVRIER": 2, "MARS": 3, "MAR": 3, "AVRIL": 4, "AVR": 4, "MAI": 5, "JUIN": 6, "JUIL": 7, "JUILLET": 7, "AOUT": 8, "AOÛT": 8, "SEPT": 9, "SEP": 9, "SEPTEMBRE": 9, "OCT": 10, "OCTOBRE": 10, "NOV": 11, "NOVEMBRE": 11, "DEC": 12, "DÉC": 12, "DECEMBRE": 12, "DÉCEMBRE": 12, } CONCERT_KEYWORDS = { 'concert', 'représentation', 'générale publique', 'raccord', 'italienne', 'scène orch' } # ── Utilitaires ─────────────────────────────────────────────────────────────── def normalize_note(note: str) -> str: return re.sub(r'\s+', ' ', note).strip() def is_public_event(titre: str) -> bool: t = titre.lower() return any(k in t for k in CONCERT_KEYWORDS) def sanitize_filename(name: str) -> str: clean = re.sub(r'[^\w\s\-éèêàùûîôç]', '', name, flags=re.UNICODE) return clean.strip().replace(' ', '_')[:80] or 'SERIE_INCONNUE' def extract_year_month_from_filename(filename: str): year_match = re.search(r'(\d{4})', filename) year = int(year_match.group(1)) if year_match else 2026 stem = Path(filename).stem.upper() main_month = 1 for key, val in MONTH_MAP.items(): if key in stem: main_month = val break return year, main_month def parse_date(date_str: str, main_year: int, main_month: int) -> Optional[date]: try: day, month = map(int, date_str.strip().split('/')) if month > main_month + 3: year = main_year - 1 elif month < main_month - 3: year = main_year + 1 else: year = main_year return date(year, month, day) except Exception: return None def parse_time(s: str) -> Optional[time]: m = re.match(r'(\d{1,2}):(\d{2})', s.strip()) return time(int(m.group(1)), int(m.group(2))) if m else None def parse_horaires(s: str): s = s.strip() m = re.match(r'(\d{1,2}:\d{2})\s*[-–]\s*(\d{1,2}:\d{2})', s) if m: return parse_time(m.group(1)), parse_time(m.group(2)) m = re.match(r'(\d{1,2}:\d{2})', s) if m: return parse_time(m.group(1)), None return None, None # ── Extraction PDF ──────────────────────────────────────────────────────────── def extract_events_from_pdf(pdf_path: Path) -> list: events = [] main_year, main_month = extract_year_month_from_filename(pdf_path.name) current_date = None with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: for table in (page.extract_tables() or []): for row in table: if not row: continue cells = [str(c).strip() if c else '' for c in row] if cells[0].lower() == 'jour' or len(cells) < 5: continue date_str = cells[1] horaires = cells[2] titre = cells[3] lieu = cells[4] note = cells[5] if len(cells) > 5 else '' dec = cells[6] if len(cells) > 6 else '' voy = cells[7] if len(cells) > 7 else '' if date_str and re.match(r'\d{1,2}/\d{2}', date_str): parsed = parse_date(date_str, main_year, main_month) if parsed: current_date = parsed if not current_date: continue if 'repos' in horaires.lower(): continue if not re.search(r'\d{1,2}:\d{2}', horaires): continue start_time, end_time = parse_horaires(horaires) if not start_time: continue events.append({ 'date': current_date, 'horaires': horaires, 'start_time': start_time, 'end_time': end_time, 'titre': titre, 'lieu': lieu, 'note': normalize_note(note), 'dec': dec, 'voy': voy, 'source_file': pdf_path.name, }) return events # ── Scraping site web ───────────────────────────────────────────────────────── def scrape_catalog(config: dict, cache_dir: Path, log: Callable = None, force: bool = False) -> dict: cache_file = cache_dir / "website_catalog.json" cache_dir.mkdir(parents=True, exist_ok=True) if not force and cache_file.exists(): if log: log("Catalogue site web chargé depuis le cache") with open(cache_file) as f: return json.load(f) headers = {'User-Agent': 'Mozilla/5.0 (compatible; planning2ics/1.0)'} calendar_url = config['site']['calendar_url'] site_base = config['site']['base_url'] if log: log("Scraping du site web de l'opéra...") resp = requests.get(calendar_url, headers=headers, timeout=30) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'html.parser') event_links = {} for a in soup.find_all('a', href=True): href = a['href'] if '/evenements/' in href and href.rstrip('/') != f'{site_base}/evenements': full_url = href if href.startswith('http') else site_base + href h3 = a.find('h3') cat_tag = a.find('p') title = h3.get_text(strip=True) if h3 else a.get_text(strip=True) category = cat_tag.get_text(strip=True) if cat_tag else '' if title and len(title) > 3: event_links[title] = {'url': full_url, 'category': category} catalog = {} total = len(event_links) if log: log(f"{total} événements trouvés sur le site, récupération des descriptions...") for i, (title, info) in enumerate(event_links.items()): if log and i % 20 == 0: log(f"Descriptions : {i}/{total}") try: r = requests.get(info['url'], headers=headers, timeout=20) r.raise_for_status() page_soup = BeautifulSoup(r.text, 'html.parser') catalog[title] = { 'url': info['url'], 'description': _extract_description(page_soup), 'category': info['category'], } time_module.sleep(0.2) except Exception: catalog[title] = { 'url': info['url'], 'description': '', 'category': info['category'] } with open(cache_file, 'w') as f: json.dump(catalog, f, ensure_ascii=False, indent=2) if log: log(f"Catalogue mis en cache : {len(catalog)} événements") return catalog def _extract_description(soup: BeautifulSoup) -> str: for selector in ['div.wp-block-group', 'div.entry-content', 'article', 'main']: container = soup.select_one(selector) if container: for tag in container.find_all(['nav', 'header', 'footer', 'button', 'form']): tag.decompose() lines = [ l.strip() for l in container.get_text('\n', strip=True).splitlines() if l.strip() and len(l.strip()) > 15 ][:40] if lines: return '\n'.join(lines) return soup.get_text('\n', strip=True)[:2000] # ── LLM ─────────────────────────────────────────────────────────────────────── def _llm_call(prompt: str, ollama_url: str, model: str) -> str: resp = requests.post( f"{ollama_url}/api/chat", json={ "model": model, "messages": [{"role": "user", "content": prompt}], "stream": True, "options": {"temperature": 0.05, "num_predict": 16384}, "think": False, }, stream=True, timeout=600, ) resp.raise_for_status() content = "" for line in resp.iter_lines(): if line: chunk = json.loads(line) content += chunk.get('message', {}).get('content', '') if chunk.get('done'): break return content def _apply_parallel_heuristic(note: str, catalog: dict) -> Optional[str]: m = re.match(r"^\([AB]'?\)\s*:\s*[\"']?(.+?)[\"']?\s*$", note, re.IGNORECASE) if not m: return None inner = m.group(1).strip().lower() for title in catalog: if inner in title.lower() or title.lower() in inner: return title return m.group(1).strip().strip('"\'') def cluster_notes_global(unique_notes: set, catalog: dict, config: dict, cache_dir: Path, log: Callable = None, force: bool = False) -> dict: cache_file = cache_dir / "series_mapping.json" cache_dir.mkdir(parents=True, exist_ok=True) if not force and cache_file.exists(): if log: log("Mapping des séries chargé depuis le cache") with open(cache_file) as f: return json.load(f) catalog_titles = sorted(catalog.keys()) titles_list = '\n'.join(f'- "{t}"' for t in catalog_titles) notes_list = '\n'.join(f'- {repr(n)}' for n in sorted(unique_notes) if n.strip()) prompt = f"""Tu analyses le planning interne de l'Opéra Orchestre National Montpellier. Voici les titres OFFICIELS des événements de la saison (depuis le site web) : {titles_list} Voici toutes les notes du planning interne (certaines sont des variantes de la même série) : {notes_list} Ta tâche : associer CHAQUE note à UN titre officiel. Règles IMPORTANTES : 1. Les notes listant les mêmes compositeurs (ordre ou sous-titres différents) → MÊME série 2. Les préfixes "(A) :", "(B) :", "(A') :", "(B') :" → séries PARALLÈLES DIFFÉRENTES Ex: '(A) : "Magdalena"' → "Magdalena" ; '(B) : "Élémentaire"' → "Élémentaire, mon cher !" 3. Les annotations entre parenthèses (captation, présence de...) ne changent PAS la série 4. Les répétitions partielles (Cordes, Vents...) = même série que le Tutti Réponds UNIQUEMENT avec un JSON valide, sans texte autour : {{ "matches": {{ "note exacte telle quelle": "Titre Officiel du Site", ... }} }}""" model = config['ollama']['cluster_model'] if log: log(f"Identification des séries avec l'IA ({model})...") content = _llm_call(prompt, config['ollama']['url'], model) json_match = re.search(r'\{[\s\S]*\}', content) if not json_match: raise ValueError("Pas de JSON dans la réponse LLM") raw = json_match.group() try: result = json.loads(raw).get('matches', {}) except json.JSONDecodeError: result = {} for m in re.finditer(r'"((?:[^"\\]|\\.)*)"\s*:\s*"((?:[^"\\]|\\.)*)"', raw): result[m.group(1)] = m.group(2) with open(cache_file, 'w') as f: json.dump(result, f, ensure_ascii=False, indent=2) if log: log(f"{len(result)} notes associées à des séries") return result def match_notes_to_series(unique_notes: set, catalog: dict, config: dict, cache_dir: Path, log: Callable = None, force_series: bool = False) -> dict: note_to_series = cluster_notes_global( unique_notes, catalog, config, cache_dir, log, force_series ) # Heuristique (A)/(B) pour les non-assignés for note in unique_notes: if note not in note_to_series and note.strip(): r = _apply_parallel_heuristic(note, catalog) if r: note_to_series[note] = r # Retry local pour les notes restantes still_missing = [n for n in unique_notes if n.strip() and n not in note_to_series] if still_missing: if log: log(f"Retry pour {len(still_missing)} notes non assignées...") titles_str = '\n'.join(f'- "{t}"' for t in sorted(catalog.keys())) notes_str = '\n'.join(f'- {repr(n)}' for n in still_missing) prompt = ( f"Associe ces notes à des titres officiels.\n" f"Titres:\n{titles_str}\nNotes:\n{notes_str}\n" f'Réponds UNIQUEMENT avec JSON: {{"matches": {{"note": "Titre"}}}}' ) content = _llm_call(prompt, config['ollama']['url'], config['ollama']['local_model']) j = re.search(r'\{[\s\S]*\}', content) if j: try: note_to_series.update(json.loads(j.group()).get('matches', {})) except Exception: pass return note_to_series # ── Génération ICS ──────────────────────────────────────────────────────────── def _build_description(evt: dict, series_title: str, catalog: dict) -> str: lines = [] if is_public_event(evt['titre']): desc = catalog.get(series_title, {}).get('description', '') lines.append(desc[:1500] if desc else f"Programme : {evt['note']}") else: if evt['note']: lines.append(f"Œuvres : {evt['note']}") lines.append(f"Type : {evt['titre']}") if evt['dec']: lines.append(f"Durée déclarée : {evt['dec']}") if evt['voy']: lines.append(f"Déplacement : {evt['voy']}h de trajet") lines.append(f"Source : {evt['source_file']}") return '\n'.join(lines) def _create_ics_bytes(series_title: str, events: list, catalog: dict) -> bytes: cal = Calendar() cal.add('prodid', '-//Opéra Orchestre National Montpellier//planning2ics//FR') cal.add('version', '2.0') cal.add('x-wr-calname', series_title) cal.add('x-wr-timezone', 'Europe/Paris') for evt in sorted(events, key=lambda e: (e['date'], e['start_time'])): vevent = Event() start_dt = datetime.combine(evt['date'], evt['start_time']) vevent.add('dtstart', start_dt) if evt['end_time']: end_dt = datetime.combine(evt['date'], evt['end_time']) else: dec_m = re.match(r'(\d{1,2}):(\d{2})', evt['dec']) duration = ( timedelta(hours=int(dec_m.group(1)), minutes=int(dec_m.group(2))) if dec_m else timedelta(hours=2) ) end_dt = start_dt + duration vevent.add('dtend', end_dt) vevent.add('summary', f"{evt['titre']} – {series_title}") if evt['lieu']: vevent.add('location', evt['lieu']) vevent.add('description', _build_description(evt, series_title, catalog)) vevent.add('uid', str(uuid4()) + '@planning-orchestre') cal.add_component(vevent) return cal.to_ical() # ── Point d'entrée principal ────────────────────────────────────────────────── def process_pdfs(pdf_paths: list, config: dict, data_dir: Path, log: Callable = None) -> dict: """ Traite une liste de PDFs. Retourne {series_title: {filename, bytes, event_count}}. """ cache_dir = data_dir / "cache" # 1. Extraction if log: log(f"Extraction de {len(pdf_paths)} PDF(s)...") all_events = [] seen = set() for i, pdf_path in enumerate(pdf_paths): if log: log(f"Extraction {i+1}/{len(pdf_paths)} : {pdf_path.name}") for evt in extract_events_from_pdf(pdf_path): key = (evt['date'], evt['start_time'], evt['titre'], evt['note']) if key not in seen: seen.add(key) all_events.append(evt) if log: log(f"{len(all_events)} événements extraits au total") # 2. Catalogue site web catalog = scrape_catalog(config, cache_dir, log) # 3. Identification des séries unique_notes = {e['note'] for e in all_events} if log: log(f"{len(unique_notes)} notes uniques à analyser...") note_to_series = match_notes_to_series(unique_notes, catalog, config, cache_dir, log) # 4. Groupement et génération ICS series_events: dict[str, list] = {} for evt in all_events: s = note_to_series.get(evt['note']) if s: series_events.setdefault(s, []).append(evt) if log: log(f"Génération de {len(series_events)} fichiers ICS...") result = {} for series_title, events in series_events.items(): result[series_title] = { 'filename': sanitize_filename(series_title) + '.ics', 'bytes': _create_ics_bytes(series_title, events, catalog), 'event_count': len(events), } if log: log(f"Terminé : {len(result)} séries générées") return result