Initial commit: planning PDF → ICS par série

- Extraction des événements depuis PDFs de planning mensuel - Scraping du site web pour titres officiels et descriptions - Clustering des séries via Ollama (qwen3.5:cloud) avec cache - Génération d'un fichier ICS par série - Descriptions riches : œuvres pour répétitions, description site pour concerts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-08 12:00:58 +01:00
commit 325d676ccf
2 changed files with 575 additions and 0 deletions
@@ -0,0 +1,6 @@
+pdf/
+ics/
+cache/
+__pycache__/
+*.pyc
+.env
@@ -0,0 +1,569 @@
+#!/usr/bin/env python3
+"""
+planning2ics.py
+Convertit les PDFs de planning de l'Opéra Orchestre National Montpellier
+en fichiers ICS par série, avec :
+- Titres officiels depuis le site web
+- Descriptions riches (site web pour concerts, œuvres pour répétitions)
+- Identification des séries parallèles (A), (B), (A'), (B')
+- Ollama qwen3:8b local en lots de 20 notes
+"""
+
+import re
+import json
+import time as time_module
+from pathlib import Path
+from datetime import datetime, date, time, timedelta
+
+import pdfplumber
+import requests
+from bs4 import BeautifulSoup
+from icalendar import Calendar, Event
+from uuid import uuid4
+
+# ── Configuration ────────────────────────────────────────────────────────────
+PDF_DIR    = Path("pdf")
+OUTPUT_DIR = Path("ics")
+CACHE_DIR  = Path("cache")
+CACHE_FILE = CACHE_DIR / "website_catalog.json"
+
+OLLAMA_URL   = "http://192.168.0.164:11434/api/chat"
+OLLAMA_MODEL = "qwen3:8b"
+
+SITE_BASE    = "https://www.opera-orchestre-montpellier.fr"
+CALENDAR_URL = f"{SITE_BASE}/calendrier/?saisons=32669"
+
+BATCH_SIZE        = 20   # notes envoyées par appel LLM local (fallback)
+CLUSTER_MODEL     = "qwen3.5:cloud"  # modèle pour le clustering global (toutes notes)
+SERIES_CACHE_FILE = CACHE_DIR / "series_mapping.json"
+
+MONTH_MAP = {
+    "JANV": 1, "JAN": 1, "JANVIER": 1,
+    "FEV": 2, "FEVR": 2, "FEVRIER": 2,
+    "MARS": 3, "MAR": 3,
+    "AVRIL": 4, "AVR": 4,
+    "MAI": 5,
+    "JUIN": 6,
+    "JUIL": 7, "JUILLET": 7,
+    "AOUT": 8, "AOÛT": 8,
+    "SEPT": 9, "SEP": 9, "SEPTEMBRE": 9,
+    "OCT": 10, "OCTOBRE": 10,
+    "NOV": 11, "NOVEMBRE": 11,
+    "DEC": 12, "DÉC": 12, "DECEMBRE": 12, "DÉCEMBRE": 12,
+}
+
+# Titres dont on sait que ce sont des concerts publics (→ description site)
+CONCERT_KEYWORDS = {
+    'concert', 'représentation', 'générale publique',
+    'raccord', 'italienne', 'scène orch'
+}
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+def normalize_note(note: str) -> str:
+    return re.sub(r'\s+', ' ', note).strip()
+
+def is_public_event(titre: str) -> bool:
+    t = titre.lower()
+    return any(k in t for k in CONCERT_KEYWORDS)
+
+def sanitize_filename(name: str) -> str:
+    clean = re.sub(r'[^\w\s\-éèêàùûîôç]', '', name, flags=re.UNICODE)
+    return clean.strip().replace(' ', '_')[:80] or 'SERIE_INCONNUE'
+
+# ── Extraction PDF ────────────────────────────────────────────────────────────
+def extract_year_month_from_filename(filename: str):
+    year_match = re.search(r'(\d{4})', filename)
+    year = int(year_match.group(1)) if year_match else 2026
+    stem = Path(filename).stem.upper()
+    main_month = 1
+    for key, val in MONTH_MAP.items():
+        if key in stem:
+            main_month = val
+            break
+    return year, main_month
+
+def parse_date(date_str: str, main_year: int, main_month: int):
+    try:
+        day, month = map(int, date_str.strip().split('/'))
+        if month > main_month + 3:
+            year = main_year - 1
+        elif month < main_month - 3:
+            year = main_year + 1
+        else:
+            year = main_year
+        return date(year, month, day)
+    except Exception:
+        return None
+
+def parse_time(s: str):
+    m = re.match(r'(\d{1,2}):(\d{2})', s.strip())
+    return time(int(m.group(1)), int(m.group(2))) if m else None
+
+def parse_horaires(s: str):
+    s = s.strip()
+    m = re.match(r'(\d{1,2}:\d{2})\s*[-–]\s*(\d{1,2}:\d{2})', s)
+    if m:
+        return parse_time(m.group(1)), parse_time(m.group(2))
+    m = re.match(r'(\d{1,2}:\d{2})', s)
+    if m:
+        return parse_time(m.group(1)), None
+    return None, None
+
+def extract_events_from_pdf(pdf_path: Path) -> list:
+    events = []
+    main_year, main_month = extract_year_month_from_filename(pdf_path.name)
+    current_date = None
+
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            for table in (page.extract_tables() or []):
+                for row in table:
+                    if not row:
+                        continue
+                    cells = [str(c).strip() if c else '' for c in row]
+                    if cells[0].lower() == 'jour':
+                        continue
+                    if len(cells) < 5:
+                        continue
+
+                    date_str = cells[1]
+                    horaires  = cells[2]
+                    titre     = cells[3]
+                    lieu      = cells[4]
+                    note      = cells[5] if len(cells) > 5 else ''
+                    dec       = cells[6] if len(cells) > 6 else ''
+                    voy       = cells[7] if len(cells) > 7 else ''
+
+                    if date_str and re.match(r'\d{1,2}/\d{2}', date_str):
+                        parsed = parse_date(date_str, main_year, main_month)
+                        if parsed:
+                            current_date = parsed
+
+                    if not current_date:
+                        continue
+                    if 'repos' in horaires.lower():
+                        continue
+                    if not re.search(r'\d{1,2}:\d{2}', horaires):
+                        continue
+
+                    start_time, end_time = parse_horaires(horaires)
+                    if not start_time:
+                        continue
+
+                    events.append({
+                        'date':       current_date,
+                        'horaires':   horaires,
+                        'start_time': start_time,
+                        'end_time':   end_time,
+                        'titre':      titre,
+                        'lieu':       lieu,
+                        'note':       normalize_note(note),
+                        'dec':        dec,
+                        'voy':        voy,
+                        'source_file': pdf_path.name,
+                    })
+    return events
+
+# ── Scraping site web ─────────────────────────────────────────────────────────
+def scrape_catalog(force_refresh: bool = False) -> dict:
+    """
+    Retourne {title: {url, description, category}}.
+    Met en cache dans CACHE_FILE pour éviter de re-scraper à chaque run.
+    """
+    CACHE_DIR.mkdir(exist_ok=True)
+
+    if not force_refresh and CACHE_FILE.exists():
+        print("  (cache site web trouvé, on l'utilise)")
+        with open(CACHE_FILE) as f:
+            return json.load(f)
+
+    headers = {'User-Agent': 'Mozilla/5.0 (compatible; planning2ics/1.0)'}
+    catalog = {}
+
+    # 1. Page calendrier → liste des événements + URLs
+    print("  Scraping page calendrier...")
+    resp = requests.get(CALENDAR_URL, headers=headers, timeout=30)
+    resp.raise_for_status()
+    soup = BeautifulSoup(resp.text, 'html.parser')
+
+    # Les liens d'événements sont sous /evenements/
+    # Structure : <a href="/evenements/..."><p>Catégorie</p><h3>Titre</h3></a>
+    event_links = {}
+    for a in soup.find_all('a', href=True):
+        href = a['href']
+        if '/evenements/' in href and href.rstrip('/') != f'{SITE_BASE}/evenements':
+            full_url = href if href.startswith('http') else SITE_BASE + href
+            h3 = a.find('h3')
+            category_tag = a.find('p')
+            title_text = h3.get_text(strip=True) if h3 else a.get_text(strip=True)
+            category_text = category_tag.get_text(strip=True) if category_tag else ''
+            if title_text and len(title_text) > 3:
+                event_links[title_text] = {'url': full_url, 'category': category_text}
+
+    print(f"  {len(event_links)} événements trouvés sur le calendrier")
+
+    # 2. Pour chaque événement, récupérer la description
+    for i, (title, info) in enumerate(event_links.items()):
+        url = info['url']
+        print(f"  [{i+1}/{len(event_links)}] {title}")
+        try:
+            r = requests.get(url, headers=headers, timeout=20)
+            r.raise_for_status()
+            page_soup = BeautifulSoup(r.text, 'html.parser')
+            description = _extract_description(page_soup)
+            catalog[title] = {
+                'url':         url,
+                'description': description,
+                'category':    info['category'],
+            }
+            time_module.sleep(0.3)
+        except Exception as e:
+            print(f"    Erreur: {e}")
+            catalog[title] = {'url': url, 'description': '', 'category': info['category']}
+
+    # Sauvegarder le cache
+    with open(CACHE_FILE, 'w') as f:
+        json.dump(catalog, f, ensure_ascii=False, indent=2)
+    print(f"  Cache sauvegardé : {CACHE_FILE}")
+    return catalog
+
+def _extract_description(soup: BeautifulSoup) -> str:
+    """Extrait le texte de description d'une page événement WordPress."""
+    # Essayer les sélecteurs courants WordPress / thèmes personnalisés
+    for selector in [
+        'div.wp-block-group',
+        'div.entry-content',
+        'article',
+        'main',
+    ]:
+        container = soup.select_one(selector)
+        if container:
+            # Supprimer nav, header, footer, boutons
+            for tag in container.find_all(['nav', 'header', 'footer', 'button', 'form']):
+                tag.decompose()
+            text = container.get_text(separator='\n', strip=True)
+            # Nettoyer les lignes vides multiples
+            lines = [l.strip() for l in text.splitlines() if l.strip()]
+            # Garder les 40 premières lignes pertinentes
+            lines = [l for l in lines if len(l) > 15][:40]
+            if lines:
+                return '\n'.join(lines)
+
+    return soup.get_text(separator='\n', strip=True)[:2000]
+
+# ── Matching LLM ──────────────────────────────────────────────────────────────
+def _llm_call(prompt: str) -> str:
+    return _llm_call_model(prompt, OLLAMA_MODEL)
+
+def _llm_batch(batch: list, catalog_titles: list) -> dict:
+    """Envoie un lot de notes au LLM. Retourne {note: titre}."""
+    titles_list = '\n'.join(f'- "{t}"' for t in catalog_titles)
+    notes_str   = '\n'.join(f'- {repr(n)}' for n in batch)
+
+    prompt = f"""Tu analyses le planning interne de l'Opéra Orchestre National Montpellier.
+
+Voici les titres OFFICIELS des événements de la saison (site web) :
+{titles_list}
+
+Associe chaque note de planning interne au titre officiel le plus proche.
+Règles IMPORTANTES :
+- "(A) : X" ou "(A') : X" → cherche "X" dans les titres officiels (ex: '(A) : "Magdalena"' → "Magdalena")
+- "(B) : X" ou "(B') : X" → cherche "X" dans les titres officiels (ex: '(B) : "Élémentaire"' → "Élémentaire, mon cher !")
+- Les variantes avec parenthèses (captation, présence de…) restent la même série
+- Répétitions partielles (Cordes, Vents…) = même série que le Tutti correspondant
+- Si vraiment aucun titre ne convient, invente un nom court et descriptif
+
+Notes à associer :
+{notes_str}
+
+Réponds UNIQUEMENT avec un JSON valide, sans aucun texte autour :
+{{
+  "matches": {{
+    "note exacte telle quelle": "Titre Officiel",
+    ...
+  }}
+}}"""
+
+    content = _llm_call(prompt)
+
+    json_match = re.search(r'\{[\s\S]*\}', content)
+    if not json_match:
+        print(f"    Avertissement: aucun JSON trouvé")
+        return {}
+
+    raw = json_match.group()
+    try:
+        data = json.loads(raw)
+        return data.get('matches', {})
+    except json.JSONDecodeError as e:
+        print(f"    JSON invalide ({e}), récupération regex...")
+        result = {}
+        for m in re.finditer(r'"((?:[^"\\]|\\.)*)"\s*:\s*"((?:[^"\\]|\\.)*)"', raw):
+            result[m.group(1)] = m.group(2)
+        return result
+
+
+def _llm_call_model(prompt: str, model: str) -> str:
+    """Appel Ollama avec streaming, modèle configurable."""
+    resp = requests.post(
+        OLLAMA_URL,
+        json={
+            "model":    model,
+            "messages": [{"role": "user", "content": prompt}],
+            "stream":   True,
+            "options":  {"temperature": 0.05, "num_predict": 16384},
+            "think":    False,
+        },
+        stream=True,
+        timeout=600,
+    )
+    resp.raise_for_status()
+    content = ""
+    for line in resp.iter_lines():
+        if line:
+            chunk = json.loads(line)
+            content += chunk.get('message', {}).get('content', '')
+            if chunk.get('done'):
+                break
+    return content
+
+
+def _apply_parallel_series_heuristic(note: str, catalog: dict) -> str | None:
+    """
+    Heuristique déterministe pour les notes avec préfixe (A)/(B)/(A')/(B').
+    Extrait le nom entre guillemets et cherche dans le catalogue.
+    """
+    m = re.match(r'^\([AB]\'?\)\s*:\s*["\']?(.+?)["\']?\s*$', note, re.IGNORECASE)
+    if not m:
+        return None
+    inner = m.group(1).strip().lower()
+    # Cherche le titre le plus proche dans le catalogue
+    for title in catalog:
+        if inner in title.lower() or title.lower() in inner:
+            return title
+    # Fallback : retourne le nom extrait nettoyé
+    return m.group(1).strip().strip('"\'')
+
+
+def cluster_notes_global(unique_notes: set, catalog: dict, force_refresh: bool = False) -> dict:
+    """
+    Clustering global : envoie TOUTES les notes en un seul appel au modèle cloud.
+    Résultat mis en cache dans SERIES_CACHE_FILE.
+    Retourne {note: titre_officiel}.
+    """
+    CACHE_DIR.mkdir(exist_ok=True)
+
+    if not force_refresh and SERIES_CACHE_FILE.exists():
+        print("  (cache séries trouvé, on l'utilise)")
+        with open(SERIES_CACHE_FILE) as f:
+            return json.load(f)
+
+    catalog_titles = sorted(catalog.keys())
+    titles_list    = '\n'.join(f'- "{t}"' for t in catalog_titles)
+    notes_list     = '\n'.join(f'- {repr(n)}' for n in sorted(unique_notes) if n.strip())
+
+    prompt = f"""Tu analyses le planning interne de l'Opéra Orchestre National Montpellier.
+
+Voici les titres OFFICIELS des événements de la saison (depuis le site web) :
+{titles_list}
+
+Voici toutes les notes du planning interne (157 notes, certaines sont des variantes de la même série) :
+{notes_list}
+
+Ta tâche : associer CHAQUE note à UN titre officiel.
+Règles IMPORTANTES :
+1. Les notes qui listent les mêmes compositeurs (même si l'ordre ou les sous-titres varient) appartiennent à la MÊME série
+   Ex: "CASALS / KORNGOLD / PÄRT / GERHARD..." et "KORNGOLD / CASALS / PÄRT..." → même concert
+2. Les préfixes "(A) :", "(B) :", "(A') :", "(B') :" indiquent des séries PARALLÈLES DIFFÉRENTES.
+   Ex: '(A) : "Magdalena"' → "Magdalena" ; '(B) : "Élémentaire, mon cher !"' → "Élémentaire, mon cher !"
+3. Les annotations entre parenthèses (captation, présence de, avec le chef assistant...) ne changent PAS la série
+4. Les répétitions partielles (Cordes, Vents, Harmonie) appartiennent à la même série que le Tutti
+
+Réponds UNIQUEMENT avec un JSON valide (sans texte autour) :
+{{
+  "matches": {{
+    "note exacte telle quelle": "Titre Officiel du Site",
+    ...
+  }}
+}}"""
+
+    print(f"  Clustering global avec {CLUSTER_MODEL}...")
+    content = _llm_call_model(prompt, CLUSTER_MODEL)
+
+    json_match = re.search(r'\{[\s\S]*\}', content)
+    if not json_match:
+        raise ValueError(f"Pas de JSON dans la réponse:\n{content[:300]}")
+
+    raw = json_match.group()
+    try:
+        data = json.loads(raw)
+        result = data.get('matches', {})
+    except json.JSONDecodeError as e:
+        print(f"  JSON invalide ({e}), récupération regex...")
+        result = {}
+        for m in re.finditer(r'"((?:[^"\\]|\\.)*)"\s*:\s*"((?:[^"\\]|\\.)*)"', raw):
+            result[m.group(1)] = m.group(2)
+
+    with open(SERIES_CACHE_FILE, 'w') as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+    print(f"  Cache séries sauvegardé : {SERIES_CACHE_FILE}")
+    return result
+
+
+def match_notes_to_series(unique_notes: set, catalog: dict) -> dict:
+    """
+    Pour chaque note PDF, trouve le titre officiel correspondant.
+    1. Clustering global (cloud, avec cache)
+    2. Heuristique déterministe pour les (A)/(B) non couverts
+    3. Retry LLM local en lots pour les notes restantes
+    Retourne {note: titre_officiel}.
+    """
+    # Étape 1 : clustering global cloud
+    note_to_series = cluster_notes_global(unique_notes, catalog)
+
+    # Étape 2 : heuristique (A)/(B) pour les non assignés
+    for note in unique_notes:
+        if note not in note_to_series and note.strip():
+            result = _apply_parallel_series_heuristic(note, catalog)
+            if result:
+                note_to_series[note] = result
+
+    # Étape 3 : retry local en lots pour les notes encore manquantes
+    still_missing = [n for n in unique_notes if n.strip() and n not in note_to_series]
+    if still_missing:
+        print(f"  Retry local pour {len(still_missing)} notes non assignées...")
+        catalog_titles = sorted(catalog.keys())
+        batches = [still_missing[i:i+BATCH_SIZE] for i in range(0, len(still_missing), BATCH_SIZE)]
+        for batch_num, batch in enumerate(batches):
+            print(f"    Lot {batch_num+1}/{len(batches)}...")
+            result = _llm_batch(batch, catalog_titles)
+            note_to_series.update(result)
+
+    return note_to_series
+
+# ── Génération ICS ────────────────────────────────────────────────────────────
+def build_event_description(evt: dict, series_title: str, catalog: dict) -> str:
+    """
+    - Concert/représentation publique → description du site web
+    - Répétition → liste des œuvres + infos pratiques
+    """
+    lines = []
+
+    if is_public_event(evt['titre']):
+        # Chercher la description sur le site
+        entry = catalog.get(series_title, {})
+        desc = entry.get('description', '')
+        if desc:
+            lines.append(desc[:1500])  # limiter la taille
+        else:
+            # Fallback : juste les œuvres
+            if evt['note']:
+                lines.append(f"Programme : {evt['note']}")
+    else:
+        # Répétition : afficher les œuvres travaillées
+        if evt['note']:
+            lines.append(f"Œuvres : {evt['note']}")
+        lines.append(f"Type : {evt['titre']}")
+
+    # Infos communes
+    if evt['dec']:
+        lines.append(f"Durée déclarée : {evt['dec']}")
+    if evt['voy']:
+        lines.append(f"Déplacement : {evt['voy']}h de trajet")
+    lines.append(f"Source : {evt['source_file']}")
+
+    return '\n'.join(lines)
+
+def create_ics(series_title: str, events: list, catalog: dict) -> Calendar:
+    cal = Calendar()
+    cal.add('prodid', '-//Opéra Orchestre National Montpellier//planning2ics//FR')
+    cal.add('version', '2.0')
+    cal.add('x-wr-calname', series_title)
+    cal.add('x-wr-timezone', 'Europe/Paris')
+
+    for evt in sorted(events, key=lambda e: (e['date'], e['start_time'])):
+        vevent = Event()
+
+        start_dt = datetime.combine(evt['date'], evt['start_time'])
+        vevent.add('dtstart', start_dt)
+
+        if evt['end_time']:
+            end_dt = datetime.combine(evt['date'], evt['end_time'])
+        else:
+            dec_m = re.match(r'(\d{1,2}):(\d{2})', evt['dec'])
+            duration = timedelta(
+                hours=int(dec_m.group(1)), minutes=int(dec_m.group(2))
+            ) if dec_m else timedelta(hours=2)
+            end_dt = start_dt + duration
+        vevent.add('dtend', end_dt)
+
+        vevent.add('summary', f"{evt['titre']} – {series_title}")
+
+        if evt['lieu']:
+            vevent.add('location', evt['lieu'])
+
+        vevent.add('description', build_event_description(evt, series_title, catalog))
+        vevent.add('uid', str(uuid4()) + '@planning-orchestre')
+        cal.add_component(vevent)
+
+    return cal
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+def main():
+    OUTPUT_DIR.mkdir(exist_ok=True)
+
+    # 1. Extraire tous les événements des PDFs
+    all_events = []
+    pdf_files = sorted(PDF_DIR.glob("*.pdf"))
+    print(f"PDFs trouvés : {len(pdf_files)}")
+    for pdf_path in pdf_files:
+        events = extract_events_from_pdf(pdf_path)
+        print(f"  {pdf_path.name}: {len(events)} événements")
+        all_events.extend(events)
+    print(f"Total : {len(all_events)} événements")
+
+    # 2. Scraper le site web (avec cache)
+    print("\nRécupération du catalogue site web...")
+    catalog = scrape_catalog()
+    print(f"  {len(catalog)} événements dans le catalogue")
+
+    # 3. Identifier les séries via LLM
+    unique_notes = {e['note'] for e in all_events}
+    print(f"\nNotes uniques : {len(unique_notes)}")
+    print(f"Identification des séries...")
+    note_to_series = match_notes_to_series(unique_notes, catalog)
+    print(f"  {len(note_to_series)} notes associées")
+
+    # 4. Répartir les événements par série
+    series_events: dict[str, list] = {}
+    unassigned = []
+    for evt in all_events:
+        series = note_to_series.get(evt['note'])
+        if series:
+            series_events.setdefault(series, []).append(evt)
+        else:
+            unassigned.append(evt)
+
+    print(f"\nSéries identifiées : {len(series_events)}")
+    for name, evts in sorted(series_events.items(), key=lambda x: -len(x[1])):
+        print(f"  [{name}] — {len(evts)} événements")
+
+    if unassigned:
+        print(f"\nNon assignés : {len(unassigned)}")
+        for e in unassigned[:5]:
+            print(f"  {e['date']} | {e['note']!r}")
+        series_events['_NON_ASSIGNE'] = unassigned
+
+    # 5. Générer les ICS
+    print(f"\nGénération des ICS dans {OUTPUT_DIR}/")
+    for series_title, events in series_events.items():
+        cal = create_ics(series_title, events, catalog)
+        fname = sanitize_filename(series_title) + '.ics'
+        with open(OUTPUT_DIR / fname, 'wb') as f:
+            f.write(cal.to_ical())
+        print(f"  {fname} ({len(events)} événements)")
+
+    print("\nTerminé !")
+
+if __name__ == '__main__':
+    main()