planning2ics/webapp/core.py

"""
core.py - Logique métier pour planning2ics web app.
Adapté de planning2ics.py pour usage web (config injectable, callback de progression).
"""

import re
import json
import time as time_module
from pathlib import Path
from datetime import datetime, date, time, timedelta
from typing import Callable, Optional

import pdfplumber
import requests
from bs4 import BeautifulSoup
from icalendar import Calendar, Event
from uuid import uuid4

MONTH_MAP = {
    "JANV": 1, "JAN": 1, "JANVIER": 1,
    "FEV": 2, "FEVR": 2, "FEVRIER": 2,
    "MARS": 3, "MAR": 3,
    "AVRIL": 4, "AVR": 4,
    "MAI": 5, "JUIN": 6,
    "JUIL": 7, "JUILLET": 7,
    "AOUT": 8, "AOÛT": 8,
    "SEPT": 9, "SEP": 9, "SEPTEMBRE": 9,
    "OCT": 10, "OCTOBRE": 10,
    "NOV": 11, "NOVEMBRE": 11,
    "DEC": 12, "DÉC": 12, "DECEMBRE": 12, "DÉCEMBRE": 12,
}

CONCERT_KEYWORDS = {
    'concert', 'représentation', 'générale publique',
    'raccord', 'italienne', 'scène orch'
}


# ── Utilitaires ───────────────────────────────────────────────────────────────

def normalize_note(note: str) -> str:
    return re.sub(r'\s+', ' ', note).strip()

def is_public_event(titre: str) -> bool:
    t = titre.lower()
    return any(k in t for k in CONCERT_KEYWORDS)

def sanitize_filename(name: str) -> str:
    clean = re.sub(r'[^\w\s\-éèêàùûîôç]', '', name, flags=re.UNICODE)
    return clean.strip().replace(' ', '_')[:80] or 'SERIE_INCONNUE'

def extract_year_month_from_filename(filename: str):
    year_match = re.search(r'(\d{4})', filename)
    year = int(year_match.group(1)) if year_match else 2026
    stem = Path(filename).stem.upper()
    main_month = 1
    for key, val in MONTH_MAP.items():
        if key in stem:
            main_month = val
            break
    return year, main_month

def parse_date(date_str: str, main_year: int, main_month: int) -> Optional[date]:
    try:
        day, month = map(int, date_str.strip().split('/'))
        if month > main_month + 3:
            year = main_year - 1
        elif month < main_month - 3:
            year = main_year + 1
        else:
            year = main_year
        return date(year, month, day)
    except Exception:
        return None

def parse_time(s: str) -> Optional[time]:
    m = re.match(r'(\d{1,2}):(\d{2})', s.strip())
    return time(int(m.group(1)), int(m.group(2))) if m else None

def parse_horaires(s: str):
    s = s.strip()
    m = re.match(r'(\d{1,2}:\d{2})\s*[-–]\s*(\d{1,2}:\d{2})', s)
    if m:
        return parse_time(m.group(1)), parse_time(m.group(2))
    m = re.match(r'(\d{1,2}:\d{2})', s)
    if m:
        return parse_time(m.group(1)), None
    return None, None


# ── Extraction PDF ────────────────────────────────────────────────────────────

def extract_events_from_pdf(pdf_path: Path) -> list:
    events = []
    main_year, main_month = extract_year_month_from_filename(pdf_path.name)
    current_date = None

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for table in (page.extract_tables() or []):
                for row in table:
                    if not row:
                        continue
                    cells = [str(c).strip() if c else '' for c in row]
                    if cells[0].lower() == 'jour' or len(cells) < 5:
                        continue

                    date_str = cells[1]
                    horaires = cells[2]
                    titre    = cells[3]
                    lieu     = cells[4]
                    note     = cells[5] if len(cells) > 5 else ''
                    dec      = cells[6] if len(cells) > 6 else ''
                    voy      = cells[7] if len(cells) > 7 else ''

                    if date_str and re.match(r'\d{1,2}/\d{2}', date_str):
                        parsed = parse_date(date_str, main_year, main_month)
                        if parsed:
                            current_date = parsed

                    if not current_date:
                        continue
                    if 'repos' in horaires.lower():
                        continue
                    if not re.search(r'\d{1,2}:\d{2}', horaires):
                        continue

                    start_time, end_time = parse_horaires(horaires)
                    if not start_time:
                        continue

                    events.append({
                        'date':        current_date,
                        'horaires':    horaires,
                        'start_time':  start_time,
                        'end_time':    end_time,
                        'titre':       titre,
                        'lieu':        lieu,
                        'note':        normalize_note(note),
                        'dec':         dec,
                        'voy':         voy,
                        'source_file': pdf_path.name,
                    })
    return events


# ── Scraping site web ─────────────────────────────────────────────────────────

def scrape_catalog(config: dict, cache_dir: Path,
                   log: Callable = None, force: bool = False) -> dict:
    cache_file = cache_dir / "website_catalog.json"
    cache_dir.mkdir(parents=True, exist_ok=True)

    if not force and cache_file.exists():
        if log:
            log("Catalogue site web chargé depuis le cache")
        with open(cache_file) as f:
            return json.load(f)

    headers     = {'User-Agent': 'Mozilla/5.0 (compatible; planning2ics/1.0)'}
    calendar_url = config['site']['calendar_url']
    site_base   = config['site']['base_url']

    if log:
        log("Scraping du site web de l'opéra...")

    resp = requests.get(calendar_url, headers=headers, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')

    event_links = {}
    for a in soup.find_all('a', href=True):
        href = a['href']
        if '/evenements/' in href and href.rstrip('/') != f'{site_base}/evenements':
            full_url = href if href.startswith('http') else site_base + href
            h3 = a.find('h3')
            cat_tag = a.find('p')
            title = h3.get_text(strip=True) if h3 else a.get_text(strip=True)
            category = cat_tag.get_text(strip=True) if cat_tag else ''
            if title and len(title) > 3:
                event_links[title] = {'url': full_url, 'category': category}

    catalog = {}
    total = len(event_links)
    if log:
        log(f"{total} événements trouvés sur le site, récupération des descriptions...")

    for i, (title, info) in enumerate(event_links.items()):
        if log and i % 20 == 0:
            log(f"Descriptions : {i}/{total}")
        try:
            r = requests.get(info['url'], headers=headers, timeout=20)
            r.raise_for_status()
            page_soup = BeautifulSoup(r.text, 'html.parser')
            catalog[title] = {
                'url':         info['url'],
                'description': _extract_description(page_soup),
                'category':    info['category'],
            }
            time_module.sleep(0.2)
        except Exception:
            catalog[title] = {
                'url': info['url'], 'description': '', 'category': info['category']
            }

    with open(cache_file, 'w') as f:
        json.dump(catalog, f, ensure_ascii=False, indent=2)

    if log:
        log(f"Catalogue mis en cache : {len(catalog)} événements")
    return catalog


def _extract_description(soup: BeautifulSoup) -> str:
    for selector in ['div.wp-block-group', 'div.entry-content', 'article', 'main']:
        container = soup.select_one(selector)
        if container:
            for tag in container.find_all(['nav', 'header', 'footer', 'button', 'form']):
                tag.decompose()
            lines = [
                l.strip() for l in container.get_text('\n', strip=True).splitlines()
                if l.strip() and len(l.strip()) > 15
            ][:40]
            if lines:
                return '\n'.join(lines)
    return soup.get_text('\n', strip=True)[:2000]


# ── LLM ───────────────────────────────────────────────────────────────────────

def _llm_call(prompt: str, ollama_url: str, model: str) -> str:
    resp = requests.post(
        f"{ollama_url}/api/chat",
        json={
            "model":    model,
            "messages": [{"role": "user", "content": prompt}],
            "stream":   True,
            "options":  {"temperature": 0.05, "num_predict": 16384},
            "think":    False,
        },
        stream=True,
        timeout=600,
    )
    resp.raise_for_status()
    content = ""
    for line in resp.iter_lines():
        if line:
            chunk = json.loads(line)
            content += chunk.get('message', {}).get('content', '')
            if chunk.get('done'):
                break
    return content


def _apply_parallel_heuristic(note: str, catalog: dict) -> Optional[str]:
    m = re.match(r"^\([AB]'?\)\s*:\s*[\"']?(.+?)[\"']?\s*$", note, re.IGNORECASE)
    if not m:
        return None
    inner = m.group(1).strip().lower()
    for title in catalog:
        if inner in title.lower() or title.lower() in inner:
            return title
    return m.group(1).strip().strip('"\'')


def cluster_notes_global(unique_notes: set, catalog: dict, config: dict,
                          cache_dir: Path, log: Callable = None,
                          force: bool = False) -> dict:
    cache_file = cache_dir / "series_mapping.json"
    cache_dir.mkdir(parents=True, exist_ok=True)

    if not force and cache_file.exists():
        if log:
            log("Mapping des séries chargé depuis le cache")
        with open(cache_file) as f:
            return json.load(f)

    catalog_titles = sorted(catalog.keys())
    titles_list    = '\n'.join(f'- "{t}"' for t in catalog_titles)
    notes_list     = '\n'.join(f'- {repr(n)}' for n in sorted(unique_notes) if n.strip())

    prompt = f"""Tu analyses le planning interne de l'Opéra Orchestre National Montpellier.

Voici les titres OFFICIELS des événements de la saison (depuis le site web) :
{titles_list}

Voici toutes les notes du planning interne (certaines sont des variantes de la même série) :
{notes_list}

Ta tâche : associer CHAQUE note à UN titre officiel.
Règles IMPORTANTES :
1. Les notes listant les mêmes compositeurs (ordre ou sous-titres différents) → MÊME série
2. Les préfixes "(A) :", "(B) :", "(A') :", "(B') :" → séries PARALLÈLES DIFFÉRENTES
   Ex: '(A) : "Magdalena"' → "Magdalena" ; '(B) : "Élémentaire"' → "Élémentaire, mon cher !"
3. Les annotations entre parenthèses (captation, présence de...) ne changent PAS la série
4. Les répétitions partielles (Cordes, Vents...) = même série que le Tutti

Réponds UNIQUEMENT avec un JSON valide, sans texte autour :
{{
  "matches": {{
    "note exacte telle quelle": "Titre Officiel du Site",
    ...
  }}
}}"""

    model = config['ollama']['cluster_model']
    if log:
        log(f"Identification des séries avec l'IA ({model})...")

    content = _llm_call(prompt, config['ollama']['url'], model)

    json_match = re.search(r'\{[\s\S]*\}', content)
    if not json_match:
        raise ValueError("Pas de JSON dans la réponse LLM")

    raw = json_match.group()
    try:
        result = json.loads(raw).get('matches', {})
    except json.JSONDecodeError:
        result = {}
        for m in re.finditer(r'"((?:[^"\\]|\\.)*)"\s*:\s*"((?:[^"\\]|\\.)*)"', raw):
            result[m.group(1)] = m.group(2)

    with open(cache_file, 'w') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    if log:
        log(f"{len(result)} notes associées à des séries")
    return result


def match_notes_to_series(unique_notes: set, catalog: dict, config: dict,
                           cache_dir: Path, log: Callable = None,
                           force_series: bool = False) -> dict:
    note_to_series = cluster_notes_global(
        unique_notes, catalog, config, cache_dir, log, force_series
    )
    # Heuristique (A)/(B) pour les non-assignés
    for note in unique_notes:
        if note not in note_to_series and note.strip():
            r = _apply_parallel_heuristic(note, catalog)
            if r:
                note_to_series[note] = r

    # Retry local pour les notes restantes
    still_missing = [n for n in unique_notes if n.strip() and n not in note_to_series]
    if still_missing:
        if log:
            log(f"Retry pour {len(still_missing)} notes non assignées...")
        titles_str = '\n'.join(f'- "{t}"' for t in sorted(catalog.keys()))
        notes_str  = '\n'.join(f'- {repr(n)}' for n in still_missing)
        prompt = (
            f"Associe ces notes à des titres officiels.\n"
            f"Titres:\n{titles_str}\nNotes:\n{notes_str}\n"
            f'Réponds UNIQUEMENT avec JSON: {{"matches": {{"note": "Titre"}}}}'
        )
        content = _llm_call(prompt, config['ollama']['url'], config['ollama']['local_model'])
        j = re.search(r'\{[\s\S]*\}', content)
        if j:
            try:
                note_to_series.update(json.loads(j.group()).get('matches', {}))
            except Exception:
                pass

    return note_to_series


# ── Génération ICS ────────────────────────────────────────────────────────────

def _build_description(evt: dict, series_title: str, catalog: dict) -> str:
    lines = []
    if is_public_event(evt['titre']):
        desc = catalog.get(series_title, {}).get('description', '')
        lines.append(desc[:1500] if desc else f"Programme : {evt['note']}")
    else:
        if evt['note']:
            lines.append(f"Œuvres : {evt['note']}")
        lines.append(f"Type : {evt['titre']}")
    if evt['dec']:
        lines.append(f"Durée déclarée : {evt['dec']}")
    if evt['voy']:
        lines.append(f"Déplacement : {evt['voy']}h de trajet")
    lines.append(f"Source : {evt['source_file']}")
    return '\n'.join(lines)


def _create_ics_bytes(series_title: str, events: list, catalog: dict) -> bytes:
    cal = Calendar()
    cal.add('prodid', '-//Opéra Orchestre National Montpellier//planning2ics//FR')
    cal.add('version', '2.0')
    cal.add('x-wr-calname', series_title)
    cal.add('x-wr-timezone', 'Europe/Paris')

    for evt in sorted(events, key=lambda e: (e['date'], e['start_time'])):
        vevent = Event()
        start_dt = datetime.combine(evt['date'], evt['start_time'])
        vevent.add('dtstart', start_dt)

        if evt['end_time']:
            end_dt = datetime.combine(evt['date'], evt['end_time'])
        else:
            dec_m = re.match(r'(\d{1,2}):(\d{2})', evt['dec'])
            duration = (
                timedelta(hours=int(dec_m.group(1)), minutes=int(dec_m.group(2)))
                if dec_m else timedelta(hours=2)
            )
            end_dt = start_dt + duration

        vevent.add('dtend', end_dt)
        vevent.add('summary', f"{evt['titre']} – {series_title}")
        if evt['lieu']:
            vevent.add('location', evt['lieu'])
        vevent.add('description', _build_description(evt, series_title, catalog))
        vevent.add('uid', str(uuid4()) + '@planning-orchestre')
        cal.add_component(vevent)

    return cal.to_ical()


# ── Point d'entrée principal ──────────────────────────────────────────────────

def process_pdfs(pdf_paths: list, config: dict, data_dir: Path,
                 log: Callable = None) -> dict:
    """
    Traite une liste de PDFs.
    Retourne {series_title: {filename, bytes, event_count}}.
    """
    cache_dir = data_dir / "cache"

    # 1. Extraction
    if log:
        log(f"Extraction de {len(pdf_paths)} PDF(s)...")
    all_events = []
    seen = set()
    for i, pdf_path in enumerate(pdf_paths):
        if log:
            log(f"Extraction {i+1}/{len(pdf_paths)} : {pdf_path.name}")
        for evt in extract_events_from_pdf(pdf_path):
            key = (evt['date'], evt['start_time'], evt['titre'], evt['note'])
            if key not in seen:
                seen.add(key)
                all_events.append(evt)
    if log:
        log(f"{len(all_events)} événements extraits au total")

    # 2. Catalogue site web
    catalog = scrape_catalog(config, cache_dir, log)

    # 3. Identification des séries
    unique_notes = {e['note'] for e in all_events}
    if log:
        log(f"{len(unique_notes)} notes uniques à analyser...")
    note_to_series = match_notes_to_series(unique_notes, catalog, config, cache_dir, log)

    # 4. Groupement et génération ICS
    series_events: dict[str, list] = {}
    for evt in all_events:
        s = note_to_series.get(evt['note'])
        if s:
            series_events.setdefault(s, []).append(evt)

    if log:
        log(f"Génération de {len(series_events)} fichiers ICS...")

    result = {}
    for series_title, events in series_events.items():
        result[series_title] = {
            'filename':    sanitize_filename(series_title) + '.ics',
            'bytes':       _create_ics_bytes(series_title, events, catalog),
            'event_count': len(events),
        }

    if log:
        log(f"Terminé : {len(result)} séries générées")
    return result