Files
planning2ics/webapp/core.py
T
sylvain 59cddee470 Fix: déduplication des événements pour éviter les doublons
Ajout d'une clé (date, start_time, titre, note) pour éviter qu'un même
événement soit ajouté plusieurs fois (PDFs qui se chevauchent ou lignes
dupliquées par pdfplumber sur tables multi-pages).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-08 15:27:02 +01:00

476 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
core.py - Logique métier pour planning2ics web app.
Adapté de planning2ics.py pour usage web (config injectable, callback de progression).
"""
import re
import json
import time as time_module
from pathlib import Path
from datetime import datetime, date, time, timedelta
from typing import Callable, Optional
import pdfplumber
import requests
from bs4 import BeautifulSoup
from icalendar import Calendar, Event
from uuid import uuid4
MONTH_MAP = {
"JANV": 1, "JAN": 1, "JANVIER": 1,
"FEV": 2, "FEVR": 2, "FEVRIER": 2,
"MARS": 3, "MAR": 3,
"AVRIL": 4, "AVR": 4,
"MAI": 5, "JUIN": 6,
"JUIL": 7, "JUILLET": 7,
"AOUT": 8, "AOÛT": 8,
"SEPT": 9, "SEP": 9, "SEPTEMBRE": 9,
"OCT": 10, "OCTOBRE": 10,
"NOV": 11, "NOVEMBRE": 11,
"DEC": 12, "DÉC": 12, "DECEMBRE": 12, "DÉCEMBRE": 12,
}
CONCERT_KEYWORDS = {
'concert', 'représentation', 'générale publique',
'raccord', 'italienne', 'scène orch'
}
# ── Utilitaires ───────────────────────────────────────────────────────────────
def normalize_note(note: str) -> str:
return re.sub(r'\s+', ' ', note).strip()
def is_public_event(titre: str) -> bool:
t = titre.lower()
return any(k in t for k in CONCERT_KEYWORDS)
def sanitize_filename(name: str) -> str:
clean = re.sub(r'[^\w\s\-éèêàùûîôç]', '', name, flags=re.UNICODE)
return clean.strip().replace(' ', '_')[:80] or 'SERIE_INCONNUE'
def extract_year_month_from_filename(filename: str):
year_match = re.search(r'(\d{4})', filename)
year = int(year_match.group(1)) if year_match else 2026
stem = Path(filename).stem.upper()
main_month = 1
for key, val in MONTH_MAP.items():
if key in stem:
main_month = val
break
return year, main_month
def parse_date(date_str: str, main_year: int, main_month: int) -> Optional[date]:
try:
day, month = map(int, date_str.strip().split('/'))
if month > main_month + 3:
year = main_year - 1
elif month < main_month - 3:
year = main_year + 1
else:
year = main_year
return date(year, month, day)
except Exception:
return None
def parse_time(s: str) -> Optional[time]:
m = re.match(r'(\d{1,2}):(\d{2})', s.strip())
return time(int(m.group(1)), int(m.group(2))) if m else None
def parse_horaires(s: str):
s = s.strip()
m = re.match(r'(\d{1,2}:\d{2})\s*[-]\s*(\d{1,2}:\d{2})', s)
if m:
return parse_time(m.group(1)), parse_time(m.group(2))
m = re.match(r'(\d{1,2}:\d{2})', s)
if m:
return parse_time(m.group(1)), None
return None, None
# ── Extraction PDF ────────────────────────────────────────────────────────────
def extract_events_from_pdf(pdf_path: Path) -> list:
events = []
main_year, main_month = extract_year_month_from_filename(pdf_path.name)
current_date = None
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
for table in (page.extract_tables() or []):
for row in table:
if not row:
continue
cells = [str(c).strip() if c else '' for c in row]
if cells[0].lower() == 'jour' or len(cells) < 5:
continue
date_str = cells[1]
horaires = cells[2]
titre = cells[3]
lieu = cells[4]
note = cells[5] if len(cells) > 5 else ''
dec = cells[6] if len(cells) > 6 else ''
voy = cells[7] if len(cells) > 7 else ''
if date_str and re.match(r'\d{1,2}/\d{2}', date_str):
parsed = parse_date(date_str, main_year, main_month)
if parsed:
current_date = parsed
if not current_date:
continue
if 'repos' in horaires.lower():
continue
if not re.search(r'\d{1,2}:\d{2}', horaires):
continue
start_time, end_time = parse_horaires(horaires)
if not start_time:
continue
events.append({
'date': current_date,
'horaires': horaires,
'start_time': start_time,
'end_time': end_time,
'titre': titre,
'lieu': lieu,
'note': normalize_note(note),
'dec': dec,
'voy': voy,
'source_file': pdf_path.name,
})
return events
# ── Scraping site web ─────────────────────────────────────────────────────────
def scrape_catalog(config: dict, cache_dir: Path,
log: Callable = None, force: bool = False) -> dict:
cache_file = cache_dir / "website_catalog.json"
cache_dir.mkdir(parents=True, exist_ok=True)
if not force and cache_file.exists():
if log:
log("Catalogue site web chargé depuis le cache")
with open(cache_file) as f:
return json.load(f)
headers = {'User-Agent': 'Mozilla/5.0 (compatible; planning2ics/1.0)'}
calendar_url = config['site']['calendar_url']
site_base = config['site']['base_url']
if log:
log("Scraping du site web de l'opéra...")
resp = requests.get(calendar_url, headers=headers, timeout=30)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')
event_links = {}
for a in soup.find_all('a', href=True):
href = a['href']
if '/evenements/' in href and href.rstrip('/') != f'{site_base}/evenements':
full_url = href if href.startswith('http') else site_base + href
h3 = a.find('h3')
cat_tag = a.find('p')
title = h3.get_text(strip=True) if h3 else a.get_text(strip=True)
category = cat_tag.get_text(strip=True) if cat_tag else ''
if title and len(title) > 3:
event_links[title] = {'url': full_url, 'category': category}
catalog = {}
total = len(event_links)
if log:
log(f"{total} événements trouvés sur le site, récupération des descriptions...")
for i, (title, info) in enumerate(event_links.items()):
if log and i % 20 == 0:
log(f"Descriptions : {i}/{total}")
try:
r = requests.get(info['url'], headers=headers, timeout=20)
r.raise_for_status()
page_soup = BeautifulSoup(r.text, 'html.parser')
catalog[title] = {
'url': info['url'],
'description': _extract_description(page_soup),
'category': info['category'],
}
time_module.sleep(0.2)
except Exception:
catalog[title] = {
'url': info['url'], 'description': '', 'category': info['category']
}
with open(cache_file, 'w') as f:
json.dump(catalog, f, ensure_ascii=False, indent=2)
if log:
log(f"Catalogue mis en cache : {len(catalog)} événements")
return catalog
def _extract_description(soup: BeautifulSoup) -> str:
for selector in ['div.wp-block-group', 'div.entry-content', 'article', 'main']:
container = soup.select_one(selector)
if container:
for tag in container.find_all(['nav', 'header', 'footer', 'button', 'form']):
tag.decompose()
lines = [
l.strip() for l in container.get_text('\n', strip=True).splitlines()
if l.strip() and len(l.strip()) > 15
][:40]
if lines:
return '\n'.join(lines)
return soup.get_text('\n', strip=True)[:2000]
# ── LLM ───────────────────────────────────────────────────────────────────────
def _llm_call(prompt: str, ollama_url: str, model: str) -> str:
resp = requests.post(
f"{ollama_url}/api/chat",
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": True,
"options": {"temperature": 0.05, "num_predict": 16384},
"think": False,
},
stream=True,
timeout=600,
)
resp.raise_for_status()
content = ""
for line in resp.iter_lines():
if line:
chunk = json.loads(line)
content += chunk.get('message', {}).get('content', '')
if chunk.get('done'):
break
return content
def _apply_parallel_heuristic(note: str, catalog: dict) -> Optional[str]:
m = re.match(r"^\([AB]'?\)\s*:\s*[\"']?(.+?)[\"']?\s*$", note, re.IGNORECASE)
if not m:
return None
inner = m.group(1).strip().lower()
for title in catalog:
if inner in title.lower() or title.lower() in inner:
return title
return m.group(1).strip().strip('"\'')
def cluster_notes_global(unique_notes: set, catalog: dict, config: dict,
cache_dir: Path, log: Callable = None,
force: bool = False) -> dict:
cache_file = cache_dir / "series_mapping.json"
cache_dir.mkdir(parents=True, exist_ok=True)
if not force and cache_file.exists():
if log:
log("Mapping des séries chargé depuis le cache")
with open(cache_file) as f:
return json.load(f)
catalog_titles = sorted(catalog.keys())
titles_list = '\n'.join(f'- "{t}"' for t in catalog_titles)
notes_list = '\n'.join(f'- {repr(n)}' for n in sorted(unique_notes) if n.strip())
prompt = f"""Tu analyses le planning interne de l'Opéra Orchestre National Montpellier.
Voici les titres OFFICIELS des événements de la saison (depuis le site web) :
{titles_list}
Voici toutes les notes du planning interne (certaines sont des variantes de la même série) :
{notes_list}
Ta tâche : associer CHAQUE note à UN titre officiel.
Règles IMPORTANTES :
1. Les notes listant les mêmes compositeurs (ordre ou sous-titres différents) → MÊME série
2. Les préfixes "(A) :", "(B) :", "(A') :", "(B') :" → séries PARALLÈLES DIFFÉRENTES
Ex: '(A) : "Magdalena"'"Magdalena" ; '(B) : "Élémentaire"'"Élémentaire, mon cher !"
3. Les annotations entre parenthèses (captation, présence de...) ne changent PAS la série
4. Les répétitions partielles (Cordes, Vents...) = même série que le Tutti
Réponds UNIQUEMENT avec un JSON valide, sans texte autour :
{{
"matches": {{
"note exacte telle quelle": "Titre Officiel du Site",
...
}}
}}"""
model = config['ollama']['cluster_model']
if log:
log(f"Identification des séries avec l'IA ({model})...")
content = _llm_call(prompt, config['ollama']['url'], model)
json_match = re.search(r'\{[\s\S]*\}', content)
if not json_match:
raise ValueError("Pas de JSON dans la réponse LLM")
raw = json_match.group()
try:
result = json.loads(raw).get('matches', {})
except json.JSONDecodeError:
result = {}
for m in re.finditer(r'"((?:[^"\\]|\\.)*)"\s*:\s*"((?:[^"\\]|\\.)*)"', raw):
result[m.group(1)] = m.group(2)
with open(cache_file, 'w') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
if log:
log(f"{len(result)} notes associées à des séries")
return result
def match_notes_to_series(unique_notes: set, catalog: dict, config: dict,
cache_dir: Path, log: Callable = None,
force_series: bool = False) -> dict:
note_to_series = cluster_notes_global(
unique_notes, catalog, config, cache_dir, log, force_series
)
# Heuristique (A)/(B) pour les non-assignés
for note in unique_notes:
if note not in note_to_series and note.strip():
r = _apply_parallel_heuristic(note, catalog)
if r:
note_to_series[note] = r
# Retry local pour les notes restantes
still_missing = [n for n in unique_notes if n.strip() and n not in note_to_series]
if still_missing:
if log:
log(f"Retry pour {len(still_missing)} notes non assignées...")
titles_str = '\n'.join(f'- "{t}"' for t in sorted(catalog.keys()))
notes_str = '\n'.join(f'- {repr(n)}' for n in still_missing)
prompt = (
f"Associe ces notes à des titres officiels.\n"
f"Titres:\n{titles_str}\nNotes:\n{notes_str}\n"
f'Réponds UNIQUEMENT avec JSON: {{"matches": {{"note": "Titre"}}}}'
)
content = _llm_call(prompt, config['ollama']['url'], config['ollama']['local_model'])
j = re.search(r'\{[\s\S]*\}', content)
if j:
try:
note_to_series.update(json.loads(j.group()).get('matches', {}))
except Exception:
pass
return note_to_series
# ── Génération ICS ────────────────────────────────────────────────────────────
def _build_description(evt: dict, series_title: str, catalog: dict) -> str:
lines = []
if is_public_event(evt['titre']):
desc = catalog.get(series_title, {}).get('description', '')
lines.append(desc[:1500] if desc else f"Programme : {evt['note']}")
else:
if evt['note']:
lines.append(f"Œuvres : {evt['note']}")
lines.append(f"Type : {evt['titre']}")
if evt['dec']:
lines.append(f"Durée déclarée : {evt['dec']}")
if evt['voy']:
lines.append(f"Déplacement : {evt['voy']}h de trajet")
lines.append(f"Source : {evt['source_file']}")
return '\n'.join(lines)
def _create_ics_bytes(series_title: str, events: list, catalog: dict) -> bytes:
cal = Calendar()
cal.add('prodid', '-//Opéra Orchestre National Montpellier//planning2ics//FR')
cal.add('version', '2.0')
cal.add('x-wr-calname', series_title)
cal.add('x-wr-timezone', 'Europe/Paris')
for evt in sorted(events, key=lambda e: (e['date'], e['start_time'])):
vevent = Event()
start_dt = datetime.combine(evt['date'], evt['start_time'])
vevent.add('dtstart', start_dt)
if evt['end_time']:
end_dt = datetime.combine(evt['date'], evt['end_time'])
else:
dec_m = re.match(r'(\d{1,2}):(\d{2})', evt['dec'])
duration = (
timedelta(hours=int(dec_m.group(1)), minutes=int(dec_m.group(2)))
if dec_m else timedelta(hours=2)
)
end_dt = start_dt + duration
vevent.add('dtend', end_dt)
vevent.add('summary', f"{evt['titre']} {series_title}")
if evt['lieu']:
vevent.add('location', evt['lieu'])
vevent.add('description', _build_description(evt, series_title, catalog))
vevent.add('uid', str(uuid4()) + '@planning-orchestre')
cal.add_component(vevent)
return cal.to_ical()
# ── Point d'entrée principal ──────────────────────────────────────────────────
def process_pdfs(pdf_paths: list, config: dict, data_dir: Path,
log: Callable = None) -> dict:
"""
Traite une liste de PDFs.
Retourne {series_title: {filename, bytes, event_count}}.
"""
cache_dir = data_dir / "cache"
# 1. Extraction
if log:
log(f"Extraction de {len(pdf_paths)} PDF(s)...")
all_events = []
seen = set()
for i, pdf_path in enumerate(pdf_paths):
if log:
log(f"Extraction {i+1}/{len(pdf_paths)} : {pdf_path.name}")
for evt in extract_events_from_pdf(pdf_path):
key = (evt['date'], evt['start_time'], evt['titre'], evt['note'])
if key not in seen:
seen.add(key)
all_events.append(evt)
if log:
log(f"{len(all_events)} événements extraits au total")
# 2. Catalogue site web
catalog = scrape_catalog(config, cache_dir, log)
# 3. Identification des séries
unique_notes = {e['note'] for e in all_events}
if log:
log(f"{len(unique_notes)} notes uniques à analyser...")
note_to_series = match_notes_to_series(unique_notes, catalog, config, cache_dir, log)
# 4. Groupement et génération ICS
series_events: dict[str, list] = {}
for evt in all_events:
s = note_to_series.get(evt['note'])
if s:
series_events.setdefault(s, []).append(evt)
if log:
log(f"Génération de {len(series_events)} fichiers ICS...")
result = {}
for series_title, events in series_events.items():
result[series_title] = {
'filename': sanitize_filename(series_title) + '.ics',
'bytes': _create_ics_bytes(series_title, events, catalog),
'event_count': len(events),
}
if log:
log(f"Terminé : {len(result)} séries générées")
return result