Initial commit: agent_logwatch v1.0
- Réception logs MQTT depuis machines distantes (agents/logwatch/+/logs) - Pré-filtrage sans LLM (14 patterns: ERROR, FATAL, OOM, segfault, auth fail...) - Analyse LLM par créneau horaire configurable (APScheduler) - Gestion round-robin avec reprise sur interruption - Extension de créneau (+30 min) avec confirmation admin - Skills: machine (gestion machines) + logwatch (contrôle) - Script send_logs.sh pour machines distantes Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,677 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Agent LogWatch — Analyse de logs multi-machines avec fenêtre horaire programmée.
|
||||
|
||||
Les machines distantes envoient leurs logs via MQTT vers agents/logwatch/<hostname>/logs.
|
||||
L'agent pré-filtre (sans LLM), stocke en SQLite, puis analyse avec le LLM
|
||||
pendant les créneaux horaires configurés.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
from agents_core import BaseAgent, AgentContext, Message, MessageType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Pré-filtres sans LLM ────────────────────────────────────────────────────
|
||||
|
||||
FILTER_PATTERNS = [
|
||||
re.compile(r'\b(ERROR|CRITICAL|FATAL|PANIC|EMERG|ALERT|CRIT)\b'),
|
||||
re.compile(r'\bException\b|\bTraceback\b|\bTraceback \(most recent'),
|
||||
re.compile(r'\bsegfault\b|\bSegmentation fault\b', re.IGNORECASE),
|
||||
re.compile(r'\bout of memory\b|\bOOM killer\b|\bOOM-killer\b', re.IGNORECASE),
|
||||
re.compile(r'\b(failed|failure)\b', re.IGNORECASE),
|
||||
re.compile(r'\bkilled\b', re.IGNORECASE),
|
||||
re.compile(r'\b(BUG|Oops):\s'),
|
||||
re.compile(r'<[0-3]>'), # syslog priorities 0=emerg, 1=alert, 2=crit, 3=err
|
||||
re.compile(r'\bcore dumped\b', re.IGNORECASE),
|
||||
re.compile(r'\bpanic\b', re.IGNORECASE),
|
||||
re.compile(r'\bdenied\b.*\bpermission\b|\bpermission\b.*\bdenied\b', re.IGNORECASE),
|
||||
re.compile(r'\bauthentication failure\b|\bfailed login\b|\bfailed password\b', re.IGNORECASE),
|
||||
re.compile(r'\bdisk full\b|\bno space left\b', re.IGNORECASE),
|
||||
re.compile(r'\bconnection refused\b|\bconnection timed out\b', re.IGNORECASE),
|
||||
re.compile(r'\bssh.*invalid user\b|\binvalid user.*ssh\b', re.IGNORECASE),
|
||||
]
|
||||
|
||||
SEVERITY_RANK = {
|
||||
'EMERG': 0, 'ALERT': 1, 'CRIT': 2, 'CRITICAL': 2, 'FATAL': 2, 'PANIC': 2,
|
||||
'ERROR': 3, 'ERR': 3,
|
||||
'FAILED': 4, 'FAILURE': 4, 'DENIED': 4,
|
||||
'EXCEPTION': 5, 'TRACEBACK': 5,
|
||||
'KILLED': 6, 'OOM': 6, 'SEGFAULT': 6, 'CORE': 6,
|
||||
}
|
||||
|
||||
CHUNK_SIZE = 150 # lignes envoyées au LLM par appel
|
||||
|
||||
|
||||
def _detect_severity(line: str) -> str:
|
||||
line_up = line.upper()
|
||||
for kw, _ in sorted(SEVERITY_RANK.items(), key=lambda x: x[1]):
|
||||
if kw in line_up:
|
||||
return kw
|
||||
return 'ERROR'
|
||||
|
||||
|
||||
class LogWatchAgent(BaseAgent):
|
||||
AGENT_TYPE = "logwatch"
|
||||
DESCRIPTION = (
|
||||
"Analyse de logs multi-machines. Reçoit les logs des machines distantes via MQTT, "
|
||||
"pré-filtre les erreurs, les analyse avec le LLM pendant les créneaux programmés, "
|
||||
"envoie des rapports par XMPP. Gestion de file de machines, round-robin, "
|
||||
"reprise sur interruption et analyse à la demande."
|
||||
)
|
||||
DEFAULT_CONFIG_PATH = "/opt/agent_logwatch/config/config.json"
|
||||
|
||||
def get_skills_dir(self) -> str:
|
||||
return os.path.join(os.path.dirname(__file__), "skills")
|
||||
|
||||
# ─── Init ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def __init__(self, config_path=None):
|
||||
super().__init__(config_path)
|
||||
self.db_path = Path(self.config.get("db_path", "/opt/agent_logwatch/data/logwatch.db"))
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._db_lock = threading.Lock()
|
||||
self._init_db()
|
||||
|
||||
# Scheduler APScheduler
|
||||
try:
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
self._scheduler = BackgroundScheduler(timezone="Europe/Paris")
|
||||
except ImportError:
|
||||
logger.error("apscheduler non installé — `pip install apscheduler`")
|
||||
self._scheduler = None
|
||||
|
||||
# État analyse
|
||||
self._analysis_thread = None
|
||||
self._analysis_stop = threading.Event()
|
||||
self._slot_end_time = None
|
||||
|
||||
# Extension demandée
|
||||
self._pending_extension = None # dict: {machine_id, hostname}
|
||||
self._extension_event = threading.Event()
|
||||
self._extension_granted = False
|
||||
|
||||
# ─── DB ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def _get_db(self) -> sqlite3.Connection:
|
||||
conn = sqlite3.connect(str(self.db_path), timeout=10)
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
def _init_db(self):
|
||||
with self._get_db() as conn:
|
||||
conn.executescript("""
|
||||
CREATE TABLE IF NOT EXISTS machines (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
hostname TEXT UNIQUE NOT NULL,
|
||||
registered_at TEXT NOT NULL,
|
||||
last_log_at TEXT,
|
||||
last_analyzed_at TEXT,
|
||||
queue_position INTEGER DEFAULT 0,
|
||||
active INTEGER DEFAULT 1
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS filtered_logs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
machine_id INTEGER NOT NULL,
|
||||
log_line TEXT NOT NULL,
|
||||
severity TEXT,
|
||||
received_at TEXT NOT NULL,
|
||||
analyzed INTEGER DEFAULT 0,
|
||||
FOREIGN KEY (machine_id) REFERENCES machines(id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_fl_machine_analyzed
|
||||
ON filtered_logs(machine_id, analyzed);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS analysis_sessions (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
machine_id INTEGER NOT NULL,
|
||||
slot_date TEXT NOT NULL,
|
||||
status TEXT DEFAULT 'pending',
|
||||
started_at TEXT,
|
||||
completed_at TEXT,
|
||||
last_log_id INTEGER DEFAULT 0,
|
||||
UNIQUE(machine_id, slot_date),
|
||||
FOREIGN KEY (machine_id) REFERENCES machines(id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS agent_config (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT NOT NULL
|
||||
);
|
||||
|
||||
INSERT OR IGNORE INTO agent_config VALUES ('analysis_start', '02:00');
|
||||
INSERT OR IGNORE INTO agent_config VALUES ('analysis_end', '04:00');
|
||||
INSERT OR IGNORE INTO agent_config VALUES ('max_overage_minutes', '30');
|
||||
INSERT OR IGNORE INTO agent_config VALUES ('enabled', '1');
|
||||
INSERT OR IGNORE INTO agent_config VALUES ('log_retention_days', '7');
|
||||
""")
|
||||
|
||||
def _cfg(self, key: str, default: str = '') -> str:
|
||||
with self._get_db() as conn:
|
||||
row = conn.execute("SELECT value FROM agent_config WHERE key=?", (key,)).fetchone()
|
||||
return row['value'] if row else default
|
||||
|
||||
def _set_cfg(self, key: str, value: str):
|
||||
with self._get_db() as conn:
|
||||
conn.execute("INSERT OR REPLACE INTO agent_config VALUES (?,?)", (key, value))
|
||||
|
||||
# ─── Démarrage ────────────────────────────────────────────────────────────
|
||||
|
||||
def on_start(self):
|
||||
# Souscriptions MQTT pour recevoir les logs des machines distantes
|
||||
self.mqtt.subscribe("agents/logwatch/+/logs", self._on_log_received)
|
||||
self.mqtt.subscribe("agents/logwatch/register", self._on_machine_register)
|
||||
|
||||
# Démarrage du scheduler
|
||||
if self._scheduler:
|
||||
self._reload_schedule()
|
||||
self._scheduler.start()
|
||||
logger.info("Scheduler APScheduler démarré.")
|
||||
|
||||
# Nettoyage des vieux logs au démarrage
|
||||
self._cleanup_old_logs()
|
||||
|
||||
logger.info("Agent LogWatch démarré. En attente de logs sur agents/logwatch/+/logs")
|
||||
|
||||
def setup_extra_subscriptions(self):
|
||||
pass # tout est dans on_start
|
||||
|
||||
# ─── Réception des logs ──────────────────────────────────────────────────
|
||||
|
||||
def _on_machine_register(self, msg, topic: str):
|
||||
"""Enregistrement explicite d'une machine via MQTT."""
|
||||
payload = msg.payload if hasattr(msg, 'payload') else str(msg)
|
||||
try:
|
||||
data = json.loads(payload) if isinstance(payload, str) else payload
|
||||
hostname = str(data.get('hostname', '')).strip()
|
||||
if hostname:
|
||||
self._register_machine(hostname)
|
||||
except Exception as e:
|
||||
logger.error(f"[register] {e}")
|
||||
|
||||
def _on_log_received(self, msg, topic: str):
|
||||
"""
|
||||
Reçoit des logs bruts depuis une machine distante.
|
||||
Topic : agents/logwatch/<hostname>/logs
|
||||
Payload JSON : {"lines": [...]} ou {"log": "..."} ou texte brut multiligne
|
||||
"""
|
||||
payload = msg.payload if hasattr(msg, 'payload') else str(msg)
|
||||
try:
|
||||
parts = topic.split('/')
|
||||
hostname = parts[2] if len(parts) >= 4 else 'unknown'
|
||||
|
||||
# Parser le payload
|
||||
if isinstance(payload, str):
|
||||
try:
|
||||
data = json.loads(payload)
|
||||
if isinstance(data, dict):
|
||||
lines = data.get('lines') or data.get('logs') or []
|
||||
if isinstance(lines, str):
|
||||
lines = lines.splitlines()
|
||||
if not lines and 'log' in data:
|
||||
lines = str(data['log']).splitlines()
|
||||
elif isinstance(data, list):
|
||||
lines = data
|
||||
else:
|
||||
lines = payload.splitlines()
|
||||
except json.JSONDecodeError:
|
||||
lines = payload.splitlines()
|
||||
elif isinstance(payload, bytes):
|
||||
lines = payload.decode('utf-8', errors='replace').splitlines()
|
||||
else:
|
||||
lines = []
|
||||
|
||||
if not lines:
|
||||
return
|
||||
|
||||
machine_id = self._register_machine(hostname)
|
||||
filtered = self._prefilter(lines)
|
||||
|
||||
if filtered:
|
||||
now = datetime.now().isoformat()
|
||||
with self._get_db() as conn:
|
||||
conn.executemany(
|
||||
"INSERT INTO filtered_logs (machine_id, log_line, severity, received_at) VALUES (?,?,?,?)",
|
||||
[(machine_id, line, sev, now) for line, sev in filtered]
|
||||
)
|
||||
conn.execute(
|
||||
"UPDATE machines SET last_log_at=? WHERE id=?",
|
||||
(now, machine_id)
|
||||
)
|
||||
logger.info(f"[{hostname}] {len(filtered)}/{len(lines)} lignes filtrées conservées")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[_on_log_received] {e}", exc_info=True)
|
||||
|
||||
def _prefilter(self, lines: list) -> list:
|
||||
"""Filtre les lignes, retourne [(line, severity)]."""
|
||||
result = []
|
||||
for line in lines:
|
||||
line = str(line).strip()
|
||||
if not line:
|
||||
continue
|
||||
for pat in FILTER_PATTERNS:
|
||||
if pat.search(line):
|
||||
result.append((line, _detect_severity(line)))
|
||||
break
|
||||
return result
|
||||
|
||||
def _register_machine(self, hostname: str) -> int:
|
||||
"""Enregistre ou met à jour une machine, retourne son id."""
|
||||
with self._get_db() as conn:
|
||||
row = conn.execute("SELECT id FROM machines WHERE hostname=?", (hostname,)).fetchone()
|
||||
if row:
|
||||
return row['id']
|
||||
max_pos = conn.execute(
|
||||
"SELECT COALESCE(MAX(queue_position), 0) FROM machines"
|
||||
).fetchone()[0]
|
||||
cur = conn.execute(
|
||||
"INSERT INTO machines (hostname, registered_at, queue_position) VALUES (?,?,?)",
|
||||
(hostname, datetime.now().isoformat(), max_pos + 1)
|
||||
)
|
||||
logger.info(f"Nouvelle machine enregistrée: {hostname} (pos={max_pos+1})")
|
||||
return cur.lastrowid
|
||||
|
||||
# ─── Scheduler ────────────────────────────────────────────────────────────
|
||||
|
||||
def _reload_schedule(self):
|
||||
"""(Re)programme les jobs APScheduler selon la config DB."""
|
||||
if not self._scheduler:
|
||||
return
|
||||
for job_id in ('_slot_start', '_slot_end'):
|
||||
try:
|
||||
self._scheduler.remove_job(job_id)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if self._cfg('enabled') != '1':
|
||||
logger.info("Analyse automatique désactivée.")
|
||||
return
|
||||
|
||||
start_str = self._cfg('analysis_start', '02:00')
|
||||
end_str = self._cfg('analysis_end', '04:00')
|
||||
try:
|
||||
sh, sm = map(int, start_str.split(':'))
|
||||
eh, em = map(int, end_str.split(':'))
|
||||
except ValueError:
|
||||
logger.error(f"Format horaire invalide: {start_str}/{end_str}")
|
||||
return
|
||||
|
||||
self._scheduler.add_job(
|
||||
self._start_slot, 'cron', hour=sh, minute=sm, id='_slot_start'
|
||||
)
|
||||
self._scheduler.add_job(
|
||||
self._signal_slot_end, 'cron', hour=eh, minute=em, id='_slot_end'
|
||||
)
|
||||
logger.info(f"Analyse programmée: {start_str} → {end_str}")
|
||||
|
||||
def _start_slot(self):
|
||||
"""Démarre la fenêtre d'analyse (appelé par APScheduler)."""
|
||||
end_str = self._cfg('analysis_end', '04:00')
|
||||
eh, em = map(int, end_str.split(':'))
|
||||
now = datetime.now()
|
||||
self._slot_end_time = now.replace(hour=eh, minute=em, second=0, microsecond=0)
|
||||
if self._slot_end_time <= now:
|
||||
self._slot_end_time += timedelta(days=1)
|
||||
|
||||
self._analysis_stop.clear()
|
||||
self._analysis_thread = threading.Thread(
|
||||
target=self._analysis_loop, daemon=True, name="logwatch-analysis"
|
||||
)
|
||||
self._analysis_thread.start()
|
||||
logger.info(f"Créneau d'analyse démarré → fin à {self._slot_end_time.strftime('%H:%M')}")
|
||||
|
||||
def _signal_slot_end(self):
|
||||
"""Signale la fin du créneau (appelé par APScheduler)."""
|
||||
logger.info("Fin de créneau signalée.")
|
||||
self._analysis_stop.set()
|
||||
|
||||
# ─── Boucle d'analyse ────────────────────────────────────────────────────
|
||||
|
||||
def _analysis_loop(self):
|
||||
"""Thread principal d'analyse, tourne pendant le créneau."""
|
||||
try:
|
||||
machines = self._get_active_machines()
|
||||
if not machines:
|
||||
self._notify_admin("📭 LogWatch: aucune machine enregistrée à analyser.")
|
||||
return
|
||||
|
||||
start_idx = self._find_resume_index(machines)
|
||||
total = len(machines)
|
||||
|
||||
for i in range(total):
|
||||
idx = (start_idx + i) % total
|
||||
machine = machines[idx]
|
||||
mid = machine['id']
|
||||
host = machine['hostname']
|
||||
|
||||
# Vérifier si le créneau est terminé avant de commencer une machine
|
||||
if self._analysis_stop.is_set():
|
||||
overage_min = self._overage_minutes()
|
||||
max_ov = int(self._cfg('max_overage_minutes', '30'))
|
||||
|
||||
if overage_min > max_ov:
|
||||
# Demander extension
|
||||
if not self._ask_extension(mid, host, overage_min):
|
||||
# Refusée ou timeout → pause
|
||||
self._set_session_status(mid, 'paused')
|
||||
self._notify_admin(
|
||||
f"⏸️ LogWatch: analyse de **{host}** reportée au prochain créneau."
|
||||
)
|
||||
break
|
||||
|
||||
self._analyze_machine(mid, host)
|
||||
|
||||
else:
|
||||
# Boucle complète sans interruption
|
||||
self._notify_admin(
|
||||
f"✅ LogWatch: analyse complète de {total} machine(s) terminée."
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[analysis_loop] {e}", exc_info=True)
|
||||
self._notify_admin(f"❌ LogWatch: erreur dans la boucle d'analyse: {e}")
|
||||
|
||||
def _get_active_machines(self) -> list:
|
||||
with self._get_db() as conn:
|
||||
rows = conn.execute(
|
||||
"SELECT id, hostname, queue_position FROM machines "
|
||||
"WHERE active=1 ORDER BY queue_position ASC"
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
def _find_resume_index(self, machines: list) -> int:
|
||||
"""Trouve l'index de la machine à reprendre (paused) ou commence à 0."""
|
||||
today = datetime.now().strftime('%Y-%m-%d')
|
||||
with self._get_db() as conn:
|
||||
row = conn.execute("""
|
||||
SELECT machine_id FROM analysis_sessions
|
||||
WHERE slot_date=? AND status='paused'
|
||||
ORDER BY id DESC LIMIT 1
|
||||
""", (today,)).fetchone()
|
||||
if not row:
|
||||
return 0
|
||||
paused_id = row['machine_id']
|
||||
for i, m in enumerate(machines):
|
||||
if m['id'] == paused_id:
|
||||
return i
|
||||
return 0
|
||||
|
||||
def _overage_minutes(self) -> float:
|
||||
"""Retourne les minutes de dépassement (positif = dépassement)."""
|
||||
if not self._slot_end_time:
|
||||
return 0.0
|
||||
delta = (datetime.now() - self._slot_end_time).total_seconds() / 60
|
||||
return max(0.0, delta)
|
||||
|
||||
def _ask_extension(self, machine_id: int, hostname: str, overage: float) -> bool:
|
||||
"""
|
||||
Demande à l'admin une extension du créneau.
|
||||
Attend la réponse (max 10 min).
|
||||
Retourne True si extension accordée.
|
||||
"""
|
||||
max_ov = int(self._cfg('max_overage_minutes', '30'))
|
||||
self._pending_extension = {'machine_id': machine_id, 'hostname': hostname}
|
||||
self._extension_event.clear()
|
||||
self._extension_granted = False
|
||||
|
||||
self._notify_admin(
|
||||
f"⏰ LogWatch: créneau terminé (dépassement {overage:.0f} min > max {max_ov} min).\n"
|
||||
f"Analyse en cours: **{hostname}** non terminée.\n"
|
||||
f"Tapez `/extend` pour accorder +{max_ov} min supplémentaires, "
|
||||
f"ou `/skip` pour reporter au prochain créneau."
|
||||
)
|
||||
|
||||
# Attendre la réponse max 10 minutes
|
||||
answered = self._extension_event.wait(timeout=600)
|
||||
self._pending_extension = None
|
||||
|
||||
if not answered:
|
||||
self._notify_admin(
|
||||
f"⏰ LogWatch: pas de réponse après 10 min → analyse de **{hostname}** reportée."
|
||||
)
|
||||
return False
|
||||
|
||||
return self._extension_granted
|
||||
|
||||
# ─── Analyse d'une machine ───────────────────────────────────────────────
|
||||
|
||||
def _analyze_machine(self, machine_id: int, hostname: str):
|
||||
"""Analyse les logs filtrés d'une machine avec le LLM."""
|
||||
today = datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
# Créer ou récupérer la session d'analyse
|
||||
with self._get_db() as conn:
|
||||
session = conn.execute(
|
||||
"SELECT id, last_log_id FROM analysis_sessions "
|
||||
"WHERE machine_id=? AND slot_date=? AND status IN ('pending','paused')",
|
||||
(machine_id, today)
|
||||
).fetchone()
|
||||
|
||||
if session:
|
||||
session_id = session['id']
|
||||
last_log_id = session['last_log_id']
|
||||
conn.execute(
|
||||
"UPDATE analysis_sessions SET status='in_progress', started_at=? WHERE id=?",
|
||||
(datetime.now().isoformat(), session_id)
|
||||
)
|
||||
else:
|
||||
# Vérifier si déjà 'done' aujourd'hui
|
||||
done = conn.execute(
|
||||
"SELECT id FROM analysis_sessions WHERE machine_id=? AND slot_date=? AND status='done'",
|
||||
(machine_id, today)
|
||||
).fetchone()
|
||||
if done:
|
||||
logger.info(f"[{hostname}] déjà analysée aujourd'hui.")
|
||||
return
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO analysis_sessions (machine_id, slot_date, status, started_at) VALUES (?,?,?,?)",
|
||||
(machine_id, today, 'in_progress', datetime.now().isoformat())
|
||||
)
|
||||
session_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
|
||||
last_log_id = 0
|
||||
|
||||
# Récupérer les logs filtrés non encore analysés
|
||||
with self._get_db() as conn:
|
||||
logs = conn.execute(
|
||||
"SELECT id, log_line, severity, received_at FROM filtered_logs "
|
||||
"WHERE machine_id=? AND id > ? AND analyzed=0 ORDER BY id ASC",
|
||||
(machine_id, last_log_id)
|
||||
).fetchall()
|
||||
|
||||
if not logs:
|
||||
logger.info(f"[{hostname}] Aucun log filtré à analyser.")
|
||||
self._set_session_status(machine_id, 'done', session_id=session_id)
|
||||
return
|
||||
|
||||
self._notify_admin(
|
||||
f"🔍 LogWatch: analyse de **{hostname}** ({len(logs)} erreurs filtrées)…"
|
||||
)
|
||||
|
||||
all_reports = []
|
||||
last_id = last_log_id
|
||||
logs_list = [dict(r) for r in logs]
|
||||
|
||||
for chunk_start in range(0, len(logs_list), CHUNK_SIZE):
|
||||
# Vérifier dépassement dans la boucle de chunks
|
||||
if self._analysis_stop.is_set():
|
||||
overage = self._overage_minutes()
|
||||
max_ov = int(self._cfg('max_overage_minutes', '30'))
|
||||
if overage > max_ov:
|
||||
# Sauvegarder le point de reprise
|
||||
with self._get_db() as conn:
|
||||
conn.execute(
|
||||
"UPDATE analysis_sessions SET status='paused', last_log_id=? WHERE id=?",
|
||||
(last_id, session_id)
|
||||
)
|
||||
self._notify_admin(
|
||||
f"⏸️ LogWatch: pause mid-analyse de **{hostname}** "
|
||||
f"(dépassement {overage:.0f} min). Reprise au prochain créneau."
|
||||
)
|
||||
return
|
||||
|
||||
chunk = logs_list[chunk_start:chunk_start + CHUNK_SIZE]
|
||||
chunk_txt = '\n'.join(
|
||||
f"[{r['received_at'][:19]}][{r['severity']}] {r['log_line']}"
|
||||
for r in chunk
|
||||
)
|
||||
|
||||
prompt = (
|
||||
f"Tu analyses des logs d'erreurs de la machine **{hostname}**.\n"
|
||||
f"Synthétise les problèmes importants : type d'erreur, criticité (critique/haute/moyenne), "
|
||||
f"fréquence, cause probable, action recommandée.\n"
|
||||
f"Ne répète pas chaque ligne individuellement. Groupe les erreurs similaires.\n"
|
||||
f"Format de réponse : 🔴/🟠/🟡 Problème → Cause → Action\n\n"
|
||||
f"Logs ({chunk_start+1}–{min(chunk_start+CHUNK_SIZE, len(logs_list))}):\n{chunk_txt}"
|
||||
)
|
||||
|
||||
report_chunk = self._call_llm(prompt)
|
||||
if report_chunk:
|
||||
all_reports.append(report_chunk)
|
||||
|
||||
# Marquer comme analysés + mise à jour offset
|
||||
ids = [r['id'] for r in chunk]
|
||||
last_id = ids[-1]
|
||||
with self._get_db() as conn:
|
||||
conn.execute(
|
||||
f"UPDATE filtered_logs SET analyzed=1 WHERE id IN ({','.join('?'*len(ids))})",
|
||||
ids
|
||||
)
|
||||
conn.execute(
|
||||
"UPDATE analysis_sessions SET last_log_id=? WHERE id=?",
|
||||
(last_id, session_id)
|
||||
)
|
||||
|
||||
# Rapport final
|
||||
if all_reports:
|
||||
report = (
|
||||
f"📊 **Rapport LogWatch — {hostname}**\n"
|
||||
f"📅 {datetime.now().strftime('%Y-%m-%d %H:%M')} | "
|
||||
f"{len(logs_list)} erreurs analysées\n"
|
||||
f"{'─'*40}\n\n"
|
||||
)
|
||||
report += '\n\n'.join(all_reports)
|
||||
self._notify_admin(report)
|
||||
else:
|
||||
self._notify_admin(f"ℹ️ LogWatch: **{hostname}** — LLM n'a pas retourné de rapport.")
|
||||
|
||||
# Marquer la session comme terminée
|
||||
with self._get_db() as conn:
|
||||
conn.execute(
|
||||
"UPDATE analysis_sessions SET status='done', completed_at=?, last_log_id=? WHERE id=?",
|
||||
(datetime.now().isoformat(), last_id, session_id)
|
||||
)
|
||||
conn.execute(
|
||||
"UPDATE machines SET last_analyzed_at=? WHERE id=?",
|
||||
(datetime.now().isoformat(), machine_id)
|
||||
)
|
||||
|
||||
def _set_session_status(self, machine_id: int, status: str, session_id: int = None):
|
||||
today = datetime.now().strftime('%Y-%m-%d')
|
||||
with self._get_db() as conn:
|
||||
if session_id:
|
||||
conn.execute(
|
||||
"UPDATE analysis_sessions SET status=? WHERE id=?",
|
||||
(status, session_id)
|
||||
)
|
||||
else:
|
||||
conn.execute(
|
||||
"UPDATE analysis_sessions SET status=? WHERE machine_id=? AND slot_date=?",
|
||||
(status, machine_id, today)
|
||||
)
|
||||
|
||||
# ─── LLM ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def _call_llm(self, prompt: str) -> str:
|
||||
"""Appelle le LLM en respectant le lock BaseAgent."""
|
||||
lock = getattr(self, '_llm_lock', None)
|
||||
acquired = False
|
||||
try:
|
||||
if lock:
|
||||
acquired = lock.acquire(timeout=300)
|
||||
if not acquired:
|
||||
return "(LLM indisponible après 5 min d'attente)"
|
||||
self.llm.reset_history()
|
||||
return self.llm.chat(prompt)
|
||||
except Exception as e:
|
||||
logger.error(f"[LLM] {e}")
|
||||
return f"(Erreur LLM: {e})"
|
||||
finally:
|
||||
if acquired and lock:
|
||||
lock.release()
|
||||
|
||||
# ─── XMPP helpers ────────────────────────────────────────────────────────
|
||||
|
||||
def _notify_admin(self, message: str):
|
||||
"""Envoie un message à tous les admins XMPP."""
|
||||
try:
|
||||
if self.xmpp:
|
||||
self.xmpp.send_to_all_admins(message)
|
||||
except Exception as e:
|
||||
logger.error(f"[notify_admin] {e}")
|
||||
|
||||
# ─── Commandes custom (/extend, /skip, /update) ──────────────────────────
|
||||
|
||||
def handle_custom_command(self, cmd: str, args: str, source_msg=None):
|
||||
cmd_lower = cmd.lower()
|
||||
|
||||
# Réponse à une demande d'extension de créneau
|
||||
if self._pending_extension:
|
||||
if cmd_lower == 'extend':
|
||||
self._extension_granted = True
|
||||
self._extension_event.set()
|
||||
max_ov = self._cfg('max_overage_minutes', '30')
|
||||
return f"⏱️ Extension accordée (+{max_ov} min). L'analyse continue."
|
||||
if cmd_lower == 'skip':
|
||||
self._extension_granted = False
|
||||
self._extension_event.set()
|
||||
return "⏸️ Analyse reportée au prochain créneau."
|
||||
|
||||
if cmd_lower == 'update':
|
||||
return self._self_update()
|
||||
|
||||
return f"Commande inconnue : /{cmd}"
|
||||
|
||||
def on_broadcast(self, msg: Message):
|
||||
pass
|
||||
|
||||
def _self_update(self) -> str:
|
||||
import subprocess
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
"cd /opt/agent_logwatch && git pull",
|
||||
shell=True, text=True, stderr=subprocess.STDOUT
|
||||
)
|
||||
subprocess.Popen(["systemctl", "restart", "agent_logwatch"])
|
||||
return f"Mise à jour:\n{out}\nRedémarrage…"
|
||||
except subprocess.CalledProcessError as e:
|
||||
return f"Erreur mise à jour: {e.output}"
|
||||
|
||||
# ─── Nettoyage ────────────────────────────────────────────────────────────
|
||||
|
||||
def _cleanup_old_logs(self):
|
||||
"""Supprime les logs filtrés plus vieux que log_retention_days."""
|
||||
days = int(self._cfg('log_retention_days', '7'))
|
||||
cutoff = (datetime.now() - timedelta(days=days)).isoformat()
|
||||
with self._get_db() as conn:
|
||||
cur = conn.execute(
|
||||
"DELETE FROM filtered_logs WHERE received_at < ? AND analyzed=1",
|
||||
(cutoff,)
|
||||
)
|
||||
if cur.rowcount:
|
||||
logger.info(f"Nettoyage: {cur.rowcount} logs anciens supprimés.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
LogWatchAgent().run()
|
||||
Reference in New Issue
Block a user