feat: stockage rapports DB + filtres affinés

- Table 'reports' : stockage des rapports LLM (machine, date, contenu, nb erreurs)
- logwatch report [hostname] [date] : relire un rapport stocké
- Filtres refactorisés : tier 1 (uppercase exacts) + tier 2 (contextuels précis)
- EXCLUDE_PATTERNS : exclure le bruit connu (Started, LogWatch lui-même...)
- Déduplication : max 5 occurrences de la même ligne par collecte
- Résultat : 0.7% de rétention vs 33% avant

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-02 09:42:25 +00:00
parent 7c5b270144
commit d496e1d188
8 changed files with 147 additions and 19 deletions
Binary file not shown.
+97 -19
View File
@@ -21,33 +21,63 @@ from agents_core import BaseAgent, AgentContext, Message, MessageType
logger = logging.getLogger(__name__)
# ─── Pré-filtres sans LLM ────────────────────────────────────────────────────
# Tier 1 — signaux critiques (mots-clés uppercase exacts, très peu de faux positifs)
# Tier 2 — patterns contextuels précis (évite les faux positifs du lowercase générique)
FILTER_PATTERNS = [
re.compile(r'\b(ERROR|CRITICAL|FATAL|PANIC|EMERG|ALERT|CRIT)\b'),
re.compile(r'\bException\b|\bTraceback\b|\bTraceback \(most recent'),
# Tier 1 : mots-clés uppercase — très fiables
re.compile(r'\b(EMERG|ALERT|CRIT|CRITICAL|FATAL|PANIC)\b'),
re.compile(r'\bERROR\b'), # uppercase uniquement
re.compile(r'\bException\b|\bTraceback\b'), # Python/Java
re.compile(r'<[0-3]>'), # syslog prio 0-3
# Tier 2 : patterns précis avec contexte
re.compile(r'\bsegfault\b|\bSegmentation fault\b', re.IGNORECASE),
re.compile(r'\bout of memory\b|\bOOM killer\b|\bOOM-killer\b', re.IGNORECASE),
re.compile(r'\b(failed|failure)\b', re.IGNORECASE),
re.compile(r'\bkilled\b', re.IGNORECASE),
re.compile(r'\b(BUG|Oops):\s'),
re.compile(r'<[0-3]>'), # syslog priorities 0=emerg, 1=alert, 2=crit, 3=err
re.compile(r'\bout of memory\b|\bOOM[ -]killer\b', re.IGNORECASE),
re.compile(r'\bcore dumped\b', re.IGNORECASE),
re.compile(r'\bpanic\b', re.IGNORECASE),
re.compile(r'\bdenied\b.*\bpermission\b|\bpermission\b.*\bdenied\b', re.IGNORECASE),
re.compile(r'\bauthentication failure\b|\bfailed login\b|\bfailed password\b', re.IGNORECASE),
re.compile(r'\bdisk full\b|\bno space left\b', re.IGNORECASE),
re.compile(r'\bconnection refused\b|\bconnection timed out\b', re.IGNORECASE),
re.compile(r'\bssh.*invalid user\b|\binvalid user.*ssh\b', re.IGNORECASE),
re.compile(r'\b(BUG|Oops):\s'), # kernel bugs
# systemd : "Failed to start X" ou "failed with result"
re.compile(r'systemd.*:\s+Failed\b', re.IGNORECASE),
re.compile(r'\bfailed with result\b', re.IGNORECASE),
re.compile(r'\.service.*failed\b', re.IGNORECASE),
# kernel : OOM kill, panic noyau
re.compile(r'kernel:.*[Kk]ill\b.*\bprocess\b'),
re.compile(r'kernel:.*[Pp]anic\b'),
# Authentification : patterns précis, pas juste "failed"
re.compile(r'\bauthentication failure\b', re.IGNORECASE),
re.compile(r'\bFailed password\b|\bFailed publickey\b'), # sshd exact
re.compile(r'\bInvalid user\b'), # sshd exact
# Disque / espace
re.compile(r'\bno space left on device\b', re.IGNORECASE),
re.compile(r'\bdisk full\b', re.IGNORECASE),
# Réseau : refus explicite (pas les retries normaux)
re.compile(r'\bconnection refused\b', re.IGNORECASE),
]
# Lignes à exclure même si un pattern matche (bruit connu)
EXCLUDE_PATTERNS = [
re.compile(r'\bsystemd\b.*\bStarted\b', re.IGNORECASE),
re.compile(r'\bLogWatch\b', re.IGNORECASE), # éviter de s'auto-analyser
re.compile(r'^\s*$'),
]
SEVERITY_RANK = {
'EMERG': 0, 'ALERT': 1, 'CRIT': 2, 'CRITICAL': 2, 'FATAL': 2, 'PANIC': 2,
'ERROR': 3, 'ERR': 3,
'FAILED': 4, 'FAILURE': 4, 'DENIED': 4,
'ERROR': 3,
'FAILED': 4, 'FAILURE': 4,
'EXCEPTION': 5, 'TRACEBACK': 5,
'KILLED': 6, 'OOM': 6, 'SEGFAULT': 6, 'CORE': 6,
'OOM': 6, 'SEGFAULT': 6, 'CORE': 6,
'INVALID USER': 7, 'AUTH': 7,
}
# Déduplication : max N occurrences de la même signature par session de filtrage
MAX_DUPLICATES = 5
CHUNK_SIZE = 150 # lignes envoyées au LLM par appel
@@ -155,6 +185,16 @@ class LogWatchAgent(BaseAgent):
INSERT OR IGNORE INTO agent_config VALUES ('enabled', '1');
INSERT OR IGNORE INTO agent_config VALUES ('log_retention_days', '7');
INSERT OR IGNORE INTO agent_config VALUES ('local_collect_time', '');
CREATE TABLE IF NOT EXISTS reports (
id INTEGER PRIMARY KEY AUTOINCREMENT,
machine_id INTEGER NOT NULL,
report_date TEXT NOT NULL,
content TEXT NOT NULL,
logs_count INTEGER DEFAULT 0,
created_at TEXT NOT NULL,
FOREIGN KEY (machine_id) REFERENCES machines(id)
);
""")
def _cfg(self, key: str, default: str = '') -> str:
@@ -255,16 +295,47 @@ class LogWatchAgent(BaseAgent):
logger.error(f"[_on_log_received] {e}", exc_info=True)
def _prefilter(self, lines: list) -> list:
"""Filtre les lignes, retourne [(line, severity)]."""
result = []
"""
Filtre les lignes, retourne [(line, severity)].
- Applique les patterns d'exclusion en premier
- Déduplique les lignes similaires (même signature, max MAX_DUPLICATES)
"""
result = []
seen_sigs = {} # signature → count
for line in lines:
line = str(line).strip()
if not line:
continue
# Exclusions d'abord
if any(ex.search(line) for ex in EXCLUDE_PATTERNS):
continue
# Test des patterns d'inclusion
matched = False
for pat in FILTER_PATTERNS:
if pat.search(line):
result.append((line, _detect_severity(line)))
matched = True
break
if not matched:
continue
# Déduplication : signature = partie fixe de la ligne (sans timestamp/PID)
sig = re.sub(r'\b\d+\b', 'N', line) # remplace les nombres
sig = re.sub(r'\[[\w/]+\]', '[X]', sig) # remplace les identifiants entre []
sig = sig[:120]
count = seen_sigs.get(sig, 0)
if count >= MAX_DUPLICATES:
continue
seen_sigs[sig] = count + 1
# Annoter si répétition
sev = _detect_severity(line)
entry = line if count == 0 else f"{line} [×{count+1}]"
result.append((entry, sev))
return result
def _register_machine(self, hostname: str) -> int:
@@ -644,6 +715,13 @@ class LogWatchAgent(BaseAgent):
)
report += '\n\n'.join(all_reports)
self._notify_admin(report)
# Stocker le rapport en DB
with self._get_db() as conn:
conn.execute(
"INSERT INTO reports (machine_id, report_date, content, logs_count, created_at) "
"VALUES (?,?,?,?,?)",
(machine_id, today, report, len(logs_list), datetime.now().isoformat())
)
else:
self._notify_admin(f"️ LogWatch: **{hostname}** — LLM n'a pas retourné de rapport.")
+1
View File
@@ -25,6 +25,7 @@ Tu reçois des instructions via MQTT (depuis Nexus) ou XMPP (directement).
- `retention <jours>` : durée de conservation des logs filtrés
- `analyze <hostname>` : lancer l'analyse d'une machine spécifique maintenant
- `analyze_all` : lancer l'analyse complète de toutes les machines
- `report [hostname] [YYYY-MM-DD]` : relire un rapport stocké (sans hostname = liste tous)
- `collect [since]` : collecter maintenant les logs locaux (ex: collect "1 hour ago")
- `logs <hostname> [N]` : voir les N derniers logs filtrés d'une machine
- `reset <hostname>` : réinitialiser l'analyse d'une machine
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
BIN
View File
Binary file not shown.
Binary file not shown.
+49
View File
@@ -26,6 +26,7 @@ USAGE = (
"SKILL:logwatch ARGS:overage <minutes>\n"
"SKILL:logwatch ARGS:analyze <hostname>\n"
"SKILL:logwatch ARGS:analyze_all\n"
"SKILL:logwatch ARGS:report [hostname] [YYYY-MM-DD]\n"
"SKILL:logwatch ARGS:collect [since]\n"
"SKILL:logwatch ARGS:retention <jours>\n"
"SKILL:logwatch ARGS:logs <hostname> [N]\n"
@@ -260,6 +261,54 @@ def run(args: str, context) -> str:
)
return "\n".join(lines)
# ── report <hostname> [date] ──────────────────────────────────────────────
if action == 'report':
p = rest.split(None, 1)
hostname = p[0].strip() if p else ''
date_str = p[1].strip() if len(p) > 1 else ''
if not hostname:
# Sans hostname : liste les derniers rapports toutes machines
with _db(context) as conn:
rows = conn.execute(
"SELECT m.hostname, r.report_date, r.logs_count, r.created_at "
"FROM reports r JOIN machines m ON m.id=r.machine_id "
"ORDER BY r.id DESC LIMIT 20"
).fetchall()
if not rows:
return "Aucun rapport stocké."
lines = ["── Rapports disponibles ──────────────────────"]
for r in rows:
lines.append(
f" {r['report_date']} | {r['hostname']:<30s} | {r['logs_count']} erreurs"
)
lines.append("\nUtilise : logwatch report <hostname> [YYYY-MM-DD]")
return "\n".join(lines)
with _db(context) as conn:
m = conn.execute(
"SELECT id FROM machines WHERE hostname=?", (hostname,)
).fetchone()
if not m:
return f"Machine '{hostname}' introuvable."
if date_str:
row = conn.execute(
"SELECT content, report_date, logs_count FROM reports "
"WHERE machine_id=? AND report_date=? ORDER BY id DESC LIMIT 1",
(m['id'], date_str)
).fetchone()
else:
row = conn.execute(
"SELECT content, report_date, logs_count FROM reports "
"WHERE machine_id=? ORDER BY id DESC LIMIT 1",
(m['id'],)
).fetchone()
if not row:
return f"Aucun rapport trouvé pour '{hostname}'" + (f" le {date_str}" if date_str else "") + "."
return f"[{row['report_date']}{row['logs_count']} erreurs]\n\n{row['content']}"
# ── collect [since] ───────────────────────────────────────────────────────
if action == 'collect':
since = rest.strip() or 'yesterday'