feat: stockage rapports DB + filtres affinés
- Table 'reports' : stockage des rapports LLM (machine, date, contenu, nb erreurs) - logwatch report [hostname] [date] : relire un rapport stocké - Filtres refactorisés : tier 1 (uppercase exacts) + tier 2 (contextuels précis) - EXCLUDE_PATTERNS : exclure le bruit connu (Started, LogWatch lui-même...) - Déduplication : max 5 occurrences de la même ligne par collecte - Résultat : 0.7% de rétention vs 33% avant Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Binary file not shown.
+96
-18
@@ -21,33 +21,63 @@ from agents_core import BaseAgent, AgentContext, Message, MessageType
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# ─── Pré-filtres sans LLM ────────────────────────────────────────────────────
|
# ─── Pré-filtres sans LLM ────────────────────────────────────────────────────
|
||||||
|
# Tier 1 — signaux critiques (mots-clés uppercase exacts, très peu de faux positifs)
|
||||||
|
# Tier 2 — patterns contextuels précis (évite les faux positifs du lowercase générique)
|
||||||
|
|
||||||
FILTER_PATTERNS = [
|
FILTER_PATTERNS = [
|
||||||
re.compile(r'\b(ERROR|CRITICAL|FATAL|PANIC|EMERG|ALERT|CRIT)\b'),
|
# Tier 1 : mots-clés uppercase — très fiables
|
||||||
re.compile(r'\bException\b|\bTraceback\b|\bTraceback \(most recent'),
|
re.compile(r'\b(EMERG|ALERT|CRIT|CRITICAL|FATAL|PANIC)\b'),
|
||||||
|
re.compile(r'\bERROR\b'), # uppercase uniquement
|
||||||
|
re.compile(r'\bException\b|\bTraceback\b'), # Python/Java
|
||||||
|
re.compile(r'<[0-3]>'), # syslog prio 0-3
|
||||||
|
|
||||||
|
# Tier 2 : patterns précis avec contexte
|
||||||
re.compile(r'\bsegfault\b|\bSegmentation fault\b', re.IGNORECASE),
|
re.compile(r'\bsegfault\b|\bSegmentation fault\b', re.IGNORECASE),
|
||||||
re.compile(r'\bout of memory\b|\bOOM killer\b|\bOOM-killer\b', re.IGNORECASE),
|
re.compile(r'\bout of memory\b|\bOOM[ -]killer\b', re.IGNORECASE),
|
||||||
re.compile(r'\b(failed|failure)\b', re.IGNORECASE),
|
|
||||||
re.compile(r'\bkilled\b', re.IGNORECASE),
|
|
||||||
re.compile(r'\b(BUG|Oops):\s'),
|
|
||||||
re.compile(r'<[0-3]>'), # syslog priorities 0=emerg, 1=alert, 2=crit, 3=err
|
|
||||||
re.compile(r'\bcore dumped\b', re.IGNORECASE),
|
re.compile(r'\bcore dumped\b', re.IGNORECASE),
|
||||||
re.compile(r'\bpanic\b', re.IGNORECASE),
|
re.compile(r'\b(BUG|Oops):\s'), # kernel bugs
|
||||||
re.compile(r'\bdenied\b.*\bpermission\b|\bpermission\b.*\bdenied\b', re.IGNORECASE),
|
|
||||||
re.compile(r'\bauthentication failure\b|\bfailed login\b|\bfailed password\b', re.IGNORECASE),
|
# systemd : "Failed to start X" ou "failed with result"
|
||||||
re.compile(r'\bdisk full\b|\bno space left\b', re.IGNORECASE),
|
re.compile(r'systemd.*:\s+Failed\b', re.IGNORECASE),
|
||||||
re.compile(r'\bconnection refused\b|\bconnection timed out\b', re.IGNORECASE),
|
re.compile(r'\bfailed with result\b', re.IGNORECASE),
|
||||||
re.compile(r'\bssh.*invalid user\b|\binvalid user.*ssh\b', re.IGNORECASE),
|
re.compile(r'\.service.*failed\b', re.IGNORECASE),
|
||||||
|
|
||||||
|
# kernel : OOM kill, panic noyau
|
||||||
|
re.compile(r'kernel:.*[Kk]ill\b.*\bprocess\b'),
|
||||||
|
re.compile(r'kernel:.*[Pp]anic\b'),
|
||||||
|
|
||||||
|
# Authentification : patterns précis, pas juste "failed"
|
||||||
|
re.compile(r'\bauthentication failure\b', re.IGNORECASE),
|
||||||
|
re.compile(r'\bFailed password\b|\bFailed publickey\b'), # sshd exact
|
||||||
|
re.compile(r'\bInvalid user\b'), # sshd exact
|
||||||
|
|
||||||
|
# Disque / espace
|
||||||
|
re.compile(r'\bno space left on device\b', re.IGNORECASE),
|
||||||
|
re.compile(r'\bdisk full\b', re.IGNORECASE),
|
||||||
|
|
||||||
|
# Réseau : refus explicite (pas les retries normaux)
|
||||||
|
re.compile(r'\bconnection refused\b', re.IGNORECASE),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Lignes à exclure même si un pattern matche (bruit connu)
|
||||||
|
EXCLUDE_PATTERNS = [
|
||||||
|
re.compile(r'\bsystemd\b.*\bStarted\b', re.IGNORECASE),
|
||||||
|
re.compile(r'\bLogWatch\b', re.IGNORECASE), # éviter de s'auto-analyser
|
||||||
|
re.compile(r'^\s*$'),
|
||||||
]
|
]
|
||||||
|
|
||||||
SEVERITY_RANK = {
|
SEVERITY_RANK = {
|
||||||
'EMERG': 0, 'ALERT': 1, 'CRIT': 2, 'CRITICAL': 2, 'FATAL': 2, 'PANIC': 2,
|
'EMERG': 0, 'ALERT': 1, 'CRIT': 2, 'CRITICAL': 2, 'FATAL': 2, 'PANIC': 2,
|
||||||
'ERROR': 3, 'ERR': 3,
|
'ERROR': 3,
|
||||||
'FAILED': 4, 'FAILURE': 4, 'DENIED': 4,
|
'FAILED': 4, 'FAILURE': 4,
|
||||||
'EXCEPTION': 5, 'TRACEBACK': 5,
|
'EXCEPTION': 5, 'TRACEBACK': 5,
|
||||||
'KILLED': 6, 'OOM': 6, 'SEGFAULT': 6, 'CORE': 6,
|
'OOM': 6, 'SEGFAULT': 6, 'CORE': 6,
|
||||||
|
'INVALID USER': 7, 'AUTH': 7,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Déduplication : max N occurrences de la même signature par session de filtrage
|
||||||
|
MAX_DUPLICATES = 5
|
||||||
|
|
||||||
CHUNK_SIZE = 150 # lignes envoyées au LLM par appel
|
CHUNK_SIZE = 150 # lignes envoyées au LLM par appel
|
||||||
|
|
||||||
|
|
||||||
@@ -155,6 +185,16 @@ class LogWatchAgent(BaseAgent):
|
|||||||
INSERT OR IGNORE INTO agent_config VALUES ('enabled', '1');
|
INSERT OR IGNORE INTO agent_config VALUES ('enabled', '1');
|
||||||
INSERT OR IGNORE INTO agent_config VALUES ('log_retention_days', '7');
|
INSERT OR IGNORE INTO agent_config VALUES ('log_retention_days', '7');
|
||||||
INSERT OR IGNORE INTO agent_config VALUES ('local_collect_time', '');
|
INSERT OR IGNORE INTO agent_config VALUES ('local_collect_time', '');
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS reports (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
machine_id INTEGER NOT NULL,
|
||||||
|
report_date TEXT NOT NULL,
|
||||||
|
content TEXT NOT NULL,
|
||||||
|
logs_count INTEGER DEFAULT 0,
|
||||||
|
created_at TEXT NOT NULL,
|
||||||
|
FOREIGN KEY (machine_id) REFERENCES machines(id)
|
||||||
|
);
|
||||||
""")
|
""")
|
||||||
|
|
||||||
def _cfg(self, key: str, default: str = '') -> str:
|
def _cfg(self, key: str, default: str = '') -> str:
|
||||||
@@ -255,16 +295,47 @@ class LogWatchAgent(BaseAgent):
|
|||||||
logger.error(f"[_on_log_received] {e}", exc_info=True)
|
logger.error(f"[_on_log_received] {e}", exc_info=True)
|
||||||
|
|
||||||
def _prefilter(self, lines: list) -> list:
|
def _prefilter(self, lines: list) -> list:
|
||||||
"""Filtre les lignes, retourne [(line, severity)]."""
|
"""
|
||||||
|
Filtre les lignes, retourne [(line, severity)].
|
||||||
|
- Applique les patterns d'exclusion en premier
|
||||||
|
- Déduplique les lignes similaires (même signature, max MAX_DUPLICATES)
|
||||||
|
"""
|
||||||
result = []
|
result = []
|
||||||
|
seen_sigs = {} # signature → count
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
line = str(line).strip()
|
line = str(line).strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Exclusions d'abord
|
||||||
|
if any(ex.search(line) for ex in EXCLUDE_PATTERNS):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Test des patterns d'inclusion
|
||||||
|
matched = False
|
||||||
for pat in FILTER_PATTERNS:
|
for pat in FILTER_PATTERNS:
|
||||||
if pat.search(line):
|
if pat.search(line):
|
||||||
result.append((line, _detect_severity(line)))
|
matched = True
|
||||||
break
|
break
|
||||||
|
if not matched:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Déduplication : signature = partie fixe de la ligne (sans timestamp/PID)
|
||||||
|
sig = re.sub(r'\b\d+\b', 'N', line) # remplace les nombres
|
||||||
|
sig = re.sub(r'\[[\w/]+\]', '[X]', sig) # remplace les identifiants entre []
|
||||||
|
sig = sig[:120]
|
||||||
|
|
||||||
|
count = seen_sigs.get(sig, 0)
|
||||||
|
if count >= MAX_DUPLICATES:
|
||||||
|
continue
|
||||||
|
seen_sigs[sig] = count + 1
|
||||||
|
|
||||||
|
# Annoter si répétition
|
||||||
|
sev = _detect_severity(line)
|
||||||
|
entry = line if count == 0 else f"{line} [×{count+1}]"
|
||||||
|
result.append((entry, sev))
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _register_machine(self, hostname: str) -> int:
|
def _register_machine(self, hostname: str) -> int:
|
||||||
@@ -644,6 +715,13 @@ class LogWatchAgent(BaseAgent):
|
|||||||
)
|
)
|
||||||
report += '\n\n'.join(all_reports)
|
report += '\n\n'.join(all_reports)
|
||||||
self._notify_admin(report)
|
self._notify_admin(report)
|
||||||
|
# Stocker le rapport en DB
|
||||||
|
with self._get_db() as conn:
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO reports (machine_id, report_date, content, logs_count, created_at) "
|
||||||
|
"VALUES (?,?,?,?,?)",
|
||||||
|
(machine_id, today, report, len(logs_list), datetime.now().isoformat())
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self._notify_admin(f"ℹ️ LogWatch: **{hostname}** — LLM n'a pas retourné de rapport.")
|
self._notify_admin(f"ℹ️ LogWatch: **{hostname}** — LLM n'a pas retourné de rapport.")
|
||||||
|
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ Tu reçois des instructions via MQTT (depuis Nexus) ou XMPP (directement).
|
|||||||
- `retention <jours>` : durée de conservation des logs filtrés
|
- `retention <jours>` : durée de conservation des logs filtrés
|
||||||
- `analyze <hostname>` : lancer l'analyse d'une machine spécifique maintenant
|
- `analyze <hostname>` : lancer l'analyse d'une machine spécifique maintenant
|
||||||
- `analyze_all` : lancer l'analyse complète de toutes les machines
|
- `analyze_all` : lancer l'analyse complète de toutes les machines
|
||||||
|
- `report [hostname] [YYYY-MM-DD]` : relire un rapport stocké (sans hostname = liste tous)
|
||||||
- `collect [since]` : collecter maintenant les logs locaux (ex: collect "1 hour ago")
|
- `collect [since]` : collecter maintenant les logs locaux (ex: collect "1 hour ago")
|
||||||
- `logs <hostname> [N]` : voir les N derniers logs filtrés d'une machine
|
- `logs <hostname> [N]` : voir les N derniers logs filtrés d'une machine
|
||||||
- `reset <hostname>` : réinitialiser l'analyse d'une machine
|
- `reset <hostname>` : réinitialiser l'analyse d'une machine
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -26,6 +26,7 @@ USAGE = (
|
|||||||
"SKILL:logwatch ARGS:overage <minutes>\n"
|
"SKILL:logwatch ARGS:overage <minutes>\n"
|
||||||
"SKILL:logwatch ARGS:analyze <hostname>\n"
|
"SKILL:logwatch ARGS:analyze <hostname>\n"
|
||||||
"SKILL:logwatch ARGS:analyze_all\n"
|
"SKILL:logwatch ARGS:analyze_all\n"
|
||||||
|
"SKILL:logwatch ARGS:report [hostname] [YYYY-MM-DD]\n"
|
||||||
"SKILL:logwatch ARGS:collect [since]\n"
|
"SKILL:logwatch ARGS:collect [since]\n"
|
||||||
"SKILL:logwatch ARGS:retention <jours>\n"
|
"SKILL:logwatch ARGS:retention <jours>\n"
|
||||||
"SKILL:logwatch ARGS:logs <hostname> [N]\n"
|
"SKILL:logwatch ARGS:logs <hostname> [N]\n"
|
||||||
@@ -260,6 +261,54 @@ def run(args: str, context) -> str:
|
|||||||
)
|
)
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
# ── report <hostname> [date] ──────────────────────────────────────────────
|
||||||
|
if action == 'report':
|
||||||
|
p = rest.split(None, 1)
|
||||||
|
hostname = p[0].strip() if p else ''
|
||||||
|
date_str = p[1].strip() if len(p) > 1 else ''
|
||||||
|
|
||||||
|
if not hostname:
|
||||||
|
# Sans hostname : liste les derniers rapports toutes machines
|
||||||
|
with _db(context) as conn:
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT m.hostname, r.report_date, r.logs_count, r.created_at "
|
||||||
|
"FROM reports r JOIN machines m ON m.id=r.machine_id "
|
||||||
|
"ORDER BY r.id DESC LIMIT 20"
|
||||||
|
).fetchall()
|
||||||
|
if not rows:
|
||||||
|
return "Aucun rapport stocké."
|
||||||
|
lines = ["── Rapports disponibles ──────────────────────"]
|
||||||
|
for r in rows:
|
||||||
|
lines.append(
|
||||||
|
f" {r['report_date']} | {r['hostname']:<30s} | {r['logs_count']} erreurs"
|
||||||
|
)
|
||||||
|
lines.append("\nUtilise : logwatch report <hostname> [YYYY-MM-DD]")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
with _db(context) as conn:
|
||||||
|
m = conn.execute(
|
||||||
|
"SELECT id FROM machines WHERE hostname=?", (hostname,)
|
||||||
|
).fetchone()
|
||||||
|
if not m:
|
||||||
|
return f"Machine '{hostname}' introuvable."
|
||||||
|
|
||||||
|
if date_str:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT content, report_date, logs_count FROM reports "
|
||||||
|
"WHERE machine_id=? AND report_date=? ORDER BY id DESC LIMIT 1",
|
||||||
|
(m['id'], date_str)
|
||||||
|
).fetchone()
|
||||||
|
else:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT content, report_date, logs_count FROM reports "
|
||||||
|
"WHERE machine_id=? ORDER BY id DESC LIMIT 1",
|
||||||
|
(m['id'],)
|
||||||
|
).fetchone()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
return f"Aucun rapport trouvé pour '{hostname}'" + (f" le {date_str}" if date_str else "") + "."
|
||||||
|
return f"[{row['report_date']} — {row['logs_count']} erreurs]\n\n{row['content']}"
|
||||||
|
|
||||||
# ── collect [since] ───────────────────────────────────────────────────────
|
# ── collect [since] ───────────────────────────────────────────────────────
|
||||||
if action == 'collect':
|
if action == 'collect':
|
||||||
since = rest.strip() or 'yesterday'
|
since = rest.strip() or 'yesterday'
|
||||||
|
|||||||
Reference in New Issue
Block a user