ea1c67b33f
Agent système complet remplaçant agent_debian : - 20 skills : apt, systemd, cron, process, network, user, sysinfo, journal, container, shell, filesystem (enhanced), git, ssh, web_fetch, todo, script, mqtt_send, mqtt_subscribe, muc_send, agents_status - filesystem : read avec numéros de lignes, edit, multiedit (style SHAI) - git : status, log, diff, add, commit, push, pull, clone, branch, checkout - ssh : exécution distante + SCP (password ou clé) - web_fetch : GET/HEAD/POST avec nettoyage HTML - todo : liste de tâches en mémoire
110 lines
4.0 KiB
Python
110 lines
4.0 KiB
Python
"""
|
|
Skill WEB_FETCH — récupérer le contenu d'une URL HTTP/HTTPS.
|
|
|
|
Usage LLM :
|
|
SKILL:web_fetch ARGS:get <url>
|
|
SKILL:web_fetch ARGS:head <url>
|
|
SKILL:web_fetch ARGS:post <url> | <body_json>
|
|
"""
|
|
import urllib.request
|
|
import urllib.error
|
|
import urllib.parse
|
|
import json
|
|
import re
|
|
|
|
DESCRIPTION = "Récupérer le contenu d'une URL HTTP/HTTPS (GET, HEAD, POST)"
|
|
USAGE = "SKILL:web_fetch ARGS:get <url> | head <url> | post <url>|<body_json>"
|
|
|
|
MAX_SIZE = 8000
|
|
|
|
|
|
def _strip_html(html: str) -> str:
|
|
"""Supprime les balises HTML et nettoie le texte."""
|
|
# Supprime scripts et styles
|
|
html = re.sub(r'<(script|style)[^>]*>.*?</\1>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
|
|
# Supprime les balises
|
|
html = re.sub(r'<[^>]+>', ' ', html)
|
|
# Décode les entités HTML basiques
|
|
html = html.replace('&', '&').replace('<', '<').replace('>', '>') \
|
|
.replace('"', '"').replace(''', "'").replace(' ', ' ')
|
|
# Nettoie les espaces multiples
|
|
html = re.sub(r'\s+', ' ', html).strip()
|
|
return html
|
|
|
|
|
|
def run(args: str, context) -> str:
|
|
parts = args.strip().split(None, 1)
|
|
action = parts[0].lower() if parts else "get"
|
|
rest = parts[1] if len(parts) > 1 else ""
|
|
|
|
if action == "get":
|
|
url = rest.strip()
|
|
if not url:
|
|
return "Précise une URL."
|
|
try:
|
|
req = urllib.request.Request(
|
|
url,
|
|
headers={
|
|
"User-Agent": "HAL-Agent/1.0 (compatible; Python urllib)",
|
|
"Accept": "text/html,text/plain,application/json,*/*"
|
|
}
|
|
)
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
content_type = resp.headers.get("Content-Type", "")
|
|
raw = resp.read(MAX_SIZE * 3) # Lit plus pour avoir du contenu après stripping
|
|
charset = "utf-8"
|
|
if "charset=" in content_type:
|
|
charset = content_type.split("charset=")[-1].split(";")[0].strip()
|
|
text = raw.decode(charset, errors="replace")
|
|
|
|
# Si HTML, nettoie les balises
|
|
if "html" in content_type.lower():
|
|
text = _strip_html(text)
|
|
|
|
if len(text) > MAX_SIZE:
|
|
text = text[:MAX_SIZE] + f"\n... (tronqué à {MAX_SIZE} caractères)"
|
|
return f"[{resp.status} {url}]\n{text}"
|
|
except urllib.error.HTTPError as e:
|
|
return f"Erreur HTTP {e.code} : {e.reason} — {url}"
|
|
except urllib.error.URLError as e:
|
|
return f"Erreur URL : {e.reason} — {url}"
|
|
except Exception as e:
|
|
return f"Erreur : {e}"
|
|
|
|
if action == "head":
|
|
url = rest.strip()
|
|
if not url:
|
|
return "Précise une URL."
|
|
try:
|
|
req = urllib.request.Request(url, method="HEAD")
|
|
with urllib.request.urlopen(req, timeout=10) as resp:
|
|
headers = dict(resp.headers)
|
|
lines = [f"[{resp.status} {url}]"]
|
|
for k, v in headers.items():
|
|
lines.append(f" {k}: {v}")
|
|
return "\n".join(lines)
|
|
except Exception as e:
|
|
return f"Erreur : {e}"
|
|
|
|
if action == "post":
|
|
if "|" not in rest:
|
|
return "Format : post <url> | <body_json>"
|
|
url, body = rest.split("|", 1)
|
|
url = url.strip()
|
|
body = body.strip().encode("utf-8")
|
|
try:
|
|
req = urllib.request.Request(
|
|
url, data=body, method="POST",
|
|
headers={
|
|
"User-Agent": "HAL-Agent/1.0",
|
|
"Content-Type": "application/json"
|
|
}
|
|
)
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
text = resp.read(MAX_SIZE).decode("utf-8", errors="replace")
|
|
return f"[{resp.status} {url}]\n{text}"
|
|
except Exception as e:
|
|
return f"Erreur : {e}"
|
|
|
|
return "Action inconnue. Disponible : get, head, post"
|