""" Skill WEB_FETCH — récupérer le contenu d'une URL HTTP/HTTPS. Usage LLM : SKILL:web_fetch ARGS:get SKILL:web_fetch ARGS:head SKILL:web_fetch ARGS:post | """ import urllib.request import urllib.error import urllib.parse import json import re DESCRIPTION = "Récupérer le contenu d'une URL HTTP/HTTPS (GET, HEAD, POST)" USAGE = "SKILL:web_fetch ARGS:get | head | post |" MAX_SIZE = 8000 def _strip_html(html: str) -> str: """Supprime les balises HTML et nettoie le texte.""" # Supprime scripts et styles html = re.sub(r'<(script|style)[^>]*>.*?', ' ', html, flags=re.DOTALL | re.IGNORECASE) # Supprime les balises html = re.sub(r'<[^>]+>', ' ', html) # Décode les entités HTML basiques html = html.replace('&', '&').replace('<', '<').replace('>', '>') \ .replace('"', '"').replace(''', "'").replace(' ', ' ') # Nettoie les espaces multiples html = re.sub(r'\s+', ' ', html).strip() return html def run(args: str, context) -> str: parts = args.strip().split(None, 1) action = parts[0].lower() if parts else "get" rest = parts[1] if len(parts) > 1 else "" if action == "get": url = rest.strip() if not url: return "Précise une URL." try: req = urllib.request.Request( url, headers={ "User-Agent": "HAL-Agent/1.0 (compatible; Python urllib)", "Accept": "text/html,text/plain,application/json,*/*" } ) with urllib.request.urlopen(req, timeout=15) as resp: content_type = resp.headers.get("Content-Type", "") raw = resp.read(MAX_SIZE * 3) # Lit plus pour avoir du contenu après stripping charset = "utf-8" if "charset=" in content_type: charset = content_type.split("charset=")[-1].split(";")[0].strip() text = raw.decode(charset, errors="replace") # Si HTML, nettoie les balises if "html" in content_type.lower(): text = _strip_html(text) if len(text) > MAX_SIZE: text = text[:MAX_SIZE] + f"\n... (tronqué à {MAX_SIZE} caractères)" return f"[{resp.status} {url}]\n{text}" except urllib.error.HTTPError as e: return f"Erreur HTTP {e.code} : {e.reason} — {url}" except urllib.error.URLError as e: return f"Erreur URL : {e.reason} — {url}" except Exception as e: return f"Erreur : {e}" if action == "head": url = rest.strip() if not url: return "Précise une URL." try: req = urllib.request.Request(url, method="HEAD") with urllib.request.urlopen(req, timeout=10) as resp: headers = dict(resp.headers) lines = [f"[{resp.status} {url}]"] for k, v in headers.items(): lines.append(f" {k}: {v}") return "\n".join(lines) except Exception as e: return f"Erreur : {e}" if action == "post": if "|" not in rest: return "Format : post | " url, body = rest.split("|", 1) url = url.strip() body = body.strip().encode("utf-8") try: req = urllib.request.Request( url, data=body, method="POST", headers={ "User-Agent": "HAL-Agent/1.0", "Content-Type": "application/json" } ) with urllib.request.urlopen(req, timeout=15) as resp: text = resp.read(MAX_SIZE).decode("utf-8", errors="replace") return f"[{resp.status} {url}]\n{text}" except Exception as e: return f"Erreur : {e}" return "Action inconnue. Disponible : get, head, post"