import json import os import sys import urllib.request import urllib.error import re # Configuration BASE_DIR = os.path.dirname(os.path.abspath(__file__)) KB_FILE = os.path.join(BASE_DIR, "knowledge_base.json") def load_kb(): if not os.path.exists(KB_FILE): return [] try: with open(KB_FILE, 'r', encoding='utf-8') as f: return json.load(f) except Exception as e: print(f"Error loading KB: {e}") return [] def save_kb(data): try: with open(KB_FILE, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) except Exception as e: print(f"Error saving KB: {e}") sys.exit(1) def validate_url(url, description): print(f"Checking: {url}...", end=" ", flush=True) try: # User-Agent is often required to avoid 403 Forbidden from some documentation sites req = urllib.request.Request( url, data=None, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } ) with urllib.request.urlopen(req, timeout=15) as response: if response.status != 200: print(f"[FAIL] Status {response.status}") return False content = response.read().decode('utf-8', errors='ignore').lower() # Simple content check: look for significant words from description # Exclude common stop words stop_words = {'documentation', 'official', 'manual', 'reference', 'guide', 'wiki', 'site', 'docs', 'for', 'enterprise', 'support', 'user', 'the', 'and', 'of', 'a', 'to', 'in'} keywords = [w.lower() for w in re.split(r'\W+', description) if w.lower() and w.lower() not in stop_words] # Special case for abbreviations if "pfsense" in description.lower(): keywords.append("netgate") if "truenas" in description.lower(): keywords.append("ixsystems") if "proxmox" in description.lower(): keywords.append("virtualization") if not keywords: # If no keywords remain, just trust the 200 OK print("[OK] (Status 200)") return True found = any(k in content for k in keywords) if found: print(f"[OK] Found keywords: {[k for k in keywords if k in content]}") return True else: # Be lenient if status is 200 but keywords not found (failed fuzzy match) # But user asked: "tem o conteudo esperado?" # Let's try to find title title_match = re.search(r'