import json import os import sys import urllib.request import urllib.error import re # Configuration BASE_DIR = os.path.dirname(os.path.abspath(__file__)) KB_FILE = os.path.join(BASE_DIR, "knowledge_base.json") def load_kb(): if not os.path.exists(KB_FILE): return [] try: with open(KB_FILE, 'r', encoding='utf-8') as f: return json.load(f) except Exception as e: print(f"Error loading KB: {e}") return [] def save_kb(data): try: with open(KB_FILE, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) except Exception as e: print(f"Error saving KB: {e}") sys.exit(1) def validate_url(url, description): print(f"Checking: {url}...", end=" ", flush=True) try: # User-Agent is often required to avoid 403 Forbidden from some documentation sites req = urllib.request.Request( url, data=None, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } ) with urllib.request.urlopen(req, timeout=15) as response: if response.status != 200: print(f"[FAIL] Status {response.status}") return False content = response.read().decode('utf-8', errors='ignore').lower() # Simple content check: look for significant words from description # Exclude common stop words stop_words = {'documentation', 'official', 'manual', 'reference', 'guide', 'wiki', 'site', 'docs', 'for', 'enterprise', 'support', 'user', 'the', 'and', 'of', 'a', 'to', 'in'} keywords = [w.lower() for w in re.split(r'\W+', description) if w.lower() and w.lower() not in stop_words] # Special case for abbreviations if "pfsense" in description.lower(): keywords.append("netgate") if "truenas" in description.lower(): keywords.append("ixsystems") if "proxmox" in description.lower(): keywords.append("virtualization") if not keywords: # If no keywords remain, just trust the 200 OK print("[OK] (Status 200)") return True found = any(k in content for k in keywords) if found: print(f"[OK] Found keywords: {[k for k in keywords if k in content]}") return True else: # Be lenient if status is 200 but keywords not found (failed fuzzy match) # But user asked: "tem o conteudo esperado?" # Let's try to find title title_match = re.search(r'(.*?)', content, re.IGNORECASE | re.DOTALL) title = title_match.group(1).strip() if title_match else "No Title" print(f"[WARNING] 200 OK but keywords {keywords} not found. Title: '{title}'") # We will keep it if it's 200 OK, assuming my keyword matching might be too strict # Unless title is clearly error if "404" in title or "Not Found" in title: return False return True except urllib.error.HTTPError as e: print(f"[FAIL] HTTP Error {e.code}") return False except urllib.error.URLError as e: print(f"[FAIL] Connection Error {e.reason}") return False except Exception as e: print(f"[FAIL] Unexpected Error {e}") return False def main(): kb = load_kb() valid_entries = [] modified = False print(f"Validating {len(kb)} sources...\n") for entry in kb: if validate_url(entry['url'], entry['description']): valid_entries.append(entry) else: print(f" -> REMOVING: {entry['description']} ({entry['url']})") modified = True if modified: save_kb(valid_entries) print(f"\nUpdate Complete. Removed {len(kb) - len(valid_entries)} invalid sources.") else: print("\nAll sources are valid. No changes made.") if __name__ == "__main__": main()