manuais-e-documentacao-itguys/.gemini/validate_knowledge.py

112 lines
4.2 KiB
Python

import json
import os
import sys
import urllib.request
import urllib.error
import re
# Configuration
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
KB_FILE = os.path.join(BASE_DIR, "knowledge_base.json")
def load_kb():
if not os.path.exists(KB_FILE):
return []
try:
with open(KB_FILE, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"Error loading KB: {e}")
return []
def save_kb(data):
try:
with open(KB_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
except Exception as e:
print(f"Error saving KB: {e}")
sys.exit(1)
def validate_url(url, description):
print(f"Checking: {url}...", end=" ", flush=True)
try:
# User-Agent is often required to avoid 403 Forbidden from some documentation sites
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
)
with urllib.request.urlopen(req, timeout=15) as response:
if response.status != 200:
print(f"[FAIL] Status {response.status}")
return False
content = response.read().decode('utf-8', errors='ignore').lower()
# Simple content check: look for significant words from description
# Exclude common stop words
stop_words = {'documentation', 'official', 'manual', 'reference', 'guide', 'wiki', 'site', 'docs', 'for', 'enterprise', 'support', 'user', 'the', 'and', 'of', 'a', 'to', 'in'}
keywords = [w.lower() for w in re.split(r'\W+', description) if w.lower() and w.lower() not in stop_words]
# Special case for abbreviations
if "pfsense" in description.lower(): keywords.append("netgate")
if "truenas" in description.lower(): keywords.append("ixsystems")
if "proxmox" in description.lower(): keywords.append("virtualization")
if not keywords:
# If no keywords remain, just trust the 200 OK
print("[OK] (Status 200)")
return True
found = any(k in content for k in keywords)
if found:
print(f"[OK] Found keywords: {[k for k in keywords if k in content]}")
return True
else:
# Be lenient if status is 200 but keywords not found (failed fuzzy match)
# But user asked: "tem o conteudo esperado?"
# Let's try to find title
title_match = re.search(r'<title>(.*?)</title>', content, re.IGNORECASE | re.DOTALL)
title = title_match.group(1).strip() if title_match else "No Title"
print(f"[WARNING] 200 OK but keywords {keywords} not found. Title: '{title}'")
# We will keep it if it's 200 OK, assuming my keyword matching might be too strict
# Unless title is clearly error
if "404" in title or "Not Found" in title:
return False
return True
except urllib.error.HTTPError as e:
print(f"[FAIL] HTTP Error {e.code}")
return False
except urllib.error.URLError as e:
print(f"[FAIL] Connection Error {e.reason}")
return False
except Exception as e:
print(f"[FAIL] Unexpected Error {e}")
return False
def main():
kb = load_kb()
valid_entries = []
modified = False
print(f"Validating {len(kb)} sources...\n")
for entry in kb:
if validate_url(entry['url'], entry['description']):
valid_entries.append(entry)
else:
print(f" -> REMOVING: {entry['description']} ({entry['url']})")
modified = True
if modified:
save_kb(valid_entries)
print(f"\nUpdate Complete. Removed {len(kb) - len(valid_entries)} invalid sources.")
else:
print("\nAll sources are valid. No changes made.")
if __name__ == "__main__":
main()