391 lines
12 KiB
Python
391 lines
12 KiB
Python
"""
|
|
RAG Ingestion Pipeline for Arthur.
|
|
|
|
Processes Markdown and PDF documents, extracts text,
|
|
generates embeddings and indexes in Qdrant.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import hashlib
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Optional, List
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
|
|
from src.clients import get_qdrant_client, get_ollama_client
|
|
|
|
logger = logging.getLogger("ArthurRAG")
|
|
|
|
|
|
@dataclass
|
|
class DocumentMetadata:
|
|
"""Metadata for an ingested document."""
|
|
filename: str
|
|
filepath: str
|
|
technology: str
|
|
tenant_id: str
|
|
doc_type: str # "manual", "procedure", "troubleshoot", "faq"
|
|
language: str = "pt"
|
|
version: str = "1.0"
|
|
author: Optional[str] = None
|
|
tags: List[str] = field(default_factory=list)
|
|
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
|
|
|
|
@dataclass
|
|
class DocumentChunk:
|
|
"""A chunk of a document ready for indexing."""
|
|
id: str
|
|
content: str
|
|
embedding: Optional[List[float]] = None
|
|
metadata: Optional[DocumentMetadata] = None
|
|
chunk_index: int = 0
|
|
total_chunks: int = 1
|
|
|
|
|
|
@dataclass
|
|
class IngestionResult:
|
|
"""Result of document ingestion."""
|
|
success: bool
|
|
filepath: str
|
|
chunks_created: int = 0
|
|
error: Optional[str] = None
|
|
|
|
|
|
class RAGIngestionPipeline:
|
|
"""
|
|
Pipeline for ingesting technical documentation into Qdrant.
|
|
|
|
Features:
|
|
- Process Markdown and plain text files
|
|
- Chunk documents for optimal retrieval
|
|
- Extract technology metadata from content
|
|
- Generate embeddings (placeholder for now)
|
|
- Index in Qdrant with tenant isolation
|
|
"""
|
|
|
|
# Technology detection patterns
|
|
TECH_PATTERNS = {
|
|
"linux": r"\b(linux|ubuntu|debian|centos|rhel|bash|shell|systemd)\b",
|
|
"windows": r"\b(windows|powershell|cmd|iis|active.?directory|ad)\b",
|
|
"docker": r"\b(docker|container|dockerfile|compose|kubernetes|k8s)\b",
|
|
"postgresql": r"\b(postgres(ql)?|pg_|psql|pgadmin)\b",
|
|
"mysql": r"\b(mysql|mariadb|mysqldump)\b",
|
|
"nginx": r"\b(nginx|reverse.?proxy|upstream)\b",
|
|
"apache": r"\b(apache|httpd|mod_)\b",
|
|
"zabbix": r"\b(zabbix|zbx|trigger|host.?group)\b",
|
|
"network": r"\b(network|firewall|iptables|routing|vlan|switch)\b",
|
|
"security": r"\b(security|ssl|tls|certificate|vulnerab|cve)\b",
|
|
"backup": r"\b(backup|restore|recovery|snapshot|rsync)\b",
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
chunk_size: int = 500,
|
|
chunk_overlap: int = 50,
|
|
embedding_dim: int = 384
|
|
):
|
|
"""Initialize ingestion pipeline."""
|
|
self._chunk_size = chunk_size
|
|
self._chunk_overlap = chunk_overlap
|
|
self._embedding_dim = embedding_dim
|
|
self._qdrant = get_qdrant_client()
|
|
self._ollama = get_ollama_client()
|
|
|
|
def ingest_directory(
|
|
self,
|
|
directory: str,
|
|
tenant_id: str,
|
|
doc_type: str = "manual",
|
|
recursive: bool = True
|
|
) -> List[IngestionResult]:
|
|
"""
|
|
Ingest all documents from a directory.
|
|
|
|
Args:
|
|
directory: Path to directory
|
|
tenant_id: Tenant for isolation
|
|
doc_type: Type of documentation
|
|
recursive: Search subdirectories
|
|
|
|
Returns:
|
|
List of ingestion results
|
|
"""
|
|
results = []
|
|
path = Path(directory)
|
|
|
|
if not path.exists():
|
|
logger.error(f"Directory not found: {directory}")
|
|
return [IngestionResult(
|
|
success=False,
|
|
filepath=directory,
|
|
error="Directory not found"
|
|
)]
|
|
|
|
# Find all supported files
|
|
patterns = ["*.md", "*.txt", "*.rst"]
|
|
files = []
|
|
|
|
for pattern in patterns:
|
|
if recursive:
|
|
files.extend(path.rglob(pattern))
|
|
else:
|
|
files.extend(path.glob(pattern))
|
|
|
|
logger.info(f"Found {len(files)} documents in {directory}")
|
|
|
|
for filepath in files:
|
|
result = self.ingest_file(
|
|
filepath=str(filepath),
|
|
tenant_id=tenant_id,
|
|
doc_type=doc_type
|
|
)
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
def ingest_file(
|
|
self,
|
|
filepath: str,
|
|
tenant_id: str,
|
|
doc_type: str = "manual"
|
|
) -> IngestionResult:
|
|
"""
|
|
Ingest a single file.
|
|
|
|
Args:
|
|
filepath: Path to file
|
|
tenant_id: Tenant for isolation
|
|
doc_type: Type of documentation
|
|
|
|
Returns:
|
|
IngestionResult with status
|
|
"""
|
|
try:
|
|
path = Path(filepath)
|
|
|
|
if not path.exists():
|
|
return IngestionResult(
|
|
success=False,
|
|
filepath=filepath,
|
|
error="File not found"
|
|
)
|
|
|
|
# Read content
|
|
content = self._read_file(path)
|
|
if not content:
|
|
return IngestionResult(
|
|
success=False,
|
|
filepath=filepath,
|
|
error="Failed to read file"
|
|
)
|
|
|
|
# Sanitize content (security)
|
|
content = self._sanitize_content(content)
|
|
|
|
# Detect technology
|
|
technology = self._detect_technology(content)
|
|
|
|
# Extract tags from content
|
|
tags = self._extract_tags(content)
|
|
|
|
# Create metadata
|
|
metadata = DocumentMetadata(
|
|
filename=path.name,
|
|
filepath=str(path),
|
|
technology=technology,
|
|
tenant_id=tenant_id,
|
|
doc_type=doc_type,
|
|
tags=tags
|
|
)
|
|
|
|
# Chunk the document
|
|
chunks = self._chunk_content(content, metadata)
|
|
|
|
# Generate embeddings and index
|
|
indexed = 0
|
|
for chunk in chunks:
|
|
# Generate real embedding using Ollama
|
|
chunk.embedding = await self._generate_embedding(chunk.content)
|
|
if not chunk.embedding:
|
|
logger.warning(f"Failed to generate embedding for chunk {chunk.id}")
|
|
continue
|
|
|
|
# Index in Qdrant
|
|
success = self._qdrant.upsert_document(
|
|
doc_id=chunk.id,
|
|
content=chunk.content,
|
|
embedding=chunk.embedding,
|
|
tenant_id=tenant_id,
|
|
metadata={
|
|
"filename": metadata.filename,
|
|
"technology": metadata.technology,
|
|
"doc_type": metadata.doc_type,
|
|
"chunk_index": chunk.chunk_index,
|
|
"total_chunks": chunk.total_chunks,
|
|
"tags": metadata.tags
|
|
}
|
|
)
|
|
|
|
if success:
|
|
indexed += 1
|
|
|
|
logger.info(f"Indexed {indexed}/{len(chunks)} chunks from {path.name}")
|
|
|
|
return IngestionResult(
|
|
success=indexed > 0,
|
|
filepath=filepath,
|
|
chunks_created=indexed
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Ingestion failed for {filepath}: {e}")
|
|
return IngestionResult(
|
|
success=False,
|
|
filepath=filepath,
|
|
error=str(e)
|
|
)
|
|
|
|
def _read_file(self, path: Path) -> Optional[str]:
|
|
"""Read file content with encoding detection."""
|
|
encodings = ["utf-8", "latin-1", "cp1252"]
|
|
|
|
for encoding in encodings:
|
|
try:
|
|
return path.read_text(encoding=encoding)
|
|
except UnicodeDecodeError:
|
|
continue
|
|
|
|
return None
|
|
|
|
def _sanitize_content(self, content: str) -> str:
|
|
"""
|
|
Sanitize content for security.
|
|
|
|
Removes:
|
|
- Embedded scripts
|
|
- Base64 encoded content
|
|
- Suspicious patterns
|
|
"""
|
|
# Remove script tags
|
|
content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.I | re.S)
|
|
|
|
# Remove style tags
|
|
content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.I | re.S)
|
|
|
|
# Remove HTML comments (may contain hidden content)
|
|
content = re.sub(r'<!--.*?-->', '', content, flags=re.S)
|
|
|
|
# Remove large base64 blocks (potential hidden payloads)
|
|
content = re.sub(r'[A-Za-z0-9+/]{100,}={0,2}', '[BASE64_REMOVED]', content)
|
|
|
|
# Normalize whitespace
|
|
content = re.sub(r'\n{3,}', '\n\n', content)
|
|
|
|
return content.strip()
|
|
|
|
def _detect_technology(self, content: str) -> str:
|
|
"""Detect primary technology from content."""
|
|
content_lower = content.lower()
|
|
|
|
scores = {}
|
|
for tech, pattern in self.TECH_PATTERNS.items():
|
|
matches = len(re.findall(pattern, content_lower, re.I))
|
|
if matches > 0:
|
|
scores[tech] = matches
|
|
|
|
if scores:
|
|
return max(scores, key=scores.get)
|
|
|
|
return "general"
|
|
|
|
def _extract_tags(self, content: str) -> List[str]:
|
|
"""Extract relevant tags from content."""
|
|
tags = set()
|
|
|
|
# Check for each technology pattern
|
|
content_lower = content.lower()
|
|
for tech, pattern in self.TECH_PATTERNS.items():
|
|
if re.search(pattern, content_lower, re.I):
|
|
tags.add(tech)
|
|
|
|
return list(tags)[:10] # Limit tags
|
|
|
|
def _chunk_content(
|
|
self,
|
|
content: str,
|
|
metadata: DocumentMetadata
|
|
) -> List[DocumentChunk]:
|
|
"""Split content into chunks for indexing."""
|
|
chunks = []
|
|
|
|
# Split by paragraphs first
|
|
paragraphs = content.split('\n\n')
|
|
|
|
current_chunk = ""
|
|
chunk_index = 0
|
|
|
|
for para in paragraphs:
|
|
if len(current_chunk) + len(para) <= self._chunk_size:
|
|
current_chunk += para + "\n\n"
|
|
else:
|
|
if current_chunk.strip():
|
|
chunk_id = self._generate_chunk_id(
|
|
metadata.filepath,
|
|
chunk_index
|
|
)
|
|
chunks.append(DocumentChunk(
|
|
id=chunk_id,
|
|
content=current_chunk.strip(),
|
|
metadata=metadata,
|
|
chunk_index=chunk_index
|
|
))
|
|
chunk_index += 1
|
|
|
|
current_chunk = para + "\n\n"
|
|
|
|
# Add final chunk
|
|
if current_chunk.strip():
|
|
chunk_id = self._generate_chunk_id(metadata.filepath, chunk_index)
|
|
chunks.append(DocumentChunk(
|
|
id=chunk_id,
|
|
content=current_chunk.strip(),
|
|
metadata=metadata,
|
|
chunk_index=chunk_index
|
|
))
|
|
|
|
# Update total chunks
|
|
for chunk in chunks:
|
|
chunk.total_chunks = len(chunks)
|
|
|
|
return chunks
|
|
|
|
def _generate_chunk_id(self, filepath: str, chunk_index: int) -> str:
|
|
"""Generate unique ID for a chunk."""
|
|
content = f"{filepath}:{chunk_index}"
|
|
return hashlib.md5(content.encode()).hexdigest()
|
|
|
|
async def _generate_embedding(self, text: str) -> List[float]:
|
|
"""
|
|
Generate embedding for text using Ollama.
|
|
"""
|
|
embedding = await self._ollama.get_embeddings(text)
|
|
|
|
# If model dimension differs, we might need padding/truncating (or just trust the model)
|
|
# For now we assume the model returns correct DIM or we handle it downstream
|
|
return embedding
|
|
|
|
|
|
# Singleton
|
|
_pipeline: Optional[RAGIngestionPipeline] = None
|
|
|
|
|
|
def get_rag_pipeline() -> RAGIngestionPipeline:
|
|
"""Get global RAG pipeline instance."""
|
|
global _pipeline
|
|
if _pipeline is None:
|
|
_pipeline = RAGIngestionPipeline()
|
|
return _pipeline
|