""" RAG Ingestion Pipeline for Arthur. Processes Markdown and PDF documents, extracts text, generates embeddings and indexes in Qdrant. """ import os import re import hashlib import logging from pathlib import Path from typing import Optional, List from dataclasses import dataclass, field from datetime import datetime, timezone from src.clients import get_qdrant_client, get_ollama_client logger = logging.getLogger("ArthurRAG") @dataclass class DocumentMetadata: """Metadata for an ingested document.""" filename: str filepath: str technology: str tenant_id: str doc_type: str # "manual", "procedure", "troubleshoot", "faq" language: str = "pt" version: str = "1.0" author: Optional[str] = None tags: List[str] = field(default_factory=list) created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) @dataclass class DocumentChunk: """A chunk of a document ready for indexing.""" id: str content: str embedding: Optional[List[float]] = None metadata: Optional[DocumentMetadata] = None chunk_index: int = 0 total_chunks: int = 1 @dataclass class IngestionResult: """Result of document ingestion.""" success: bool filepath: str chunks_created: int = 0 error: Optional[str] = None class RAGIngestionPipeline: """ Pipeline for ingesting technical documentation into Qdrant. Features: - Process Markdown and plain text files - Chunk documents for optimal retrieval - Extract technology metadata from content - Generate embeddings (placeholder for now) - Index in Qdrant with tenant isolation """ # Technology detection patterns TECH_PATTERNS = { "linux": r"\b(linux|ubuntu|debian|centos|rhel|bash|shell|systemd)\b", "windows": r"\b(windows|powershell|cmd|iis|active.?directory|ad)\b", "docker": r"\b(docker|container|dockerfile|compose|kubernetes|k8s)\b", "postgresql": r"\b(postgres(ql)?|pg_|psql|pgadmin)\b", "mysql": r"\b(mysql|mariadb|mysqldump)\b", "nginx": r"\b(nginx|reverse.?proxy|upstream)\b", "apache": r"\b(apache|httpd|mod_)\b", "zabbix": r"\b(zabbix|zbx|trigger|host.?group)\b", "network": r"\b(network|firewall|iptables|routing|vlan|switch)\b", "security": r"\b(security|ssl|tls|certificate|vulnerab|cve)\b", "backup": r"\b(backup|restore|recovery|snapshot|rsync)\b", } def __init__( self, chunk_size: int = 500, chunk_overlap: int = 50, embedding_dim: int = 384 ): """Initialize ingestion pipeline.""" self._chunk_size = chunk_size self._chunk_overlap = chunk_overlap self._embedding_dim = embedding_dim self._qdrant = get_qdrant_client() self._ollama = get_ollama_client() def ingest_directory( self, directory: str, tenant_id: str, doc_type: str = "manual", recursive: bool = True ) -> List[IngestionResult]: """ Ingest all documents from a directory. Args: directory: Path to directory tenant_id: Tenant for isolation doc_type: Type of documentation recursive: Search subdirectories Returns: List of ingestion results """ results = [] path = Path(directory) if not path.exists(): logger.error(f"Directory not found: {directory}") return [IngestionResult( success=False, filepath=directory, error="Directory not found" )] # Find all supported files patterns = ["*.md", "*.txt", "*.rst"] files = [] for pattern in patterns: if recursive: files.extend(path.rglob(pattern)) else: files.extend(path.glob(pattern)) logger.info(f"Found {len(files)} documents in {directory}") for filepath in files: result = self.ingest_file( filepath=str(filepath), tenant_id=tenant_id, doc_type=doc_type ) results.append(result) return results def ingest_file( self, filepath: str, tenant_id: str, doc_type: str = "manual" ) -> IngestionResult: """ Ingest a single file. Args: filepath: Path to file tenant_id: Tenant for isolation doc_type: Type of documentation Returns: IngestionResult with status """ try: path = Path(filepath) if not path.exists(): return IngestionResult( success=False, filepath=filepath, error="File not found" ) # Read content content = self._read_file(path) if not content: return IngestionResult( success=False, filepath=filepath, error="Failed to read file" ) # Sanitize content (security) content = self._sanitize_content(content) # Detect technology technology = self._detect_technology(content) # Extract tags from content tags = self._extract_tags(content) # Create metadata metadata = DocumentMetadata( filename=path.name, filepath=str(path), technology=technology, tenant_id=tenant_id, doc_type=doc_type, tags=tags ) # Chunk the document chunks = self._chunk_content(content, metadata) # Generate embeddings and index indexed = 0 for chunk in chunks: # Generate real embedding using Ollama chunk.embedding = await self._generate_embedding(chunk.content) if not chunk.embedding: logger.warning(f"Failed to generate embedding for chunk {chunk.id}") continue # Index in Qdrant success = self._qdrant.upsert_document( doc_id=chunk.id, content=chunk.content, embedding=chunk.embedding, tenant_id=tenant_id, metadata={ "filename": metadata.filename, "technology": metadata.technology, "doc_type": metadata.doc_type, "chunk_index": chunk.chunk_index, "total_chunks": chunk.total_chunks, "tags": metadata.tags } ) if success: indexed += 1 logger.info(f"Indexed {indexed}/{len(chunks)} chunks from {path.name}") return IngestionResult( success=indexed > 0, filepath=filepath, chunks_created=indexed ) except Exception as e: logger.error(f"Ingestion failed for {filepath}: {e}") return IngestionResult( success=False, filepath=filepath, error=str(e) ) def _read_file(self, path: Path) -> Optional[str]: """Read file content with encoding detection.""" encodings = ["utf-8", "latin-1", "cp1252"] for encoding in encodings: try: return path.read_text(encoding=encoding) except UnicodeDecodeError: continue return None def _sanitize_content(self, content: str) -> str: """ Sanitize content for security. Removes: - Embedded scripts - Base64 encoded content - Suspicious patterns """ # Remove script tags content = re.sub(r']*>.*?', '', content, flags=re.I | re.S) # Remove style tags content = re.sub(r']*>.*?', '', content, flags=re.I | re.S) # Remove HTML comments (may contain hidden content) content = re.sub(r'', '', content, flags=re.S) # Remove large base64 blocks (potential hidden payloads) content = re.sub(r'[A-Za-z0-9+/]{100,}={0,2}', '[BASE64_REMOVED]', content) # Normalize whitespace content = re.sub(r'\n{3,}', '\n\n', content) return content.strip() def _detect_technology(self, content: str) -> str: """Detect primary technology from content.""" content_lower = content.lower() scores = {} for tech, pattern in self.TECH_PATTERNS.items(): matches = len(re.findall(pattern, content_lower, re.I)) if matches > 0: scores[tech] = matches if scores: return max(scores, key=scores.get) return "general" def _extract_tags(self, content: str) -> List[str]: """Extract relevant tags from content.""" tags = set() # Check for each technology pattern content_lower = content.lower() for tech, pattern in self.TECH_PATTERNS.items(): if re.search(pattern, content_lower, re.I): tags.add(tech) return list(tags)[:10] # Limit tags def _chunk_content( self, content: str, metadata: DocumentMetadata ) -> List[DocumentChunk]: """Split content into chunks for indexing.""" chunks = [] # Split by paragraphs first paragraphs = content.split('\n\n') current_chunk = "" chunk_index = 0 for para in paragraphs: if len(current_chunk) + len(para) <= self._chunk_size: current_chunk += para + "\n\n" else: if current_chunk.strip(): chunk_id = self._generate_chunk_id( metadata.filepath, chunk_index ) chunks.append(DocumentChunk( id=chunk_id, content=current_chunk.strip(), metadata=metadata, chunk_index=chunk_index )) chunk_index += 1 current_chunk = para + "\n\n" # Add final chunk if current_chunk.strip(): chunk_id = self._generate_chunk_id(metadata.filepath, chunk_index) chunks.append(DocumentChunk( id=chunk_id, content=current_chunk.strip(), metadata=metadata, chunk_index=chunk_index )) # Update total chunks for chunk in chunks: chunk.total_chunks = len(chunks) return chunks def _generate_chunk_id(self, filepath: str, chunk_index: int) -> str: """Generate unique ID for a chunk.""" content = f"{filepath}:{chunk_index}" return hashlib.md5(content.encode()).hexdigest() async def _generate_embedding(self, text: str) -> List[float]: """ Generate embedding for text using Ollama. """ embedding = await self._ollama.get_embeddings(text) # If model dimension differs, we might need padding/truncating (or just trust the model) # For now we assume the model returns correct DIM or we handle it downstream return embedding # Singleton _pipeline: Optional[RAGIngestionPipeline] = None def get_rag_pipeline() -> RAGIngestionPipeline: """Get global RAG pipeline instance.""" global _pipeline if _pipeline is None: _pipeline = RAGIngestionPipeline() return _pipeline