minions-ai-agents/src/flywheel/rag_pipeline.py

391 lines
12 KiB
Python

"""
RAG Ingestion Pipeline for Arthur.
Processes Markdown and PDF documents, extracts text,
generates embeddings and indexes in Qdrant.
"""
import os
import re
import hashlib
import logging
from pathlib import Path
from typing import Optional, List
from dataclasses import dataclass, field
from datetime import datetime, timezone
from src.clients import get_qdrant_client, get_ollama_client
logger = logging.getLogger("ArthurRAG")
@dataclass
class DocumentMetadata:
"""Metadata for an ingested document."""
filename: str
filepath: str
technology: str
tenant_id: str
doc_type: str # "manual", "procedure", "troubleshoot", "faq"
language: str = "pt"
version: str = "1.0"
author: Optional[str] = None
tags: List[str] = field(default_factory=list)
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
@dataclass
class DocumentChunk:
"""A chunk of a document ready for indexing."""
id: str
content: str
embedding: Optional[List[float]] = None
metadata: Optional[DocumentMetadata] = None
chunk_index: int = 0
total_chunks: int = 1
@dataclass
class IngestionResult:
"""Result of document ingestion."""
success: bool
filepath: str
chunks_created: int = 0
error: Optional[str] = None
class RAGIngestionPipeline:
"""
Pipeline for ingesting technical documentation into Qdrant.
Features:
- Process Markdown and plain text files
- Chunk documents for optimal retrieval
- Extract technology metadata from content
- Generate embeddings (placeholder for now)
- Index in Qdrant with tenant isolation
"""
# Technology detection patterns
TECH_PATTERNS = {
"linux": r"\b(linux|ubuntu|debian|centos|rhel|bash|shell|systemd)\b",
"windows": r"\b(windows|powershell|cmd|iis|active.?directory|ad)\b",
"docker": r"\b(docker|container|dockerfile|compose|kubernetes|k8s)\b",
"postgresql": r"\b(postgres(ql)?|pg_|psql|pgadmin)\b",
"mysql": r"\b(mysql|mariadb|mysqldump)\b",
"nginx": r"\b(nginx|reverse.?proxy|upstream)\b",
"apache": r"\b(apache|httpd|mod_)\b",
"zabbix": r"\b(zabbix|zbx|trigger|host.?group)\b",
"network": r"\b(network|firewall|iptables|routing|vlan|switch)\b",
"security": r"\b(security|ssl|tls|certificate|vulnerab|cve)\b",
"backup": r"\b(backup|restore|recovery|snapshot|rsync)\b",
}
def __init__(
self,
chunk_size: int = 500,
chunk_overlap: int = 50,
embedding_dim: int = 384
):
"""Initialize ingestion pipeline."""
self._chunk_size = chunk_size
self._chunk_overlap = chunk_overlap
self._embedding_dim = embedding_dim
self._qdrant = get_qdrant_client()
self._ollama = get_ollama_client()
def ingest_directory(
self,
directory: str,
tenant_id: str,
doc_type: str = "manual",
recursive: bool = True
) -> List[IngestionResult]:
"""
Ingest all documents from a directory.
Args:
directory: Path to directory
tenant_id: Tenant for isolation
doc_type: Type of documentation
recursive: Search subdirectories
Returns:
List of ingestion results
"""
results = []
path = Path(directory)
if not path.exists():
logger.error(f"Directory not found: {directory}")
return [IngestionResult(
success=False,
filepath=directory,
error="Directory not found"
)]
# Find all supported files
patterns = ["*.md", "*.txt", "*.rst"]
files = []
for pattern in patterns:
if recursive:
files.extend(path.rglob(pattern))
else:
files.extend(path.glob(pattern))
logger.info(f"Found {len(files)} documents in {directory}")
for filepath in files:
result = self.ingest_file(
filepath=str(filepath),
tenant_id=tenant_id,
doc_type=doc_type
)
results.append(result)
return results
def ingest_file(
self,
filepath: str,
tenant_id: str,
doc_type: str = "manual"
) -> IngestionResult:
"""
Ingest a single file.
Args:
filepath: Path to file
tenant_id: Tenant for isolation
doc_type: Type of documentation
Returns:
IngestionResult with status
"""
try:
path = Path(filepath)
if not path.exists():
return IngestionResult(
success=False,
filepath=filepath,
error="File not found"
)
# Read content
content = self._read_file(path)
if not content:
return IngestionResult(
success=False,
filepath=filepath,
error="Failed to read file"
)
# Sanitize content (security)
content = self._sanitize_content(content)
# Detect technology
technology = self._detect_technology(content)
# Extract tags from content
tags = self._extract_tags(content)
# Create metadata
metadata = DocumentMetadata(
filename=path.name,
filepath=str(path),
technology=technology,
tenant_id=tenant_id,
doc_type=doc_type,
tags=tags
)
# Chunk the document
chunks = self._chunk_content(content, metadata)
# Generate embeddings and index
indexed = 0
for chunk in chunks:
# Generate real embedding using Ollama
chunk.embedding = await self._generate_embedding(chunk.content)
if not chunk.embedding:
logger.warning(f"Failed to generate embedding for chunk {chunk.id}")
continue
# Index in Qdrant
success = self._qdrant.upsert_document(
doc_id=chunk.id,
content=chunk.content,
embedding=chunk.embedding,
tenant_id=tenant_id,
metadata={
"filename": metadata.filename,
"technology": metadata.technology,
"doc_type": metadata.doc_type,
"chunk_index": chunk.chunk_index,
"total_chunks": chunk.total_chunks,
"tags": metadata.tags
}
)
if success:
indexed += 1
logger.info(f"Indexed {indexed}/{len(chunks)} chunks from {path.name}")
return IngestionResult(
success=indexed > 0,
filepath=filepath,
chunks_created=indexed
)
except Exception as e:
logger.error(f"Ingestion failed for {filepath}: {e}")
return IngestionResult(
success=False,
filepath=filepath,
error=str(e)
)
def _read_file(self, path: Path) -> Optional[str]:
"""Read file content with encoding detection."""
encodings = ["utf-8", "latin-1", "cp1252"]
for encoding in encodings:
try:
return path.read_text(encoding=encoding)
except UnicodeDecodeError:
continue
return None
def _sanitize_content(self, content: str) -> str:
"""
Sanitize content for security.
Removes:
- Embedded scripts
- Base64 encoded content
- Suspicious patterns
"""
# Remove script tags
content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.I | re.S)
# Remove style tags
content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.I | re.S)
# Remove HTML comments (may contain hidden content)
content = re.sub(r'<!--.*?-->', '', content, flags=re.S)
# Remove large base64 blocks (potential hidden payloads)
content = re.sub(r'[A-Za-z0-9+/]{100,}={0,2}', '[BASE64_REMOVED]', content)
# Normalize whitespace
content = re.sub(r'\n{3,}', '\n\n', content)
return content.strip()
def _detect_technology(self, content: str) -> str:
"""Detect primary technology from content."""
content_lower = content.lower()
scores = {}
for tech, pattern in self.TECH_PATTERNS.items():
matches = len(re.findall(pattern, content_lower, re.I))
if matches > 0:
scores[tech] = matches
if scores:
return max(scores, key=scores.get)
return "general"
def _extract_tags(self, content: str) -> List[str]:
"""Extract relevant tags from content."""
tags = set()
# Check for each technology pattern
content_lower = content.lower()
for tech, pattern in self.TECH_PATTERNS.items():
if re.search(pattern, content_lower, re.I):
tags.add(tech)
return list(tags)[:10] # Limit tags
def _chunk_content(
self,
content: str,
metadata: DocumentMetadata
) -> List[DocumentChunk]:
"""Split content into chunks for indexing."""
chunks = []
# Split by paragraphs first
paragraphs = content.split('\n\n')
current_chunk = ""
chunk_index = 0
for para in paragraphs:
if len(current_chunk) + len(para) <= self._chunk_size:
current_chunk += para + "\n\n"
else:
if current_chunk.strip():
chunk_id = self._generate_chunk_id(
metadata.filepath,
chunk_index
)
chunks.append(DocumentChunk(
id=chunk_id,
content=current_chunk.strip(),
metadata=metadata,
chunk_index=chunk_index
))
chunk_index += 1
current_chunk = para + "\n\n"
# Add final chunk
if current_chunk.strip():
chunk_id = self._generate_chunk_id(metadata.filepath, chunk_index)
chunks.append(DocumentChunk(
id=chunk_id,
content=current_chunk.strip(),
metadata=metadata,
chunk_index=chunk_index
))
# Update total chunks
for chunk in chunks:
chunk.total_chunks = len(chunks)
return chunks
def _generate_chunk_id(self, filepath: str, chunk_index: int) -> str:
"""Generate unique ID for a chunk."""
content = f"{filepath}:{chunk_index}"
return hashlib.md5(content.encode()).hexdigest()
async def _generate_embedding(self, text: str) -> List[float]:
"""
Generate embedding for text using Ollama.
"""
embedding = await self._ollama.get_embeddings(text)
# If model dimension differs, we might need padding/truncating (or just trust the model)
# For now we assume the model returns correct DIM or we handle it downstream
return embedding
# Singleton
_pipeline: Optional[RAGIngestionPipeline] = None
def get_rag_pipeline() -> RAGIngestionPipeline:
"""Get global RAG pipeline instance."""
global _pipeline
if _pipeline is None:
_pipeline = RAGIngestionPipeline()
return _pipeline