minions-ai-agents/src/flywheel/rag_pipeline.py

"""
RAG Ingestion Pipeline for Arthur.

Processes Markdown and PDF documents, extracts text,
generates embeddings and indexes in Qdrant.
"""

import os
import re
import hashlib
import logging
from pathlib import Path
from typing import Optional, List
from dataclasses import dataclass, field
from datetime import datetime, timezone

from src.clients import get_qdrant_client, get_ollama_client

logger = logging.getLogger("ArthurRAG")


@dataclass
class DocumentMetadata:
    """Metadata for an ingested document."""
    filename: str
    filepath: str
    technology: str
    tenant_id: str
    doc_type: str  # "manual", "procedure", "troubleshoot", "faq"
    language: str = "pt"
    version: str = "1.0"
    author: Optional[str] = None
    tags: List[str] = field(default_factory=list)
    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))


@dataclass
class DocumentChunk:
    """A chunk of a document ready for indexing."""
    id: str
    content: str
    embedding: Optional[List[float]] = None
    metadata: Optional[DocumentMetadata] = None
    chunk_index: int = 0
    total_chunks: int = 1


@dataclass
class IngestionResult:
    """Result of document ingestion."""
    success: bool
    filepath: str
    chunks_created: int = 0
    error: Optional[str] = None


class RAGIngestionPipeline:
    """
    Pipeline for ingesting technical documentation into Qdrant.

    Features:
    - Process Markdown and plain text files
    - Chunk documents for optimal retrieval
    - Extract technology metadata from content
    - Generate embeddings (placeholder for now)
    - Index in Qdrant with tenant isolation
    """

    # Technology detection patterns
    TECH_PATTERNS = {
        "linux": r"\b(linux|ubuntu|debian|centos|rhel|bash|shell|systemd)\b",
        "windows": r"\b(windows|powershell|cmd|iis|active.?directory|ad)\b",
        "docker": r"\b(docker|container|dockerfile|compose|kubernetes|k8s)\b",
        "postgresql": r"\b(postgres(ql)?|pg_|psql|pgadmin)\b",
        "mysql": r"\b(mysql|mariadb|mysqldump)\b",
        "nginx": r"\b(nginx|reverse.?proxy|upstream)\b",
        "apache": r"\b(apache|httpd|mod_)\b",
        "zabbix": r"\b(zabbix|zbx|trigger|host.?group)\b",
        "network": r"\b(network|firewall|iptables|routing|vlan|switch)\b",
        "security": r"\b(security|ssl|tls|certificate|vulnerab|cve)\b",
        "backup": r"\b(backup|restore|recovery|snapshot|rsync)\b",
    }

    def __init__(
        self,
        chunk_size: int = 500,
        chunk_overlap: int = 50,
        embedding_dim: int = 384
    ):
        """Initialize ingestion pipeline."""
        self._chunk_size = chunk_size
        self._chunk_overlap = chunk_overlap
        self._embedding_dim = embedding_dim
        self._qdrant = get_qdrant_client()
        self._ollama = get_ollama_client()

    def ingest_directory(
        self,
        directory: str,
        tenant_id: str,
        doc_type: str = "manual",
        recursive: bool = True
    ) -> List[IngestionResult]:
        """
        Ingest all documents from a directory.

        Args:
            directory: Path to directory
            tenant_id: Tenant for isolation
            doc_type: Type of documentation
            recursive: Search subdirectories

        Returns:
            List of ingestion results
        """
        results = []
        path = Path(directory)

        if not path.exists():
            logger.error(f"Directory not found: {directory}")
            return [IngestionResult(
                success=False,
                filepath=directory,
                error="Directory not found"
            )]

        # Find all supported files
        patterns = ["*.md", "*.txt", "*.rst"]
        files = []

        for pattern in patterns:
            if recursive:
                files.extend(path.rglob(pattern))
            else:
                files.extend(path.glob(pattern))

        logger.info(f"Found {len(files)} documents in {directory}")

        for filepath in files:
            result = self.ingest_file(
                filepath=str(filepath),
                tenant_id=tenant_id,
                doc_type=doc_type
            )
            results.append(result)

        return results

    def ingest_file(
        self,
        filepath: str,
        tenant_id: str,
        doc_type: str = "manual"
    ) -> IngestionResult:
        """
        Ingest a single file.

        Args:
            filepath: Path to file
            tenant_id: Tenant for isolation
            doc_type: Type of documentation

        Returns:
            IngestionResult with status
        """
        try:
            path = Path(filepath)

            if not path.exists():
                return IngestionResult(
                    success=False,
                    filepath=filepath,
                    error="File not found"
                )

            # Read content
            content = self._read_file(path)
            if not content:
                return IngestionResult(
                    success=False,
                    filepath=filepath,
                    error="Failed to read file"
                )

            # Sanitize content (security)
            content = self._sanitize_content(content)

            # Detect technology
            technology = self._detect_technology(content)

            # Extract tags from content
            tags = self._extract_tags(content)

            # Create metadata
            metadata = DocumentMetadata(
                filename=path.name,
                filepath=str(path),
                technology=technology,
                tenant_id=tenant_id,
                doc_type=doc_type,
                tags=tags
            )

            # Chunk the document
            chunks = self._chunk_content(content, metadata)

            # Generate embeddings and index
            indexed = 0
            for chunk in chunks:
                # Generate real embedding using Ollama
                chunk.embedding = await self._generate_embedding(chunk.content)
                if not chunk.embedding:
                    logger.warning(f"Failed to generate embedding for chunk {chunk.id}")
                    continue

                # Index in Qdrant
                success = self._qdrant.upsert_document(
                    doc_id=chunk.id,
                    content=chunk.content,
                    embedding=chunk.embedding,
                    tenant_id=tenant_id,
                    metadata={
                        "filename": metadata.filename,
                        "technology": metadata.technology,
                        "doc_type": metadata.doc_type,
                        "chunk_index": chunk.chunk_index,
                        "total_chunks": chunk.total_chunks,
                        "tags": metadata.tags
                    }
                )

                if success:
                    indexed += 1

            logger.info(f"Indexed {indexed}/{len(chunks)} chunks from {path.name}")

            return IngestionResult(
                success=indexed > 0,
                filepath=filepath,
                chunks_created=indexed
            )

        except Exception as e:
            logger.error(f"Ingestion failed for {filepath}: {e}")
            return IngestionResult(
                success=False,
                filepath=filepath,
                error=str(e)
            )

    def _read_file(self, path: Path) -> Optional[str]:
        """Read file content with encoding detection."""
        encodings = ["utf-8", "latin-1", "cp1252"]

        for encoding in encodings:
            try:
                return path.read_text(encoding=encoding)
            except UnicodeDecodeError:
                continue

        return None

    def _sanitize_content(self, content: str) -> str:
        """
        Sanitize content for security.

        Removes:
        - Embedded scripts
        - Base64 encoded content
        - Suspicious patterns
        """
        # Remove script tags
        content = re.sub(r'<script[^>]*>.*?</script>', '', content, flags=re.I | re.S)

        # Remove style tags
        content = re.sub(r'<style[^>]*>.*?</style>', '', content, flags=re.I | re.S)

        # Remove HTML comments (may contain hidden content)
        content = re.sub(r'<!--.*?-->', '', content, flags=re.S)

        # Remove large base64 blocks (potential hidden payloads)
        content = re.sub(r'[A-Za-z0-9+/]{100,}={0,2}', '[BASE64_REMOVED]', content)

        # Normalize whitespace
        content = re.sub(r'\n{3,}', '\n\n', content)

        return content.strip()

    def _detect_technology(self, content: str) -> str:
        """Detect primary technology from content."""
        content_lower = content.lower()

        scores = {}
        for tech, pattern in self.TECH_PATTERNS.items():
            matches = len(re.findall(pattern, content_lower, re.I))
            if matches > 0:
                scores[tech] = matches

        if scores:
            return max(scores, key=scores.get)

        return "general"

    def _extract_tags(self, content: str) -> List[str]:
        """Extract relevant tags from content."""
        tags = set()

        # Check for each technology pattern
        content_lower = content.lower()
        for tech, pattern in self.TECH_PATTERNS.items():
            if re.search(pattern, content_lower, re.I):
                tags.add(tech)

        return list(tags)[:10]  # Limit tags

    def _chunk_content(
        self,
        content: str,
        metadata: DocumentMetadata
    ) -> List[DocumentChunk]:
        """Split content into chunks for indexing."""
        chunks = []

        # Split by paragraphs first
        paragraphs = content.split('\n\n')

        current_chunk = ""
        chunk_index = 0

        for para in paragraphs:
            if len(current_chunk) + len(para) <= self._chunk_size:
                current_chunk += para + "\n\n"
            else:
                if current_chunk.strip():
                    chunk_id = self._generate_chunk_id(
                        metadata.filepath,
                        chunk_index
                    )
                    chunks.append(DocumentChunk(
                        id=chunk_id,
                        content=current_chunk.strip(),
                        metadata=metadata,
                        chunk_index=chunk_index
                    ))
                    chunk_index += 1

                current_chunk = para + "\n\n"

        # Add final chunk
        if current_chunk.strip():
            chunk_id = self._generate_chunk_id(metadata.filepath, chunk_index)
            chunks.append(DocumentChunk(
                id=chunk_id,
                content=current_chunk.strip(),
                metadata=metadata,
                chunk_index=chunk_index
            ))

        # Update total chunks
        for chunk in chunks:
            chunk.total_chunks = len(chunks)

        return chunks

    def _generate_chunk_id(self, filepath: str, chunk_index: int) -> str:
        """Generate unique ID for a chunk."""
        content = f"{filepath}:{chunk_index}"
        return hashlib.md5(content.encode()).hexdigest()

    async def _generate_embedding(self, text: str) -> List[float]:
        """
        Generate embedding for text using Ollama.
        """
        embedding = await self._ollama.get_embeddings(text)

        # If model dimension differs, we might need padding/truncating (or just trust the model)
        # For now we assume the model returns correct DIM or we handle it downstream
        return embedding


# Singleton
_pipeline: Optional[RAGIngestionPipeline] = None


def get_rag_pipeline() -> RAGIngestionPipeline:
    """Get global RAG pipeline instance."""
    global _pipeline
    if _pipeline is None:
        _pipeline = RAGIngestionPipeline()
    return _pipeline