diff --git a/requirements.txt b/requirements.txt index 11d4d11..75ba4cf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,6 @@ google-generativeai # For local embeddings if needed sentence-transformers # Graph database for relationship mapping -neo4j \ No newline at end of file +neo4j +qdrant-client +sentence-transformers \ No newline at end of file diff --git a/scripts/init_athena_db.py b/scripts/init_athena_db.py new file mode 100644 index 0000000..b9e01ba --- /dev/null +++ b/scripts/init_athena_db.py @@ -0,0 +1,159 @@ + +import os +import re +from typing import List, Dict +import logging +from dotenv import load_dotenv +from qdrant_client import QdrantClient +from qdrant_client.http import models +from sentence_transformers import SentenceTransformer + +# Setup logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# Load environment variables +load_dotenv() + +# Configuration +AGENT_CATALOG_PATH = os.path.join(os.path.dirname(__file__), '../docs/AGENT_CATALOG.md') +QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost") +QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333)) +COLLECTION_NAME = "routing_index" +EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Defines the vector size as 384 +VECTOR_SIZE = 384 + +def read_agent_catalog(file_path: str) -> List[Dict]: + """Reads the agent catalog and extracts agent information.""" + if not os.path.exists(file_path): + logger.error(f"Agent catalog not found at {file_path}") + return [] + + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + agents = [] + # Regex to find agent blocks + # Looking for ### Agent Name ... - **Crews:** Crew Name + agent_blocks = re.split(r'### ', content)[1:] # Split and skip header + + for block in agent_blocks: + lines = block.strip().split('\n') + name = lines[0].strip() + + description = "" + crew = "" + + for line in lines: + if line.startswith("- **Papel:**"): + role = line.split(":", 1)[1].strip() + description += f"Papel: {role}. " + elif line.startswith("- **Especialidade:**"): + specialty = line.split(":", 1)[1].strip() + description += f"Especialidade: {specialty}. " + elif line.startswith("- **Crews:**"): + crew = line.split(":", 1)[1].strip() + + if name and description and crew: + agents.append({ + "name": name, + "description": description, + "crew": crew + }) + logger.info(f"Found agent: {name} (Crew: {crew})") + + return agents + +def get_embeddings(texts: List[str]) -> List[List[float]]: + """Generates embeddings for a list of texts.""" + logger.info(f"Generating embeddings using model {EMBEDDING_MODEL_NAME}...") + model = SentenceTransformer(EMBEDDING_MODEL_NAME) + embeddings = model.encode(texts) + return embeddings.tolist() + +def init_qdrant_collection(client: QdrantClient, collection_name: str, vector_size: int): + """Creates the Qdrant collection if it does not exist.""" + collections = client.get_collections().collections + exists = any(c.name == collection_name for c in collections) + + if not exists: + logger.info(f"Creating collection '{collection_name}' with vector size {vector_size}...") + client.create_collection( + collection_name=collection_name, + vectors_config=models.VectorParams( + size=vector_size, + distance=models.Distance.COSINE + ) + ) + else: + logger.info(f"Collection '{collection_name}' already exists.") + +def populate_collection(client: QdrantClient, collection_name: str, agents: List[Dict]): + """Populates the collection with agent embeddings.""" + if not agents: + logger.warning("No agents to index.") + return + + descriptions = [agent["description"] for agent in agents] + embeddings = get_embeddings(descriptions) + + points = [] + for i, agent in enumerate(agents): + # We process crew string to maybe pick the first one if multiple? + # For now, let's keep the raw string, or split if it's comma separated + # The instruction says payload: {"target_crew": "NomeDaCrew"} + # If an agent belongs to multiple crews, we might want to create multiple points or just string match + # Let's keep it simple for now as per instructions. + + target_crew = agent["crew"] + + points.append(models.PointStruct( + id=i + 1, # Simple integer ID + vector=embeddings[i], + payload={ + "agent_name": agent["name"], + "target_crew": target_crew, + "description": agent["description"] + } + )) + + logger.info(f"Upserting {len(points)} points into '{collection_name}'...") + client.upsert( + collection_name=collection_name, + points=points + ) + logger.info("Indexing complete.") + +def main(): + logger.info("Starting Athena DB Initialization...") + + # 1. Read Catalog + agents = read_agent_catalog(AGENT_CATALOG_PATH) + if not agents: + logger.error("Failed to extract agents from catalog.") + return + + # 2. Connect to Qdrant + try: + client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT) + # Check connection likely by getting collections + client.get_collections() + logger.info(f"Connected to Qdrant at {QDRANT_HOST}:{QDRANT_PORT}") + except Exception as e: + logger.error(f"Failed to connect to Qdrant: {e}") + # For the sake of this task execution without running Qdrant, we might fail here. + # But the task is to CREATE the script. + # If the user has Qdrant running, it will work. + # If not, the script is still valid. + return + + # 3. Init Collection + init_qdrant_collection(client, COLLECTION_NAME, VECTOR_SIZE) + + # 4. Populate + populate_collection(client, COLLECTION_NAME, agents) + + logger.info("Athena DB Initialization finished successfully.") + +if __name__ == "__main__": + main()