refactored project to use poetry

2025-05-20 22:19:30 -05:00
parent 3beb160c22
commit b3da128396
20 changed files with 5113 additions and 23 deletions
--- a/rag_system/vectordb/init.py
+++ b/rag_system/vectordb/init.py
--- a/rag_system/vectordb/azure_search.py
+++ b/rag_system/vectordb/azure_search.py
@@ -0,0 +1,139 @@
+import os
+
+from typing import Tuple
+from langchain_community.vectorstores.azuresearch import AzureSearch
+from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
+from dotenv import load_dotenv
+from uuid import uuid4
+
+load_dotenv()  # take environment variables
+required_env_vars = [
+    "AZURE_DEPLOYMENT",
+    "AZURE_OPENAI_API_VERSION",
+    "AZURE_ENDPOINT",
+    "AZURE_OPENAI_API_KEY",
+    "VECTOR_STORE_ADDRESS",
+    "VECTOR_STORE_PASSWORD",
+    "INDEX_NAME",
+    "RETRY_TOTAL",
+]
+
+missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
+if missing_vars:
+    raise ValueError(
+        f"Missing required environment variables: {', '.join(missing_vars)}"
+    )
+
+# Use AzureOpenAIEmbeddings with an Azure account
+embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
+    azure_deployment=os.getenv("AZURE_DEPLOYMENT"),
+    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
+    azure_endpoint=os.getenv("AZURE_ENDPOINT"),
+    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
+)
+
+# Specify additional properties for the Azure client such as the following https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md#configurations
+vector_store: AzureSearch = AzureSearch(
+    azure_search_endpoint=os.getenv("VECTOR_STORE_ADDRESS"),
+    azure_search_key=os.getenv("VECTOR_STORE_PASSWORD"),
+    index_name=os.getenv("INDEX_NAME"),
+    embedding_function=embeddings.embed_query,
+    # Configure max retries for the Azure client
+    additional_search_client_options={"retry_total": os.getenv("RETRY_TOTAL")},
+)
+
+
+def get_document_id(document):
+    """
+    Get the document ID from the document object.
+    """
+    if hasattr(document, "metadata") and "id" in document.metadata:
+        return document.metadata["id"]
+    elif hasattr(document, "id"):
+        return document.id
+    else:
+        raise ValueError("Document does not have a valid ID.")
+
+
+def delete_all_documents():
+    """
+    Delete all documents from the AzureSearch vector store.
+    """
+    try:
+
+        docs_to_delete = []
+        while True:
+            # Delete all documents in the index
+            docs_to_delete = retrieve("", 10)
+
+            vector_store.delete(list(map(get_document_id, docs_to_delete)))
+            if len(docs_to_delete) > 0:
+                continue
+            else:
+                break
+
+        print("All documents deleted successfully.")
+    except Exception as e:
+        print(f"Error deleting documents: {str(e)}")
+
+
+def add_documents(documents):
+    # uuids = [str(uuid4()) for _ in range(len(documents))]
+
+    try:
+        vector_store.add_documents(documents)
+    except Exception as e:
+        print(f"Error adding document to vector store: {str(e)}")
+
+
+def retrieve(query_text, n_results=1):
+    # Perform a similarity search
+    docs = vector_store.similarity_search(
+        query=query_text,
+        k=n_results,
+        search_type="similarity",
+    )
+    return docs
+
+
+# def add_document_to_vector_store(document):
+#     """
+#     Add a document to the AzureSearch vector store.
+
+#     Args:
+#         vector_store: The initialized AzureSearch vector store instance.
+#         document: A dictionary or object representing the document to be added.
+#                   Example format:
+#                   {
+#                       "id": "unique_document_id",
+#                       "content": "The text content of the document",
+#                       "metadata": {
+#                           "source": "source_url",
+#                           "created": "2025-03-04T14:14:40.421666",
+#                           "modified": "2025-03-04T14:14:40.421666"
+#                       }
+#                   }
+#     """
+#     try:
+
+#         # Add the document to the vector store
+#         vector_store.add_documents([document])
+#         print(f"Document with ID {document['id']} added successfully.")
+#     except Exception as e:
+#         print(f"Error adding document to vector store: {str(e)}")
+
+# add_document_to_vector_store("https://api.python.langchain.com/en/latest/langchain_api_reference.html",None)
+# Example document to add
+
+# doc = Document(
+#                 page_content="This is the content of the document.For testing IVA demo integration  ",
+#                 metadata= {
+#                     "source": "https://example.com/source",
+#                     "created": "2025-03-04T14:14:40.421666",
+#                     "modified": "2025-03-04T14:14:40.421666"
+#                 }
+#             )
+# Add the document to the vector store
+# add_document_to_vector_store( doc)
+
+# result = retrieve("iva",1)
--- a/rag_system/vectordb/chromadb.py
+++ b/rag_system/vectordb/chromadb.py
@@ -0,0 +1,55 @@
+from typing import Tuple
+import chromadb
+from langchain_chroma import Chroma
+from uuid import uuid4
+
+# from chromadb.utils.embedding_functions.ollama_embedding_function import (
+#     OllamaEmbeddingFunction,
+# )
+from langchain_ollama import OllamaEmbeddings
+from chromadb.api.types import Metadata, Document, OneOrMany
+
+
+# Define a custom embedding function for ChromaDB using Ollama
+class ChromaDBEmbeddingFunction:
+    """
+    Custom embedding function for ChromaDB using embeddings from Ollama.
+    """
+
+    def __init__(self, langchain_embeddings):
+        self.langchain_embeddings = langchain_embeddings
+
+    def __call__(self, input):
+        # Ensure the input is in a list format for processing
+        if isinstance(input, str):
+            input = [input]
+        return self.langchain_embeddings.embed_documents(input)
+
+
+# Initialize the embedding function with Ollama embeddings
+embedding = ChromaDBEmbeddingFunction(
+    OllamaEmbeddings(
+        model="nomic-embed-text",
+        base_url="http://localhost:11434",  # Adjust the base URL as per your Ollama server configuration
+    )
+)
+
+
+persistent_client = chromadb.PersistentClient()
+collection = persistent_client.get_or_create_collection(
+    name="collection_name",
+    metadata={"description": "A collection for RAG with Ollama - Demo1"},
+    embedding_function=embedding,  # Use the custom embedding function)
+)
+
+
+def add_documents(documents: Tuple[OneOrMany[Document], OneOrMany[Metadata]]):
+    docs, metas = documents
+    uuids = [str(uuid4()) for _ in range(len(docs))]
+    collection.add(documents=docs, ids=uuids, metadatas=metas)
+
+
+def retrieve(query_text, n_results=1):
+    # return vector_store.similarity_search(query, k=3)
+    results = collection.query(query_texts=[query_text], n_results=n_results)
+    return results["documents"], results["metadatas"]