refactored project to use poetry

This commit is contained in:
2025-05-20 22:19:30 -05:00
parent 3beb160c22
commit b3da128396
20 changed files with 5113 additions and 23 deletions

View File

View File

@@ -0,0 +1,139 @@
import os
from typing import Tuple
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
from dotenv import load_dotenv
from uuid import uuid4
load_dotenv() # take environment variables
required_env_vars = [
"AZURE_DEPLOYMENT",
"AZURE_OPENAI_API_VERSION",
"AZURE_ENDPOINT",
"AZURE_OPENAI_API_KEY",
"VECTOR_STORE_ADDRESS",
"VECTOR_STORE_PASSWORD",
"INDEX_NAME",
"RETRY_TOTAL",
]
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
if missing_vars:
raise ValueError(
f"Missing required environment variables: {', '.join(missing_vars)}"
)
# Use AzureOpenAIEmbeddings with an Azure account
embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
azure_deployment=os.getenv("AZURE_DEPLOYMENT"),
openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
azure_endpoint=os.getenv("AZURE_ENDPOINT"),
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
)
# Specify additional properties for the Azure client such as the following https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md#configurations
vector_store: AzureSearch = AzureSearch(
azure_search_endpoint=os.getenv("VECTOR_STORE_ADDRESS"),
azure_search_key=os.getenv("VECTOR_STORE_PASSWORD"),
index_name=os.getenv("INDEX_NAME"),
embedding_function=embeddings.embed_query,
# Configure max retries for the Azure client
additional_search_client_options={"retry_total": os.getenv("RETRY_TOTAL")},
)
def get_document_id(document):
"""
Get the document ID from the document object.
"""
if hasattr(document, "metadata") and "id" in document.metadata:
return document.metadata["id"]
elif hasattr(document, "id"):
return document.id
else:
raise ValueError("Document does not have a valid ID.")
def delete_all_documents():
"""
Delete all documents from the AzureSearch vector store.
"""
try:
docs_to_delete = []
while True:
# Delete all documents in the index
docs_to_delete = retrieve("", 10)
vector_store.delete(list(map(get_document_id, docs_to_delete)))
if len(docs_to_delete) > 0:
continue
else:
break
print("All documents deleted successfully.")
except Exception as e:
print(f"Error deleting documents: {str(e)}")
def add_documents(documents):
# uuids = [str(uuid4()) for _ in range(len(documents))]
try:
vector_store.add_documents(documents)
except Exception as e:
print(f"Error adding document to vector store: {str(e)}")
def retrieve(query_text, n_results=1):
# Perform a similarity search
docs = vector_store.similarity_search(
query=query_text,
k=n_results,
search_type="similarity",
)
return docs
# def add_document_to_vector_store(document):
# """
# Add a document to the AzureSearch vector store.
# Args:
# vector_store: The initialized AzureSearch vector store instance.
# document: A dictionary or object representing the document to be added.
# Example format:
# {
# "id": "unique_document_id",
# "content": "The text content of the document",
# "metadata": {
# "source": "source_url",
# "created": "2025-03-04T14:14:40.421666",
# "modified": "2025-03-04T14:14:40.421666"
# }
# }
# """
# try:
# # Add the document to the vector store
# vector_store.add_documents([document])
# print(f"Document with ID {document['id']} added successfully.")
# except Exception as e:
# print(f"Error adding document to vector store: {str(e)}")
# add_document_to_vector_store("https://api.python.langchain.com/en/latest/langchain_api_reference.html",None)
# Example document to add
# doc = Document(
# page_content="This is the content of the document.For testing IVA demo integration ",
# metadata= {
# "source": "https://example.com/source",
# "created": "2025-03-04T14:14:40.421666",
# "modified": "2025-03-04T14:14:40.421666"
# }
# )
# Add the document to the vector store
# add_document_to_vector_store( doc)
# result = retrieve("iva",1)

View File

@@ -0,0 +1,55 @@
from typing import Tuple
import chromadb
from langchain_chroma import Chroma
from uuid import uuid4
# from chromadb.utils.embedding_functions.ollama_embedding_function import (
# OllamaEmbeddingFunction,
# )
from langchain_ollama import OllamaEmbeddings
from chromadb.api.types import Metadata, Document, OneOrMany
# Define a custom embedding function for ChromaDB using Ollama
class ChromaDBEmbeddingFunction:
"""
Custom embedding function for ChromaDB using embeddings from Ollama.
"""
def __init__(self, langchain_embeddings):
self.langchain_embeddings = langchain_embeddings
def __call__(self, input):
# Ensure the input is in a list format for processing
if isinstance(input, str):
input = [input]
return self.langchain_embeddings.embed_documents(input)
# Initialize the embedding function with Ollama embeddings
embedding = ChromaDBEmbeddingFunction(
OllamaEmbeddings(
model="nomic-embed-text",
base_url="http://localhost:11434", # Adjust the base URL as per your Ollama server configuration
)
)
persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection(
name="collection_name",
metadata={"description": "A collection for RAG with Ollama - Demo1"},
embedding_function=embedding, # Use the custom embedding function)
)
def add_documents(documents: Tuple[OneOrMany[Document], OneOrMany[Metadata]]):
docs, metas = documents
uuids = [str(uuid4()) for _ in range(len(docs))]
collection.add(documents=docs, ids=uuids, metadatas=metas)
def retrieve(query_text, n_results=1):
# return vector_store.similarity_search(query, k=3)
results = collection.query(query_texts=[query_text], n_results=n_results)
return results["documents"], results["metadatas"]