Refactored and moved all keys and configuration into .env files and provided samples

This commit is contained in:
2025-05-24 12:33:40 -05:00
parent 4d62015470
commit c381b0434a
9 changed files with 330 additions and 215 deletions

View File

@@ -1,17 +1,23 @@
import os
from rag_system.loaders.pdf_loader import load_pdf
from rag_system.loaders.web_loader import load_web_crawl
from rag_system.vectordb.azure_search import add_documents
from dotenv import load_dotenv
load_dotenv() # take environment variables
def main():
print("[1/2] Splitting and processing documents...")
pdf_documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
# web_documents = load_web_crawl(["https://excalibur.mgmresorts.com/en.html"])
# web_documents = load_web_crawl(["https://www.verint.com"])
# web_documents = load_web_crawl("https://firecrawl.dev")
# pdf_documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
documents = load_web_crawl(os.getenv("CRAWLER_COMPANY_URL"))
for doc in documents:
doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
print("[2/2] Generating and storing embeddings...")
add_documents(pdf_documents)
# add_documents(web_documents)
# add_documents(pdf_documents)
add_documents(documents)
print("Embeddings stored. You can now run the Streamlit app with:\n")
print(" streamlit run rag_system/app/streamlit_app.py")

View File

@@ -1,24 +1,39 @@
import json
import os
import logging
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rag_system.loaders.firecrawl import FireCrawlLoader
from dotenv import load_dotenv
load_dotenv() # take environment variables]
# Configure the logging
logging.basicConfig(level=logging.INFO)
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
firecrawl_api_url = os.getenv("FIRECRAWL_API_URL")
firecrawl_mode = os.getenv("FIRECRAWL_MODE")
firecrawl_params = os.getenv("FIRECRAWL_PARAMS")
if firecrawl_params:
firecrawl_params = json.loads(firecrawl_params)
logging.info(f"web_loader firecrawl_api_url: {firecrawl_api_url}")
logging.info(f"web_loader firecrawl_mode: {firecrawl_mode}")
logging.info(f"web_loader firecrawl_params: {firecrawl_params}")
def load_web_crawl(url):
documents = []
metadatas = []
logging.info(f"load_web_crawl url: {url}")
loader = FireCrawlLoader(
url=url,
api_key="changeme",
api_url="http://localhost:3002",
mode="crawl",
params={
"limit": 100,
"include_paths": ["/.*"],
"ignore_sitemap": True,
"poll_interval": 5,
},
api_key=firecrawl_api_key,
api_url=firecrawl_api_url,
mode=firecrawl_mode,
params=firecrawl_params,
)
docs = []
docs_lazy = loader.load()
@@ -35,10 +50,4 @@ def load_web_crawl(url):
chunk_size=250, chunk_overlap=0
)
# Split the documents into chunks
splits = text_splitter.split_documents(docs)
for split in splits:
documents.append(split.page_content)
metadatas.append(split.metadata)
return (documents, metadatas)
return text_splitter.split_documents(docs)

View File

@@ -1,7 +1,13 @@
import os
from typing import Tuple
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_community.vectorstores.azuresearch import (
AzureSearch,
FIELDS_CONTENT,
FIELDS_CONTENT_VECTOR,
FIELDS_ID,
FIELDS_METADATA,
)
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
from dotenv import load_dotenv
from uuid import uuid4
@@ -32,6 +38,50 @@ embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
)
try:
from azure.search.documents.indexes.models import (
SearchableField,
SearchField,
SearchFieldDataType,
SimpleField,
)
except ImportError as e:
raise ImportError(
"Unable to import azure.search.documents. Please install with "
"`pip install -U azure-search-documents`."
) from e
fields = [
SimpleField(
name=FIELDS_ID,
type=SearchFieldDataType.String,
key=True,
filterable=True,
),
SimpleField(
name="company",
type=SearchFieldDataType.String,
key=False,
filterable=True,
),
SearchableField(
name=FIELDS_CONTENT,
type=SearchFieldDataType.String,
),
SearchField(
name=FIELDS_CONTENT_VECTOR,
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True,
vector_search_dimensions=None or len("Text"),
vector_search_profile_name="myHnswProfile",
),
SearchableField(
name=FIELDS_METADATA,
type=SearchFieldDataType.String,
),
]
# Specify additional properties for the Azure client such as the following https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md#configurations
vector_store: AzureSearch = AzureSearch(
azure_search_endpoint=os.getenv("VECTOR_STORE_ADDRESS"),
@@ -40,6 +90,7 @@ vector_store: AzureSearch = AzureSearch(
embedding_function=embeddings.embed_query,
# Configure max retries for the Azure client
additional_search_client_options={"retry_total": os.getenv("RETRY_TOTAL")},
fields=fields,
)