Refactored and moved all keys and configuration into .env files and provided samples
This commit is contained in:
@@ -1,17 +1,23 @@
|
||||
import os
|
||||
from rag_system.loaders.pdf_loader import load_pdf
|
||||
from rag_system.loaders.web_loader import load_web_crawl
|
||||
from rag_system.vectordb.azure_search import add_documents
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv() # take environment variables
|
||||
|
||||
|
||||
def main():
|
||||
print("[1/2] Splitting and processing documents...")
|
||||
pdf_documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
|
||||
# web_documents = load_web_crawl(["https://excalibur.mgmresorts.com/en.html"])
|
||||
# web_documents = load_web_crawl(["https://www.verint.com"])
|
||||
# web_documents = load_web_crawl("https://firecrawl.dev")
|
||||
# pdf_documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
|
||||
|
||||
documents = load_web_crawl(os.getenv("CRAWLER_COMPANY_URL"))
|
||||
for doc in documents:
|
||||
doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
|
||||
print("[2/2] Generating and storing embeddings...")
|
||||
add_documents(pdf_documents)
|
||||
# add_documents(web_documents)
|
||||
# add_documents(pdf_documents)
|
||||
add_documents(documents)
|
||||
print("Embeddings stored. You can now run the Streamlit app with:\n")
|
||||
print(" streamlit run rag_system/app/streamlit_app.py")
|
||||
|
||||
|
||||
@@ -1,24 +1,39 @@
|
||||
import json
|
||||
import os
|
||||
import logging
|
||||
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from rag_system.loaders.firecrawl import FireCrawlLoader
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv() # take environment variables]
|
||||
|
||||
# Configure the logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
firecrawl_api_url = os.getenv("FIRECRAWL_API_URL")
|
||||
firecrawl_mode = os.getenv("FIRECRAWL_MODE")
|
||||
firecrawl_params = os.getenv("FIRECRAWL_PARAMS")
|
||||
if firecrawl_params:
|
||||
firecrawl_params = json.loads(firecrawl_params)
|
||||
|
||||
logging.info(f"web_loader firecrawl_api_url: {firecrawl_api_url}")
|
||||
logging.info(f"web_loader firecrawl_mode: {firecrawl_mode}")
|
||||
logging.info(f"web_loader firecrawl_params: {firecrawl_params}")
|
||||
|
||||
|
||||
def load_web_crawl(url):
|
||||
|
||||
documents = []
|
||||
metadatas = []
|
||||
logging.info(f"load_web_crawl url: {url}")
|
||||
|
||||
loader = FireCrawlLoader(
|
||||
url=url,
|
||||
api_key="changeme",
|
||||
api_url="http://localhost:3002",
|
||||
mode="crawl",
|
||||
params={
|
||||
"limit": 100,
|
||||
"include_paths": ["/.*"],
|
||||
"ignore_sitemap": True,
|
||||
"poll_interval": 5,
|
||||
},
|
||||
api_key=firecrawl_api_key,
|
||||
api_url=firecrawl_api_url,
|
||||
mode=firecrawl_mode,
|
||||
params=firecrawl_params,
|
||||
)
|
||||
docs = []
|
||||
docs_lazy = loader.load()
|
||||
@@ -35,10 +50,4 @@ def load_web_crawl(url):
|
||||
chunk_size=250, chunk_overlap=0
|
||||
)
|
||||
# Split the documents into chunks
|
||||
splits = text_splitter.split_documents(docs)
|
||||
|
||||
for split in splits:
|
||||
documents.append(split.page_content)
|
||||
metadatas.append(split.metadata)
|
||||
|
||||
return (documents, metadatas)
|
||||
return text_splitter.split_documents(docs)
|
||||
|
||||
@@ -1,7 +1,13 @@
|
||||
import os
|
||||
|
||||
from typing import Tuple
|
||||
from langchain_community.vectorstores.azuresearch import AzureSearch
|
||||
from langchain_community.vectorstores.azuresearch import (
|
||||
AzureSearch,
|
||||
FIELDS_CONTENT,
|
||||
FIELDS_CONTENT_VECTOR,
|
||||
FIELDS_ID,
|
||||
FIELDS_METADATA,
|
||||
)
|
||||
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
|
||||
from dotenv import load_dotenv
|
||||
from uuid import uuid4
|
||||
@@ -32,6 +38,50 @@ embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
|
||||
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
|
||||
)
|
||||
|
||||
try:
|
||||
from azure.search.documents.indexes.models import (
|
||||
SearchableField,
|
||||
SearchField,
|
||||
SearchFieldDataType,
|
||||
SimpleField,
|
||||
)
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Unable to import azure.search.documents. Please install with "
|
||||
"`pip install -U azure-search-documents`."
|
||||
) from e
|
||||
|
||||
fields = [
|
||||
SimpleField(
|
||||
name=FIELDS_ID,
|
||||
type=SearchFieldDataType.String,
|
||||
key=True,
|
||||
filterable=True,
|
||||
),
|
||||
SimpleField(
|
||||
name="company",
|
||||
type=SearchFieldDataType.String,
|
||||
key=False,
|
||||
filterable=True,
|
||||
),
|
||||
SearchableField(
|
||||
name=FIELDS_CONTENT,
|
||||
type=SearchFieldDataType.String,
|
||||
),
|
||||
SearchField(
|
||||
name=FIELDS_CONTENT_VECTOR,
|
||||
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
|
||||
searchable=True,
|
||||
vector_search_dimensions=None or len("Text"),
|
||||
vector_search_profile_name="myHnswProfile",
|
||||
),
|
||||
SearchableField(
|
||||
name=FIELDS_METADATA,
|
||||
type=SearchFieldDataType.String,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# Specify additional properties for the Azure client such as the following https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/core/azure-core/README.md#configurations
|
||||
vector_store: AzureSearch = AzureSearch(
|
||||
azure_search_endpoint=os.getenv("VECTOR_STORE_ADDRESS"),
|
||||
@@ -40,6 +90,7 @@ vector_store: AzureSearch = AzureSearch(
|
||||
embedding_function=embeddings.embed_query,
|
||||
# Configure max retries for the Azure client
|
||||
additional_search_client_options={"retry_total": os.getenv("RETRY_TOTAL")},
|
||||
fields=fields,
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user