crawler now supports directories for loading multiple pdfs
This commit is contained in:
10
.gitignore
vendored
10
.gitignore
vendored
@@ -171,3 +171,13 @@ cython_debug/
|
|||||||
chroma/*
|
chroma/*
|
||||||
|
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
files/AdaptHealth-Nebulizer-Order-Checklist.pdf
|
||||||
|
files/AH_Financial_Policy_2024.pdf
|
||||||
|
files/AH_PatientGuide_122024.pdf
|
||||||
|
files/AH-RAD-Booklet-July-2023.pdf
|
||||||
|
files/CGM-Order-Checklist-AdaptHealth-REV1a.pdf
|
||||||
|
files/Maze-Booklet-AdaptHealth-August-2023.pdf
|
||||||
|
files/ResMed-AirSense-10-Elite-Auto-CPAP-Quick-Start-Guide.pdf
|
||||||
|
files/ResMed-AirSense-10-Elite-Auto-CPAP-User-Manual.pdf
|
||||||
|
files/Tandem Insulim Pump Instructions from YouTube Captions on AdaptHealth Website.pdf
|
||||||
|
files/Tandem Tslim X2 User Guide.pdf
|
||||||
|
|||||||
@@ -1,25 +1,68 @@
|
|||||||
import os
|
import os
|
||||||
from rag_system.loaders.pdf_loader import load_pdf
|
import coloredlogs, logging
|
||||||
from rag_system.loaders.web_loader import load_web_crawl
|
|
||||||
from rag_system.vectordb.azure_search import add_documents
|
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
load_dotenv() # take environment variables
|
load_dotenv() # take environment variables
|
||||||
|
|
||||||
|
# logging.basicConfig(level=logging.INFO)
|
||||||
|
logger: logging.Logger = logging.getLogger("crawler")
|
||||||
|
coloredlogs.install(level="DEBUG")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print("[1/2] Splitting and processing documents...")
|
|
||||||
# pdf_documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
|
|
||||||
|
|
||||||
documents = load_web_crawl(os.getenv("CRAWLER_COMPANY_URL"))
|
documents = []
|
||||||
for doc in documents:
|
|
||||||
doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
|
logger.info("[1/2] Splitting and processing documents...")
|
||||||
print("[2/2] Generating and storing embeddings...")
|
# documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
|
||||||
# add_documents(pdf_documents)
|
|
||||||
|
directory = os.getenv("CRAWLER_COMPANY_PDF_DIR")
|
||||||
|
|
||||||
|
if not directory: # Checks for both empty and None
|
||||||
|
logger.info(
|
||||||
|
"No CRAWLER_COMPANY_PDF_DIR set in the environment variables. Skipping PDF loading."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
|
||||||
|
from rag_system.loaders.pdf_loader import load_pdf
|
||||||
|
|
||||||
|
pdf_files = [file for file in os.listdir(directory) if file.endswith(".pdf")]
|
||||||
|
|
||||||
|
logger.info(pdf_files)
|
||||||
|
for pdf_file in pdf_files:
|
||||||
|
pdf_path = os.path.join(directory, pdf_file)
|
||||||
|
logger.info(f"Loading PDF: {pdf_path}")
|
||||||
|
documents += load_pdf(pdf_path)
|
||||||
|
|
||||||
|
url = os.getenv("CRAWLER_COMPANY_URL")
|
||||||
|
|
||||||
|
if not url: # Checks for both empty and None
|
||||||
|
logger.info(
|
||||||
|
"No CRAWLER_COMPANY_URL set in the environment variables. Skipping web crawling."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
from rag_system.loaders.web_loader import load_web_crawl
|
||||||
|
|
||||||
|
documents += load_web_crawl(url)
|
||||||
|
for doc in documents:
|
||||||
|
doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
|
||||||
|
|
||||||
|
logger.info("[2/2] Generating and storing embeddings...")
|
||||||
|
if not documents:
|
||||||
|
logger.info("No documents found to process. Exiting.")
|
||||||
|
return
|
||||||
|
|
||||||
|
from rag_system.vectordb.azure_search import add_documents
|
||||||
|
|
||||||
|
logger.info(f"Total documents to process: {len(documents)}")
|
||||||
|
logger.info("Storing embeddings in Azure Search...")
|
||||||
|
# Add the documents to the vector database
|
||||||
|
# This function should handle the embedding generation and storage
|
||||||
|
# in Azure Search or any other vector database you are using.
|
||||||
add_documents(documents)
|
add_documents(documents)
|
||||||
print("Embeddings stored. You can now run the Streamlit app with:\n")
|
logger.info("Embeddings stored. You can now run the Streamlit app with:\n")
|
||||||
print(" streamlit run rag_system/app/streamlit_app.py")
|
logger.info(" streamlit run rag_system/app/streamlit_app.py")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user