crawler now supports directories for loading multiple pdfs

This commit is contained in:
Peter Morton 2025-05-30 16:28:35 -05:00
parent f07afccdb3
commit 25867c0f31
2 changed files with 65 additions and 12 deletions

10
.gitignore vendored
View File

@ -171,3 +171,13 @@ cython_debug/
chroma/*
.DS_Store
files/AdaptHealth-Nebulizer-Order-Checklist.pdf
files/AH_Financial_Policy_2024.pdf
files/AH_PatientGuide_122024.pdf
files/AH-RAD-Booklet-July-2023.pdf
files/CGM-Order-Checklist-AdaptHealth-REV1a.pdf
files/Maze-Booklet-AdaptHealth-August-2023.pdf
files/ResMed-AirSense-10-Elite-Auto-CPAP-Quick-Start-Guide.pdf
files/ResMed-AirSense-10-Elite-Auto-CPAP-User-Manual.pdf
files/Tandem Insulim Pump Instructions from YouTube Captions on AdaptHealth Website.pdf
files/Tandem Tslim X2 User Guide.pdf

View File

@ -1,25 +1,68 @@
import os
from rag_system.loaders.pdf_loader import load_pdf
from rag_system.loaders.web_loader import load_web_crawl
from rag_system.vectordb.azure_search import add_documents
import coloredlogs, logging
from dotenv import load_dotenv
load_dotenv() # take environment variables
# logging.basicConfig(level=logging.INFO)
logger: logging.Logger = logging.getLogger("crawler")
coloredlogs.install(level="DEBUG")
def main():
print("[1/2] Splitting and processing documents...")
# pdf_documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
documents = load_web_crawl(os.getenv("CRAWLER_COMPANY_URL"))
for doc in documents:
doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
print("[2/2] Generating and storing embeddings...")
# add_documents(pdf_documents)
documents = []
logger.info("[1/2] Splitting and processing documents...")
# documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
directory = os.getenv("CRAWLER_COMPANY_PDF_DIR")
if not directory: # Checks for both empty and None
logger.info(
"No CRAWLER_COMPANY_PDF_DIR set in the environment variables. Skipping PDF loading."
)
else:
from rag_system.loaders.pdf_loader import load_pdf
pdf_files = [file for file in os.listdir(directory) if file.endswith(".pdf")]
logger.info(pdf_files)
for pdf_file in pdf_files:
pdf_path = os.path.join(directory, pdf_file)
logger.info(f"Loading PDF: {pdf_path}")
documents += load_pdf(pdf_path)
url = os.getenv("CRAWLER_COMPANY_URL")
if not url: # Checks for both empty and None
logger.info(
"No CRAWLER_COMPANY_URL set in the environment variables. Skipping web crawling."
)
else:
from rag_system.loaders.web_loader import load_web_crawl
documents += load_web_crawl(url)
for doc in documents:
doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
logger.info("[2/2] Generating and storing embeddings...")
if not documents:
logger.info("No documents found to process. Exiting.")
return
from rag_system.vectordb.azure_search import add_documents
logger.info(f"Total documents to process: {len(documents)}")
logger.info("Storing embeddings in Azure Search...")
# Add the documents to the vector database
# This function should handle the embedding generation and storage
# in Azure Search or any other vector database you are using.
add_documents(documents)
print("Embeddings stored. You can now run the Streamlit app with:\n")
print(" streamlit run rag_system/app/streamlit_app.py")
logger.info("Embeddings stored. You can now run the Streamlit app with:\n")
logger.info(" streamlit run rag_system/app/streamlit_app.py")
if __name__ == "__main__":