crawler now supports directories for loading multiple pdfs
This commit is contained in:
parent
f07afccdb3
commit
25867c0f31
10
.gitignore
vendored
10
.gitignore
vendored
@ -171,3 +171,13 @@ cython_debug/
|
||||
chroma/*
|
||||
|
||||
.DS_Store
|
||||
files/AdaptHealth-Nebulizer-Order-Checklist.pdf
|
||||
files/AH_Financial_Policy_2024.pdf
|
||||
files/AH_PatientGuide_122024.pdf
|
||||
files/AH-RAD-Booklet-July-2023.pdf
|
||||
files/CGM-Order-Checklist-AdaptHealth-REV1a.pdf
|
||||
files/Maze-Booklet-AdaptHealth-August-2023.pdf
|
||||
files/ResMed-AirSense-10-Elite-Auto-CPAP-Quick-Start-Guide.pdf
|
||||
files/ResMed-AirSense-10-Elite-Auto-CPAP-User-Manual.pdf
|
||||
files/Tandem Insulim Pump Instructions from YouTube Captions on AdaptHealth Website.pdf
|
||||
files/Tandem Tslim X2 User Guide.pdf
|
||||
|
||||
@ -1,25 +1,68 @@
|
||||
import os
|
||||
from rag_system.loaders.pdf_loader import load_pdf
|
||||
from rag_system.loaders.web_loader import load_web_crawl
|
||||
from rag_system.vectordb.azure_search import add_documents
|
||||
import coloredlogs, logging
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv() # take environment variables
|
||||
|
||||
# logging.basicConfig(level=logging.INFO)
|
||||
logger: logging.Logger = logging.getLogger("crawler")
|
||||
coloredlogs.install(level="DEBUG")
|
||||
|
||||
|
||||
def main():
|
||||
print("[1/2] Splitting and processing documents...")
|
||||
# pdf_documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
|
||||
|
||||
documents = load_web_crawl(os.getenv("CRAWLER_COMPANY_URL"))
|
||||
for doc in documents:
|
||||
doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
|
||||
print("[2/2] Generating and storing embeddings...")
|
||||
# add_documents(pdf_documents)
|
||||
documents = []
|
||||
|
||||
logger.info("[1/2] Splitting and processing documents...")
|
||||
# documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
|
||||
|
||||
directory = os.getenv("CRAWLER_COMPANY_PDF_DIR")
|
||||
|
||||
if not directory: # Checks for both empty and None
|
||||
logger.info(
|
||||
"No CRAWLER_COMPANY_PDF_DIR set in the environment variables. Skipping PDF loading."
|
||||
)
|
||||
else:
|
||||
|
||||
from rag_system.loaders.pdf_loader import load_pdf
|
||||
|
||||
pdf_files = [file for file in os.listdir(directory) if file.endswith(".pdf")]
|
||||
|
||||
logger.info(pdf_files)
|
||||
for pdf_file in pdf_files:
|
||||
pdf_path = os.path.join(directory, pdf_file)
|
||||
logger.info(f"Loading PDF: {pdf_path}")
|
||||
documents += load_pdf(pdf_path)
|
||||
|
||||
url = os.getenv("CRAWLER_COMPANY_URL")
|
||||
|
||||
if not url: # Checks for both empty and None
|
||||
logger.info(
|
||||
"No CRAWLER_COMPANY_URL set in the environment variables. Skipping web crawling."
|
||||
)
|
||||
else:
|
||||
from rag_system.loaders.web_loader import load_web_crawl
|
||||
|
||||
documents += load_web_crawl(url)
|
||||
for doc in documents:
|
||||
doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
|
||||
|
||||
logger.info("[2/2] Generating and storing embeddings...")
|
||||
if not documents:
|
||||
logger.info("No documents found to process. Exiting.")
|
||||
return
|
||||
|
||||
from rag_system.vectordb.azure_search import add_documents
|
||||
|
||||
logger.info(f"Total documents to process: {len(documents)}")
|
||||
logger.info("Storing embeddings in Azure Search...")
|
||||
# Add the documents to the vector database
|
||||
# This function should handle the embedding generation and storage
|
||||
# in Azure Search or any other vector database you are using.
|
||||
add_documents(documents)
|
||||
print("Embeddings stored. You can now run the Streamlit app with:\n")
|
||||
print(" streamlit run rag_system/app/streamlit_app.py")
|
||||
logger.info("Embeddings stored. You can now run the Streamlit app with:\n")
|
||||
logger.info(" streamlit run rag_system/app/streamlit_app.py")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user