72 lines
2.2 KiB
Python

import os
import coloredlogs, logging
from dotenv import load_dotenv
load_dotenv() # take environment variables
# logging.basicConfig(level=logging.INFO)
logger: logging.Logger = logging.getLogger("crawler")
coloredlogs.install(level="DEBUG")
def main():
documents = []
logger.info("[1/2] Splitting and processing documents...")
# documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
directory = os.getenv("CRAWLER_COMPANY_PDF_DIR")
if not directory: # Checks for both empty and None
logger.info(
"No CRAWLER_COMPANY_PDF_DIR set in the environment variables. Skipping PDF loading."
)
else:
from rag_system.loaders.pdf_loader import load_pdf
pdf_files = [file for file in os.listdir(directory) if file.endswith(".pdf")]
logger.info(pdf_files)
for pdf_file in pdf_files:
pdf_path = os.path.join(directory, pdf_file)
logger.info(f"Loading PDF: {pdf_path}")
documents += load_pdf(pdf_path)
url = os.getenv("CRAWLER_COMPANY_URL")
if not url: # Checks for both empty and None
logger.info(
"No CRAWLER_COMPANY_URL set in the environment variables. Skipping web crawling."
)
else:
from rag_system.loaders.web_loader import load_web_crawl
documents += load_web_crawl(url)
logger.info("[2/2] Generating and storing embeddings...")
if not documents:
logger.info("No documents found to process. Exiting.")
return
from rag_system.vectordb.azure_search import add_documents
logger.info(f"Total documents to process: {len(documents)}")
for doc in documents:
doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
logger.info("Storing embeddings in Azure Search...")
# Add the documents to the vector database
# This function should handle the embedding generation and storage
# in Azure Search or any other vector database you are using.
add_documents(documents)
logger.info("Embeddings stored. You can now run the Streamlit app with:\n")
logger.info(" streamlit run rag_system/app/streamlit_app.py")
if __name__ == "__main__":
main()