import os import coloredlogs, logging from dotenv import load_dotenv load_dotenv() # take environment variables # logging.basicConfig(level=logging.INFO) logger: logging.Logger = logging.getLogger("crawler") coloredlogs.install(level="DEBUG") def main(): documents = [] logger.info("[1/2] Splitting and processing documents...") # documents = load_pdf("data/verint-responsible-ethical-ai.pdf") directory = os.getenv("CRAWLER_COMPANY_PDF_DIR") if not directory: # Checks for both empty and None logger.info( "No CRAWLER_COMPANY_PDF_DIR set in the environment variables. Skipping PDF loading." ) else: from rag_system.loaders.pdf_loader import load_pdf pdf_files = [file for file in os.listdir(directory) if file.endswith(".pdf")] logger.info(pdf_files) for pdf_file in pdf_files: pdf_path = os.path.join(directory, pdf_file) logger.info(f"Loading PDF: {pdf_path}") documents += load_pdf(pdf_path) url = os.getenv("CRAWLER_COMPANY_URL") if not url: # Checks for both empty and None logger.info( "No CRAWLER_COMPANY_URL set in the environment variables. Skipping web crawling." ) else: from rag_system.loaders.web_loader import load_web_crawl documents += load_web_crawl(url) logger.info("[2/2] Generating and storing embeddings...") if not documents: logger.info("No documents found to process. Exiting.") return from rag_system.vectordb.azure_search import add_documents logger.info(f"Total documents to process: {len(documents)}") for doc in documents: doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME") logger.info("Storing embeddings in Azure Search...") # Add the documents to the vector database # This function should handle the embedding generation and storage # in Azure Search or any other vector database you are using. add_documents(documents) logger.info("Embeddings stored. You can now run the Streamlit app with:\n") logger.info(" streamlit run rag_system/app/streamlit_app.py") if __name__ == "__main__": main()