72 lines
2.2 KiB
Python
72 lines
2.2 KiB
Python
import os
|
|
import coloredlogs, logging
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv() # take environment variables
|
|
|
|
# logging.basicConfig(level=logging.INFO)
|
|
logger: logging.Logger = logging.getLogger("crawler")
|
|
coloredlogs.install(level="DEBUG")
|
|
|
|
|
|
def main():
|
|
|
|
documents = []
|
|
|
|
logger.info("[1/2] Splitting and processing documents...")
|
|
# documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
|
|
|
|
directory = os.getenv("CRAWLER_COMPANY_PDF_DIR")
|
|
|
|
if not directory: # Checks for both empty and None
|
|
logger.info(
|
|
"No CRAWLER_COMPANY_PDF_DIR set in the environment variables. Skipping PDF loading."
|
|
)
|
|
else:
|
|
|
|
from rag_system.loaders.pdf_loader import load_pdf
|
|
|
|
pdf_files = [file for file in os.listdir(directory) if file.endswith(".pdf")]
|
|
|
|
logger.info(pdf_files)
|
|
for pdf_file in pdf_files:
|
|
pdf_path = os.path.join(directory, pdf_file)
|
|
logger.info(f"Loading PDF: {pdf_path}")
|
|
documents += load_pdf(pdf_path)
|
|
|
|
url = os.getenv("CRAWLER_COMPANY_URL")
|
|
|
|
if not url: # Checks for both empty and None
|
|
logger.info(
|
|
"No CRAWLER_COMPANY_URL set in the environment variables. Skipping web crawling."
|
|
)
|
|
else:
|
|
from rag_system.loaders.web_loader import load_web_crawl
|
|
|
|
documents += load_web_crawl(url)
|
|
|
|
logger.info("[2/2] Generating and storing embeddings...")
|
|
if not documents:
|
|
logger.info("No documents found to process. Exiting.")
|
|
return
|
|
|
|
from rag_system.vectordb.azure_search import add_documents
|
|
|
|
logger.info(f"Total documents to process: {len(documents)}")
|
|
|
|
for doc in documents:
|
|
doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
|
|
|
|
logger.info("Storing embeddings in Azure Search...")
|
|
# Add the documents to the vector database
|
|
# This function should handle the embedding generation and storage
|
|
# in Azure Search or any other vector database you are using.
|
|
add_documents(documents)
|
|
logger.info("Embeddings stored. You can now run the Streamlit app with:\n")
|
|
logger.info(" streamlit run rag_system/app/streamlit_app.py")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|