Refactored and moved all keys and configuration into .env files and provided samples

2025-05-24 12:33:40 -05:00
parent 4d62015470
commit c381b0434a
9 changed files with 330 additions and 215 deletions
--- a/rag_system/crawler.py
+++ b/rag_system/crawler.py
@@ -1,17 +1,23 @@
+import os
 from rag_system.loaders.pdf_loader import load_pdf
 from rag_system.loaders.web_loader import load_web_crawl
 from rag_system.vectordb.azure_search import add_documents

+from dotenv import load_dotenv
+
+load_dotenv()  # take environment variables
+

 def main():
    print("[1/2] Splitting and processing documents...")
-    pdf_documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
-    # web_documents = load_web_crawl(["https://excalibur.mgmresorts.com/en.html"])
-    # web_documents = load_web_crawl(["https://www.verint.com"])
-    # web_documents = load_web_crawl("https://firecrawl.dev")
+    # pdf_documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
+
+    documents = load_web_crawl(os.getenv("CRAWLER_COMPANY_URL"))
+    for doc in documents:
+        doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
    print("[2/2] Generating and storing embeddings...")
-    add_documents(pdf_documents)
-    # add_documents(web_documents)
+    # add_documents(pdf_documents)
+    add_documents(documents)
    print("Embeddings stored. You can now run the Streamlit app with:\n")
    print("   streamlit run rag_system/app/streamlit_app.py")