Fix for not applying company field to documents

This commit is contained in:
Peter Morton 2025-05-30 17:31:09 -05:00
parent 4717cb35ab
commit f0503e7aac
2 changed files with 5 additions and 2 deletions

View File

@ -13,6 +13,7 @@ RETRY_TOTAL=4
# Crawler settings
CRAWLER_LOADER=web # web or file
CRAWLER_COMPANY_NAME=Morton
CRAWLER_COMPANY_PDF_DIR=files
CRAWLER_COMPANY_URL=https://blog.mortons.site/
CRAWLER_COMPANY_FILE=data/verint-responsible-ethical-ai.pdf
FIRECRAWL_API_KEY=your-firecrawl-api-key

View File

@ -45,8 +45,6 @@ def main():
from rag_system.loaders.web_loader import load_web_crawl
documents += load_web_crawl(url)
for doc in documents:
doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
logger.info("[2/2] Generating and storing embeddings...")
if not documents:
@ -56,6 +54,10 @@ def main():
from rag_system.vectordb.azure_search import add_documents
logger.info(f"Total documents to process: {len(documents)}")
for doc in documents:
doc.metadata["company"] = os.getenv("CRAWLER_COMPANY_NAME")
logger.info("Storing embeddings in Azure Search...")
# Add the documents to the vector database
# This function should handle the embedding generation and storage