From 2552d867da7089b6763d97ca2c052e6b13e59944 Mon Sep 17 00:00:00 2001 From: "Peter.Morton" Date: Fri, 30 May 2025 16:27:48 -0500 Subject: [PATCH] Added logger --- rag_system/loaders/pdf_loader.py | 6 ++- rag_system/loaders/web_loader.py | 17 ++++----- rag_system/vectordb/azure_search.py | 58 +++++------------------------ 3 files changed, 21 insertions(+), 60 deletions(-) diff --git a/rag_system/loaders/pdf_loader.py b/rag_system/loaders/pdf_loader.py index 3b3ece8..c4b0203 100644 --- a/rag_system/loaders/pdf_loader.py +++ b/rag_system/loaders/pdf_loader.py @@ -1,11 +1,15 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader +import logging + +logger: logging.Logger = logging.getLogger("pdf_loader") + def load_pdf(file_path): loader = PyPDFLoader(file_path) splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) documents = loader.load_and_split(splitter) - print(f"Loaded and Split into {len(documents)} documents from {file_path}") + logger.info(f"Loaded and Split into {len(documents)} documents from {file_path}") return documents diff --git a/rag_system/loaders/web_loader.py b/rag_system/loaders/web_loader.py index 33a3ce5..d62488e 100644 --- a/rag_system/loaders/web_loader.py +++ b/rag_system/loaders/web_loader.py @@ -2,6 +2,8 @@ import json import os import logging +logger: logging.Logger = logging.getLogger("web_loader") + from langchain_community.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from rag_system.loaders.firecrawl import FireCrawlLoader @@ -11,9 +13,6 @@ from dotenv import load_dotenv load_dotenv() # take environment variables] -# Configure the logging -logging.basicConfig(level=logging.INFO) - firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") firecrawl_api_url = os.getenv("FIRECRAWL_API_URL") firecrawl_mode = os.getenv("FIRECRAWL_MODE") @@ -27,14 +26,14 @@ if firecrawl_params["scrape_options"]: ) -logging.info(f"web_loader firecrawl_params: {firecrawl_params}") -logging.info(f"web_loader firecrawl_api_url: {firecrawl_api_url}") -logging.info(f"web_loader firecrawl_mode: {firecrawl_mode}") -logging.info(f"web_loader firecrawl_params: {firecrawl_params}") +logger.info(f"web_loader firecrawl_params: {firecrawl_params}") +logger.info(f"web_loader firecrawl_api_url: {firecrawl_api_url}") +logger.info(f"web_loader firecrawl_mode: {firecrawl_mode}") +logger.info(f"web_loader firecrawl_params: {firecrawl_params}") def load_web_crawl(url): - logging.info(f"load_web_crawl url: {url}") + logger.info(f"load_web_crawl url: {url}") loader = FireCrawlLoader( url=url, @@ -46,9 +45,7 @@ def load_web_crawl(url): docs = [] docs_lazy = loader.load() for doc in docs_lazy: - print(".", end="") docs.append(doc) - print() # Load documents from the URLs # docs = [WebBaseLoader(url).load() for url in urls] diff --git a/rag_system/vectordb/azure_search.py b/rag_system/vectordb/azure_search.py index d6278a0..96c4360 100644 --- a/rag_system/vectordb/azure_search.py +++ b/rag_system/vectordb/azure_search.py @@ -1,4 +1,7 @@ import os +import logging + +logger: logging.Logger = logging.getLogger("azure_search") from typing import Tuple from langchain_community.vectorstores.azuresearch import ( @@ -106,24 +109,24 @@ def get_document_id(document): raise ValueError("Document does not have a valid ID.") -def delete_all_documents(): +def delete_documents(filters: str = None): """ - Delete all documents from the AzureSearch vector store. + Delete documents from the AzureSearch vector store. """ try: docs_to_delete = [] while True: # Delete all documents in the index - docs_to_delete = retrieve("", 10) - + docs_to_delete = vector_store.vector_search("*", 20, filters=filters) + logger.debug("Deleting documents: %s", docs_to_delete) vector_store.delete(list(map(get_document_id, docs_to_delete))) if len(docs_to_delete) > 0: continue else: break - print("All documents deleted successfully.") + logger.debug("All documents deleted successfully.") except Exception as e: print(f"Error deleting documents: {str(e)}") @@ -134,7 +137,7 @@ def add_documents(documents): try: vector_store.add_documents(documents) except Exception as e: - print(f"Error adding document to vector store: {str(e)}") + logger.error(f"Error adding document to vector store: {str(e)}") def retrieve(query_text, n_results=1): @@ -145,46 +148,3 @@ def retrieve(query_text, n_results=1): search_type="similarity", ) return docs - - -# def add_document_to_vector_store(document): -# """ -# Add a document to the AzureSearch vector store. - -# Args: -# vector_store: The initialized AzureSearch vector store instance. -# document: A dictionary or object representing the document to be added. -# Example format: -# { -# "id": "unique_document_id", -# "content": "The text content of the document", -# "metadata": { -# "source": "source_url", -# "created": "2025-03-04T14:14:40.421666", -# "modified": "2025-03-04T14:14:40.421666" -# } -# } -# """ -# try: - -# # Add the document to the vector store -# vector_store.add_documents([document]) -# print(f"Document with ID {document['id']} added successfully.") -# except Exception as e: -# print(f"Error adding document to vector store: {str(e)}") - -# add_document_to_vector_store("https://api.python.langchain.com/en/latest/langchain_api_reference.html",None) -# Example document to add - -# doc = Document( -# page_content="This is the content of the document.For testing IVA demo integration ", -# metadata= { -# "source": "https://example.com/source", -# "created": "2025-03-04T14:14:40.421666", -# "modified": "2025-03-04T14:14:40.421666" -# } -# ) -# Add the document to the vector store -# add_document_to_vector_store( doc) - -# result = retrieve("iva",1)