initial commit

2025-05-01 12:21:47 -05:00 · 2025-05-01 12:21:47 -05:00 · 226b51a6a1
commit 226b51a6a1
parent 2b9c4289e7
18 changed files with 13479 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -168,3 +168,8 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/

+chroma/chroma.sqlite3
+chroma/c1d09313-7362-4ef4-b0b1-6b53736ea827/data_level0.bin
+chroma/c1d09313-7362-4ef4-b0b1-6b53736ea827/header.bin
+chroma/c1d09313-7362-4ef4-b0b1-6b53736ea827/length.bin
+chroma/c1d09313-7362-4ef4-b0b1-6b53736ea827/link_lists.bin
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,26 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python:Streamlit",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "streamlit",
+            "args": [
+                "run",
+                "app/streamlit_app.py",
+            ]
+        },
+        {
+            "name": "Python Debugger: main.py",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "main.py",
+            "console": "integratedTerminal",
+            "justMyCode": false
+        }
+    ]
+}
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,3 @@
+{
+    "nixEnvSelector.nixFile": "${workspaceFolder}/shell.nix"
+}
--- a/app/init.py
+++ b/app/init.py
--- a/app/rag_chain.py
+++ b/app/rag_chain.py
@ -0,0 +1,39 @@
+from llm.ollama import load_llm
+from vectordb.vector_store import retrieve
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+
+# Define the prompt template for the LLM
+prompt = PromptTemplate(
+    template="""You are an assistant for question-answering tasks.
+    Use the following context to answer the question.
+    If you don't know the answer, just say that you don't know.
+    Use three sentences maximum and keep the answer concise:
+    Question: {question}
+    Context: {context}
+    Answer:
+    """,
+    input_variables=["question", "documents"],
+)
+
+def get_rag_response(query):
+	print("⌄⌄⌄⌄ Retrieving ⌄⌄⌄⌄")
+	retrieved_docs, metadata = retrieve(query, 10)
+	print("Query Found %d documents." % len(retrieved_docs[0]))
+	for meta in metadata[0]:
+		print("Metadata: ", meta)
+	print("⌃⌃⌃⌃ Retrieving ⌃⌃⌃⌃ " )
+
+	print("⌄⌄⌄⌄ Augmented Prompt ⌄⌄⌄⌄")
+	llm = load_llm()
+	# Create a chain combining the prompt template and LLM
+	rag_chain = prompt | llm | StrOutputParser()
+	context = " ".join(retrieved_docs[0]) if retrieved_docs else "No relevant documents found."
+	print("⌃⌃⌃⌃ Augmented Prompt ⌃⌃⌃⌃")
+	
+	print("⌄⌄⌄⌄ Generation ⌄⌄⌄⌄")
+	response = rag_chain.invoke({"question": query, "context": context});
+	print(response)
+	print("⌃⌃⌃⌃ Generation ⌃⌃⌃⌃")
+ 
+	return response
--- a/app/streamlit_app.py
+++ b/app/streamlit_app.py
@ -0,0 +1,9 @@
+import streamlit as st
+from app.rag_chain import get_rag_response
+
+st.title("RAG System")
+query = st.text_input("Ask a question:")
+if query:
+	response = get_rag_response(query)
+	st.write("### Response:")
+	st.write(response)
--- a/data/verint-responsible-ethical-ai.pdf
+++ b/data/verint-responsible-ethical-ai.pdf
--- a/llm/init.py
+++ b/llm/init.py
--- a/llm/ollama.py
+++ b/llm/ollama.py
@ -0,0 +1,7 @@
+from langchain_ollama import OllamaLLM
+
+def load_llm():
+    return OllamaLLM(
+        model="llama3.2", 
+        base_url="http://localhost:11434", 
+        temperature=0)
--- a/loaders/init.py
+++ b/loaders/init.py
--- a/loaders/firecrawl.py
+++ b/loaders/firecrawl.py
@ -0,0 +1,106 @@
+import warnings
+from typing import Iterator, Literal, Optional
+
+from langchain_core.document_loaders import BaseLoader
+from langchain_core.documents import Document
+from langchain_core.utils import get_from_env
+
+
+class FireCrawlLoader(BaseLoader):
+
+    def __init__(
+        self,
+        url: str,
+        *,
+        api_key: Optional[str] = None,
+        api_url: Optional[str] = None,
+        mode: Literal["crawl", "scrape", "map", "extract"] = "crawl",
+        params: Optional[dict] = None,
+    ):
+        """Initialize with API key and url.
+
+        Args:
+            url: The url to be crawled.
+            api_key: The Firecrawl API key. If not specified will be read from env var
+                FIRECRAWL_API_KEY. Get an API key
+            api_url: The Firecrawl API URL. If not specified will be read from env var
+                FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev.
+            mode: The mode to run the loader in. Default is "crawl".
+                 Options include "scrape" (single url),
+                 "crawl" (all accessible sub pages),
+                 "map" (returns list of links that are semantically related).
+                 "extract" (extracts structured data from a page).
+            params: The parameters to pass to the Firecrawl API.
+                Examples include crawlerOptions.
+                For more details, visit: https://github.com/mendableai/firecrawl-py
+        """
+
+        try:
+            from firecrawl import FirecrawlApp
+        except ImportError:
+            raise ImportError(
+                "`firecrawl` package not found, please run `pip install firecrawl-py`"
+            )
+        if mode not in ("crawl", "scrape", "search", "map", "extract"):
+            raise ValueError(
+                f"""Invalid mode '{mode}'.
+                Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'."""
+            )
+
+        if not url:
+            raise ValueError("Url must be provided")
+
+        api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY")
+        self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url)
+        self.url = url
+        self.mode = mode
+        self.params = params or {}
+
+    def lazy_load(self) -> Iterator[Document]:
+        if self.mode == "scrape":
+            firecrawl_docs = [
+                self.firecrawl.scrape_url(
+                    self.url, **self.params
+                )
+            ]
+        elif self.mode == "crawl":
+            if not self.url:
+                raise ValueError("URL is required for crawl mode")
+            crawl_response = self.firecrawl.crawl_url(
+                self.url, **self.params
+            )
+            firecrawl_docs = crawl_response.data or []
+        elif self.mode == "map":
+            if not self.url:
+                raise ValueError("URL is required for map mode")
+            firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
+        elif self.mode == "extract":
+            if not self.url:
+                raise ValueError("URL is required for extract mode")
+            firecrawl_docs = [
+                str(self.firecrawl.extract([self.url], params=self.params))
+            ]
+        elif self.mode == "search":
+            raise ValueError(
+                "Search mode is not supported in this version, please downgrade."
+            )
+        else:
+            raise ValueError(
+                f"""Invalid mode '{self.mode}'.
+                Allowed: 'crawl', 'scrape', 'map', 'extract'."""
+            )
+        for doc in firecrawl_docs:
+            if self.mode == "map" or self.mode == "extract":
+                page_content = doc
+                metadata = {}
+            else:
+                page_content = (
+                    doc.markdown or doc.html or doc.rawHtml or ""
+                )
+                metadata = doc.metadata or {}
+            if not page_content:
+                continue
+            yield Document(
+                page_content=page_content,
+                metadata=metadata,
+            )
--- a/loaders/pdf_loader.py
+++ b/loaders/pdf_loader.py
@ -0,0 +1,16 @@
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+def load_pdf(file_path):
+	loader = PyPDFLoader(file_path)
+	pages = loader.load()
+	print(f"Loaded {len(pages)} documents from {file_path}")
+	splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+	splits = splitter.split_documents(pages)
+	documents = []
+	metadatas = []
+ 
+	for split in splits:
+		documents.append(split.page_content)
+		metadatas.append(split.metadata)
+     
+	return (documents, metadatas)
--- a/loaders/web_loader.py
+++ b/loaders/web_loader.py
@ -0,0 +1,37 @@
+from langchain_community.document_loaders import WebBaseLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from loaders.firecrawl import FireCrawlLoader
+
+
+
+def load_web_crawl(url):
+    
+    documents = []
+    metadatas = []
+    
+    loader = FireCrawlLoader(
+        url=url, api_key="changeme", api_url="http://localhost:3002", mode="crawl", params={ "limit": 100, "include_paths": ["/.*"], "ignore_sitemap": True, "poll_interval": 5 }
+    )
+    docs = []
+    docs_lazy = loader.load()
+    for doc in docs_lazy:
+        print('.', end="")
+        docs.append(doc)
+    print()
+    
+    
+    # Load documents from the URLs
+    # docs = [WebBaseLoader(url).load() for url in urls]
+    # docs_list = [item for sublist in docs for item in sublist]
+    # Initialize a text splitter with specified chunk size and overlap
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+        chunk_size=250, chunk_overlap=0
+    )
+    # Split the documents into chunks
+    splits = text_splitter.split_documents(docs)
+
+    for split in splits:
+        documents.append(split.page_content)
+        metadatas.append(split.metadata)
+     
+    return (documents, metadatas)
--- a/main.py
+++ b/main.py
@ -0,0 +1,16 @@
+from loaders.pdf_loader import load_pdf
+from loaders.web_loader import load_web_crawl
+from vectordb.vector_store import add_documents
+
+def main():
+	print("[1/2] Splitting and processing documents...")
+	# pdf_documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
+	# web_documents = load_web(["https://excalibur.mgmresorts.com/en.html"])
+	web_documents = load_web_crawl("https://firecrawl.dev")
+	print("[2/2] Generating and storing embeddings...")
+	# add_documents(pdf_documents)
+	add_documents(web_documents)
+	print("Embeddings stored. You can now run the Streamlit app with:\n")
+	print("   streamlit run app/streamlit_app.py")
+if __name__ == "__main__":
+	main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,11 @@
+langchain
+langchain-community
+langchain-chroma
+chromadb
+pypdf
+streamlit
+ollama
+langchain_ollama
+bs4
+tiktoken
+firecrawl-py
--- a/shell.nix
+++ b/shell.nix
@ -0,0 +1,14 @@
+let
+  pkgs = import <nixpkgs> {};
+in pkgs.mkShell {
+  packages = [
+    (pkgs.python3.withPackages (python-pkgs: [
+      python-pkgs.langchain
+      python-pkgs.langchain-community
+      python-pkgs.chromadb
+      python-pkgs.pypdf
+      python-pkgs.streamlit
+      python-pkgs.ollama
+    ]))
+  ];
+}
--- a/vectordb/init.py
+++ b/vectordb/init.py
--- a/vectordb/vector_store.py
+++ b/vectordb/vector_store.py
@ -0,0 +1,53 @@
+from typing import Tuple
+import chromadb
+from langchain_chroma import Chroma
+from uuid import uuid4
+# from chromadb.utils.embedding_functions.ollama_embedding_function import (
+#     OllamaEmbeddingFunction,
+# )
+from langchain_ollama import OllamaEmbeddings
+from chromadb.api.types import (Metadata,Document,OneOrMany)
+
+
+# Define a custom embedding function for ChromaDB using Ollama
+class ChromaDBEmbeddingFunction:
+    """
+    Custom embedding function for ChromaDB using embeddings from Ollama.
+    """
+    def __init__(self, langchain_embeddings):
+        self.langchain_embeddings = langchain_embeddings
+
+    def __call__(self, input):
+        # Ensure the input is in a list format for processing
+        if isinstance(input, str):
+            input = [input]
+        return self.langchain_embeddings.embed_documents(input)
+
+# Initialize the embedding function with Ollama embeddings
+embedding = ChromaDBEmbeddingFunction(
+    OllamaEmbeddings(
+        model="nomic-embed-text",
+        base_url="http://localhost:11434"  # Adjust the base URL as per your Ollama server configuration
+    )
+)
+
+
+persistent_client = chromadb.PersistentClient()
+collection = persistent_client.get_or_create_collection(
+    name="collection_name",
+    metadata={"description": "A collection for RAG with Ollama - Demo1"},
+    embedding_function=embedding  # Use the custom embedding function)
+)
+
+def add_documents(documents: Tuple[OneOrMany[Document], OneOrMany[Metadata]]):
+    docs, metas = documents
+    uuids = [str(uuid4()) for _ in range(len(docs))]
+    collection.add(documents=docs, ids=uuids, metadatas=metas)
+    
+def retrieve(query_text, n_results=1):
+    # return vector_store.similarity_search(query, k=3)
+    results = collection.query(
+        query_texts=[query_text],
+        n_results=n_results
+    )
+    return results["documents"], results["metadatas"]