initial commit

2025-05-01 12:21:47 -05:00
parent 2b9c4289e7
commit 226b51a6a1
18 changed files with 13479 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -168,3 +168,8 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 chroma/chroma.sqlite3
 chroma/c1d09313-7362-4ef4-b0b1-6b53736ea827/data_level0.bin
 chroma/c1d09313-7362-4ef4-b0b1-6b53736ea827/header.bin
 chroma/c1d09313-7362-4ef4-b0b1-6b53736ea827/length.bin
 chroma/c1d09313-7362-4ef4-b0b1-6b53736ea827/link_lists.bin
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,26 @@
 {
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python:Streamlit",
            "type": "debugpy",
            "request": "launch",
            "module": "streamlit",
            "args": [
                "run",
                "app/streamlit_app.py",
            ]
        },
        {
            "name": "Python Debugger: main.py",
            "type": "debugpy",
            "request": "launch",
            "program": "main.py",
            "console": "integratedTerminal",
            "justMyCode": false
        }
    ]
 }
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
 {
    "nixEnvSelector.nixFile": "${workspaceFolder}/shell.nix"
 }
--- a/app/init.py
+++ b/app/init.py
--- a/app/rag_chain.py
+++ b/app/rag_chain.py
@@ -0,0 +1,39 @@
 from llm.ollama import load_llm
 from vectordb.vector_store import retrieve
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 # Define the prompt template for the LLM
 prompt = PromptTemplate(
    template="""You are an assistant for question-answering tasks.
    Use the following context to answer the question.
    If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise:
    Question: {question}
    Context: {context}
    Answer:
    """,
    input_variables=["question", "documents"],
 )
 def get_rag_response(query):
 	print("⌄⌄⌄⌄ Retrieving ⌄⌄⌄⌄")
 	retrieved_docs, metadata = retrieve(query, 10)
 	print("Query Found %d documents." % len(retrieved_docs[0]))
 	for meta in metadata[0]:
 		print("Metadata: ", meta)
 	print("⌃⌃⌃⌃ Retrieving ⌃⌃⌃⌃ " )
 	print("⌄⌄⌄⌄ Augmented Prompt ⌄⌄⌄⌄")
 	llm = load_llm()
 	# Create a chain combining the prompt template and LLM
 	rag_chain = prompt | llm | StrOutputParser()
 	context = " ".join(retrieved_docs[0]) if retrieved_docs else "No relevant documents found."
 	print("⌃⌃⌃⌃ Augmented Prompt ⌃⌃⌃⌃")
 	print("⌄⌄⌄⌄ Generation ⌄⌄⌄⌄")
 	response = rag_chain.invoke({"question": query, "context": context});
 	print(response)
 	print("⌃⌃⌃⌃ Generation ⌃⌃⌃⌃")
 	return response
--- a/app/streamlit_app.py
+++ b/app/streamlit_app.py
@@ -0,0 +1,9 @@
 import streamlit as st
 from app.rag_chain import get_rag_response
 st.title("RAG System")
 query = st.text_input("Ask a question:")
 if query:
 	response = get_rag_response(query)
 	st.write("### Response:")
 	st.write(response)
--- a/data/verint-responsible-ethical-ai.pdf
+++ b/data/verint-responsible-ethical-ai.pdf
--- a/llm/init.py
+++ b/llm/init.py
--- a/llm/ollama.py
+++ b/llm/ollama.py
@@ -0,0 +1,7 @@
 from langchain_ollama import OllamaLLM
 def load_llm():
    return OllamaLLM(
        model="llama3.2", 
        base_url="http://localhost:11434", 
        temperature=0)
--- a/loaders/init.py
+++ b/loaders/init.py
--- a/loaders/firecrawl.py
+++ b/loaders/firecrawl.py
@@ -0,0 +1,106 @@
 import warnings
 from typing import Iterator, Literal, Optional
 from langchain_core.document_loaders import BaseLoader
 from langchain_core.documents import Document
 from langchain_core.utils import get_from_env
 class FireCrawlLoader(BaseLoader):
    def __init__(
        self,
        url: str,
        *,
        api_key: Optional[str] = None,
        api_url: Optional[str] = None,
        mode: Literal["crawl", "scrape", "map", "extract"] = "crawl",
        params: Optional[dict] = None,
    ):
        """Initialize with API key and url.
        Args:
            url: The url to be crawled.
            api_key: The Firecrawl API key. If not specified will be read from env var
                FIRECRAWL_API_KEY. Get an API key
            api_url: The Firecrawl API URL. If not specified will be read from env var
                FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev.
            mode: The mode to run the loader in. Default is "crawl".
                 Options include "scrape" (single url),
                 "crawl" (all accessible sub pages),
                 "map" (returns list of links that are semantically related).
                 "extract" (extracts structured data from a page).
            params: The parameters to pass to the Firecrawl API.
                Examples include crawlerOptions.
                For more details, visit: https://github.com/mendableai/firecrawl-py
        """
        try:
            from firecrawl import FirecrawlApp
        except ImportError:
            raise ImportError(
                "`firecrawl` package not found, please run `pip install firecrawl-py`"
            )
        if mode not in ("crawl", "scrape", "search", "map", "extract"):
            raise ValueError(
                f"""Invalid mode '{mode}'.
                Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'."""
            )
        if not url:
            raise ValueError("Url must be provided")
        api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY")
        self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url)
        self.url = url
        self.mode = mode
        self.params = params or {}
    def lazy_load(self) -> Iterator[Document]:
        if self.mode == "scrape":
            firecrawl_docs = [
                self.firecrawl.scrape_url(
                    self.url, **self.params
                )
            ]
        elif self.mode == "crawl":
            if not self.url:
                raise ValueError("URL is required for crawl mode")
            crawl_response = self.firecrawl.crawl_url(
                self.url, **self.params
            )
            firecrawl_docs = crawl_response.data or []
        elif self.mode == "map":
            if not self.url:
                raise ValueError("URL is required for map mode")
            firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
        elif self.mode == "extract":
            if not self.url:
                raise ValueError("URL is required for extract mode")
            firecrawl_docs = [
                str(self.firecrawl.extract([self.url], params=self.params))
            ]
        elif self.mode == "search":
            raise ValueError(
                "Search mode is not supported in this version, please downgrade."
            )
        else:
            raise ValueError(
                f"""Invalid mode '{self.mode}'.
                Allowed: 'crawl', 'scrape', 'map', 'extract'."""
            )
        for doc in firecrawl_docs:
            if self.mode == "map" or self.mode == "extract":
                page_content = doc
                metadata = {}
            else:
                page_content = (
                    doc.markdown or doc.html or doc.rawHtml or ""
                )
                metadata = doc.metadata or {}
            if not page_content:
                continue
            yield Document(
                page_content=page_content,
                metadata=metadata,
            )
--- a/loaders/pdf_loader.py
+++ b/loaders/pdf_loader.py
@@ -0,0 +1,16 @@
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import PyPDFLoader
 def load_pdf(file_path):
 	loader = PyPDFLoader(file_path)
 	pages = loader.load()
 	print(f"Loaded {len(pages)} documents from {file_path}")
 	splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
 	splits = splitter.split_documents(pages)
 	documents = []
 	metadatas = []
 	for split in splits:
 		documents.append(split.page_content)
 		metadatas.append(split.metadata)
 	return (documents, metadatas)
--- a/loaders/web_loader.py
+++ b/loaders/web_loader.py
@@ -0,0 +1,37 @@
 from langchain_community.document_loaders import WebBaseLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from loaders.firecrawl import FireCrawlLoader
 def load_web_crawl(url):
    documents = []
    metadatas = []
    loader = FireCrawlLoader(
        url=url, api_key="changeme", api_url="http://localhost:3002", mode="crawl", params={ "limit": 100, "include_paths": ["/.*"], "ignore_sitemap": True, "poll_interval": 5 }
    )
    docs = []
    docs_lazy = loader.load()
    for doc in docs_lazy:
        print('.', end="")
        docs.append(doc)
    print()
    # Load documents from the URLs
    # docs = [WebBaseLoader(url).load() for url in urls]
    # docs_list = [item for sublist in docs for item in sublist]
    # Initialize a text splitter with specified chunk size and overlap
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=250, chunk_overlap=0
    )
    # Split the documents into chunks
    splits = text_splitter.split_documents(docs)
    for split in splits:
        documents.append(split.page_content)
        metadatas.append(split.metadata)
    return (documents, metadatas)
--- a/main.py
+++ b/main.py
@@ -0,0 +1,16 @@
 from loaders.pdf_loader import load_pdf
 from loaders.web_loader import load_web_crawl
 from vectordb.vector_store import add_documents
 def main():
 	print("[1/2] Splitting and processing documents...")
 	# pdf_documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
 	# web_documents = load_web(["https://excalibur.mgmresorts.com/en.html"])
 	web_documents = load_web_crawl("https://firecrawl.dev")
 	print("[2/2] Generating and storing embeddings...")
 	# add_documents(pdf_documents)
 	add_documents(web_documents)
 	print("Embeddings stored. You can now run the Streamlit app with:\n")
 	print("   streamlit run app/streamlit_app.py")
 if __name__ == "__main__":
 	main()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,11 @@
 langchain
 langchain-community
 langchain-chroma
 chromadb
 pypdf
 streamlit
 ollama
 langchain_ollama
 bs4
 tiktoken
 firecrawl-py
--- a/shell.nix
+++ b/shell.nix
@@ -0,0 +1,14 @@
 let
  pkgs = import <nixpkgs> {};
 in pkgs.mkShell {
  packages = [
    (pkgs.python3.withPackages (python-pkgs: [
      python-pkgs.langchain
      python-pkgs.langchain-community
      python-pkgs.chromadb
      python-pkgs.pypdf
      python-pkgs.streamlit
      python-pkgs.ollama
    ]))
  ];
 }
--- a/vectordb/init.py
+++ b/vectordb/init.py
--- a/vectordb/vector_store.py
+++ b/vectordb/vector_store.py
@@ -0,0 +1,53 @@
 from typing import Tuple
 import chromadb
 from langchain_chroma import Chroma
 from uuid import uuid4
 # from chromadb.utils.embedding_functions.ollama_embedding_function import (
 #     OllamaEmbeddingFunction,
 # )
 from langchain_ollama import OllamaEmbeddings
 from chromadb.api.types import (Metadata,Document,OneOrMany)
 # Define a custom embedding function for ChromaDB using Ollama
 class ChromaDBEmbeddingFunction:
    """
    Custom embedding function for ChromaDB using embeddings from Ollama.
    """
    def __init__(self, langchain_embeddings):
        self.langchain_embeddings = langchain_embeddings
    def __call__(self, input):
        # Ensure the input is in a list format for processing
        if isinstance(input, str):
            input = [input]
        return self.langchain_embeddings.embed_documents(input)
 # Initialize the embedding function with Ollama embeddings
 embedding = ChromaDBEmbeddingFunction(
    OllamaEmbeddings(
        model="nomic-embed-text",
        base_url="http://localhost:11434"  # Adjust the base URL as per your Ollama server configuration
    )
 )
 persistent_client = chromadb.PersistentClient()
 collection = persistent_client.get_or_create_collection(
    name="collection_name",
    metadata={"description": "A collection for RAG with Ollama - Demo1"},
    embedding_function=embedding  # Use the custom embedding function)
 )
 def add_documents(documents: Tuple[OneOrMany[Document], OneOrMany[Metadata]]):
    docs, metas = documents
    uuids = [str(uuid4()) for _ in range(len(docs))]
    collection.add(documents=docs, ids=uuids, metadatas=metas)
 def retrieve(query_text, n_results=1):
    # return vector_store.similarity_search(query, k=3)
    results = collection.query(
        query_texts=[query_text],
        n_results=n_results
    )
    return results["documents"], results["metadatas"]