initial commit
This commit is contained in:
parent
2b9c4289e7
commit
226b51a6a1
5
.gitignore
vendored
5
.gitignore
vendored
@ -168,3 +168,8 @@ cython_debug/
|
|||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
chroma/chroma.sqlite3
|
||||||
|
chroma/c1d09313-7362-4ef4-b0b1-6b53736ea827/data_level0.bin
|
||||||
|
chroma/c1d09313-7362-4ef4-b0b1-6b53736ea827/header.bin
|
||||||
|
chroma/c1d09313-7362-4ef4-b0b1-6b53736ea827/length.bin
|
||||||
|
chroma/c1d09313-7362-4ef4-b0b1-6b53736ea827/link_lists.bin
|
||||||
|
|||||||
26
.vscode/launch.json
vendored
Normal file
26
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
// Use IntelliSense to learn about possible attributes.
|
||||||
|
// Hover to view descriptions of existing attributes.
|
||||||
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Python:Streamlit",
|
||||||
|
"type": "debugpy",
|
||||||
|
"request": "launch",
|
||||||
|
"module": "streamlit",
|
||||||
|
"args": [
|
||||||
|
"run",
|
||||||
|
"app/streamlit_app.py",
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Python Debugger: main.py",
|
||||||
|
"type": "debugpy",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "main.py",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"justMyCode": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
{
|
||||||
|
"nixEnvSelector.nixFile": "${workspaceFolder}/shell.nix"
|
||||||
|
}
|
||||||
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
39
app/rag_chain.py
Normal file
39
app/rag_chain.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
from llm.ollama import load_llm
|
||||||
|
from vectordb.vector_store import retrieve
|
||||||
|
from langchain.prompts import PromptTemplate
|
||||||
|
from langchain_core.output_parsers import StrOutputParser
|
||||||
|
|
||||||
|
# Define the prompt template for the LLM
|
||||||
|
prompt = PromptTemplate(
|
||||||
|
template="""You are an assistant for question-answering tasks.
|
||||||
|
Use the following context to answer the question.
|
||||||
|
If you don't know the answer, just say that you don't know.
|
||||||
|
Use three sentences maximum and keep the answer concise:
|
||||||
|
Question: {question}
|
||||||
|
Context: {context}
|
||||||
|
Answer:
|
||||||
|
""",
|
||||||
|
input_variables=["question", "documents"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_rag_response(query):
|
||||||
|
print("⌄⌄⌄⌄ Retrieving ⌄⌄⌄⌄")
|
||||||
|
retrieved_docs, metadata = retrieve(query, 10)
|
||||||
|
print("Query Found %d documents." % len(retrieved_docs[0]))
|
||||||
|
for meta in metadata[0]:
|
||||||
|
print("Metadata: ", meta)
|
||||||
|
print("⌃⌃⌃⌃ Retrieving ⌃⌃⌃⌃ " )
|
||||||
|
|
||||||
|
print("⌄⌄⌄⌄ Augmented Prompt ⌄⌄⌄⌄")
|
||||||
|
llm = load_llm()
|
||||||
|
# Create a chain combining the prompt template and LLM
|
||||||
|
rag_chain = prompt | llm | StrOutputParser()
|
||||||
|
context = " ".join(retrieved_docs[0]) if retrieved_docs else "No relevant documents found."
|
||||||
|
print("⌃⌃⌃⌃ Augmented Prompt ⌃⌃⌃⌃")
|
||||||
|
|
||||||
|
print("⌄⌄⌄⌄ Generation ⌄⌄⌄⌄")
|
||||||
|
response = rag_chain.invoke({"question": query, "context": context});
|
||||||
|
print(response)
|
||||||
|
print("⌃⌃⌃⌃ Generation ⌃⌃⌃⌃")
|
||||||
|
|
||||||
|
return response
|
||||||
9
app/streamlit_app.py
Normal file
9
app/streamlit_app.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
import streamlit as st
|
||||||
|
from app.rag_chain import get_rag_response
|
||||||
|
|
||||||
|
st.title("RAG System")
|
||||||
|
query = st.text_input("Ask a question:")
|
||||||
|
if query:
|
||||||
|
response = get_rag_response(query)
|
||||||
|
st.write("### Response:")
|
||||||
|
st.write(response)
|
||||||
13137
data/verint-responsible-ethical-ai.pdf
Normal file
13137
data/verint-responsible-ethical-ai.pdf
Normal file
File diff suppressed because one or more lines are too long
0
llm/__init__.py
Normal file
0
llm/__init__.py
Normal file
7
llm/ollama.py
Normal file
7
llm/ollama.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
from langchain_ollama import OllamaLLM
|
||||||
|
|
||||||
|
def load_llm():
|
||||||
|
return OllamaLLM(
|
||||||
|
model="llama3.2",
|
||||||
|
base_url="http://localhost:11434",
|
||||||
|
temperature=0)
|
||||||
0
loaders/__init__.py
Normal file
0
loaders/__init__.py
Normal file
106
loaders/firecrawl.py
Normal file
106
loaders/firecrawl.py
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
import warnings
|
||||||
|
from typing import Iterator, Literal, Optional
|
||||||
|
|
||||||
|
from langchain_core.document_loaders import BaseLoader
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.utils import get_from_env
|
||||||
|
|
||||||
|
|
||||||
|
class FireCrawlLoader(BaseLoader):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
api_url: Optional[str] = None,
|
||||||
|
mode: Literal["crawl", "scrape", "map", "extract"] = "crawl",
|
||||||
|
params: Optional[dict] = None,
|
||||||
|
):
|
||||||
|
"""Initialize with API key and url.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The url to be crawled.
|
||||||
|
api_key: The Firecrawl API key. If not specified will be read from env var
|
||||||
|
FIRECRAWL_API_KEY. Get an API key
|
||||||
|
api_url: The Firecrawl API URL. If not specified will be read from env var
|
||||||
|
FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev.
|
||||||
|
mode: The mode to run the loader in. Default is "crawl".
|
||||||
|
Options include "scrape" (single url),
|
||||||
|
"crawl" (all accessible sub pages),
|
||||||
|
"map" (returns list of links that are semantically related).
|
||||||
|
"extract" (extracts structured data from a page).
|
||||||
|
params: The parameters to pass to the Firecrawl API.
|
||||||
|
Examples include crawlerOptions.
|
||||||
|
For more details, visit: https://github.com/mendableai/firecrawl-py
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
from firecrawl import FirecrawlApp
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`firecrawl` package not found, please run `pip install firecrawl-py`"
|
||||||
|
)
|
||||||
|
if mode not in ("crawl", "scrape", "search", "map", "extract"):
|
||||||
|
raise ValueError(
|
||||||
|
f"""Invalid mode '{mode}'.
|
||||||
|
Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'."""
|
||||||
|
)
|
||||||
|
|
||||||
|
if not url:
|
||||||
|
raise ValueError("Url must be provided")
|
||||||
|
|
||||||
|
api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY")
|
||||||
|
self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url)
|
||||||
|
self.url = url
|
||||||
|
self.mode = mode
|
||||||
|
self.params = params or {}
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
|
if self.mode == "scrape":
|
||||||
|
firecrawl_docs = [
|
||||||
|
self.firecrawl.scrape_url(
|
||||||
|
self.url, **self.params
|
||||||
|
)
|
||||||
|
]
|
||||||
|
elif self.mode == "crawl":
|
||||||
|
if not self.url:
|
||||||
|
raise ValueError("URL is required for crawl mode")
|
||||||
|
crawl_response = self.firecrawl.crawl_url(
|
||||||
|
self.url, **self.params
|
||||||
|
)
|
||||||
|
firecrawl_docs = crawl_response.data or []
|
||||||
|
elif self.mode == "map":
|
||||||
|
if not self.url:
|
||||||
|
raise ValueError("URL is required for map mode")
|
||||||
|
firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
|
||||||
|
elif self.mode == "extract":
|
||||||
|
if not self.url:
|
||||||
|
raise ValueError("URL is required for extract mode")
|
||||||
|
firecrawl_docs = [
|
||||||
|
str(self.firecrawl.extract([self.url], params=self.params))
|
||||||
|
]
|
||||||
|
elif self.mode == "search":
|
||||||
|
raise ValueError(
|
||||||
|
"Search mode is not supported in this version, please downgrade."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"""Invalid mode '{self.mode}'.
|
||||||
|
Allowed: 'crawl', 'scrape', 'map', 'extract'."""
|
||||||
|
)
|
||||||
|
for doc in firecrawl_docs:
|
||||||
|
if self.mode == "map" or self.mode == "extract":
|
||||||
|
page_content = doc
|
||||||
|
metadata = {}
|
||||||
|
else:
|
||||||
|
page_content = (
|
||||||
|
doc.markdown or doc.html or doc.rawHtml or ""
|
||||||
|
)
|
||||||
|
metadata = doc.metadata or {}
|
||||||
|
if not page_content:
|
||||||
|
continue
|
||||||
|
yield Document(
|
||||||
|
page_content=page_content,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
16
loaders/pdf_loader.py
Normal file
16
loaders/pdf_loader.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
from langchain_community.document_loaders import PyPDFLoader
|
||||||
|
def load_pdf(file_path):
|
||||||
|
loader = PyPDFLoader(file_path)
|
||||||
|
pages = loader.load()
|
||||||
|
print(f"Loaded {len(pages)} documents from {file_path}")
|
||||||
|
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
||||||
|
splits = splitter.split_documents(pages)
|
||||||
|
documents = []
|
||||||
|
metadatas = []
|
||||||
|
|
||||||
|
for split in splits:
|
||||||
|
documents.append(split.page_content)
|
||||||
|
metadatas.append(split.metadata)
|
||||||
|
|
||||||
|
return (documents, metadatas)
|
||||||
37
loaders/web_loader.py
Normal file
37
loaders/web_loader.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
from langchain_community.document_loaders import WebBaseLoader
|
||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
from loaders.firecrawl import FireCrawlLoader
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def load_web_crawl(url):
|
||||||
|
|
||||||
|
documents = []
|
||||||
|
metadatas = []
|
||||||
|
|
||||||
|
loader = FireCrawlLoader(
|
||||||
|
url=url, api_key="changeme", api_url="http://localhost:3002", mode="crawl", params={ "limit": 100, "include_paths": ["/.*"], "ignore_sitemap": True, "poll_interval": 5 }
|
||||||
|
)
|
||||||
|
docs = []
|
||||||
|
docs_lazy = loader.load()
|
||||||
|
for doc in docs_lazy:
|
||||||
|
print('.', end="")
|
||||||
|
docs.append(doc)
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# Load documents from the URLs
|
||||||
|
# docs = [WebBaseLoader(url).load() for url in urls]
|
||||||
|
# docs_list = [item for sublist in docs for item in sublist]
|
||||||
|
# Initialize a text splitter with specified chunk size and overlap
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
||||||
|
chunk_size=250, chunk_overlap=0
|
||||||
|
)
|
||||||
|
# Split the documents into chunks
|
||||||
|
splits = text_splitter.split_documents(docs)
|
||||||
|
|
||||||
|
for split in splits:
|
||||||
|
documents.append(split.page_content)
|
||||||
|
metadatas.append(split.metadata)
|
||||||
|
|
||||||
|
return (documents, metadatas)
|
||||||
16
main.py
Normal file
16
main.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from loaders.pdf_loader import load_pdf
|
||||||
|
from loaders.web_loader import load_web_crawl
|
||||||
|
from vectordb.vector_store import add_documents
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("[1/2] Splitting and processing documents...")
|
||||||
|
# pdf_documents = load_pdf("data/verint-responsible-ethical-ai.pdf")
|
||||||
|
# web_documents = load_web(["https://excalibur.mgmresorts.com/en.html"])
|
||||||
|
web_documents = load_web_crawl("https://firecrawl.dev")
|
||||||
|
print("[2/2] Generating and storing embeddings...")
|
||||||
|
# add_documents(pdf_documents)
|
||||||
|
add_documents(web_documents)
|
||||||
|
print("Embeddings stored. You can now run the Streamlit app with:\n")
|
||||||
|
print(" streamlit run app/streamlit_app.py")
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
11
requirements.txt
Normal file
11
requirements.txt
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
langchain
|
||||||
|
langchain-community
|
||||||
|
langchain-chroma
|
||||||
|
chromadb
|
||||||
|
pypdf
|
||||||
|
streamlit
|
||||||
|
ollama
|
||||||
|
langchain_ollama
|
||||||
|
bs4
|
||||||
|
tiktoken
|
||||||
|
firecrawl-py
|
||||||
14
shell.nix
Normal file
14
shell.nix
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
let
|
||||||
|
pkgs = import <nixpkgs> {};
|
||||||
|
in pkgs.mkShell {
|
||||||
|
packages = [
|
||||||
|
(pkgs.python3.withPackages (python-pkgs: [
|
||||||
|
python-pkgs.langchain
|
||||||
|
python-pkgs.langchain-community
|
||||||
|
python-pkgs.chromadb
|
||||||
|
python-pkgs.pypdf
|
||||||
|
python-pkgs.streamlit
|
||||||
|
python-pkgs.ollama
|
||||||
|
]))
|
||||||
|
];
|
||||||
|
}
|
||||||
0
vectordb/__init__.py
Normal file
0
vectordb/__init__.py
Normal file
53
vectordb/vector_store.py
Normal file
53
vectordb/vector_store.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
from typing import Tuple
|
||||||
|
import chromadb
|
||||||
|
from langchain_chroma import Chroma
|
||||||
|
from uuid import uuid4
|
||||||
|
# from chromadb.utils.embedding_functions.ollama_embedding_function import (
|
||||||
|
# OllamaEmbeddingFunction,
|
||||||
|
# )
|
||||||
|
from langchain_ollama import OllamaEmbeddings
|
||||||
|
from chromadb.api.types import (Metadata,Document,OneOrMany)
|
||||||
|
|
||||||
|
|
||||||
|
# Define a custom embedding function for ChromaDB using Ollama
|
||||||
|
class ChromaDBEmbeddingFunction:
|
||||||
|
"""
|
||||||
|
Custom embedding function for ChromaDB using embeddings from Ollama.
|
||||||
|
"""
|
||||||
|
def __init__(self, langchain_embeddings):
|
||||||
|
self.langchain_embeddings = langchain_embeddings
|
||||||
|
|
||||||
|
def __call__(self, input):
|
||||||
|
# Ensure the input is in a list format for processing
|
||||||
|
if isinstance(input, str):
|
||||||
|
input = [input]
|
||||||
|
return self.langchain_embeddings.embed_documents(input)
|
||||||
|
|
||||||
|
# Initialize the embedding function with Ollama embeddings
|
||||||
|
embedding = ChromaDBEmbeddingFunction(
|
||||||
|
OllamaEmbeddings(
|
||||||
|
model="nomic-embed-text",
|
||||||
|
base_url="http://localhost:11434" # Adjust the base URL as per your Ollama server configuration
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
persistent_client = chromadb.PersistentClient()
|
||||||
|
collection = persistent_client.get_or_create_collection(
|
||||||
|
name="collection_name",
|
||||||
|
metadata={"description": "A collection for RAG with Ollama - Demo1"},
|
||||||
|
embedding_function=embedding # Use the custom embedding function)
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_documents(documents: Tuple[OneOrMany[Document], OneOrMany[Metadata]]):
|
||||||
|
docs, metas = documents
|
||||||
|
uuids = [str(uuid4()) for _ in range(len(docs))]
|
||||||
|
collection.add(documents=docs, ids=uuids, metadatas=metas)
|
||||||
|
|
||||||
|
def retrieve(query_text, n_results=1):
|
||||||
|
# return vector_store.similarity_search(query, k=3)
|
||||||
|
results = collection.query(
|
||||||
|
query_texts=[query_text],
|
||||||
|
n_results=n_results
|
||||||
|
)
|
||||||
|
return results["documents"], results["metadatas"]
|
||||||
Loading…
x
Reference in New Issue
Block a user