initial commit

This commit is contained in:
2025-05-01 12:21:47 -05:00
parent 2b9c4289e7
commit 226b51a6a1
18 changed files with 13479 additions and 0 deletions

0
loaders/__init__.py Normal file
View File

106
loaders/firecrawl.py Normal file
View File

@@ -0,0 +1,106 @@
import warnings
from typing import Iterator, Literal, Optional
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from langchain_core.utils import get_from_env
class FireCrawlLoader(BaseLoader):
def __init__(
self,
url: str,
*,
api_key: Optional[str] = None,
api_url: Optional[str] = None,
mode: Literal["crawl", "scrape", "map", "extract"] = "crawl",
params: Optional[dict] = None,
):
"""Initialize with API key and url.
Args:
url: The url to be crawled.
api_key: The Firecrawl API key. If not specified will be read from env var
FIRECRAWL_API_KEY. Get an API key
api_url: The Firecrawl API URL. If not specified will be read from env var
FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev.
mode: The mode to run the loader in. Default is "crawl".
Options include "scrape" (single url),
"crawl" (all accessible sub pages),
"map" (returns list of links that are semantically related).
"extract" (extracts structured data from a page).
params: The parameters to pass to the Firecrawl API.
Examples include crawlerOptions.
For more details, visit: https://github.com/mendableai/firecrawl-py
"""
try:
from firecrawl import FirecrawlApp
except ImportError:
raise ImportError(
"`firecrawl` package not found, please run `pip install firecrawl-py`"
)
if mode not in ("crawl", "scrape", "search", "map", "extract"):
raise ValueError(
f"""Invalid mode '{mode}'.
Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'."""
)
if not url:
raise ValueError("Url must be provided")
api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY")
self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url)
self.url = url
self.mode = mode
self.params = params or {}
def lazy_load(self) -> Iterator[Document]:
if self.mode == "scrape":
firecrawl_docs = [
self.firecrawl.scrape_url(
self.url, **self.params
)
]
elif self.mode == "crawl":
if not self.url:
raise ValueError("URL is required for crawl mode")
crawl_response = self.firecrawl.crawl_url(
self.url, **self.params
)
firecrawl_docs = crawl_response.data or []
elif self.mode == "map":
if not self.url:
raise ValueError("URL is required for map mode")
firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
elif self.mode == "extract":
if not self.url:
raise ValueError("URL is required for extract mode")
firecrawl_docs = [
str(self.firecrawl.extract([self.url], params=self.params))
]
elif self.mode == "search":
raise ValueError(
"Search mode is not supported in this version, please downgrade."
)
else:
raise ValueError(
f"""Invalid mode '{self.mode}'.
Allowed: 'crawl', 'scrape', 'map', 'extract'."""
)
for doc in firecrawl_docs:
if self.mode == "map" or self.mode == "extract":
page_content = doc
metadata = {}
else:
page_content = (
doc.markdown or doc.html or doc.rawHtml or ""
)
metadata = doc.metadata or {}
if not page_content:
continue
yield Document(
page_content=page_content,
metadata=metadata,
)

16
loaders/pdf_loader.py Normal file
View File

@@ -0,0 +1,16 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
def load_pdf(file_path):
loader = PyPDFLoader(file_path)
pages = loader.load()
print(f"Loaded {len(pages)} documents from {file_path}")
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
splits = splitter.split_documents(pages)
documents = []
metadatas = []
for split in splits:
documents.append(split.page_content)
metadatas.append(split.metadata)
return (documents, metadatas)

37
loaders/web_loader.py Normal file
View File

@@ -0,0 +1,37 @@
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from loaders.firecrawl import FireCrawlLoader
def load_web_crawl(url):
documents = []
metadatas = []
loader = FireCrawlLoader(
url=url, api_key="changeme", api_url="http://localhost:3002", mode="crawl", params={ "limit": 100, "include_paths": ["/.*"], "ignore_sitemap": True, "poll_interval": 5 }
)
docs = []
docs_lazy = loader.load()
for doc in docs_lazy:
print('.', end="")
docs.append(doc)
print()
# Load documents from the URLs
# docs = [WebBaseLoader(url).load() for url in urls]
# docs_list = [item for sublist in docs for item in sublist]
# Initialize a text splitter with specified chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=250, chunk_overlap=0
)
# Split the documents into chunks
splits = text_splitter.split_documents(docs)
for split in splits:
documents.append(split.page_content)
metadatas.append(split.metadata)
return (documents, metadatas)