initial commit
This commit is contained in:
0
loaders/__init__.py
Normal file
0
loaders/__init__.py
Normal file
106
loaders/firecrawl.py
Normal file
106
loaders/firecrawl.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import warnings
|
||||
from typing import Iterator, Literal, Optional
|
||||
|
||||
from langchain_core.document_loaders import BaseLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.utils import get_from_env
|
||||
|
||||
|
||||
class FireCrawlLoader(BaseLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
api_key: Optional[str] = None,
|
||||
api_url: Optional[str] = None,
|
||||
mode: Literal["crawl", "scrape", "map", "extract"] = "crawl",
|
||||
params: Optional[dict] = None,
|
||||
):
|
||||
"""Initialize with API key and url.
|
||||
|
||||
Args:
|
||||
url: The url to be crawled.
|
||||
api_key: The Firecrawl API key. If not specified will be read from env var
|
||||
FIRECRAWL_API_KEY. Get an API key
|
||||
api_url: The Firecrawl API URL. If not specified will be read from env var
|
||||
FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev.
|
||||
mode: The mode to run the loader in. Default is "crawl".
|
||||
Options include "scrape" (single url),
|
||||
"crawl" (all accessible sub pages),
|
||||
"map" (returns list of links that are semantically related).
|
||||
"extract" (extracts structured data from a page).
|
||||
params: The parameters to pass to the Firecrawl API.
|
||||
Examples include crawlerOptions.
|
||||
For more details, visit: https://github.com/mendableai/firecrawl-py
|
||||
"""
|
||||
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`firecrawl` package not found, please run `pip install firecrawl-py`"
|
||||
)
|
||||
if mode not in ("crawl", "scrape", "search", "map", "extract"):
|
||||
raise ValueError(
|
||||
f"""Invalid mode '{mode}'.
|
||||
Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'."""
|
||||
)
|
||||
|
||||
if not url:
|
||||
raise ValueError("Url must be provided")
|
||||
|
||||
api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY")
|
||||
self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url)
|
||||
self.url = url
|
||||
self.mode = mode
|
||||
self.params = params or {}
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
if self.mode == "scrape":
|
||||
firecrawl_docs = [
|
||||
self.firecrawl.scrape_url(
|
||||
self.url, **self.params
|
||||
)
|
||||
]
|
||||
elif self.mode == "crawl":
|
||||
if not self.url:
|
||||
raise ValueError("URL is required for crawl mode")
|
||||
crawl_response = self.firecrawl.crawl_url(
|
||||
self.url, **self.params
|
||||
)
|
||||
firecrawl_docs = crawl_response.data or []
|
||||
elif self.mode == "map":
|
||||
if not self.url:
|
||||
raise ValueError("URL is required for map mode")
|
||||
firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
|
||||
elif self.mode == "extract":
|
||||
if not self.url:
|
||||
raise ValueError("URL is required for extract mode")
|
||||
firecrawl_docs = [
|
||||
str(self.firecrawl.extract([self.url], params=self.params))
|
||||
]
|
||||
elif self.mode == "search":
|
||||
raise ValueError(
|
||||
"Search mode is not supported in this version, please downgrade."
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"""Invalid mode '{self.mode}'.
|
||||
Allowed: 'crawl', 'scrape', 'map', 'extract'."""
|
||||
)
|
||||
for doc in firecrawl_docs:
|
||||
if self.mode == "map" or self.mode == "extract":
|
||||
page_content = doc
|
||||
metadata = {}
|
||||
else:
|
||||
page_content = (
|
||||
doc.markdown or doc.html or doc.rawHtml or ""
|
||||
)
|
||||
metadata = doc.metadata or {}
|
||||
if not page_content:
|
||||
continue
|
||||
yield Document(
|
||||
page_content=page_content,
|
||||
metadata=metadata,
|
||||
)
|
||||
16
loaders/pdf_loader.py
Normal file
16
loaders/pdf_loader.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
def load_pdf(file_path):
|
||||
loader = PyPDFLoader(file_path)
|
||||
pages = loader.load()
|
||||
print(f"Loaded {len(pages)} documents from {file_path}")
|
||||
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
||||
splits = splitter.split_documents(pages)
|
||||
documents = []
|
||||
metadatas = []
|
||||
|
||||
for split in splits:
|
||||
documents.append(split.page_content)
|
||||
metadatas.append(split.metadata)
|
||||
|
||||
return (documents, metadatas)
|
||||
37
loaders/web_loader.py
Normal file
37
loaders/web_loader.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from loaders.firecrawl import FireCrawlLoader
|
||||
|
||||
|
||||
|
||||
def load_web_crawl(url):
|
||||
|
||||
documents = []
|
||||
metadatas = []
|
||||
|
||||
loader = FireCrawlLoader(
|
||||
url=url, api_key="changeme", api_url="http://localhost:3002", mode="crawl", params={ "limit": 100, "include_paths": ["/.*"], "ignore_sitemap": True, "poll_interval": 5 }
|
||||
)
|
||||
docs = []
|
||||
docs_lazy = loader.load()
|
||||
for doc in docs_lazy:
|
||||
print('.', end="")
|
||||
docs.append(doc)
|
||||
print()
|
||||
|
||||
|
||||
# Load documents from the URLs
|
||||
# docs = [WebBaseLoader(url).load() for url in urls]
|
||||
# docs_list = [item for sublist in docs for item in sublist]
|
||||
# Initialize a text splitter with specified chunk size and overlap
|
||||
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
||||
chunk_size=250, chunk_overlap=0
|
||||
)
|
||||
# Split the documents into chunks
|
||||
splits = text_splitter.split_documents(docs)
|
||||
|
||||
for split in splits:
|
||||
documents.append(split.page_content)
|
||||
metadatas.append(split.metadata)
|
||||
|
||||
return (documents, metadatas)
|
||||
Reference in New Issue
Block a user