107 lines
3.9 KiB
Python
107 lines
3.9 KiB
Python
import warnings
|
|
from typing import Iterator, Literal, Optional
|
|
|
|
from langchain_core.document_loaders import BaseLoader
|
|
from langchain_core.documents import Document
|
|
from langchain_core.utils import get_from_env
|
|
|
|
|
|
class FireCrawlLoader(BaseLoader):
|
|
|
|
def __init__(
|
|
self,
|
|
url: str,
|
|
*,
|
|
api_key: Optional[str] = None,
|
|
api_url: Optional[str] = None,
|
|
mode: Literal["crawl", "scrape", "map", "extract"] = "crawl",
|
|
params: Optional[dict] = None,
|
|
):
|
|
"""Initialize with API key and url.
|
|
|
|
Args:
|
|
url: The url to be crawled.
|
|
api_key: The Firecrawl API key. If not specified will be read from env var
|
|
FIRECRAWL_API_KEY. Get an API key
|
|
api_url: The Firecrawl API URL. If not specified will be read from env var
|
|
FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev.
|
|
mode: The mode to run the loader in. Default is "crawl".
|
|
Options include "scrape" (single url),
|
|
"crawl" (all accessible sub pages),
|
|
"map" (returns list of links that are semantically related).
|
|
"extract" (extracts structured data from a page).
|
|
params: The parameters to pass to the Firecrawl API.
|
|
Examples include crawlerOptions.
|
|
For more details, visit: https://github.com/mendableai/firecrawl-py
|
|
"""
|
|
|
|
try:
|
|
from firecrawl import FirecrawlApp
|
|
except ImportError:
|
|
raise ImportError(
|
|
"`firecrawl` package not found, please run `pip install firecrawl-py`"
|
|
)
|
|
if mode not in ("crawl", "scrape", "search", "map", "extract"):
|
|
raise ValueError(
|
|
f"""Invalid mode '{mode}'.
|
|
Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'."""
|
|
)
|
|
|
|
if not url:
|
|
raise ValueError("Url must be provided")
|
|
|
|
api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY")
|
|
self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url)
|
|
self.url = url
|
|
self.mode = mode
|
|
self.params = params or {}
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
if self.mode == "scrape":
|
|
firecrawl_docs = [
|
|
self.firecrawl.scrape_url(
|
|
self.url, **self.params
|
|
)
|
|
]
|
|
elif self.mode == "crawl":
|
|
if not self.url:
|
|
raise ValueError("URL is required for crawl mode")
|
|
crawl_response = self.firecrawl.crawl_url(
|
|
self.url, **self.params
|
|
)
|
|
firecrawl_docs = crawl_response.data or []
|
|
elif self.mode == "map":
|
|
if not self.url:
|
|
raise ValueError("URL is required for map mode")
|
|
firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
|
|
elif self.mode == "extract":
|
|
if not self.url:
|
|
raise ValueError("URL is required for extract mode")
|
|
firecrawl_docs = [
|
|
str(self.firecrawl.extract([self.url], params=self.params))
|
|
]
|
|
elif self.mode == "search":
|
|
raise ValueError(
|
|
"Search mode is not supported in this version, please downgrade."
|
|
)
|
|
else:
|
|
raise ValueError(
|
|
f"""Invalid mode '{self.mode}'.
|
|
Allowed: 'crawl', 'scrape', 'map', 'extract'."""
|
|
)
|
|
for doc in firecrawl_docs:
|
|
if self.mode == "map" or self.mode == "extract":
|
|
page_content = doc
|
|
metadata = {}
|
|
else:
|
|
page_content = (
|
|
doc.markdown or doc.html or doc.rawHtml or ""
|
|
)
|
|
metadata = doc.metadata or {}
|
|
if not page_content:
|
|
continue
|
|
yield Document(
|
|
page_content=page_content,
|
|
metadata=metadata,
|
|
)
|