rag-system/rag_system/loaders/web_loader.py

59 lines
1.8 KiB
Python

import json
import os
import logging
logger: logging.Logger = logging.getLogger("web_loader")
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rag_system.loaders.firecrawl import FireCrawlLoader
from firecrawl import ScrapeOptions
from dotenv import load_dotenv
load_dotenv() # take environment variables]
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
firecrawl_api_url = os.getenv("FIRECRAWL_API_URL")
firecrawl_mode = os.getenv("FIRECRAWL_MODE")
firecrawl_params = os.getenv("FIRECRAWL_PARAMS")
if firecrawl_params:
firecrawl_params = json.loads(firecrawl_params)
if firecrawl_params["scrape_options"]:
firecrawl_params["scrape_options"] = ScrapeOptions(
**firecrawl_params["scrape_options"]
)
logger.info(f"web_loader firecrawl_params: {firecrawl_params}")
logger.info(f"web_loader firecrawl_api_url: {firecrawl_api_url}")
logger.info(f"web_loader firecrawl_mode: {firecrawl_mode}")
logger.info(f"web_loader firecrawl_params: {firecrawl_params}")
def load_web_crawl(url):
logger.info(f"load_web_crawl url: {url}")
loader = FireCrawlLoader(
url=url,
api_key=firecrawl_api_key,
api_url=firecrawl_api_url,
mode=firecrawl_mode,
params=firecrawl_params,
)
docs = []
docs_lazy = loader.load()
for doc in docs_lazy:
docs.append(doc)
# Load documents from the URLs
# docs = [WebBaseLoader(url).load() for url in urls]
# docs_list = [item for sublist in docs for item in sublist]
# Initialize a text splitter with specified chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=1000, chunk_overlap=200
)
# Split the documents into chunks
return text_splitter.split_documents(docs)