62 lines
1.9 KiB
Python
62 lines
1.9 KiB
Python
import json
|
|
import os
|
|
import logging
|
|
|
|
from langchain_community.document_loaders import WebBaseLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from rag_system.loaders.firecrawl import FireCrawlLoader
|
|
from firecrawl import ScrapeOptions
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv() # take environment variables]
|
|
|
|
# Configure the logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
|
firecrawl_api_url = os.getenv("FIRECRAWL_API_URL")
|
|
firecrawl_mode = os.getenv("FIRECRAWL_MODE")
|
|
firecrawl_params = os.getenv("FIRECRAWL_PARAMS")
|
|
if firecrawl_params:
|
|
firecrawl_params = json.loads(firecrawl_params)
|
|
|
|
if firecrawl_params["scrape_options"]:
|
|
firecrawl_params["scrape_options"] = ScrapeOptions(
|
|
**firecrawl_params["scrape_options"]
|
|
)
|
|
|
|
|
|
logging.info(f"web_loader firecrawl_params: {firecrawl_params}")
|
|
logging.info(f"web_loader firecrawl_api_url: {firecrawl_api_url}")
|
|
logging.info(f"web_loader firecrawl_mode: {firecrawl_mode}")
|
|
logging.info(f"web_loader firecrawl_params: {firecrawl_params}")
|
|
|
|
|
|
def load_web_crawl(url):
|
|
logging.info(f"load_web_crawl url: {url}")
|
|
|
|
loader = FireCrawlLoader(
|
|
url=url,
|
|
api_key=firecrawl_api_key,
|
|
api_url=firecrawl_api_url,
|
|
mode=firecrawl_mode,
|
|
params=firecrawl_params,
|
|
)
|
|
docs = []
|
|
docs_lazy = loader.load()
|
|
for doc in docs_lazy:
|
|
print(".", end="")
|
|
docs.append(doc)
|
|
print()
|
|
|
|
# Load documents from the URLs
|
|
# docs = [WebBaseLoader(url).load() for url in urls]
|
|
# docs_list = [item for sublist in docs for item in sublist]
|
|
# Initialize a text splitter with specified chunk size and overlap
|
|
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
|
chunk_size=250, chunk_overlap=0
|
|
)
|
|
# Split the documents into chunks
|
|
return text_splitter.split_documents(docs)
|