import json import os import logging from langchain_community.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from rag_system.loaders.firecrawl import FireCrawlLoader from dotenv import load_dotenv load_dotenv() # take environment variables] # Configure the logging logging.basicConfig(level=logging.INFO) firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") firecrawl_api_url = os.getenv("FIRECRAWL_API_URL") firecrawl_mode = os.getenv("FIRECRAWL_MODE") firecrawl_params = os.getenv("FIRECRAWL_PARAMS") if firecrawl_params: firecrawl_params = json.loads(firecrawl_params) logging.info(f"web_loader firecrawl_api_url: {firecrawl_api_url}") logging.info(f"web_loader firecrawl_mode: {firecrawl_mode}") logging.info(f"web_loader firecrawl_params: {firecrawl_params}") def load_web_crawl(url): logging.info(f"load_web_crawl url: {url}") loader = FireCrawlLoader( url=url, api_key=firecrawl_api_key, api_url=firecrawl_api_url, mode=firecrawl_mode, params=firecrawl_params, ) docs = [] docs_lazy = loader.load() for doc in docs_lazy: print(".", end="") docs.append(doc) print() # Load documents from the URLs # docs = [WebBaseLoader(url).load() for url in urls] # docs_list = [item for sublist in docs for item in sublist] # Initialize a text splitter with specified chunk size and overlap text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( chunk_size=250, chunk_overlap=0 ) # Split the documents into chunks return text_splitter.split_documents(docs)