from langchain_community.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from rag_system.loaders.firecrawl import FireCrawlLoader def load_web_crawl(url): documents = [] metadatas = [] loader = FireCrawlLoader( url=url, api_key="changeme", api_url="http://localhost:3002", mode="crawl", params={ "limit": 100, "include_paths": ["/.*"], "ignore_sitemap": True, "poll_interval": 5, }, ) docs = [] docs_lazy = loader.load() for doc in docs_lazy: print(".", end="") docs.append(doc) print() # Load documents from the URLs # docs = [WebBaseLoader(url).load() for url in urls] # docs_list = [item for sublist in docs for item in sublist] # Initialize a text splitter with specified chunk size and overlap text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( chunk_size=250, chunk_overlap=0 ) # Split the documents into chunks splits = text_splitter.split_documents(docs) for split in splits: documents.append(split.page_content) metadatas.append(split.metadata) return (documents, metadatas)