rag-system/rag_system/loaders/web_loader.py

45 lines
1.2 KiB
Python

from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rag_system.loaders.firecrawl import FireCrawlLoader
def load_web_crawl(url):
documents = []
metadatas = []
loader = FireCrawlLoader(
url=url,
api_key="changeme",
api_url="http://localhost:3002",
mode="crawl",
params={
"limit": 100,
"include_paths": ["/.*"],
"ignore_sitemap": True,
"poll_interval": 5,
},
)
docs = []
docs_lazy = loader.load()
for doc in docs_lazy:
print(".", end="")
docs.append(doc)
print()
# Load documents from the URLs
# docs = [WebBaseLoader(url).load() for url in urls]
# docs_list = [item for sublist in docs for item in sublist]
# Initialize a text splitter with specified chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=250, chunk_overlap=0
)
# Split the documents into chunks
splits = text_splitter.split_documents(docs)
for split in splits:
documents.append(split.page_content)
metadatas.append(split.metadata)
return (documents, metadatas)