45 lines
1.2 KiB
Python
45 lines
1.2 KiB
Python
from langchain_community.document_loaders import WebBaseLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from rag_system.loaders.firecrawl import FireCrawlLoader
|
|
|
|
|
|
def load_web_crawl(url):
|
|
|
|
documents = []
|
|
metadatas = []
|
|
|
|
loader = FireCrawlLoader(
|
|
url=url,
|
|
api_key="changeme",
|
|
api_url="http://localhost:3002",
|
|
mode="crawl",
|
|
params={
|
|
"limit": 100,
|
|
"include_paths": ["/.*"],
|
|
"ignore_sitemap": True,
|
|
"poll_interval": 5,
|
|
},
|
|
)
|
|
docs = []
|
|
docs_lazy = loader.load()
|
|
for doc in docs_lazy:
|
|
print(".", end="")
|
|
docs.append(doc)
|
|
print()
|
|
|
|
# Load documents from the URLs
|
|
# docs = [WebBaseLoader(url).load() for url in urls]
|
|
# docs_list = [item for sublist in docs for item in sublist]
|
|
# Initialize a text splitter with specified chunk size and overlap
|
|
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
|
chunk_size=250, chunk_overlap=0
|
|
)
|
|
# Split the documents into chunks
|
|
splits = text_splitter.split_documents(docs)
|
|
|
|
for split in splits:
|
|
documents.append(split.page_content)
|
|
metadatas.append(split.metadata)
|
|
|
|
return (documents, metadatas)
|