refactored project to use poetry
This commit is contained in:
44
rag_system/loaders/web_loader.py
Normal file
44
rag_system/loaders/web_loader.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from rag_system.loaders.firecrawl import FireCrawlLoader
|
||||
|
||||
|
||||
def load_web_crawl(url):
|
||||
|
||||
documents = []
|
||||
metadatas = []
|
||||
|
||||
loader = FireCrawlLoader(
|
||||
url=url,
|
||||
api_key="changeme",
|
||||
api_url="http://localhost:3002",
|
||||
mode="crawl",
|
||||
params={
|
||||
"limit": 100,
|
||||
"include_paths": ["/.*"],
|
||||
"ignore_sitemap": True,
|
||||
"poll_interval": 5,
|
||||
},
|
||||
)
|
||||
docs = []
|
||||
docs_lazy = loader.load()
|
||||
for doc in docs_lazy:
|
||||
print(".", end="")
|
||||
docs.append(doc)
|
||||
print()
|
||||
|
||||
# Load documents from the URLs
|
||||
# docs = [WebBaseLoader(url).load() for url in urls]
|
||||
# docs_list = [item for sublist in docs for item in sublist]
|
||||
# Initialize a text splitter with specified chunk size and overlap
|
||||
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
||||
chunk_size=250, chunk_overlap=0
|
||||
)
|
||||
# Split the documents into chunks
|
||||
splits = text_splitter.split_documents(docs)
|
||||
|
||||
for split in splits:
|
||||
documents.append(split.page_content)
|
||||
metadatas.append(split.metadata)
|
||||
|
||||
return (documents, metadatas)
|
||||
Reference in New Issue
Block a user