Refactored and moved all keys and configuration into .env files and provided samples
This commit is contained in:
@@ -1,24 +1,39 @@
|
||||
import json
|
||||
import os
|
||||
import logging
|
||||
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from rag_system.loaders.firecrawl import FireCrawlLoader
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv() # take environment variables]
|
||||
|
||||
# Configure the logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
firecrawl_api_url = os.getenv("FIRECRAWL_API_URL")
|
||||
firecrawl_mode = os.getenv("FIRECRAWL_MODE")
|
||||
firecrawl_params = os.getenv("FIRECRAWL_PARAMS")
|
||||
if firecrawl_params:
|
||||
firecrawl_params = json.loads(firecrawl_params)
|
||||
|
||||
logging.info(f"web_loader firecrawl_api_url: {firecrawl_api_url}")
|
||||
logging.info(f"web_loader firecrawl_mode: {firecrawl_mode}")
|
||||
logging.info(f"web_loader firecrawl_params: {firecrawl_params}")
|
||||
|
||||
|
||||
def load_web_crawl(url):
|
||||
|
||||
documents = []
|
||||
metadatas = []
|
||||
logging.info(f"load_web_crawl url: {url}")
|
||||
|
||||
loader = FireCrawlLoader(
|
||||
url=url,
|
||||
api_key="changeme",
|
||||
api_url="http://localhost:3002",
|
||||
mode="crawl",
|
||||
params={
|
||||
"limit": 100,
|
||||
"include_paths": ["/.*"],
|
||||
"ignore_sitemap": True,
|
||||
"poll_interval": 5,
|
||||
},
|
||||
api_key=firecrawl_api_key,
|
||||
api_url=firecrawl_api_url,
|
||||
mode=firecrawl_mode,
|
||||
params=firecrawl_params,
|
||||
)
|
||||
docs = []
|
||||
docs_lazy = loader.load()
|
||||
@@ -35,10 +50,4 @@ def load_web_crawl(url):
|
||||
chunk_size=250, chunk_overlap=0
|
||||
)
|
||||
# Split the documents into chunks
|
||||
splits = text_splitter.split_documents(docs)
|
||||
|
||||
for split in splits:
|
||||
documents.append(split.page_content)
|
||||
metadatas.append(split.metadata)
|
||||
|
||||
return (documents, metadatas)
|
||||
return text_splitter.split_documents(docs)
|
||||
|
||||
Reference in New Issue
Block a user