Adding AzureSearch AI as vector store
This commit is contained in:
@@ -58,17 +58,11 @@ class FireCrawlLoader(BaseLoader):
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
if self.mode == "scrape":
|
||||
firecrawl_docs = [
|
||||
self.firecrawl.scrape_url(
|
||||
self.url, **self.params
|
||||
)
|
||||
]
|
||||
firecrawl_docs = [self.firecrawl.scrape_url(self.url, **self.params)]
|
||||
elif self.mode == "crawl":
|
||||
if not self.url:
|
||||
raise ValueError("URL is required for crawl mode")
|
||||
crawl_response = self.firecrawl.crawl_url(
|
||||
self.url, **self.params
|
||||
)
|
||||
crawl_response = self.firecrawl.crawl_url(self.url, **self.params)
|
||||
firecrawl_docs = crawl_response.data or []
|
||||
elif self.mode == "map":
|
||||
if not self.url:
|
||||
@@ -94,9 +88,7 @@ class FireCrawlLoader(BaseLoader):
|
||||
page_content = doc
|
||||
metadata = {}
|
||||
else:
|
||||
page_content = (
|
||||
doc.markdown or doc.html or doc.rawHtml or ""
|
||||
)
|
||||
page_content = doc.markdown or doc.html or doc.rawHtml or ""
|
||||
metadata = doc.metadata or {}
|
||||
if not page_content:
|
||||
continue
|
||||
|
||||
@@ -1,16 +1,11 @@
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
|
||||
|
||||
def load_pdf(file_path):
|
||||
loader = PyPDFLoader(file_path)
|
||||
pages = loader.load()
|
||||
print(f"Loaded {len(pages)} documents from {file_path}")
|
||||
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
||||
splits = splitter.split_documents(pages)
|
||||
documents = []
|
||||
metadatas = []
|
||||
|
||||
for split in splits:
|
||||
documents.append(split.page_content)
|
||||
metadatas.append(split.metadata)
|
||||
|
||||
return (documents, metadatas)
|
||||
loader = PyPDFLoader(file_path)
|
||||
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
||||
documents = loader.load_and_split(splitter)
|
||||
print(f"Loaded and Split into {len(documents)} documents from {file_path}")
|
||||
|
||||
return documents
|
||||
|
||||
@@ -3,23 +3,30 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from loaders.firecrawl import FireCrawlLoader
|
||||
|
||||
|
||||
|
||||
def load_web_crawl(url):
|
||||
|
||||
|
||||
documents = []
|
||||
metadatas = []
|
||||
|
||||
|
||||
loader = FireCrawlLoader(
|
||||
url=url, api_key="changeme", api_url="http://localhost:3002", mode="crawl", params={ "limit": 100, "include_paths": ["/.*"], "ignore_sitemap": True, "poll_interval": 5 }
|
||||
url=url,
|
||||
api_key="changeme",
|
||||
api_url="http://localhost:3002",
|
||||
mode="crawl",
|
||||
params={
|
||||
"limit": 100,
|
||||
"include_paths": ["/.*"],
|
||||
"ignore_sitemap": True,
|
||||
"poll_interval": 5,
|
||||
},
|
||||
)
|
||||
docs = []
|
||||
docs_lazy = loader.load()
|
||||
for doc in docs_lazy:
|
||||
print('.', end="")
|
||||
print(".", end="")
|
||||
docs.append(doc)
|
||||
print()
|
||||
|
||||
|
||||
|
||||
# Load documents from the URLs
|
||||
# docs = [WebBaseLoader(url).load() for url in urls]
|
||||
# docs_list = [item for sublist in docs for item in sublist]
|
||||
@@ -33,5 +40,5 @@ def load_web_crawl(url):
|
||||
for split in splits:
|
||||
documents.append(split.page_content)
|
||||
metadatas.append(split.metadata)
|
||||
|
||||
return (documents, metadatas)
|
||||
|
||||
return (documents, metadatas)
|
||||
|
||||
Reference in New Issue
Block a user