Adding AzureSearch AI as vector store

This commit is contained in:
2025-05-16 22:01:05 -05:00
parent 226b51a6a1
commit 3beb160c22
18 changed files with 2751 additions and 96 deletions

View File

@@ -58,17 +58,11 @@ class FireCrawlLoader(BaseLoader):
def lazy_load(self) -> Iterator[Document]:
if self.mode == "scrape":
firecrawl_docs = [
self.firecrawl.scrape_url(
self.url, **self.params
)
]
firecrawl_docs = [self.firecrawl.scrape_url(self.url, **self.params)]
elif self.mode == "crawl":
if not self.url:
raise ValueError("URL is required for crawl mode")
crawl_response = self.firecrawl.crawl_url(
self.url, **self.params
)
crawl_response = self.firecrawl.crawl_url(self.url, **self.params)
firecrawl_docs = crawl_response.data or []
elif self.mode == "map":
if not self.url:
@@ -94,9 +88,7 @@ class FireCrawlLoader(BaseLoader):
page_content = doc
metadata = {}
else:
page_content = (
doc.markdown or doc.html or doc.rawHtml or ""
)
page_content = doc.markdown or doc.html or doc.rawHtml or ""
metadata = doc.metadata or {}
if not page_content:
continue

View File

@@ -1,16 +1,11 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
def load_pdf(file_path):
loader = PyPDFLoader(file_path)
pages = loader.load()
print(f"Loaded {len(pages)} documents from {file_path}")
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
splits = splitter.split_documents(pages)
documents = []
metadatas = []
for split in splits:
documents.append(split.page_content)
metadatas.append(split.metadata)
return (documents, metadatas)
loader = PyPDFLoader(file_path)
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = loader.load_and_split(splitter)
print(f"Loaded and Split into {len(documents)} documents from {file_path}")
return documents

View File

@@ -3,23 +3,30 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
from loaders.firecrawl import FireCrawlLoader
def load_web_crawl(url):
documents = []
metadatas = []
loader = FireCrawlLoader(
url=url, api_key="changeme", api_url="http://localhost:3002", mode="crawl", params={ "limit": 100, "include_paths": ["/.*"], "ignore_sitemap": True, "poll_interval": 5 }
url=url,
api_key="changeme",
api_url="http://localhost:3002",
mode="crawl",
params={
"limit": 100,
"include_paths": ["/.*"],
"ignore_sitemap": True,
"poll_interval": 5,
},
)
docs = []
docs_lazy = loader.load()
for doc in docs_lazy:
print('.', end="")
print(".", end="")
docs.append(doc)
print()
# Load documents from the URLs
# docs = [WebBaseLoader(url).load() for url in urls]
# docs_list = [item for sublist in docs for item in sublist]
@@ -33,5 +40,5 @@ def load_web_crawl(url):
for split in splits:
documents.append(split.page_content)
metadatas.append(split.metadata)
return (documents, metadatas)
return (documents, metadatas)