rag-system/loaders/pdf_loader.py
2025-05-01 12:21:47 -05:00

17 lines
553 B
Python

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
def load_pdf(file_path):
loader = PyPDFLoader(file_path)
pages = loader.load()
print(f"Loaded {len(pages)} documents from {file_path}")
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
splits = splitter.split_documents(pages)
documents = []
metadatas = []
for split in splits:
documents.append(split.page_content)
metadatas.append(split.metadata)
return (documents, metadatas)