17 lines
553 B
Python
17 lines
553 B
Python
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_community.document_loaders import PyPDFLoader
|
|
def load_pdf(file_path):
|
|
loader = PyPDFLoader(file_path)
|
|
pages = loader.load()
|
|
print(f"Loaded {len(pages)} documents from {file_path}")
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
|
splits = splitter.split_documents(pages)
|
|
documents = []
|
|
metadatas = []
|
|
|
|
for split in splits:
|
|
documents.append(split.page_content)
|
|
metadatas.append(split.metadata)
|
|
|
|
return (documents, metadatas)
|