from pathlib import Path from pypdf import PdfReader from docx import Document def load_documents(folder="documents"): docs = [] for file in Path(folder).glob("*"): if file.suffix == ".txt": text = file.read_text(encoding="utf-8") docs.append({ "filename": file.name, "content": text }) elif file.suffix == ".pdf": reader = PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() docs.append({ "filename": file.name, "content": text }) elif file.suffix == ".docx": doc = Document(file) text = "\n".join([paragraph.text for paragraph in doc.paragraphs]) docs.append({ "filename": file.name, "content": text }) return docs #test if __name__ == "__main__": docs = load_documents() print(docs)