DP_PRACA/ingest.py
2026-05-16 08:50:22 +02:00

43 lines
1.0 KiB
Python

from pathlib import Path
from pypdf import PdfReader
from docx import Document
def load_documents(folder="documents"):
docs = []
for file in Path(folder).glob("*"):
if file.suffix == ".txt":
text = file.read_text(encoding="utf-8")
docs.append({
"filename": file.name,
"content": text
})
elif file.suffix == ".pdf":
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
docs.append({
"filename": file.name,
"content": text
})
elif file.suffix == ".docx":
doc = Document(file)
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
docs.append({
"filename": file.name,
"content": text
})
return docs
#test
if __name__ == "__main__":
docs = load_documents()
print(docs)