43 lines
1.0 KiB
Python
43 lines
1.0 KiB
Python
from pathlib import Path
|
|
from pypdf import PdfReader
|
|
from docx import Document
|
|
|
|
def load_documents(folder="documents"):
|
|
docs = []
|
|
|
|
for file in Path(folder).glob("*"):
|
|
if file.suffix == ".txt":
|
|
text = file.read_text(encoding="utf-8")
|
|
|
|
docs.append({
|
|
"filename": file.name,
|
|
"content": text
|
|
})
|
|
|
|
elif file.suffix == ".pdf":
|
|
reader = PdfReader(file)
|
|
text = ""
|
|
for page in reader.pages:
|
|
text += page.extract_text()
|
|
|
|
docs.append({
|
|
"filename": file.name,
|
|
"content": text
|
|
})
|
|
|
|
elif file.suffix == ".docx":
|
|
doc = Document(file)
|
|
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
|
|
|
docs.append({
|
|
"filename": file.name,
|
|
"content": text
|
|
})
|
|
|
|
return docs
|
|
|
|
#test
|
|
if __name__ == "__main__":
|
|
docs = load_documents()
|
|
|
|
print(docs) |