184 lines
4.3 KiB
Python
184 lines
4.3 KiB
Python
from pathlib import Path
|
|
import json
|
|
import re
|
|
import frontmatter
|
|
from rich import print
|
|
|
|
|
|
ZPWIKI_ROOT = Path("../zpwiki")
|
|
PAGES_ROOT = ZPWIKI_ROOT / "pages"
|
|
OUTPUT_FILE = Path("data/chunks.json")
|
|
|
|
MAX_CHARS = 1200
|
|
OVERLAP_CHARS = 200
|
|
|
|
|
|
def json_safe(value):
|
|
if value is None:
|
|
return None
|
|
|
|
if isinstance(value, (str, int, float, bool)):
|
|
return value
|
|
|
|
if isinstance(value, list):
|
|
return [json_safe(item) for item in value]
|
|
|
|
if isinstance(value, dict):
|
|
return {str(key): json_safe(val) for key, val in value.items()}
|
|
|
|
return str(value)
|
|
|
|
|
|
def normalize_list(value):
|
|
if value is None:
|
|
return []
|
|
|
|
if isinstance(value, list):
|
|
return [str(item).strip() for item in value if str(item).strip()]
|
|
|
|
if isinstance(value, str):
|
|
return [item.strip() for item in value.split(",") if item.strip()]
|
|
|
|
return [str(value)]
|
|
|
|
|
|
def clean_markdown(text: str) -> str:
|
|
text = text.replace("\r\n", "\n")
|
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
text = text.strip()
|
|
return text
|
|
|
|
|
|
def split_by_headings(text: str) -> list[str]:
|
|
parts = re.split(r"(?m)(?=^#{1,6}\s+)", text)
|
|
return [part.strip() for part in parts if part.strip()]
|
|
|
|
|
|
def split_long_text(text: str, max_chars: int = MAX_CHARS, overlap: int = OVERLAP_CHARS) -> list[str]:
|
|
if len(text) <= max_chars:
|
|
return [text]
|
|
|
|
chunks = []
|
|
start = 0
|
|
|
|
while start < len(text):
|
|
end = start + max_chars
|
|
chunk = text[start:end].strip()
|
|
|
|
if chunk:
|
|
chunks.append(chunk)
|
|
|
|
if end >= len(text):
|
|
break
|
|
|
|
start = max(0, end - overlap)
|
|
|
|
return chunks
|
|
|
|
|
|
def chunk_markdown(text: str) -> list[str]:
|
|
text = clean_markdown(text)
|
|
|
|
if not text:
|
|
return []
|
|
|
|
heading_parts = split_by_headings(text)
|
|
|
|
chunks = []
|
|
|
|
for part in heading_parts:
|
|
if len(part) <= MAX_CHARS:
|
|
chunks.append(part)
|
|
else:
|
|
chunks.extend(split_long_text(part))
|
|
|
|
return chunks
|
|
|
|
|
|
def extract_document(file_path: Path) -> dict:
|
|
post = frontmatter.load(file_path)
|
|
|
|
metadata = {
|
|
key: json_safe(value)
|
|
for key, value in post.metadata.items()
|
|
}
|
|
|
|
taxonomy = metadata.get("taxonomy") or {}
|
|
|
|
categories = normalize_list(
|
|
metadata.get("category")
|
|
or taxonomy.get("category")
|
|
)
|
|
|
|
tags = normalize_list(
|
|
metadata.get("tag")
|
|
or metadata.get("tags")
|
|
or taxonomy.get("tag")
|
|
or taxonomy.get("tags")
|
|
)
|
|
|
|
author = (
|
|
metadata.get("author")
|
|
or taxonomy.get("author")
|
|
)
|
|
|
|
relative_path = file_path.relative_to(ZPWIKI_ROOT)
|
|
|
|
return {
|
|
"path": str(relative_path),
|
|
"title": metadata.get("title"),
|
|
"categories": categories,
|
|
"tags": tags,
|
|
"published": metadata.get("published"),
|
|
"author": author,
|
|
"content": post.content.strip(),
|
|
"metadata": metadata,
|
|
}
|
|
|
|
|
|
def main():
|
|
if not PAGES_ROOT.exists():
|
|
raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")
|
|
|
|
markdown_files = sorted(PAGES_ROOT.glob("**/README.md"))
|
|
|
|
all_chunks = []
|
|
document_count = 0
|
|
|
|
for file_path in markdown_files:
|
|
document = extract_document(file_path)
|
|
chunks = chunk_markdown(document["content"])
|
|
|
|
document_count += 1
|
|
|
|
for index, chunk_text in enumerate(chunks):
|
|
all_chunks.append({
|
|
"chunk_id": f"{document['path']}::chunk-{index}",
|
|
"document_path": document["path"],
|
|
"title": document["title"],
|
|
"categories": document["categories"],
|
|
"tags": document["tags"],
|
|
"author": document["author"],
|
|
"published": document["published"],
|
|
"chunk_index": index,
|
|
"text": chunk_text,
|
|
"text_length": len(chunk_text),
|
|
})
|
|
|
|
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with OUTPUT_FILE.open("w", encoding="utf-8") as file:
|
|
json.dump(all_chunks, file, ensure_ascii=False, indent=2)
|
|
|
|
print(f"[green]Dokumentov:[/green] {document_count}")
|
|
print(f"[green]Chunkov:[/green] {len(all_chunks)}")
|
|
print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}")
|
|
|
|
if all_chunks:
|
|
print("\n[bold]Ukážka prvého chunku:[/bold]")
|
|
print(all_chunks[0])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|