dp-zp-agent/scripts/build_chunks.py

from pathlib import Path
import json
import re
import frontmatter
from rich import print


ZPWIKI_ROOT = Path("../zpwiki")
PAGES_ROOT = ZPWIKI_ROOT / "pages"
OUTPUT_FILE = Path("data/chunks.json")

MAX_CHARS = 1200
OVERLAP_CHARS = 200


def json_safe(value):
    if value is None:
        return None

    if isinstance(value, (str, int, float, bool)):
        return value

    if isinstance(value, list):
        return [json_safe(item) for item in value]

    if isinstance(value, dict):
        return {str(key): json_safe(val) for key, val in value.items()}

    return str(value)


def normalize_list(value):
    if value is None:
        return []

    if isinstance(value, list):
        return [str(item).strip() for item in value if str(item).strip()]

    if isinstance(value, str):
        return [item.strip() for item in value.split(",") if item.strip()]

    return [str(value)]


def clean_markdown(text: str) -> str:
    text = text.replace("\r\n", "\n")
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = text.strip()
    return text


def split_by_headings(text: str) -> list[str]:
    parts = re.split(r"(?m)(?=^#{1,6}\s+)", text)
    return [part.strip() for part in parts if part.strip()]


def split_long_text(text: str, max_chars: int = MAX_CHARS, overlap: int = OVERLAP_CHARS) -> list[str]:
    if len(text) <= max_chars:
        return [text]

    chunks = []
    start = 0

    while start < len(text):
        end = start + max_chars
        chunk = text[start:end].strip()

        if chunk:
            chunks.append(chunk)

        if end >= len(text):
            break

        start = max(0, end - overlap)

    return chunks


def chunk_markdown(text: str) -> list[str]:
    text = clean_markdown(text)

    if not text:
        return []

    heading_parts = split_by_headings(text)

    chunks = []

    for part in heading_parts:
        if len(part) <= MAX_CHARS:
            chunks.append(part)
        else:
            chunks.extend(split_long_text(part))

    return chunks


def extract_document(file_path: Path) -> dict:
    post = frontmatter.load(file_path)

    metadata = {
        key: json_safe(value)
        for key, value in post.metadata.items()
    }

    taxonomy = metadata.get("taxonomy") or {}

    categories = normalize_list(
        metadata.get("category")
        or taxonomy.get("category")
    )

    tags = normalize_list(
        metadata.get("tag")
        or metadata.get("tags")
        or taxonomy.get("tag")
        or taxonomy.get("tags")
    )

    author = (
        metadata.get("author")
        or taxonomy.get("author")
    )

    relative_path = file_path.relative_to(ZPWIKI_ROOT)

    return {
        "path": str(relative_path),
        "title": metadata.get("title"),
        "categories": categories,
        "tags": tags,
        "published": metadata.get("published"),
        "author": author,
        "content": post.content.strip(),
        "metadata": metadata,
    }


def main():
    if not PAGES_ROOT.exists():
        raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")

    markdown_files = sorted(PAGES_ROOT.glob("**/README.md"))

    all_chunks = []
    document_count = 0

    for file_path in markdown_files:
        document = extract_document(file_path)
        chunks = chunk_markdown(document["content"])

        document_count += 1

        for index, chunk_text in enumerate(chunks):
            all_chunks.append({
                "chunk_id": f"{document['path']}::chunk-{index}",
                "document_path": document["path"],
                "title": document["title"],
                "categories": document["categories"],
                "tags": document["tags"],
                "author": document["author"],
                "published": document["published"],
                "chunk_index": index,
                "text": chunk_text,
                "text_length": len(chunk_text),
            })

    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

    with OUTPUT_FILE.open("w", encoding="utf-8") as file:
        json.dump(all_chunks, file, ensure_ascii=False, indent=2)

    print(f"[green]Dokumentov:[/green] {document_count}")
    print(f"[green]Chunkov:[/green] {len(all_chunks)}")
    print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}")

    if all_chunks:
        print("\n[bold]Ukážka prvého chunku:[/bold]")
        print(all_chunks[0])


if __name__ == "__main__":
    main()