dp-zp-agent/scripts/build_chunks.py

from __future__ import annotations

import re
import sys
from pathlib import Path

from rich import print


PROJECT_ROOT = Path(__file__).resolve().parents[1]

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))


from scripts.common import CHUNKS_FILE, PAGES_ROOT, ZPWIKI_ROOT, load_zpwiki_page, write_json


MAX_CHARS = 1200
OVERLAP_CHARS = 200


def clean_markdown(text: str) -> str:
    text = text.replace("\r\n", "\n")
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def split_by_headings(text: str) -> list[str]:
    parts = re.split(r"(?m)(?=^#{1,6}\s+)", text)
    return [part.strip() for part in parts if part.strip()]


def find_split_position(text: str, max_chars: int) -> int:
    """Nájde lepšie miesto delenia, aby chunk nekončil úplne náhodne."""
    if len(text) <= max_chars:
        return len(text)

    search_area = text[:max_chars]
    min_position = int(max_chars * 0.6)

    for separator in ("\n\n", "\n", ". ", " "):
        position = search_area.rfind(separator)

        if position >= min_position:
            return position + len(separator)

    return max_chars


def split_long_text(
    text: str,
    max_chars: int = MAX_CHARS,
    overlap: int = OVERLAP_CHARS,
) -> list[str]:
    if max_chars <= overlap:
        raise ValueError("max_chars musí byť väčšie ako overlap")

    if len(text) <= max_chars:
        return [text]

    chunks = []
    start = 0

    while start < len(text):
        remaining = text[start:]

        if len(remaining) <= max_chars:
            chunk = remaining.strip()

            if chunk:
                chunks.append(chunk)

            break

        split_at = find_split_position(remaining, max_chars)
        chunk = remaining[:split_at].strip()

        if chunk:
            chunks.append(chunk)

        start += max(1, split_at - overlap)

    return chunks


def chunk_markdown(text: str) -> list[str]:
    """Rozdelí Markdown najprv podľa nadpisov a potom podľa dĺžky."""
    text = clean_markdown(text)

    if not text:
        return []

    chunks = []

    for part in split_by_headings(text):
        chunks.extend(split_long_text(part))

    return chunks


def build_chunks() -> list[dict]:
    if not PAGES_ROOT.exists():
        raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")

    all_chunks = []
    document_count = 0

    for file_path in sorted(PAGES_ROOT.glob("**/README.md")):
        document = load_zpwiki_page(file_path)
        document_count += 1

        for index, text in enumerate(chunk_markdown(document["content"])):
            all_chunks.append(
                {
                    "chunk_id": f"{document['path']}::chunk-{index}",
                    "document_path": document["path"],
                    "title": document["title"],
                    "categories": document["categories"],
                    "tags": document["tags"],
                    "author": document["author"],
                    "published": document["published"],
                    "chunk_index": index,
                    "text": text,
                    "text_length": len(text),
                }
            )

    write_json(CHUNKS_FILE, all_chunks)

    print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}")
    print(f"[green]Dokumentov:[/green] {document_count}")
    print(f"[green]Chunkov:[/green] {len(all_chunks)}")
    print(f"[green]Výstup uložený do:[/green] {CHUNKS_FILE}")

    if all_chunks:
        print("\n[bold]Ukážka prvého chunku:[/bold]")
        print(all_chunks[0])

    return all_chunks


def main() -> None:
    build_chunks()


if __name__ == "__main__":
    main()