dp-zp-agent/scripts/build_chunks.py

149 lines
3.6 KiB
Python

from __future__ import annotations
import re
import sys
from pathlib import Path
from rich import print
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from scripts.common import CHUNKS_FILE, PAGES_ROOT, ZPWIKI_ROOT, load_zpwiki_page, write_json
MAX_CHARS = 1200
OVERLAP_CHARS = 200
def clean_markdown(text: str) -> str:
text = text.replace("\r\n", "\n")
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def split_by_headings(text: str) -> list[str]:
parts = re.split(r"(?m)(?=^#{1,6}\s+)", text)
return [part.strip() for part in parts if part.strip()]
def find_split_position(text: str, max_chars: int) -> int:
"""Nájde lepšie miesto delenia, aby chunk nekončil úplne náhodne."""
if len(text) <= max_chars:
return len(text)
search_area = text[:max_chars]
min_position = int(max_chars * 0.6)
for separator in ("\n\n", "\n", ". ", " "):
position = search_area.rfind(separator)
if position >= min_position:
return position + len(separator)
return max_chars
def split_long_text(
text: str,
max_chars: int = MAX_CHARS,
overlap: int = OVERLAP_CHARS,
) -> list[str]:
if max_chars <= overlap:
raise ValueError("max_chars musí byť väčšie ako overlap")
if len(text) <= max_chars:
return [text]
chunks = []
start = 0
while start < len(text):
remaining = text[start:]
if len(remaining) <= max_chars:
chunk = remaining.strip()
if chunk:
chunks.append(chunk)
break
split_at = find_split_position(remaining, max_chars)
chunk = remaining[:split_at].strip()
if chunk:
chunks.append(chunk)
start += max(1, split_at - overlap)
return chunks
def chunk_markdown(text: str) -> list[str]:
"""Rozdelí Markdown najprv podľa nadpisov a potom podľa dĺžky."""
text = clean_markdown(text)
if not text:
return []
chunks = []
for part in split_by_headings(text):
chunks.extend(split_long_text(part))
return chunks
def build_chunks() -> list[dict]:
if not PAGES_ROOT.exists():
raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")
all_chunks = []
document_count = 0
for file_path in sorted(PAGES_ROOT.glob("**/README.md")):
document = load_zpwiki_page(file_path)
document_count += 1
for index, text in enumerate(chunk_markdown(document["content"])):
all_chunks.append(
{
"chunk_id": f"{document['path']}::chunk-{index}",
"document_path": document["path"],
"title": document["title"],
"categories": document["categories"],
"tags": document["tags"],
"author": document["author"],
"published": document["published"],
"chunk_index": index,
"text": text,
"text_length": len(text),
}
)
write_json(CHUNKS_FILE, all_chunks)
print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}")
print(f"[green]Dokumentov:[/green] {document_count}")
print(f"[green]Chunkov:[/green] {len(all_chunks)}")
print(f"[green]Výstup uložený do:[/green] {CHUNKS_FILE}")
if all_chunks:
print("\n[bold]Ukážka prvého chunku:[/bold]")
print(all_chunks[0])
return all_chunks
def main() -> None:
build_chunks()
if __name__ == "__main__":
main()