from __future__ import annotations import re import sys from pathlib import Path from rich import print PROJECT_ROOT = Path(__file__).resolve().parents[1] if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT)) from scripts.common import CHUNKS_FILE, PAGES_ROOT, ZPWIKI_ROOT, load_zpwiki_page, write_json MAX_CHARS = 1200 OVERLAP_CHARS = 200 def clean_markdown(text: str) -> str: text = text.replace("\r\n", "\n") text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def split_by_headings(text: str) -> list[str]: parts = re.split(r"(?m)(?=^#{1,6}\s+)", text) return [part.strip() for part in parts if part.strip()] def find_split_position(text: str, max_chars: int) -> int: """Nájde lepšie miesto delenia, aby chunk nekončil úplne náhodne.""" if len(text) <= max_chars: return len(text) search_area = text[:max_chars] min_position = int(max_chars * 0.6) for separator in ("\n\n", "\n", ". ", " "): position = search_area.rfind(separator) if position >= min_position: return position + len(separator) return max_chars def split_long_text( text: str, max_chars: int = MAX_CHARS, overlap: int = OVERLAP_CHARS, ) -> list[str]: if max_chars <= overlap: raise ValueError("max_chars musí byť väčšie ako overlap") if len(text) <= max_chars: return [text] chunks = [] start = 0 while start < len(text): remaining = text[start:] if len(remaining) <= max_chars: chunk = remaining.strip() if chunk: chunks.append(chunk) break split_at = find_split_position(remaining, max_chars) chunk = remaining[:split_at].strip() if chunk: chunks.append(chunk) start += max(1, split_at - overlap) return chunks def chunk_markdown(text: str) -> list[str]: """Rozdelí Markdown najprv podľa nadpisov a potom podľa dĺžky.""" text = clean_markdown(text) if not text: return [] chunks = [] for part in split_by_headings(text): chunks.extend(split_long_text(part)) return chunks def build_chunks() -> list[dict]: if not PAGES_ROOT.exists(): raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}") all_chunks = [] document_count = 0 for file_path in sorted(PAGES_ROOT.glob("**/README.md")): document = load_zpwiki_page(file_path) document_count += 1 for index, text in enumerate(chunk_markdown(document["content"])): all_chunks.append( { "chunk_id": f"{document['path']}::chunk-{index}", "document_path": document["path"], "title": document["title"], "categories": document["categories"], "tags": document["tags"], "author": document["author"], "published": document["published"], "chunk_index": index, "text": text, "text_length": len(text), } ) write_json(CHUNKS_FILE, all_chunks) print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}") print(f"[green]Dokumentov:[/green] {document_count}") print(f"[green]Chunkov:[/green] {len(all_chunks)}") print(f"[green]Výstup uložený do:[/green] {CHUNKS_FILE}") if all_chunks: print("\n[bold]Ukážka prvého chunku:[/bold]") print(all_chunks[0]) return all_chunks def main() -> None: build_chunks() if __name__ == "__main__": main()