from pathlib import Path import json import re import frontmatter from rich import print ZPWIKI_ROOT = Path("../zpwiki") PAGES_ROOT = ZPWIKI_ROOT / "pages" OUTPUT_FILE = Path("data/chunks.json") MAX_CHARS = 1200 OVERLAP_CHARS = 200 def json_safe(value): if value is None: return None if isinstance(value, (str, int, float, bool)): return value if isinstance(value, list): return [json_safe(item) for item in value] if isinstance(value, dict): return {str(key): json_safe(val) for key, val in value.items()} return str(value) def normalize_list(value): if value is None: return [] if isinstance(value, list): return [str(item).strip() for item in value if str(item).strip()] if isinstance(value, str): return [item.strip() for item in value.split(",") if item.strip()] return [str(value)] def clean_markdown(text: str) -> str: text = text.replace("\r\n", "\n") text = re.sub(r"\n{3,}", "\n\n", text) text = text.strip() return text def split_by_headings(text: str) -> list[str]: parts = re.split(r"(?m)(?=^#{1,6}\s+)", text) return [part.strip() for part in parts if part.strip()] def split_long_text(text: str, max_chars: int = MAX_CHARS, overlap: int = OVERLAP_CHARS) -> list[str]: if len(text) <= max_chars: return [text] chunks = [] start = 0 while start < len(text): end = start + max_chars chunk = text[start:end].strip() if chunk: chunks.append(chunk) if end >= len(text): break start = max(0, end - overlap) return chunks def chunk_markdown(text: str) -> list[str]: text = clean_markdown(text) if not text: return [] heading_parts = split_by_headings(text) chunks = [] for part in heading_parts: if len(part) <= MAX_CHARS: chunks.append(part) else: chunks.extend(split_long_text(part)) return chunks def extract_document(file_path: Path) -> dict: post = frontmatter.load(file_path) metadata = { key: json_safe(value) for key, value in post.metadata.items() } taxonomy = metadata.get("taxonomy") or {} categories = normalize_list( metadata.get("category") or taxonomy.get("category") ) tags = normalize_list( metadata.get("tag") or metadata.get("tags") or taxonomy.get("tag") or taxonomy.get("tags") ) author = ( metadata.get("author") or taxonomy.get("author") ) relative_path = file_path.relative_to(ZPWIKI_ROOT) return { "path": str(relative_path), "title": metadata.get("title"), "categories": categories, "tags": tags, "published": metadata.get("published"), "author": author, "content": post.content.strip(), "metadata": metadata, } def main(): if not PAGES_ROOT.exists(): raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}") markdown_files = sorted(PAGES_ROOT.glob("**/README.md")) all_chunks = [] document_count = 0 for file_path in markdown_files: document = extract_document(file_path) chunks = chunk_markdown(document["content"]) document_count += 1 for index, chunk_text in enumerate(chunks): all_chunks.append({ "chunk_id": f"{document['path']}::chunk-{index}", "document_path": document["path"], "title": document["title"], "categories": document["categories"], "tags": document["tags"], "author": document["author"], "published": document["published"], "chunk_index": index, "text": chunk_text, "text_length": len(chunk_text), }) OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) with OUTPUT_FILE.open("w", encoding="utf-8") as file: json.dump(all_chunks, file, ensure_ascii=False, indent=2) print(f"[green]Dokumentov:[/green] {document_count}") print(f"[green]Chunkov:[/green] {len(all_chunks)}") print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}") if all_chunks: print("\n[bold]Ukážka prvého chunku:[/bold]") print(all_chunks[0]) if __name__ == "__main__": main()