from pathlib import Path from collections import Counter import json import frontmatter from rich import print ZPWIKI_ROOT = Path("../zpwiki") PAGES_ROOT = ZPWIKI_ROOT / "pages" OUTPUT_FILE = Path("data/documents.json") def json_safe(value): if value is None: return None if isinstance(value, (str, int, float, bool)): return value if isinstance(value, list): return [json_safe(item) for item in value] if isinstance(value, dict): return {str(key): json_safe(val) for key, val in value.items()} return str(value) def normalize_list(value): if value is None: return [] if isinstance(value, list): return [str(item).strip() for item in value if str(item).strip()] if isinstance(value, str): return [item.strip() for item in value.split(",") if item.strip()] return [str(value)] def main(): if not PAGES_ROOT.exists(): raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}") markdown_files = sorted(PAGES_ROOT.glob("**/README.md")) documents = [] metadata_keys = Counter() categories_counter = Counter() tags_counter = Counter() authors_counter = Counter() for file_path in markdown_files: post = frontmatter.load(file_path) metadata = { key: json_safe(value) for key, value in post.metadata.items() } taxonomy = metadata.get("taxonomy") or {} content = post.content.strip() for key in metadata.keys(): metadata_keys[key] += 1 categories = normalize_list( metadata.get("category") or taxonomy.get("category") ) tags = normalize_list( metadata.get("tag") or metadata.get("tags") or taxonomy.get("tag") or taxonomy.get("tags") ) author = ( metadata.get("author") or taxonomy.get("author") ) for category in categories: categories_counter[category] += 1 for tag in tags: tags_counter[tag] += 1 if author: authors_counter[str(author)] += 1 relative_path = file_path.relative_to(ZPWIKI_ROOT) documents.append({ "path": str(relative_path), "title": metadata.get("title"), "categories": categories, "tags": tags, "published": metadata.get("published"), "author": author, "taxonomy": taxonomy, "metadata": metadata, "content_preview": content[:500], "content_length": len(content), }) OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) with OUTPUT_FILE.open("w", encoding="utf-8") as file: json.dump(documents, file, ensure_ascii=False, indent=2) print(f"[green]Našiel som README.md súborov:[/green] {len(markdown_files)}") print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}") print("\n[bold]Najčastejšie metadata kľúče:[/bold]") for key, count in metadata_keys.most_common(30): print(f"{key}: {count}") print("\n[bold]Najčastejšie kategórie:[/bold]") for key, count in categories_counter.most_common(30): print(f"{key}: {count}") print("\n[bold]Najčastejšie tagy:[/bold]") for key, count in tags_counter.most_common(40): print(f"{key}: {count}") print("\n[bold]Najčastejší autori:[/bold]") for key, count in authors_counter.most_common(20): print(f"{key}: {count}") print("\n[bold]Ukážka prvého dokumentu:[/bold]") if documents: print(documents[0]) if __name__ == "__main__": main()