dp-zp-agent/scripts/scan_zpwiki.py
2026-06-03 21:04:03 +02:00

140 lines
3.6 KiB
Python

from pathlib import Path
from collections import Counter
import json
import frontmatter
from rich import print
ZPWIKI_ROOT = Path("../zpwiki")
PAGES_ROOT = ZPWIKI_ROOT / "pages"
OUTPUT_FILE = Path("data/documents.json")
def json_safe(value):
if value is None:
return None
if isinstance(value, (str, int, float, bool)):
return value
if isinstance(value, list):
return [json_safe(item) for item in value]
if isinstance(value, dict):
return {str(key): json_safe(val) for key, val in value.items()}
return str(value)
def normalize_list(value):
if value is None:
return []
if isinstance(value, list):
return [str(item).strip() for item in value if str(item).strip()]
if isinstance(value, str):
return [item.strip() for item in value.split(",") if item.strip()]
return [str(value)]
def main():
if not PAGES_ROOT.exists():
raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")
markdown_files = sorted(PAGES_ROOT.glob("**/README.md"))
documents = []
metadata_keys = Counter()
categories_counter = Counter()
tags_counter = Counter()
authors_counter = Counter()
for file_path in markdown_files:
post = frontmatter.load(file_path)
metadata = {
key: json_safe(value)
for key, value in post.metadata.items()
}
taxonomy = metadata.get("taxonomy") or {}
content = post.content.strip()
for key in metadata.keys():
metadata_keys[key] += 1
categories = normalize_list(
metadata.get("category")
or taxonomy.get("category")
)
tags = normalize_list(
metadata.get("tag")
or metadata.get("tags")
or taxonomy.get("tag")
or taxonomy.get("tags")
)
author = (
metadata.get("author")
or taxonomy.get("author")
)
for category in categories:
categories_counter[category] += 1
for tag in tags:
tags_counter[tag] += 1
if author:
authors_counter[str(author)] += 1
relative_path = file_path.relative_to(ZPWIKI_ROOT)
documents.append({
"path": str(relative_path),
"title": metadata.get("title"),
"categories": categories,
"tags": tags,
"published": metadata.get("published"),
"author": author,
"taxonomy": taxonomy,
"metadata": metadata,
"content_preview": content[:500],
"content_length": len(content),
})
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_FILE.open("w", encoding="utf-8") as file:
json.dump(documents, file, ensure_ascii=False, indent=2)
print(f"[green]Našiel som README.md súborov:[/green] {len(markdown_files)}")
print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}")
print("\n[bold]Najčastejšie metadata kľúče:[/bold]")
for key, count in metadata_keys.most_common(30):
print(f"{key}: {count}")
print("\n[bold]Najčastejšie kategórie:[/bold]")
for key, count in categories_counter.most_common(30):
print(f"{key}: {count}")
print("\n[bold]Najčastejšie tagy:[/bold]")
for key, count in tags_counter.most_common(40):
print(f"{key}: {count}")
print("\n[bold]Najčastejší autori:[/bold]")
for key, count in authors_counter.most_common(20):
print(f"{key}: {count}")
print("\n[bold]Ukážka prvého dokumentu:[/bold]")
if documents:
print(documents[0])
if __name__ == "__main__":
main()