140 lines
3.6 KiB
Python
140 lines
3.6 KiB
Python
from pathlib import Path
|
|
from collections import Counter
|
|
import json
|
|
import frontmatter
|
|
from rich import print
|
|
|
|
|
|
ZPWIKI_ROOT = Path("../zpwiki")
|
|
PAGES_ROOT = ZPWIKI_ROOT / "pages"
|
|
OUTPUT_FILE = Path("data/documents.json")
|
|
|
|
|
|
def json_safe(value):
|
|
if value is None:
|
|
return None
|
|
|
|
if isinstance(value, (str, int, float, bool)):
|
|
return value
|
|
|
|
if isinstance(value, list):
|
|
return [json_safe(item) for item in value]
|
|
|
|
if isinstance(value, dict):
|
|
return {str(key): json_safe(val) for key, val in value.items()}
|
|
|
|
return str(value)
|
|
|
|
|
|
def normalize_list(value):
|
|
if value is None:
|
|
return []
|
|
|
|
if isinstance(value, list):
|
|
return [str(item).strip() for item in value if str(item).strip()]
|
|
|
|
if isinstance(value, str):
|
|
return [item.strip() for item in value.split(",") if item.strip()]
|
|
|
|
return [str(value)]
|
|
|
|
|
|
def main():
|
|
if not PAGES_ROOT.exists():
|
|
raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")
|
|
|
|
markdown_files = sorted(PAGES_ROOT.glob("**/README.md"))
|
|
|
|
documents = []
|
|
metadata_keys = Counter()
|
|
categories_counter = Counter()
|
|
tags_counter = Counter()
|
|
authors_counter = Counter()
|
|
|
|
for file_path in markdown_files:
|
|
post = frontmatter.load(file_path)
|
|
|
|
metadata = {
|
|
key: json_safe(value)
|
|
for key, value in post.metadata.items()
|
|
}
|
|
|
|
taxonomy = metadata.get("taxonomy") or {}
|
|
content = post.content.strip()
|
|
|
|
for key in metadata.keys():
|
|
metadata_keys[key] += 1
|
|
|
|
categories = normalize_list(
|
|
metadata.get("category")
|
|
or taxonomy.get("category")
|
|
)
|
|
|
|
tags = normalize_list(
|
|
metadata.get("tag")
|
|
or metadata.get("tags")
|
|
or taxonomy.get("tag")
|
|
or taxonomy.get("tags")
|
|
)
|
|
|
|
author = (
|
|
metadata.get("author")
|
|
or taxonomy.get("author")
|
|
)
|
|
|
|
for category in categories:
|
|
categories_counter[category] += 1
|
|
|
|
for tag in tags:
|
|
tags_counter[tag] += 1
|
|
|
|
if author:
|
|
authors_counter[str(author)] += 1
|
|
|
|
relative_path = file_path.relative_to(ZPWIKI_ROOT)
|
|
|
|
documents.append({
|
|
"path": str(relative_path),
|
|
"title": metadata.get("title"),
|
|
"categories": categories,
|
|
"tags": tags,
|
|
"published": metadata.get("published"),
|
|
"author": author,
|
|
"taxonomy": taxonomy,
|
|
"metadata": metadata,
|
|
"content_preview": content[:500],
|
|
"content_length": len(content),
|
|
})
|
|
|
|
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with OUTPUT_FILE.open("w", encoding="utf-8") as file:
|
|
json.dump(documents, file, ensure_ascii=False, indent=2)
|
|
|
|
print(f"[green]Našiel som README.md súborov:[/green] {len(markdown_files)}")
|
|
print(f"[green]Výstup uložený do:[/green] {OUTPUT_FILE}")
|
|
|
|
print("\n[bold]Najčastejšie metadata kľúče:[/bold]")
|
|
for key, count in metadata_keys.most_common(30):
|
|
print(f"{key}: {count}")
|
|
|
|
print("\n[bold]Najčastejšie kategórie:[/bold]")
|
|
for key, count in categories_counter.most_common(30):
|
|
print(f"{key}: {count}")
|
|
|
|
print("\n[bold]Najčastejšie tagy:[/bold]")
|
|
for key, count in tags_counter.most_common(40):
|
|
print(f"{key}: {count}")
|
|
|
|
print("\n[bold]Najčastejší autori:[/bold]")
|
|
for key, count in authors_counter.most_common(20):
|
|
print(f"{key}: {count}")
|
|
|
|
print("\n[bold]Ukážka prvého dokumentu:[/bold]")
|
|
if documents:
|
|
print(documents[0])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|