97 lines
2.4 KiB
Python
97 lines
2.4 KiB
Python
from __future__ import annotations
|
|
|
|
import sys
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
from rich import print
|
|
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
|
|
|
if str(PROJECT_ROOT) not in sys.path:
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
|
|
from scripts.common import DOCUMENTS_FILE, PAGES_ROOT, ZPWIKI_ROOT, load_zpwiki_page, write_json
|
|
|
|
|
|
def scan_pages() -> list[dict]:
|
|
if not PAGES_ROOT.exists():
|
|
raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")
|
|
|
|
documents = []
|
|
metadata_keys = Counter()
|
|
categories = Counter()
|
|
tags = Counter()
|
|
authors = Counter()
|
|
|
|
for file_path in sorted(PAGES_ROOT.glob("**/README.md")):
|
|
page = load_zpwiki_page(file_path)
|
|
content = page.pop("content", "")
|
|
|
|
for key in page["metadata"]:
|
|
metadata_keys[key] += 1
|
|
|
|
for category in page["categories"]:
|
|
categories[category] += 1
|
|
|
|
for tag in page["tags"]:
|
|
tags[tag] += 1
|
|
|
|
if page.get("author"):
|
|
authors[str(page["author"])] += 1
|
|
|
|
documents.append(
|
|
{
|
|
**page,
|
|
"content_preview": content[:500],
|
|
"content_length": len(content),
|
|
}
|
|
)
|
|
|
|
write_json(DOCUMENTS_FILE, documents)
|
|
print_summary(documents, metadata_keys, categories, tags, authors)
|
|
|
|
return documents
|
|
|
|
|
|
def print_summary(
|
|
documents: list[dict],
|
|
metadata_keys: Counter,
|
|
categories: Counter,
|
|
tags: Counter,
|
|
authors: Counter,
|
|
) -> None:
|
|
print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}")
|
|
print(f"[green]Našiel som dokumentov:[/green] {len(documents)}")
|
|
print(f"[green]Výstup uložený do:[/green] {DOCUMENTS_FILE}")
|
|
|
|
print("\n[bold]Najčastejšie metadata kľúče:[/bold]")
|
|
for key, count in metadata_keys.most_common(30):
|
|
print(f"{key}: {count}")
|
|
|
|
print("\n[bold]Najčastejšie kategórie:[/bold]")
|
|
for key, count in categories.most_common(30):
|
|
print(f"{key}: {count}")
|
|
|
|
print("\n[bold]Najčastejšie tagy:[/bold]")
|
|
for key, count in tags.most_common(40):
|
|
print(f"{key}: {count}")
|
|
|
|
print("\n[bold]Najčastejší autori:[/bold]")
|
|
for key, count in authors.most_common(20):
|
|
print(f"{key}: {count}")
|
|
|
|
if documents:
|
|
print("\n[bold]Ukážka prvého dokumentu:[/bold]")
|
|
print(documents[0])
|
|
|
|
|
|
def main() -> None:
|
|
scan_pages()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|