dp-zp-agent/scripts/scan_zpwiki.py

97 lines
2.4 KiB
Python

from __future__ import annotations
import sys
from collections import Counter
from pathlib import Path
from rich import print
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from scripts.common import DOCUMENTS_FILE, PAGES_ROOT, ZPWIKI_ROOT, load_zpwiki_page, write_json
def scan_pages() -> list[dict]:
if not PAGES_ROOT.exists():
raise SystemExit(f"Neexistuje priečinok: {PAGES_ROOT}")
documents = []
metadata_keys = Counter()
categories = Counter()
tags = Counter()
authors = Counter()
for file_path in sorted(PAGES_ROOT.glob("**/README.md")):
page = load_zpwiki_page(file_path)
content = page.pop("content", "")
for key in page["metadata"]:
metadata_keys[key] += 1
for category in page["categories"]:
categories[category] += 1
for tag in page["tags"]:
tags[tag] += 1
if page.get("author"):
authors[str(page["author"])] += 1
documents.append(
{
**page,
"content_preview": content[:500],
"content_length": len(content),
}
)
write_json(DOCUMENTS_FILE, documents)
print_summary(documents, metadata_keys, categories, tags, authors)
return documents
def print_summary(
documents: list[dict],
metadata_keys: Counter,
categories: Counter,
tags: Counter,
authors: Counter,
) -> None:
print(f"[green]ZPWIKI_ROOT:[/green] {ZPWIKI_ROOT}")
print(f"[green]Našiel som dokumentov:[/green] {len(documents)}")
print(f"[green]Výstup uložený do:[/green] {DOCUMENTS_FILE}")
print("\n[bold]Najčastejšie metadata kľúče:[/bold]")
for key, count in metadata_keys.most_common(30):
print(f"{key}: {count}")
print("\n[bold]Najčastejšie kategórie:[/bold]")
for key, count in categories.most_common(30):
print(f"{key}: {count}")
print("\n[bold]Najčastejšie tagy:[/bold]")
for key, count in tags.most_common(40):
print(f"{key}: {count}")
print("\n[bold]Najčastejší autori:[/bold]")
for key, count in authors.most_common(20):
print(f"{key}: {count}")
if documents:
print("\n[bold]Ukážka prvého dokumentu:[/bold]")
print(documents[0])
def main() -> None:
scan_pages()
if __name__ == "__main__":
main()