DiplomovaPraca/scripts/adc_scraper/validate_adc_json.py
2026-05-14 12:26:11 +02:00

48 lines
1.5 KiB
Python

"""Validate basic quality of structured ADC JSON."""
from __future__ import annotations
import argparse
import json
from collections import Counter
from pathlib import Path
def main() -> None:
parser = argparse.ArgumentParser(description="Validate structured ADC JSON.")
parser.add_argument("--input", type=Path, required=True)
args = parser.parse_args()
data = json.loads(args.input.read_text(encoding="utf-8"))
if not isinstance(data, list):
raise SystemExit("Input must be a JSON list.")
missing = Counter()
section_counter = Counter()
total_pil_chars = 0
total_spc_chars = 0
for record in data:
for key in ("source_url", "name", "pil_text", "spc_text", "sections"):
if not record.get(key):
missing[key] += 1
total_pil_chars += len(record.get("pil_text") or "")
total_spc_chars += len(record.get("spc_text") or "")
for section_name, section_text in (record.get("sections") or {}).items():
if section_text:
section_counter[section_name] += 1
print(f"Records: {len(data)}")
print(f"Average PIL chars: {total_pil_chars // max(len(data), 1)}")
print(f"Average SPC chars: {total_spc_chars // max(len(data), 1)}")
print("Missing fields:")
for key in ("source_url", "name", "pil_text", "spc_text", "sections"):
print(f" {key}: {missing[key]}")
print("Detected sections:")
for key, value in section_counter.most_common():
print(f" {key}: {value}")
if __name__ == "__main__":
main()