"""Validate basic quality of structured ADC JSON.""" from __future__ import annotations import argparse import json from collections import Counter from pathlib import Path def main() -> None: parser = argparse.ArgumentParser(description="Validate structured ADC JSON.") parser.add_argument("--input", type=Path, required=True) args = parser.parse_args() data = json.loads(args.input.read_text(encoding="utf-8")) if not isinstance(data, list): raise SystemExit("Input must be a JSON list.") missing = Counter() section_counter = Counter() total_pil_chars = 0 total_spc_chars = 0 for record in data: for key in ("source_url", "name", "pil_text", "spc_text", "sections"): if not record.get(key): missing[key] += 1 total_pil_chars += len(record.get("pil_text") or "") total_spc_chars += len(record.get("spc_text") or "") for section_name, section_text in (record.get("sections") or {}).items(): if section_text: section_counter[section_name] += 1 print(f"Records: {len(data)}") print(f"Average PIL chars: {total_pil_chars // max(len(data), 1)}") print(f"Average SPC chars: {total_spc_chars // max(len(data), 1)}") print("Missing fields:") for key in ("source_url", "name", "pil_text", "spc_text", "sections"): print(f" {key}: {missing[key]}") print("Detected sections:") for key, value in section_counter.most_common(): print(f" {key}: {value}") if __name__ == "__main__": main()