48 lines
1.5 KiB
Python
48 lines
1.5 KiB
Python
"""Validate basic quality of structured ADC JSON."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Validate structured ADC JSON.")
|
|
parser.add_argument("--input", type=Path, required=True)
|
|
args = parser.parse_args()
|
|
|
|
data = json.loads(args.input.read_text(encoding="utf-8"))
|
|
if not isinstance(data, list):
|
|
raise SystemExit("Input must be a JSON list.")
|
|
|
|
missing = Counter()
|
|
section_counter = Counter()
|
|
total_pil_chars = 0
|
|
total_spc_chars = 0
|
|
|
|
for record in data:
|
|
for key in ("source_url", "name", "pil_text", "spc_text", "sections"):
|
|
if not record.get(key):
|
|
missing[key] += 1
|
|
total_pil_chars += len(record.get("pil_text") or "")
|
|
total_spc_chars += len(record.get("spc_text") or "")
|
|
for section_name, section_text in (record.get("sections") or {}).items():
|
|
if section_text:
|
|
section_counter[section_name] += 1
|
|
|
|
print(f"Records: {len(data)}")
|
|
print(f"Average PIL chars: {total_pil_chars // max(len(data), 1)}")
|
|
print(f"Average SPC chars: {total_spc_chars // max(len(data), 1)}")
|
|
print("Missing fields:")
|
|
for key in ("source_url", "name", "pil_text", "spc_text", "sections"):
|
|
print(f" {key}: {missing[key]}")
|
|
print("Detected sections:")
|
|
for key, value in section_counter.most_common():
|
|
print(f" {key}: {value}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|