dp2022/squad_char_counter.py

46 lines
1.1 KiB
Python

import json
squad = None
with open("squad-v2-dev.json", "r", encoding="utf-8") as f:
squad = json.load(f)
num_articles = len(squad['data'])
print(f"total articles: {num_articles}")
context_chars = 0
question_chars = 0
answer_chars = 0
total_paragraphs = 0
total_qas = 0
total_answers = 0
for article in squad['data']:
total_paragraphs += len(article['paragraphs'])
for paragraph in article['paragraphs']:
context_chars += len(paragraph['context'])
total_qas += len(paragraph['qas'])
for qas in paragraph['qas']:
question_chars += len(qas['question'])
total_answers += len(qas['answers'])
for answer in qas['answers']:
answer_chars += len(answer['text'])
print(f"total paragraphs: {total_paragraphs}")
print(f"total qas: {total_qas}")
print(f"total answers: {total_answers}")
print(f"chars in contexts: {context_chars}")
print(f"chars in questions: {question_chars}")
print(f"chars in answers: {answer_chars}")
total_chars = context_chars + question_chars + answer_chars
print(f"total chars: {total_chars}")