import json def calculate_chars(squad): num_articles = len(squad['data']) print(f"total articles: {num_articles}") context_chars = 0 question_chars = 0 answer_chars = 0 total_paragraphs = 0 total_qas = 0 total_answers = 0 for article in squad['data']: total_paragraphs += len(article['paragraphs']) for paragraph in article['paragraphs']: context_chars += len(paragraph['context']) total_qas += len(paragraph['qas']) for qas in paragraph['qas']: question_chars += len(qas['question']) total_answers += len(qas['answers']) for answer in qas['answers']: answer_chars += len(answer['text']) print(f"total paragraphs: {total_paragraphs}") print(f"total qas: {total_qas}") print(f"total answers: {total_answers}") print(f"chars in contexts: {context_chars}") print(f"chars in questions: {question_chars}") print(f"chars in answers: {answer_chars}") total_chars = context_chars + question_chars + answer_chars print(f"total chars: {total_chars}") def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100): for article in squad['data'][:article_limit]: print("="*40) print(f"Article title: {article['title']}\n\n") for paragraph in article['paragraphs'][:paragraph_limit]: print(f"{paragraph['context']}\n") # index = 0 # for qas in paragraph['qas'][:qas_limit]: # print(f"Question: {qas['question']}") # print(f"Answers:") # answer = qas['answers'][0] # print(f"#{index} @{answer['answer_start']}: \t{answer['text']}") # print(f"#{index} ends @{answer['answer_end']}") # start = answer['answer_start'] # end = start + len(answer['text']) # print(f"from context: \t{paragraph['context'][start:end]}") # print("\n") # index += 1 if __name__ == "__main__": squad = None with open("./data/squad-v2-dev.json", "r", encoding="utf-8") as f: squad = json.load(f) calculate_chars(squad)