dp2022/squad_utils.py

77 lines
2.2 KiB
Python
Raw Normal View History

2022-02-20 20:32:55 +00:00
import json
def calculate_chars(squad):
num_articles = len(squad['data'])
print(f"total articles: {num_articles}")
context_chars = 0
question_chars = 0
answer_chars = 0
total_paragraphs = 0
total_qas = 0
total_answers = 0
for article in squad['data']:
total_paragraphs += len(article['paragraphs'])
for paragraph in article['paragraphs']:
context_chars += len(paragraph['context'])
total_qas += len(paragraph['qas'])
for qas in paragraph['qas']:
question_chars += len(qas['question'])
total_answers += len(qas['answers'])
for answer in qas['answers']:
answer_chars += len(answer['text'])
print(f"total paragraphs: {total_paragraphs}")
print(f"total qas: {total_qas}")
print(f"total answers: {total_answers}")
print(f"chars in contexts: {context_chars}")
print(f"chars in questions: {question_chars}")
print(f"chars in answers: {answer_chars}")
total_chars = context_chars + question_chars + answer_chars
print(f"total chars: {total_chars}")
def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100):
for article in squad['data'][:article_limit]:
print("="*40)
print(f"Article title: {article['title']}\n\n")
for paragraph in article['paragraphs'][:paragraph_limit]:
print(f"{paragraph['context']}\n")
# index = 0
# for qas in paragraph['qas'][:qas_limit]:
# print(f"Question: {qas['question']}")
# print(f"Answers:")
# answer = qas['answers'][0]
# print(f"#{index} @{answer['answer_start']}: \t{answer['text']}")
# print(f"#{index} ends @{answer['answer_end']}")
# start = answer['answer_start']
# end = start + len(answer['text'])
# print(f"from context: \t{paragraph['context'][start:end]}")
# print("\n")
# index += 1
if __name__ == "__main__":
squad = None
2022-02-20 21:02:03 +00:00
with open("./data/squad-v2-dev.json", "r", encoding="utf-8") as f:
2022-02-20 20:32:55 +00:00
squad = json.load(f)
calculate_chars(squad)