spring cleaning
This commit is contained in:
parent
f5538d4882
commit
695cef2b32
1661218
squad-test-translated.json
Normal file
1661218
squad-test-translated.json
Normal file
File diff suppressed because one or more lines are too long
@ -1,45 +0,0 @@
|
||||
import json
|
||||
|
||||
squad = None
|
||||
|
||||
with open("squad-v2-dev.json", "r", encoding="utf-8") as f:
|
||||
squad = json.load(f)
|
||||
|
||||
num_articles = len(squad['data'])
|
||||
print(f"total articles: {num_articles}")
|
||||
|
||||
context_chars = 0
|
||||
question_chars = 0
|
||||
answer_chars = 0
|
||||
|
||||
total_paragraphs = 0
|
||||
total_qas = 0
|
||||
total_answers = 0
|
||||
for article in squad['data']:
|
||||
total_paragraphs += len(article['paragraphs'])
|
||||
|
||||
for paragraph in article['paragraphs']:
|
||||
context_chars += len(paragraph['context'])
|
||||
|
||||
total_qas += len(paragraph['qas'])
|
||||
|
||||
for qas in paragraph['qas']:
|
||||
question_chars += len(qas['question'])
|
||||
|
||||
total_answers += len(qas['answers'])
|
||||
|
||||
for answer in qas['answers']:
|
||||
answer_chars += len(answer['text'])
|
||||
|
||||
print(f"total paragraphs: {total_paragraphs}")
|
||||
print(f"total qas: {total_qas}")
|
||||
print(f"total answers: {total_answers}")
|
||||
|
||||
print(f"chars in contexts: {context_chars}")
|
||||
print(f"chars in questions: {question_chars}")
|
||||
print(f"chars in answers: {answer_chars}")
|
||||
|
||||
total_chars = context_chars + question_chars + answer_chars
|
||||
|
||||
print(f"total chars: {total_chars}")
|
||||
|
93
squad_transform.py
Normal file
93
squad_transform.py
Normal file
@ -0,0 +1,93 @@
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
from squad_utils import print_squad
|
||||
|
||||
|
||||
def load(filename):
|
||||
with open(filename, "r") as f:
|
||||
squad = json.load(f)
|
||||
|
||||
return squad
|
||||
|
||||
|
||||
def save(filename, squad):
|
||||
with open(filename, "w") as f:
|
||||
json.dump(squad, f, indent=2)
|
||||
|
||||
|
||||
def sort_qas_by_answer_index(squad):
|
||||
for article in squad['data']:
|
||||
for paragraph in article['paragraphs']:
|
||||
impossible_qas = list(filter(lambda qas: qas['is_impossible'] == True, paragraph['qas']))
|
||||
possible_qas = list(filter(lambda qas: qas['is_impossible'] == False, paragraph['qas']))
|
||||
sorted_qas = sorted(possible_qas, key=lambda qas: qas['answers'][0]['answer_start'])
|
||||
|
||||
for qas in sorted_qas:
|
||||
a = qas['answers'][0]
|
||||
a['answer_end'] = a['answer_start'] + len(a['text'])
|
||||
|
||||
paragraph['qas'] = sorted_qas + impossible_qas
|
||||
|
||||
|
||||
def transform_squad(squad):
|
||||
for article in squad['data']:
|
||||
for paragraph in article['paragraphs']:
|
||||
add_special_chars_to_paragraph(paragraph)
|
||||
|
||||
|
||||
def add_special_chars_to_paragraph(paragraph):
|
||||
for counter, qas in enumerate(paragraph['qas']):
|
||||
# Skip if impossible question
|
||||
if qas["is_impossible"] == True: continue
|
||||
|
||||
special_char = f"[{counter}]"
|
||||
|
||||
if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue
|
||||
|
||||
current = qas['answers'][0]
|
||||
|
||||
# Get start index
|
||||
start = current['answer_start']
|
||||
# Calculate end index
|
||||
end = current['answer_end']
|
||||
# Add special chars to context
|
||||
context = paragraph['context']
|
||||
paragraph['context'] = f"{context[:start]}{special_char} {context[start:end]} {special_char}{context[end:]}"
|
||||
|
||||
# Recalculate indexes
|
||||
for q in paragraph['qas'][counter + 1:]: # Skip all answers before and current one
|
||||
if q["is_impossible"] == True: continue
|
||||
|
||||
other = q['answers'][0]
|
||||
|
||||
if other['answer_start'] >= current['answer_start'] and other['answer_end'] <= current["answer_end"]: # Other is being enclosed by current
|
||||
other['answer_start'] += len(special_char) +1
|
||||
other['answer_end'] += 2*len(special_char) +2
|
||||
|
||||
elif other['answer_start'] < current['answer_end']: # Other is enclosing the current one
|
||||
other['answer_start'] += len(special_char) +1
|
||||
other['answer_end'] += len(special_char) +1
|
||||
|
||||
else: # Other is after current
|
||||
other['answer_start'] += 2*len(special_char) +2
|
||||
other['answer_end'] += 2*len(special_char) +2
|
||||
|
||||
# Fix indexes in current answer
|
||||
other = paragraph['qas'][counter]['answers'][0]
|
||||
|
||||
if other == current: # Other answer is the one im working on
|
||||
other['answer_start'] += len(special_char) +1
|
||||
other['answer_end'] += len(special_char) +1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_dotenv()
|
||||
|
||||
squad = load('./squad-test.json')
|
||||
|
||||
sort_qas_by_answer_index(squad)
|
||||
transform_squad(squad)
|
||||
print_squad(squad)
|
||||
|
||||
save("./squad-test-translated.json", squad)
|
||||
|
@ -5,42 +5,6 @@ import six
|
||||
from google.cloud import translate_v2 as translate
|
||||
|
||||
|
||||
def load(filename):
|
||||
with open(filename, "r") as f:
|
||||
squad = json.load(f)
|
||||
|
||||
return squad
|
||||
|
||||
|
||||
def save(filename, squad):
|
||||
with open(filename, "w") as f:
|
||||
json.dump(squad, f, indent=2)
|
||||
|
||||
|
||||
def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100):
|
||||
for article in squad['data'][:article_limit]:
|
||||
print("="*40)
|
||||
print(f"Article title: {article['title']}\n\n")
|
||||
|
||||
for paragraph in article['paragraphs'][:paragraph_limit]:
|
||||
print(f"{paragraph['context']}\n")
|
||||
|
||||
# index = 0
|
||||
# for qas in paragraph['qas'][:qas_limit]:
|
||||
# print(f"Question: {qas['question']}")
|
||||
|
||||
# print(f"Answers:")
|
||||
# answer = qas['answers'][0]
|
||||
# print(f"#{index} @{answer['answer_start']}: \t{answer['text']}")
|
||||
# print(f"#{index} ends @{answer['answer_end']}")
|
||||
# start = answer['answer_start']
|
||||
# end = start + len(answer['text'])
|
||||
# print(f"from context: \t{paragraph['context'][start:end]}")
|
||||
|
||||
# print("\n")
|
||||
# index += 1
|
||||
|
||||
|
||||
def translate_text(text):
|
||||
"""Translates text into the target language.
|
||||
|
||||
@ -130,12 +94,13 @@ def add_special_chars_to_paragraph(paragraph):
|
||||
if __name__ == "__main__":
|
||||
load_dotenv()
|
||||
|
||||
squad = load('./squad-v2-dev.json')
|
||||
with open("./squad-test.json", "r") as f:
|
||||
squad = json.load(f)
|
||||
|
||||
sort_qas_by_answer_index(squad)
|
||||
transform_squad(squad)
|
||||
print_squad(squad)
|
||||
save("./squad-v2-dev-test-out.json", squad)
|
||||
|
||||
# translate_text("my name is tomas")
|
||||
|
||||
with open("./squad-test-out.json", "w") as f:
|
||||
json.dump(squad, f, indent=2)
|
||||
|
76
squad_utils.py
Normal file
76
squad_utils.py
Normal file
@ -0,0 +1,76 @@
|
||||
import json
|
||||
|
||||
|
||||
def calculate_chars(squad):
|
||||
num_articles = len(squad['data'])
|
||||
print(f"total articles: {num_articles}")
|
||||
|
||||
context_chars = 0
|
||||
question_chars = 0
|
||||
answer_chars = 0
|
||||
|
||||
total_paragraphs = 0
|
||||
total_qas = 0
|
||||
total_answers = 0
|
||||
|
||||
for article in squad['data']:
|
||||
total_paragraphs += len(article['paragraphs'])
|
||||
|
||||
for paragraph in article['paragraphs']:
|
||||
context_chars += len(paragraph['context'])
|
||||
|
||||
total_qas += len(paragraph['qas'])
|
||||
|
||||
for qas in paragraph['qas']:
|
||||
question_chars += len(qas['question'])
|
||||
|
||||
total_answers += len(qas['answers'])
|
||||
|
||||
for answer in qas['answers']:
|
||||
answer_chars += len(answer['text'])
|
||||
|
||||
print(f"total paragraphs: {total_paragraphs}")
|
||||
print(f"total qas: {total_qas}")
|
||||
print(f"total answers: {total_answers}")
|
||||
|
||||
print(f"chars in contexts: {context_chars}")
|
||||
print(f"chars in questions: {question_chars}")
|
||||
print(f"chars in answers: {answer_chars}")
|
||||
|
||||
total_chars = context_chars + question_chars + answer_chars
|
||||
|
||||
print(f"total chars: {total_chars}")
|
||||
|
||||
|
||||
def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100):
|
||||
for article in squad['data'][:article_limit]:
|
||||
print("="*40)
|
||||
print(f"Article title: {article['title']}\n\n")
|
||||
|
||||
for paragraph in article['paragraphs'][:paragraph_limit]:
|
||||
print(f"{paragraph['context']}\n")
|
||||
|
||||
# index = 0
|
||||
# for qas in paragraph['qas'][:qas_limit]:
|
||||
# print(f"Question: {qas['question']}")
|
||||
|
||||
# print(f"Answers:")
|
||||
# answer = qas['answers'][0]
|
||||
# print(f"#{index} @{answer['answer_start']}: \t{answer['text']}")
|
||||
# print(f"#{index} ends @{answer['answer_end']}")
|
||||
# start = answer['answer_start']
|
||||
# end = start + len(answer['text'])
|
||||
# print(f"from context: \t{paragraph['context'][start:end]}")
|
||||
|
||||
# print("\n")
|
||||
# index += 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
squad = None
|
||||
|
||||
with open("squad-v2-dev.json", "r", encoding="utf-8") as f:
|
||||
squad = json.load(f)
|
||||
|
||||
calculate_chars(squad)
|
||||
|
Loading…
Reference in New Issue
Block a user