2021-11-04 20:07:23 +00:00
|
|
|
import json
|
|
|
|
from dotenv import load_dotenv
|
2022-02-20 22:04:51 +00:00
|
|
|
from tqdm import tqdm
|
2021-11-04 20:07:23 +00:00
|
|
|
|
2022-02-20 20:34:12 +00:00
|
|
|
from squad_utils import print_squad
|
2022-02-20 22:04:51 +00:00
|
|
|
from translate_utils import translate_text
|
2022-02-15 08:14:58 +00:00
|
|
|
|
2021-11-04 20:07:23 +00:00
|
|
|
|
2022-02-15 08:14:58 +00:00
|
|
|
def sort_qas_by_answer_index(squad):
|
|
|
|
for article in squad['data']:
|
|
|
|
for paragraph in article['paragraphs']:
|
|
|
|
impossible_qas = list(filter(lambda qas: qas['is_impossible'] == True, paragraph['qas']))
|
|
|
|
possible_qas = list(filter(lambda qas: qas['is_impossible'] == False, paragraph['qas']))
|
|
|
|
sorted_qas = sorted(possible_qas, key=lambda qas: qas['answers'][0]['answer_start'])
|
|
|
|
|
|
|
|
for qas in sorted_qas:
|
|
|
|
a = qas['answers'][0]
|
|
|
|
a['answer_end'] = a['answer_start'] + len(a['text'])
|
|
|
|
|
|
|
|
paragraph['qas'] = sorted_qas + impossible_qas
|
|
|
|
|
|
|
|
|
|
|
|
def transform_squad(squad):
|
|
|
|
for article in squad['data']:
|
|
|
|
for paragraph in article['paragraphs']:
|
|
|
|
add_special_chars_to_paragraph(paragraph)
|
|
|
|
|
|
|
|
|
|
|
|
def add_special_chars_to_paragraph(paragraph):
|
|
|
|
for counter, qas in enumerate(paragraph['qas']):
|
|
|
|
# Skip if impossible question
|
|
|
|
if qas["is_impossible"] == True: continue
|
|
|
|
|
|
|
|
special_char = f"[{counter}]"
|
|
|
|
|
|
|
|
if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue
|
|
|
|
|
|
|
|
current = qas['answers'][0]
|
|
|
|
|
|
|
|
# Get start index
|
|
|
|
start = current['answer_start']
|
|
|
|
# Calculate end index
|
|
|
|
end = current['answer_end']
|
|
|
|
# Add special chars to context
|
|
|
|
context = paragraph['context']
|
|
|
|
paragraph['context'] = f"{context[:start]}{special_char} {context[start:end]} {special_char}{context[end:]}"
|
|
|
|
|
|
|
|
# Recalculate indexes
|
|
|
|
for q in paragraph['qas'][counter + 1:]: # Skip all answers before and current one
|
|
|
|
if q["is_impossible"] == True: continue
|
|
|
|
|
|
|
|
other = q['answers'][0]
|
|
|
|
|
|
|
|
if other['answer_start'] >= current['answer_start'] and other['answer_end'] <= current["answer_end"]: # Other is being enclosed by current
|
|
|
|
other['answer_start'] += len(special_char) +1
|
|
|
|
other['answer_end'] += 2*len(special_char) +2
|
|
|
|
|
|
|
|
elif other['answer_start'] < current['answer_end']: # Other is enclosing the current one
|
|
|
|
other['answer_start'] += len(special_char) +1
|
|
|
|
other['answer_end'] += len(special_char) +1
|
|
|
|
|
|
|
|
else: # Other is after current
|
|
|
|
other['answer_start'] += 2*len(special_char) +2
|
|
|
|
other['answer_end'] += 2*len(special_char) +2
|
|
|
|
|
|
|
|
# Fix indexes in current answer
|
|
|
|
other = paragraph['qas'][counter]['answers'][0]
|
|
|
|
|
|
|
|
if other == current: # Other answer is the one im working on
|
|
|
|
other['answer_start'] += len(special_char) +1
|
|
|
|
other['answer_end'] += len(special_char) +1
|
|
|
|
|
|
|
|
|
2022-02-20 22:04:51 +00:00
|
|
|
def translate_paragraphs(squad):
|
|
|
|
for article in tqdm(squad["data"]):
|
|
|
|
for paragraph in article["paragraphs"]:
|
|
|
|
translated = translate_text(paragraph["context"])
|
|
|
|
paragraph['translatedContext'] = translated
|
|
|
|
|
|
|
|
|
2021-11-04 20:07:23 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
load_dotenv()
|
|
|
|
|
2022-02-20 21:02:03 +00:00
|
|
|
with open("./data/squad-v2-dev-small.json", "r") as f:
|
2022-02-20 20:34:12 +00:00
|
|
|
squad = json.load(f)
|
2022-02-20 20:32:55 +00:00
|
|
|
|
2022-02-15 08:14:58 +00:00
|
|
|
sort_qas_by_answer_index(squad)
|
|
|
|
transform_squad(squad)
|
2022-02-20 22:04:51 +00:00
|
|
|
translate_paragraphs(squad)
|
2021-11-04 20:07:23 +00:00
|
|
|
|
2022-02-20 21:02:03 +00:00
|
|
|
with open("./data/squad-v2-dev-small-translated.json", "w") as f:
|
2022-02-20 20:34:12 +00:00
|
|
|
json.dump(squad, f, indent=2)
|
2021-11-04 20:07:23 +00:00
|
|
|
|