dp2022/squad_translate_google.py

142 lines
4.7 KiB
Python
Raw Normal View History

import json
from dotenv import load_dotenv
import six
from google.cloud import translate_v2 as translate
def load(filename):
with open(filename, "r") as f:
squad = json.load(f)
return squad
def save(filename, squad):
with open(filename, "w") as f:
json.dump(squad, f, indent=2)
def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100):
for article in squad['data'][:article_limit]:
print("="*40)
print(f"Article title: {article['title']}\n\n")
for paragraph in article['paragraphs'][:paragraph_limit]:
print(f"{paragraph['context']}\n")
# index = 0
# for qas in paragraph['qas'][:qas_limit]:
# print(f"Question: {qas['question']}")
# print(f"Answers:")
# answer = qas['answers'][0]
# print(f"#{index} @{answer['answer_start']}: \t{answer['text']}")
# print(f"#{index} ends @{answer['answer_end']}")
# start = answer['answer_start']
# end = start + len(answer['text'])
# print(f"from context: \t{paragraph['context'][start:end]}")
# print("\n")
# index += 1
def translate_text(text):
"""Translates text into the target language.
Target must be an ISO 639-1 language code.
See https://g.co/cloud/translate/v2/translate-reference#supported_languages
"""
translate_client = translate.Client()
if isinstance(text, six.binary_type):
text = text.decode("utf-8")
# Text can also be a sequence of strings, in which case this method
# will return a sequence of results for each text.
result = translate_client.translate(text, target_language="sk")
print(u"Text: {}".format(result["input"]))
print(u"Translation: {}".format(result["translatedText"]))
print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
def sort_qas_by_answer_index(squad):
for article in squad['data']:
for paragraph in article['paragraphs']:
impossible_qas = list(filter(lambda qas: qas['is_impossible'] == True, paragraph['qas']))
possible_qas = list(filter(lambda qas: qas['is_impossible'] == False, paragraph['qas']))
sorted_qas = sorted(possible_qas, key=lambda qas: qas['answers'][0]['answer_start'])
for qas in sorted_qas:
a = qas['answers'][0]
a['answer_end'] = a['answer_start'] + len(a['text'])
paragraph['qas'] = sorted_qas + impossible_qas
def transform_squad(squad):
for article in squad['data']:
for paragraph in article['paragraphs']:
add_special_chars_to_paragraph(paragraph)
def add_special_chars_to_paragraph(paragraph):
for counter, qas in enumerate(paragraph['qas']):
# Skip if impossible question
if qas["is_impossible"] == True: continue
special_char = f"[{counter}]"
if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue
current = qas['answers'][0]
# Get start index
start = current['answer_start']
# Calculate end index
end = current['answer_end']
# Add special chars to context
context = paragraph['context']
paragraph['context'] = f"{context[:start]}{special_char} {context[start:end]} {special_char}{context[end:]}"
# Recalculate indexes
for q in paragraph['qas'][counter + 1:]: # Skip all answers before and current one
if q["is_impossible"] == True: continue
other = q['answers'][0]
if other['answer_start'] >= current['answer_start'] and other['answer_end'] <= current["answer_end"]: # Other is being enclosed by current
other['answer_start'] += len(special_char) +1
other['answer_end'] += 2*len(special_char) +2
elif other['answer_start'] < current['answer_end']: # Other is enclosing the current one
other['answer_start'] += len(special_char) +1
other['answer_end'] += len(special_char) +1
else: # Other is after current
other['answer_start'] += 2*len(special_char) +2
other['answer_end'] += 2*len(special_char) +2
# Fix indexes in current answer
other = paragraph['qas'][counter]['answers'][0]
if other == current: # Other answer is the one im working on
other['answer_start'] += len(special_char) +1
other['answer_end'] += len(special_char) +1
if __name__ == "__main__":
load_dotenv()
squad = load('./squad-v2-dev.json')
sort_qas_by_answer_index(squad)
transform_squad(squad)
print_squad(squad)
save("./squad-v2-dev-test-out.json", squad)
# translate_text("my name is tomas")