import json from dotenv import load_dotenv import six from google.cloud import translate_v2 as translate def load(filename): with open(filename, "r") as f: squad = json.load(f) return squad def save(filename, squad): with open(filename, "w") as f: json.dump(squad, f, indent=2) def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100): for article in squad['data'][:article_limit]: print("="*40) print(f"Article title: {article['title']}\n\n") for paragraph in article['paragraphs'][:paragraph_limit]: print(f"{paragraph['context']}\n") # index = 0 # for qas in paragraph['qas'][:qas_limit]: # print(f"Question: {qas['question']}") # print(f"Answers:") # answer = qas['answers'][0] # print(f"#{index} @{answer['answer_start']}: \t{answer['text']}") # print(f"#{index} ends @{answer['answer_end']}") # start = answer['answer_start'] # end = start + len(answer['text']) # print(f"from context: \t{paragraph['context'][start:end]}") # print("\n") # index += 1 def translate_text(text): """Translates text into the target language. Target must be an ISO 639-1 language code. See https://g.co/cloud/translate/v2/translate-reference#supported_languages """ translate_client = translate.Client() if isinstance(text, six.binary_type): text = text.decode("utf-8") # Text can also be a sequence of strings, in which case this method # will return a sequence of results for each text. result = translate_client.translate(text, target_language="sk") print(u"Text: {}".format(result["input"])) print(u"Translation: {}".format(result["translatedText"])) print(u"Detected source language: {}".format(result["detectedSourceLanguage"])) def sort_qas_by_answer_index(squad): for article in squad['data']: for paragraph in article['paragraphs']: impossible_qas = list(filter(lambda qas: qas['is_impossible'] == True, paragraph['qas'])) possible_qas = list(filter(lambda qas: qas['is_impossible'] == False, paragraph['qas'])) sorted_qas = sorted(possible_qas, key=lambda qas: qas['answers'][0]['answer_start']) for qas in sorted_qas: a = qas['answers'][0] a['answer_end'] = a['answer_start'] + len(a['text']) paragraph['qas'] = sorted_qas + impossible_qas def transform_squad(squad): for article in squad['data']: for paragraph in article['paragraphs']: add_special_chars_to_paragraph(paragraph) def add_special_chars_to_paragraph(paragraph): for counter, qas in enumerate(paragraph['qas']): # Skip if impossible question if qas["is_impossible"] == True: continue special_char = f"[{counter}]" if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue current = qas['answers'][0] # Get start index start = current['answer_start'] # Calculate end index end = current['answer_end'] # Add special chars to context context = paragraph['context'] paragraph['context'] = f"{context[:start]}{special_char} {context[start:end]} {special_char}{context[end:]}" # Recalculate indexes for q in paragraph['qas'][counter + 1:]: # Skip all answers before and current one if q["is_impossible"] == True: continue other = q['answers'][0] if other['answer_start'] >= current['answer_start'] and other['answer_end'] <= current["answer_end"]: # Other is being enclosed by current other['answer_start'] += len(special_char) +1 other['answer_end'] += 2*len(special_char) +2 elif other['answer_start'] < current['answer_end']: # Other is enclosing the current one other['answer_start'] += len(special_char) +1 other['answer_end'] += len(special_char) +1 else: # Other is after current other['answer_start'] += 2*len(special_char) +2 other['answer_end'] += 2*len(special_char) +2 # Fix indexes in current answer other = paragraph['qas'][counter]['answers'][0] if other == current: # Other answer is the one im working on other['answer_start'] += len(special_char) +1 other['answer_end'] += len(special_char) +1 if __name__ == "__main__": load_dotenv() squad = load('./squad-v2-dev.json') sort_qas_by_answer_index(squad) transform_squad(squad) print_squad(squad) save("./squad-v2-dev-test-out.json", squad) # translate_text("my name is tomas")