diff --git a/squad-v2-dev-test.json b/squad-v2-dev-test.json new file mode 100644 index 0000000..98ddc90 --- /dev/null +++ b/squad-v2-dev-test.json @@ -0,0 +1,180 @@ +{ + "data": [ + { + "paragraphs": [ + { + "qas": [ + { + "is_impossible": false, + "question": "When did Beyonce start becoming popular?", + "answers": [ + { + "answer_start": 269, + "text": "in the late 1990s" + } + ], + "id": "56be85543aeaaa14008c9063" + }, + { + "is_impossible": false, + "question": "What areas did Beyonce compete in when she was growing up?", + "answers": [ + { + "answer_start": 207, + "text": "singing and dancing" + } + ], + "id": "56be85543aeaaa14008c9065" + }, + { + "is_impossible": false, + "question": "When did Beyonce leave Destiny's Child and become a solo singer?", + "answers": [ + { + "answer_start": 526, + "text": "2003" + } + ], + "id": "56be85543aeaaa14008c9066" + }, + { + "is_impossible": false, + "question": "In what city and state did Beyonce grow up? ", + "answers": [ + { + "answer_start": 166, + "text": "Houston, Texas" + } + ], + "id": "56bf6b0f3aeaaa14008c9601" + }, + { + "is_impossible": false, + "question": "In which decade did Beyonce become famous?", + "answers": [ + { + "answer_start": 276, + "text": "late 1990s" + } + ], + "id": "56bf6b0f3aeaaa14008c9602" + }, + { + "is_impossible": false, + "question": "In what R&B group was she the lead singer?", + "answers": [ + { + "answer_start": 320, + "text": "Destiny's Child" + } + ], + "id": "56bf6b0f3aeaaa14008c9603" + }, + { + "is_impossible": false, + "question": "What album made her a worldwide known artist?", + "answers": [ + { + "answer_start": 505, + "text": "Dangerously in Love" + } + ], + "id": "56bf6b0f3aeaaa14008c9604" + }, + { + "is_impossible": false, + "question": "Who managed the Destiny's Child group?", + "answers": [ + { + "answer_start": 360, + "text": "Mathew Knowles" + } + ], + "id": "56bf6b0f3aeaaa14008c9605" + }, + { + "is_impossible": false, + "question": "When did Beyonc\u00e9 rise to fame?", + "answers": [ + { + "answer_start": 276, + "text": "late 1990s" + } + ], + "id": "56d43c5f2ccc5a1400d830a9" + }, + { + "is_impossible": false, + "question": "What role did Beyonc\u00e9 have in Destiny's Child?", + "answers": [ + { + "answer_start": 290, + "text": "lead singer" + } + ], + "id": "56d43c5f2ccc5a1400d830aa" + }, + { + "is_impossible": false, + "question": "What was the first album Beyonc\u00e9 released as a solo artist?", + "answers": [ + { + "answer_start": 505, + "text": "Dangerously in Love" + } + ], + "id": "56d43c5f2ccc5a1400d830ab" + }, + { + "is_impossible": false, + "question": "When did Beyonc\u00e9 release Dangerously in Love?", + "answers": [ + { + "answer_start": 526, + "text": "2003" + } + ], + "id": "56d43c5f2ccc5a1400d830ac" + }, + { + "is_impossible": false, + "question": "How many Grammy awards did Beyonc\u00e9 win for her first solo album?", + "answers": [ + { + "answer_start": 590, + "text": "five" + } + ], + "id": "56d43c5f2ccc5a1400d830ad" + }, + { + "is_impossible": false, + "question": "What was Beyonc\u00e9's role in Destiny's Child?", + "answers": [ + { + "answer_start": 290, + "text": "lead singer" + } + ], + "id": "56d43ce42ccc5a1400d830b4" + }, + { + "is_impossible": false, + "question": "What was the name of Beyonc\u00e9's first solo album?", + "answers": [ + { + "answer_start": 505, + "text": "Dangerously in Love" + } + ], + "id": "56d43ce42ccc5a1400d830b5" + } + ], + "context": "Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\"." + } + ], + "title": "Beyonc\u00e9" + } + ], + "version": "v2.0" +} diff --git a/squad_translate_google.py b/squad_translate_google.py index e46ae57..8259298 100644 --- a/squad_translate_google.py +++ b/squad_translate_google.py @@ -1,19 +1,23 @@ import json from dotenv import load_dotenv + import six from google.cloud import translate_v2 as translate -def load_squad(filename): + +def load(filename): with open(filename, "r") as f: squad = json.load(f) return squad -def save_squad(filename, squad): - with open(filename, "w") as f: - json.dump(squad, f) -def print_squad(squad, article_limit=2, paragraph_limit=3, qas_limit=5): +def save(filename, squad): + with open(filename, "w") as f: + json.dump(squad, f, indent=2) + + +def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100): for article in squad['data'][:article_limit]: print("="*40) print(f"Article title: {article['title']}\n\n") @@ -21,16 +25,23 @@ def print_squad(squad, article_limit=2, paragraph_limit=3, qas_limit=5): for paragraph in article['paragraphs'][:paragraph_limit]: print(f"{paragraph['context']}\n") - for qas in paragraph['qas'][:qas_limit]: - print(f"Question: {qas['question']}") + # index = 0 + # for qas in paragraph['qas'][:qas_limit]: + # print(f"Question: {qas['question']}") - print(f"Answers:") - for answer in qas['answers']: - print(f"\t{answer['text']}") + # print(f"Answers:") + # answer = qas['answers'][0] + # print(f"#{index} @{answer['answer_start']}: \t{answer['text']}") + # print(f"#{index} ends @{answer['answer_end']}") + # start = answer['answer_start'] + # end = start + len(answer['text']) + # print(f"from context: \t{paragraph['context'][start:end]}") - print("\n") + # print("\n") + # index += 1 -def translate_text(target, text): + +def translate_text(text): """Translates text into the target language. Target must be an ISO 639-1 language code. @@ -44,17 +55,87 @@ def translate_text(target, text): # Text can also be a sequence of strings, in which case this method # will return a sequence of results for each text. - result = translate_client.translate(text, target_language=target) + result = translate_client.translate(text, target_language="sk") print(u"Text: {}".format(result["input"])) print(u"Translation: {}".format(result["translatedText"])) print(u"Detected source language: {}".format(result["detectedSourceLanguage"])) +def sort_qas_by_answer_index(squad): + for article in squad['data']: + for paragraph in article['paragraphs']: + impossible_qas = list(filter(lambda qas: qas['is_impossible'] == True, paragraph['qas'])) + possible_qas = list(filter(lambda qas: qas['is_impossible'] == False, paragraph['qas'])) + sorted_qas = sorted(possible_qas, key=lambda qas: qas['answers'][0]['answer_start']) + + for qas in sorted_qas: + a = qas['answers'][0] + a['answer_end'] = a['answer_start'] + len(a['text']) + + paragraph['qas'] = sorted_qas + impossible_qas + + +def transform_squad(squad): + for article in squad['data']: + for paragraph in article['paragraphs']: + add_special_chars_to_paragraph(paragraph) + + +def add_special_chars_to_paragraph(paragraph): + for counter, qas in enumerate(paragraph['qas']): + # Skip if impossible question + if qas["is_impossible"] == True: continue + + special_char = f"[{counter}]" + + if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue + + current = qas['answers'][0] + + # Get start index + start = current['answer_start'] + # Calculate end index + end = current['answer_end'] + # Add special chars to context + context = paragraph['context'] + paragraph['context'] = f"{context[:start]}{special_char} {context[start:end]} {special_char}{context[end:]}" + + # Recalculate indexes + for q in paragraph['qas'][counter + 1:]: # Skip all answers before and current one + if q["is_impossible"] == True: continue + + other = q['answers'][0] + + if other['answer_start'] >= current['answer_start'] and other['answer_end'] <= current["answer_end"]: # Other is being enclosed by current + other['answer_start'] += len(special_char) +1 + other['answer_end'] += 2*len(special_char) +2 + + elif other['answer_start'] < current['answer_end']: # Other is enclosing the current one + other['answer_start'] += len(special_char) +1 + other['answer_end'] += len(special_char) +1 + + else: # Other is after current + other['answer_start'] += 2*len(special_char) +2 + other['answer_end'] += 2*len(special_char) +2 + + # Fix indexes in current answer + other = paragraph['qas'][counter]['answers'][0] + + if other == current: # Other answer is the one im working on + other['answer_start'] += len(special_char) +1 + other['answer_end'] += len(special_char) +1 + + if __name__ == "__main__": load_dotenv() - orig_squad = load_squad('./squad-v2-dev.json') - # print_squad(orig_squad) + squad = load('./squad-v2-dev.json') + sort_qas_by_answer_index(squad) + transform_squad(squad) + print_squad(squad) + save("./squad-v2-dev-test-out.json", squad) + + # translate_text("my name is tomas")