dp2022/squad_translate_google.py

import json
from dotenv import load_dotenv

import six
from google.cloud import translate_v2 as translate


def load(filename):
    with open(filename, "r") as f:
        squad = json.load(f)

    return squad


def save(filename, squad):
    with open(filename, "w") as f:
        json.dump(squad, f, indent=2)


def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100):
    for article in squad['data'][:article_limit]:
        print("="*40)
        print(f"Article title: {article['title']}\n\n")

        for paragraph in article['paragraphs'][:paragraph_limit]:
            print(f"{paragraph['context']}\n")

            # index = 0
            # for qas in paragraph['qas'][:qas_limit]:
            #     print(f"Question: {qas['question']}")

            #     print(f"Answers:")
            #     answer = qas['answers'][0]
            #     print(f"#{index} @{answer['answer_start']}: \t{answer['text']}")
            #     print(f"#{index} ends @{answer['answer_end']}")
            #     start = answer['answer_start']
            #     end = start + len(answer['text'])
            #     print(f"from context: \t{paragraph['context'][start:end]}")

            #     print("\n")
            #     index += 1


def translate_text(text):
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """

    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, target_language="sk")

    print(u"Text: {}".format(result["input"]))
    print(u"Translation: {}".format(result["translatedText"]))
    print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))


def sort_qas_by_answer_index(squad):
    for article in squad['data']:
        for paragraph in article['paragraphs']:
            impossible_qas = list(filter(lambda qas: qas['is_impossible'] == True, paragraph['qas']))
            possible_qas = list(filter(lambda qas: qas['is_impossible'] == False, paragraph['qas']))
            sorted_qas = sorted(possible_qas, key=lambda qas: qas['answers'][0]['answer_start'])

            for qas in sorted_qas:
                a = qas['answers'][0]
                a['answer_end'] = a['answer_start'] + len(a['text'])

            paragraph['qas'] = sorted_qas + impossible_qas


def transform_squad(squad):
    for article in squad['data']:
        for paragraph in article['paragraphs']:
            add_special_chars_to_paragraph(paragraph)


def add_special_chars_to_paragraph(paragraph):
    for counter, qas in enumerate(paragraph['qas']):
        # Skip if impossible question
        if qas["is_impossible"] == True: continue

        special_char = f"[{counter}]"

        if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue

        current = qas['answers'][0]

        # Get start index
        start = current['answer_start']
        # Calculate end index
        end = current['answer_end']
        # Add special chars to context
        context = paragraph['context']
        paragraph['context'] = f"{context[:start]}{special_char} {context[start:end]} {special_char}{context[end:]}"

        # Recalculate indexes
        for q in paragraph['qas'][counter + 1:]: # Skip all answers before and current one
            if q["is_impossible"] == True: continue

            other = q['answers'][0]

            if other['answer_start'] >= current['answer_start'] and other['answer_end'] <= current["answer_end"]: # Other is being enclosed by current
                other['answer_start'] += len(special_char) +1
                other['answer_end'] += 2*len(special_char) +2

            elif other['answer_start'] < current['answer_end']: # Other is enclosing the current one
                other['answer_start'] += len(special_char) +1
                other['answer_end'] += len(special_char) +1

            else: # Other is after current
                other['answer_start'] += 2*len(special_char) +2
                other['answer_end'] += 2*len(special_char) +2

        # Fix indexes in current answer
        other = paragraph['qas'][counter]['answers'][0]

        if other == current: # Other answer is the one im working on
            other['answer_start'] += len(special_char) +1
            other['answer_end'] += len(special_char) +1


if __name__ == "__main__":
    load_dotenv()

    squad = load('./squad-v2-dev.json')
    sort_qas_by_answer_index(squad)
    transform_squad(squad)
    print_squad(squad)
    save("./squad-v2-dev-test-out.json", squad)

    # translate_text("my name is tomas")
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00			`import json`
			`from dotenv import load_dotenv`
working special character addition to context 2022-02-15 08:14:58 +00:00
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00			`import six`
			`from google.cloud import translate_v2 as translate`

working special character addition to context 2022-02-15 08:14:58 +00:00
			`def load(filename):`
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00			`with open(filename, "r") as f:`
			`squad = json.load(f)`

			`return squad`

working special character addition to context 2022-02-15 08:14:58 +00:00
			`def save(filename, squad):`
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00			`with open(filename, "w") as f:`
working special character addition to context 2022-02-15 08:14:58 +00:00			`json.dump(squad, f, indent=2)`

added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00
working special character addition to context 2022-02-15 08:14:58 +00:00			`def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100):`
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00			`for article in squad['data'][:article_limit]:`
			`print("="*40)`
			`print(f"Article title: {article['title']}\n\n")`

			`for paragraph in article['paragraphs'][:paragraph_limit]:`
			`print(f"{paragraph['context']}\n")`

working special character addition to context 2022-02-15 08:14:58 +00:00			`# index = 0`
			`# for qas in paragraph['qas'][:qas_limit]:`
			`# print(f"Question: {qas['question']}")`
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00
working special character addition to context 2022-02-15 08:14:58 +00:00			`# print(f"Answers:")`
			`# answer = qas['answers'][0]`
			`# print(f"#{index} @{answer['answer_start']}: \t{answer['text']}")`
			`# print(f"#{index} ends @{answer['answer_end']}")`
			`# start = answer['answer_start']`
			`# end = start + len(answer['text'])`
			`# print(f"from context: \t{paragraph['context'][start:end]}")`
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00
working special character addition to context 2022-02-15 08:14:58 +00:00			`# print("\n")`
			`# index += 1`
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00
working special character addition to context 2022-02-15 08:14:58 +00:00
			`def translate_text(text):`
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00			`"""Translates text into the target language.`

			`Target must be an ISO 639-1 language code.`
			`See https://g.co/cloud/translate/v2/translate-reference#supported_languages`
			`"""`

			`translate_client = translate.Client()`

			`if isinstance(text, six.binary_type):`
			`text = text.decode("utf-8")`

			`# Text can also be a sequence of strings, in which case this method`
			`# will return a sequence of results for each text.`
working special character addition to context 2022-02-15 08:14:58 +00:00			`result = translate_client.translate(text, target_language="sk")`
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00
			`print(u"Text: {}".format(result["input"]))`
			`print(u"Translation: {}".format(result["translatedText"]))`
			`print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))`


working special character addition to context 2022-02-15 08:14:58 +00:00			`def sort_qas_by_answer_index(squad):`
			`for article in squad['data']:`
			`for paragraph in article['paragraphs']:`
			`impossible_qas = list(filter(lambda qas: qas['is_impossible'] == True, paragraph['qas']))`
			`possible_qas = list(filter(lambda qas: qas['is_impossible'] == False, paragraph['qas']))`
			`sorted_qas = sorted(possible_qas, key=lambda qas: qas['answers'][0]['answer_start'])`

			`for qas in sorted_qas:`
			`a = qas['answers'][0]`
			`a['answer_end'] = a['answer_start'] + len(a['text'])`

			`paragraph['qas'] = sorted_qas + impossible_qas`


			`def transform_squad(squad):`
			`for article in squad['data']:`
			`for paragraph in article['paragraphs']:`
			`add_special_chars_to_paragraph(paragraph)`


			`def add_special_chars_to_paragraph(paragraph):`
			`for counter, qas in enumerate(paragraph['qas']):`
			`# Skip if impossible question`
			`if qas["is_impossible"] == True: continue`

			`special_char = f"[{counter}]"`

			`if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue`

			`current = qas['answers'][0]`

			`# Get start index`
			`start = current['answer_start']`
			`# Calculate end index`
			`end = current['answer_end']`
			`# Add special chars to context`
			`context = paragraph['context']`
			`paragraph['context'] = f"{context[:start]}{special_char} {context[start:end]} {special_char}{context[end:]}"`

			`# Recalculate indexes`
			`for q in paragraph['qas'][counter + 1:]: # Skip all answers before and current one`
			`if q["is_impossible"] == True: continue`

			`other = q['answers'][0]`

			`if other['answer_start'] >= current['answer_start'] and other['answer_end'] <= current["answer_end"]: # Other is being enclosed by current`
			`other['answer_start'] += len(special_char) +1`
			`other['answer_end'] += 2*len(special_char) +2`

			`elif other['answer_start'] < current['answer_end']: # Other is enclosing the current one`
			`other['answer_start'] += len(special_char) +1`
			`other['answer_end'] += len(special_char) +1`

			`else: # Other is after current`
			`other['answer_start'] += 2*len(special_char) +2`
			`other['answer_end'] += 2*len(special_char) +2`

			`# Fix indexes in current answer`
			`other = paragraph['qas'][counter]['answers'][0]`

			`if other == current: # Other answer is the one im working on`
			`other['answer_start'] += len(special_char) +1`
			`other['answer_end'] += len(special_char) +1`


added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00			`if __name__ == "__main__":`
			`load_dotenv()`

working special character addition to context 2022-02-15 08:14:58 +00:00			`squad = load('./squad-v2-dev.json')`
			`sort_qas_by_answer_index(squad)`
			`transform_squad(squad)`
			`print_squad(squad)`
			`save("./squad-v2-dev-test-out.json", squad)`

			`# translate_text("my name is tomas")`
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00