dp2022/squad_transform.py

import json
from dotenv import load_dotenv
from tqdm import tqdm

from squad_utils import print_squad
from translate_utils import translate_text


def sort_qas_by_answer_index(squad):
    for article in squad['data']:
        for paragraph in article['paragraphs']:
            impossible_qas = list(filter(lambda qas: qas['is_impossible'] == True, paragraph['qas']))
            possible_qas = list(filter(lambda qas: qas['is_impossible'] == False, paragraph['qas']))
            sorted_qas = sorted(possible_qas, key=lambda qas: qas['answers'][0]['answer_start'])

            for qas in sorted_qas:
                a = qas['answers'][0]
                a['answer_end'] = a['answer_start'] + len(a['text'])

            paragraph['qas'] = sorted_qas + impossible_qas


def transform_squad(squad):
    for article in squad['data']:
        for paragraph in article['paragraphs']:
            add_special_chars_to_paragraph(paragraph)


def add_special_chars_to_paragraph(paragraph):
    for counter, qas in enumerate(paragraph['qas']):
        # Skip if impossible question
        if qas["is_impossible"] == True: continue

        if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue

        special_char = f"[{counter}]"

        current = qas['answers'][0]

        # Get start index
        start = current['answer_start']
        # Calculate end index
        end = current['answer_end']
        # Add special chars to context
        context = paragraph['context']
        paragraph['context'] = f"{context[:start]}{special_char} {context[start:end]} {special_char}{context[end:]}"

        # Recalculate indexes
        for q in paragraph['qas'][counter + 1:]: # Skip all answers before and current one
            if q["is_impossible"] == True: continue

            other = q['answers'][0]

            if other['answer_start'] >= current['answer_start'] and other['answer_end'] <= current["answer_end"]: # Other is being enclosed by current
                other['answer_start'] += len(special_char) +1
                other['answer_end'] += 2*len(special_char) +2

            elif other['answer_start'] < current['answer_end']: # Other is enclosing the current one
                other['answer_start'] += len(special_char) +1
                other['answer_end'] += len(special_char) +1

            else: # Other is after current
                other['answer_start'] += 2*len(special_char) +2
                other['answer_end'] += 2*len(special_char) +2

        # Fix indexes in current answer
        other = paragraph['qas'][counter]['answers'][0]

        if other == current: # Other answer is the one im working on
            other['answer_start'] += len(special_char) +1
            other['answer_end'] += len(special_char) +1


def detransform_squad(squad):
    for article in squad['data']:
        for paragraph in article['paragraphs']:
            for counter, qas in enumerate(paragraph['qas']):
                # Skip if impossible question
                if qas["is_impossible"] == True: continue
                if len(qas) == 0: continue
                if len(qas['answers']) == 0: continue

                special_char = f"[{counter}]"
                len_special_char = len(special_char)

                current = qas['answers'][0]

                # Fix english indexes
                start = paragraph['context'].find(special_char)
                end = paragraph['context'].rfind(special_char) - len_special_char - 2

                current['answer_start'] = start
                current['answer_end'] = end

                # Fix slovak indexes
                start = paragraph['translated_context'].find(special_char)
                end = paragraph['translated_context'].rfind(special_char) - len_special_char - 2

                current['translated_answer_start'] = start
                current['translated_answer_end'] = end

                # Fix english context
                paragraph['context'] = paragraph['context'].replace(f"{special_char} ", "")
                # There are possible cases where special char is followed by ,. or is at end of paragraph
                paragraph['context'] = paragraph['context'].replace(f" {special_char}", "")

                # Fix slovak context
                paragraph['translated_context'] = paragraph['translated_context'].replace(f"{special_char} ", "")
                # There are possible cases where special char is followed by ,. or is at end of paragraph
                paragraph['translated_context'] = paragraph['translated_context'].replace(f" {special_char}", "")

                # Add translated_text to qas
                start = current['translated_answer_start']
                end = current['translated_answer_end']
                current['translated_text'] = paragraph['translated_context'][start:end]


def translate_paragraphs(squad):
    for article in tqdm(squad["data"]):
        for paragraph in article["paragraphs"]:
            # Translate context
            translated = translate_text(paragraph["context"])
            paragraph['translated_context'] = translated

            # Translate questions
            for qas in paragraph['qas']:
                translated = translate_text(qas['question'])
                qas['translated_question'] = translated


if __name__ == "__main__":
    load_dotenv()

    with open("./data/squad-v2-dev-small.json", "r") as f:
        squad = json.load(f)

    sort_qas_by_answer_index(squad)
    transform_squad(squad)
    translate_paragraphs(squad)

    with open("./data/squad-v2-dev-small-transformed.json", "w") as f:
        json.dump(squad, f, indent=2)

    # with open("./data/squad-v2-dev-small-transformed.json", "r") as f:
    #     squad = json.load(f)

    detransform_squad(squad)

    with open("./data/squad-v2-dev-small-translated.json", "w") as f:
        json.dump(squad, f, indent=2)
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00			`import json`
			`from dotenv import load_dotenv`
small translation test working 2022-02-20 22:04:51 +00:00			`from tqdm import tqdm`
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00
removed load save functions 2022-02-20 20:34:12 +00:00			`from squad_utils import print_squad`
small translation test working 2022-02-20 22:04:51 +00:00			`from translate_utils import translate_text`
working special character addition to context 2022-02-15 08:14:58 +00:00
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00
working special character addition to context 2022-02-15 08:14:58 +00:00			`def sort_qas_by_answer_index(squad):`
			`for article in squad['data']:`
			`for paragraph in article['paragraphs']:`
			`impossible_qas = list(filter(lambda qas: qas['is_impossible'] == True, paragraph['qas']))`
			`possible_qas = list(filter(lambda qas: qas['is_impossible'] == False, paragraph['qas']))`
			`sorted_qas = sorted(possible_qas, key=lambda qas: qas['answers'][0]['answer_start'])`

			`for qas in sorted_qas:`
			`a = qas['answers'][0]`
			`a['answer_end'] = a['answer_start'] + len(a['text'])`

			`paragraph['qas'] = sorted_qas + impossible_qas`


			`def transform_squad(squad):`
			`for article in squad['data']:`
			`for paragraph in article['paragraphs']:`
			`add_special_chars_to_paragraph(paragraph)`


			`def add_special_chars_to_paragraph(paragraph):`
			`for counter, qas in enumerate(paragraph['qas']):`
			`# Skip if impossible question`
			`if qas["is_impossible"] == True: continue`

			`if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue`

added tqdm for visualization, question translation, scripts for removing special chars and recalculating indexes, some variable renaming, etc. 2022-02-21 09:52:56 +00:00			`special_char = f"[{counter}]"`

working special character addition to context 2022-02-15 08:14:58 +00:00			`current = qas['answers'][0]`

			`# Get start index`
			`start = current['answer_start']`
			`# Calculate end index`
			`end = current['answer_end']`
			`# Add special chars to context`
			`context = paragraph['context']`
			`paragraph['context'] = f"{context[:start]}{special_char} {context[start:end]} {special_char}{context[end:]}"`

			`# Recalculate indexes`
			`for q in paragraph['qas'][counter + 1:]: # Skip all answers before and current one`
			`if q["is_impossible"] == True: continue`

			`other = q['answers'][0]`

			`if other['answer_start'] >= current['answer_start'] and other['answer_end'] <= current["answer_end"]: # Other is being enclosed by current`
			`other['answer_start'] += len(special_char) +1`
			`other['answer_end'] += 2*len(special_char) +2`

			`elif other['answer_start'] < current['answer_end']: # Other is enclosing the current one`
			`other['answer_start'] += len(special_char) +1`
			`other['answer_end'] += len(special_char) +1`

			`else: # Other is after current`
			`other['answer_start'] += 2*len(special_char) +2`
			`other['answer_end'] += 2*len(special_char) +2`

			`# Fix indexes in current answer`
			`other = paragraph['qas'][counter]['answers'][0]`

			`if other == current: # Other answer is the one im working on`
			`other['answer_start'] += len(special_char) +1`
			`other['answer_end'] += len(special_char) +1`


added tqdm for visualization, question translation, scripts for removing special chars and recalculating indexes, some variable renaming, etc. 2022-02-21 09:52:56 +00:00			`def detransform_squad(squad):`
			`for article in squad['data']:`
			`for paragraph in article['paragraphs']:`
			`for counter, qas in enumerate(paragraph['qas']):`
			`# Skip if impossible question`
			`if qas["is_impossible"] == True: continue`
			`if len(qas) == 0: continue`
			`if len(qas['answers']) == 0: continue`

			`special_char = f"[{counter}]"`
			`len_special_char = len(special_char)`

			`current = qas['answers'][0]`

			`# Fix english indexes`
			`start = paragraph['context'].find(special_char)`
			`end = paragraph['context'].rfind(special_char) - len_special_char - 2`

			`current['answer_start'] = start`
			`current['answer_end'] = end`

			`# Fix slovak indexes`
			`start = paragraph['translated_context'].find(special_char)`
			`end = paragraph['translated_context'].rfind(special_char) - len_special_char - 2`

			`current['translated_answer_start'] = start`
			`current['translated_answer_end'] = end`

			`# Fix english context`
			`paragraph['context'] = paragraph['context'].replace(f"{special_char} ", "")`
			`# There are possible cases where special char is followed by ,. or is at end of paragraph`
			`paragraph['context'] = paragraph['context'].replace(f" {special_char}", "")`

			`# Fix slovak context`
			`paragraph['translated_context'] = paragraph['translated_context'].replace(f"{special_char} ", "")`
			`# There are possible cases where special char is followed by ,. or is at end of paragraph`
			`paragraph['translated_context'] = paragraph['translated_context'].replace(f" {special_char}", "")`

			`# Add translated_text to qas`
			`start = current['translated_answer_start']`
			`end = current['translated_answer_end']`
			`current['translated_text'] = paragraph['translated_context'][start:end]`



small translation test working 2022-02-20 22:04:51 +00:00			`def translate_paragraphs(squad):`
			`for article in tqdm(squad["data"]):`
			`for paragraph in article["paragraphs"]:`
added tqdm for visualization, question translation, scripts for removing special chars and recalculating indexes, some variable renaming, etc. 2022-02-21 09:52:56 +00:00			`# Translate context`
small translation test working 2022-02-20 22:04:51 +00:00			`translated = translate_text(paragraph["context"])`
added tqdm for visualization, question translation, scripts for removing special chars and recalculating indexes, some variable renaming, etc. 2022-02-21 09:52:56 +00:00			`paragraph['translated_context'] = translated`

			`# Translate questions`
			`for qas in paragraph['qas']:`
			`translated = translate_text(qas['question'])`
			`qas['translated_question'] = translated`
small translation test working 2022-02-20 22:04:51 +00:00

added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00			`if __name__ == "__main__":`
			`load_dotenv()`

even more spring cleaning 2022-02-20 21:02:03 +00:00			`with open("./data/squad-v2-dev-small.json", "r") as f:`
removed load save functions 2022-02-20 20:34:12 +00:00			`squad = json.load(f)`
spring cleaning 2022-02-20 20:32:55 +00:00
working special character addition to context 2022-02-15 08:14:58 +00:00			`sort_qas_by_answer_index(squad)`
			`transform_squad(squad)`
small translation test working 2022-02-20 22:04:51 +00:00			`translate_paragraphs(squad)`
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00
added tqdm for visualization, question translation, scripts for removing special chars and recalculating indexes, some variable renaming, etc. 2022-02-21 09:52:56 +00:00			`with open("./data/squad-v2-dev-small-transformed.json", "w") as f:`
			`json.dump(squad, f, indent=2)`

			`# with open("./data/squad-v2-dev-small-transformed.json", "r") as f:`
			`# squad = json.load(f)`

			`detransform_squad(squad)`

even more spring cleaning 2022-02-20 21:02:03 +00:00			`with open("./data/squad-v2-dev-small-translated.json", "w") as f:`
removed load save functions 2022-02-20 20:34:12 +00:00			`json.dump(squad, f, indent=2)`
added initial script with squad loading, printing, saving and with example function for google translation, added markdown file with basic setup instructions 2021-11-04 20:07:23 +00:00