added tqdm for visualization, question translation, scripts for removing special chars and recalculating indexes, some variable renaming, etc.

This commit is contained in:
Tomas Kucharik 2022-02-21 10:52:56 +01:00
parent 5edc0bef7f
commit 8657db10a0
4 changed files with 1990 additions and 1662301 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,180 +0,0 @@
{
"data": [
{
"paragraphs": [
{
"qas": [
{
"is_impossible": false,
"question": "When did Beyonce start becoming popular?",
"answers": [
{
"answer_start": 269,
"text": "in the late 1990s"
}
],
"id": "56be85543aeaaa14008c9063"
},
{
"is_impossible": false,
"question": "What areas did Beyonce compete in when she was growing up?",
"answers": [
{
"answer_start": 207,
"text": "singing and dancing"
}
],
"id": "56be85543aeaaa14008c9065"
},
{
"is_impossible": false,
"question": "When did Beyonce leave Destiny's Child and become a solo singer?",
"answers": [
{
"answer_start": 526,
"text": "2003"
}
],
"id": "56be85543aeaaa14008c9066"
},
{
"is_impossible": false,
"question": "In what city and state did Beyonce grow up? ",
"answers": [
{
"answer_start": 166,
"text": "Houston, Texas"
}
],
"id": "56bf6b0f3aeaaa14008c9601"
},
{
"is_impossible": false,
"question": "In which decade did Beyonce become famous?",
"answers": [
{
"answer_start": 276,
"text": "late 1990s"
}
],
"id": "56bf6b0f3aeaaa14008c9602"
},
{
"is_impossible": false,
"question": "In what R&B group was she the lead singer?",
"answers": [
{
"answer_start": 320,
"text": "Destiny's Child"
}
],
"id": "56bf6b0f3aeaaa14008c9603"
},
{
"is_impossible": false,
"question": "What album made her a worldwide known artist?",
"answers": [
{
"answer_start": 505,
"text": "Dangerously in Love"
}
],
"id": "56bf6b0f3aeaaa14008c9604"
},
{
"is_impossible": false,
"question": "Who managed the Destiny's Child group?",
"answers": [
{
"answer_start": 360,
"text": "Mathew Knowles"
}
],
"id": "56bf6b0f3aeaaa14008c9605"
},
{
"is_impossible": false,
"question": "When did Beyonc\u00e9 rise to fame?",
"answers": [
{
"answer_start": 276,
"text": "late 1990s"
}
],
"id": "56d43c5f2ccc5a1400d830a9"
},
{
"is_impossible": false,
"question": "What role did Beyonc\u00e9 have in Destiny's Child?",
"answers": [
{
"answer_start": 290,
"text": "lead singer"
}
],
"id": "56d43c5f2ccc5a1400d830aa"
},
{
"is_impossible": false,
"question": "What was the first album Beyonc\u00e9 released as a solo artist?",
"answers": [
{
"answer_start": 505,
"text": "Dangerously in Love"
}
],
"id": "56d43c5f2ccc5a1400d830ab"
},
{
"is_impossible": false,
"question": "When did Beyonc\u00e9 release Dangerously in Love?",
"answers": [
{
"answer_start": 526,
"text": "2003"
}
],
"id": "56d43c5f2ccc5a1400d830ac"
},
{
"is_impossible": false,
"question": "How many Grammy awards did Beyonc\u00e9 win for her first solo album?",
"answers": [
{
"answer_start": 590,
"text": "five"
}
],
"id": "56d43c5f2ccc5a1400d830ad"
},
{
"is_impossible": false,
"question": "What was Beyonc\u00e9's role in Destiny's Child?",
"answers": [
{
"answer_start": 290,
"text": "lead singer"
}
],
"id": "56d43ce42ccc5a1400d830b4"
},
{
"is_impossible": false,
"question": "What was the name of Beyonc\u00e9's first solo album?",
"answers": [
{
"answer_start": 505,
"text": "Dangerously in Love"
}
],
"id": "56d43ce42ccc5a1400d830b5"
}
],
"context": "Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\"."
}
],
"title": "Beyonc\u00e9"
}
],
"version": "v2.0"
}

File diff suppressed because it is too large Load Diff

View File

@ -31,10 +31,10 @@ def add_special_chars_to_paragraph(paragraph):
# Skip if impossible question # Skip if impossible question
if qas["is_impossible"] == True: continue if qas["is_impossible"] == True: continue
special_char = f"[{counter}]"
if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue
special_char = f"[{counter}]"
current = qas['answers'][0] current = qas['answers'][0]
# Get start index # Get start index
@ -71,11 +71,62 @@ def add_special_chars_to_paragraph(paragraph):
other['answer_end'] += len(special_char) +1 other['answer_end'] += len(special_char) +1
def detransform_squad(squad):
for article in squad['data']:
for paragraph in article['paragraphs']:
for counter, qas in enumerate(paragraph['qas']):
# Skip if impossible question
if qas["is_impossible"] == True: continue
if len(qas) == 0: continue
if len(qas['answers']) == 0: continue
special_char = f"[{counter}]"
len_special_char = len(special_char)
current = qas['answers'][0]
# Fix english indexes
start = paragraph['context'].find(special_char)
end = paragraph['context'].rfind(special_char) - len_special_char - 2
current['answer_start'] = start
current['answer_end'] = end
# Fix slovak indexes
start = paragraph['translated_context'].find(special_char)
end = paragraph['translated_context'].rfind(special_char) - len_special_char - 2
current['translated_answer_start'] = start
current['translated_answer_end'] = end
# Fix english context
paragraph['context'] = paragraph['context'].replace(f"{special_char} ", "")
# There are possible cases where special char is followed by ,. or is at end of paragraph
paragraph['context'] = paragraph['context'].replace(f" {special_char}", "")
# Fix slovak context
paragraph['translated_context'] = paragraph['translated_context'].replace(f"{special_char} ", "")
# There are possible cases where special char is followed by ,. or is at end of paragraph
paragraph['translated_context'] = paragraph['translated_context'].replace(f" {special_char}", "")
# Add translated_text to qas
start = current['translated_answer_start']
end = current['translated_answer_end']
current['translated_text'] = paragraph['translated_context'][start:end]
def translate_paragraphs(squad): def translate_paragraphs(squad):
for article in tqdm(squad["data"]): for article in tqdm(squad["data"]):
for paragraph in article["paragraphs"]: for paragraph in article["paragraphs"]:
# Translate context
translated = translate_text(paragraph["context"]) translated = translate_text(paragraph["context"])
paragraph['translatedContext'] = translated paragraph['translated_context'] = translated
# Translate questions
for qas in paragraph['qas']:
translated = translate_text(qas['question'])
qas['translated_question'] = translated
if __name__ == "__main__": if __name__ == "__main__":
@ -88,6 +139,14 @@ if __name__ == "__main__":
transform_squad(squad) transform_squad(squad)
translate_paragraphs(squad) translate_paragraphs(squad)
with open("./data/squad-v2-dev-small-transformed.json", "w") as f:
json.dump(squad, f, indent=2)
# with open("./data/squad-v2-dev-small-transformed.json", "r") as f:
# squad = json.load(f)
detransform_squad(squad)
with open("./data/squad-v2-dev-small-translated.json", "w") as f: with open("./data/squad-v2-dev-small-translated.json", "w") as f:
json.dump(squad, f, indent=2) json.dump(squad, f, indent=2)