added tqdm for visualization, question translation, scripts for removing special chars and recalculating indexes, some variable renaming, etc.
This commit is contained in:
parent
5edc0bef7f
commit
8657db10a0
1661218
data/squad-test-translated.json
1661218
data/squad-test-translated.json
File diff suppressed because one or more lines are too long
@ -1,180 +0,0 @@
|
|||||||
{
|
|
||||||
"data": [
|
|
||||||
{
|
|
||||||
"paragraphs": [
|
|
||||||
{
|
|
||||||
"qas": [
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "When did Beyonce start becoming popular?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 269,
|
|
||||||
"text": "in the late 1990s"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56be85543aeaaa14008c9063"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "What areas did Beyonce compete in when she was growing up?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 207,
|
|
||||||
"text": "singing and dancing"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56be85543aeaaa14008c9065"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "When did Beyonce leave Destiny's Child and become a solo singer?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 526,
|
|
||||||
"text": "2003"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56be85543aeaaa14008c9066"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "In what city and state did Beyonce grow up? ",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 166,
|
|
||||||
"text": "Houston, Texas"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56bf6b0f3aeaaa14008c9601"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "In which decade did Beyonce become famous?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 276,
|
|
||||||
"text": "late 1990s"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56bf6b0f3aeaaa14008c9602"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "In what R&B group was she the lead singer?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 320,
|
|
||||||
"text": "Destiny's Child"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56bf6b0f3aeaaa14008c9603"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "What album made her a worldwide known artist?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 505,
|
|
||||||
"text": "Dangerously in Love"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56bf6b0f3aeaaa14008c9604"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "Who managed the Destiny's Child group?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 360,
|
|
||||||
"text": "Mathew Knowles"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56bf6b0f3aeaaa14008c9605"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "When did Beyonc\u00e9 rise to fame?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 276,
|
|
||||||
"text": "late 1990s"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56d43c5f2ccc5a1400d830a9"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "What role did Beyonc\u00e9 have in Destiny's Child?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 290,
|
|
||||||
"text": "lead singer"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56d43c5f2ccc5a1400d830aa"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "What was the first album Beyonc\u00e9 released as a solo artist?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 505,
|
|
||||||
"text": "Dangerously in Love"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56d43c5f2ccc5a1400d830ab"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "When did Beyonc\u00e9 release Dangerously in Love?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 526,
|
|
||||||
"text": "2003"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56d43c5f2ccc5a1400d830ac"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "How many Grammy awards did Beyonc\u00e9 win for her first solo album?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 590,
|
|
||||||
"text": "five"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56d43c5f2ccc5a1400d830ad"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "What was Beyonc\u00e9's role in Destiny's Child?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 290,
|
|
||||||
"text": "lead singer"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56d43ce42ccc5a1400d830b4"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"is_impossible": false,
|
|
||||||
"question": "What was the name of Beyonc\u00e9's first solo album?",
|
|
||||||
"answers": [
|
|
||||||
{
|
|
||||||
"answer_start": 505,
|
|
||||||
"text": "Dangerously in Love"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"id": "56d43ce42ccc5a1400d830b5"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"context": "Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\"."
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"title": "Beyonc\u00e9"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"version": "v2.0"
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
@ -31,10 +31,10 @@ def add_special_chars_to_paragraph(paragraph):
|
|||||||
# Skip if impossible question
|
# Skip if impossible question
|
||||||
if qas["is_impossible"] == True: continue
|
if qas["is_impossible"] == True: continue
|
||||||
|
|
||||||
special_char = f"[{counter}]"
|
|
||||||
|
|
||||||
if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue
|
if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue
|
||||||
|
|
||||||
|
special_char = f"[{counter}]"
|
||||||
|
|
||||||
current = qas['answers'][0]
|
current = qas['answers'][0]
|
||||||
|
|
||||||
# Get start index
|
# Get start index
|
||||||
@ -71,11 +71,62 @@ def add_special_chars_to_paragraph(paragraph):
|
|||||||
other['answer_end'] += len(special_char) +1
|
other['answer_end'] += len(special_char) +1
|
||||||
|
|
||||||
|
|
||||||
|
def detransform_squad(squad):
|
||||||
|
for article in squad['data']:
|
||||||
|
for paragraph in article['paragraphs']:
|
||||||
|
for counter, qas in enumerate(paragraph['qas']):
|
||||||
|
# Skip if impossible question
|
||||||
|
if qas["is_impossible"] == True: continue
|
||||||
|
if len(qas) == 0: continue
|
||||||
|
if len(qas['answers']) == 0: continue
|
||||||
|
|
||||||
|
special_char = f"[{counter}]"
|
||||||
|
len_special_char = len(special_char)
|
||||||
|
|
||||||
|
current = qas['answers'][0]
|
||||||
|
|
||||||
|
# Fix english indexes
|
||||||
|
start = paragraph['context'].find(special_char)
|
||||||
|
end = paragraph['context'].rfind(special_char) - len_special_char - 2
|
||||||
|
|
||||||
|
current['answer_start'] = start
|
||||||
|
current['answer_end'] = end
|
||||||
|
|
||||||
|
# Fix slovak indexes
|
||||||
|
start = paragraph['translated_context'].find(special_char)
|
||||||
|
end = paragraph['translated_context'].rfind(special_char) - len_special_char - 2
|
||||||
|
|
||||||
|
current['translated_answer_start'] = start
|
||||||
|
current['translated_answer_end'] = end
|
||||||
|
|
||||||
|
# Fix english context
|
||||||
|
paragraph['context'] = paragraph['context'].replace(f"{special_char} ", "")
|
||||||
|
# There are possible cases where special char is followed by ,. or is at end of paragraph
|
||||||
|
paragraph['context'] = paragraph['context'].replace(f" {special_char}", "")
|
||||||
|
|
||||||
|
# Fix slovak context
|
||||||
|
paragraph['translated_context'] = paragraph['translated_context'].replace(f"{special_char} ", "")
|
||||||
|
# There are possible cases where special char is followed by ,. or is at end of paragraph
|
||||||
|
paragraph['translated_context'] = paragraph['translated_context'].replace(f" {special_char}", "")
|
||||||
|
|
||||||
|
# Add translated_text to qas
|
||||||
|
start = current['translated_answer_start']
|
||||||
|
end = current['translated_answer_end']
|
||||||
|
current['translated_text'] = paragraph['translated_context'][start:end]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def translate_paragraphs(squad):
|
def translate_paragraphs(squad):
|
||||||
for article in tqdm(squad["data"]):
|
for article in tqdm(squad["data"]):
|
||||||
for paragraph in article["paragraphs"]:
|
for paragraph in article["paragraphs"]:
|
||||||
|
# Translate context
|
||||||
translated = translate_text(paragraph["context"])
|
translated = translate_text(paragraph["context"])
|
||||||
paragraph['translatedContext'] = translated
|
paragraph['translated_context'] = translated
|
||||||
|
|
||||||
|
# Translate questions
|
||||||
|
for qas in paragraph['qas']:
|
||||||
|
translated = translate_text(qas['question'])
|
||||||
|
qas['translated_question'] = translated
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -88,6 +139,14 @@ if __name__ == "__main__":
|
|||||||
transform_squad(squad)
|
transform_squad(squad)
|
||||||
translate_paragraphs(squad)
|
translate_paragraphs(squad)
|
||||||
|
|
||||||
|
with open("./data/squad-v2-dev-small-transformed.json", "w") as f:
|
||||||
|
json.dump(squad, f, indent=2)
|
||||||
|
|
||||||
|
# with open("./data/squad-v2-dev-small-transformed.json", "r") as f:
|
||||||
|
# squad = json.load(f)
|
||||||
|
|
||||||
|
detransform_squad(squad)
|
||||||
|
|
||||||
with open("./data/squad-v2-dev-small-translated.json", "w") as f:
|
with open("./data/squad-v2-dev-small-translated.json", "w") as f:
|
||||||
json.dump(squad, f, indent=2)
|
json.dump(squad, f, indent=2)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user