spring cleaning

2022-02-20 21:32:55 +01:00 · 2022-02-20 21:32:55 +01:00 · 695cef2b32
commit 695cef2b32
parent f5538d4882
9 changed files with 1661392 additions and 85 deletions
--- a/notes/Porovnanie
+++ b/notes/Porovnanie
--- a/notes/Prieskum
+++ b/notes/Prieskum
--- a/notes/google_translate_howto.md
+++ b/notes/google_translate_howto.md
--- a/squad-test-translated.json
+++ b/squad-test-translated.json
--- a/squad-v2-dev-test.json
+++ b/squad-v2-dev-test.json
--- a/squad_char_counter.py
+++ b/squad_char_counter.py
@ -1,45 +0,0 @@
-import json
-
-squad = None
-
-with open("squad-v2-dev.json", "r", encoding="utf-8") as f:
-    squad = json.load(f)
-
-num_articles = len(squad['data'])
-print(f"total articles: {num_articles}")
-
-context_chars = 0
-question_chars = 0
-answer_chars = 0
-
-total_paragraphs = 0
-total_qas = 0
-total_answers = 0
-for article in squad['data']:
-    total_paragraphs += len(article['paragraphs'])
-
-    for paragraph in article['paragraphs']:
-        context_chars += len(paragraph['context'])
-
-        total_qas += len(paragraph['qas'])
-
-        for qas in paragraph['qas']:
-            question_chars += len(qas['question'])
-
-            total_answers += len(qas['answers'])
-
-            for answer in qas['answers']:
-                answer_chars += len(answer['text'])
-
-print(f"total paragraphs: {total_paragraphs}")
-print(f"total qas: {total_qas}")
-print(f"total answers: {total_answers}")
-
-print(f"chars in contexts: {context_chars}")
-print(f"chars in questions: {question_chars}")
-print(f"chars in answers: {answer_chars}")
-
-total_chars = context_chars + question_chars + answer_chars
-
-print(f"total chars: {total_chars}")
-
--- a/squad_transform.py
+++ b/squad_transform.py
@ -0,0 +1,93 @@
+import json
+from dotenv import load_dotenv
+from squad_utils import print_squad
+
+
+def load(filename):
+    with open(filename, "r") as f:
+        squad = json.load(f)
+
+    return squad
+
+
+def save(filename, squad):
+    with open(filename, "w") as f:
+        json.dump(squad, f, indent=2)
+
+
+def sort_qas_by_answer_index(squad):
+    for article in squad['data']:
+        for paragraph in article['paragraphs']:
+            impossible_qas = list(filter(lambda qas: qas['is_impossible'] == True, paragraph['qas']))
+            possible_qas = list(filter(lambda qas: qas['is_impossible'] == False, paragraph['qas']))
+            sorted_qas = sorted(possible_qas, key=lambda qas: qas['answers'][0]['answer_start'])
+
+            for qas in sorted_qas:
+                a = qas['answers'][0]
+                a['answer_end'] = a['answer_start'] + len(a['text'])
+
+            paragraph['qas'] = sorted_qas + impossible_qas
+
+
+def transform_squad(squad):
+    for article in squad['data']:
+        for paragraph in article['paragraphs']:
+            add_special_chars_to_paragraph(paragraph)
+
+
+def add_special_chars_to_paragraph(paragraph):
+    for counter, qas in enumerate(paragraph['qas']):
+        # Skip if impossible question
+        if qas["is_impossible"] == True: continue
+
+        special_char = f"[{counter}]"
+
+        if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue
+
+        current = qas['answers'][0]
+
+        # Get start index
+        start = current['answer_start']
+        # Calculate end index
+        end = current['answer_end']
+        # Add special chars to context
+        context = paragraph['context']
+        paragraph['context'] = f"{context[:start]}{special_char} {context[start:end]} {special_char}{context[end:]}"
+
+        # Recalculate indexes
+        for q in paragraph['qas'][counter + 1:]: # Skip all answers before and current one
+            if q["is_impossible"] == True: continue
+
+            other = q['answers'][0]
+
+            if other['answer_start'] >= current['answer_start'] and other['answer_end'] <= current["answer_end"]: # Other is being enclosed by current
+                other['answer_start'] += len(special_char) +1
+                other['answer_end'] += 2*len(special_char) +2
+
+            elif other['answer_start'] < current['answer_end']: # Other is enclosing the current one
+                other['answer_start'] += len(special_char) +1
+                other['answer_end'] += len(special_char) +1
+
+            else: # Other is after current
+                other['answer_start'] += 2*len(special_char) +2
+                other['answer_end'] += 2*len(special_char) +2
+
+        # Fix indexes in current answer
+        other = paragraph['qas'][counter]['answers'][0]
+
+        if other == current: # Other answer is the one im working on
+            other['answer_start'] += len(special_char) +1
+            other['answer_end'] += len(special_char) +1
+
+
+if __name__ == "__main__":
+    load_dotenv()
+
+    squad = load('./squad-test.json')
+
+    sort_qas_by_answer_index(squad)
+    transform_squad(squad)
+    print_squad(squad)
+
+    save("./squad-test-translated.json", squad)
+
--- a/squad_translate_google.py
+++ b/squad_translate_google.py
@ -5,42 +5,6 @@ import six
 from google.cloud import translate_v2 as translate


-def load(filename):
-    with open(filename, "r") as f:
-        squad = json.load(f)
-
-    return squad
-
-
-def save(filename, squad):
-    with open(filename, "w") as f:
-        json.dump(squad, f, indent=2)
-
-
-def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100):
-    for article in squad['data'][:article_limit]:
-        print("="*40)
-        print(f"Article title: {article['title']}\n\n")
-
-        for paragraph in article['paragraphs'][:paragraph_limit]:
-            print(f"{paragraph['context']}\n")
-
-            # index = 0
-            # for qas in paragraph['qas'][:qas_limit]:
-            #     print(f"Question: {qas['question']}")
-
-            #     print(f"Answers:")
-            #     answer = qas['answers'][0]
-            #     print(f"#{index} @{answer['answer_start']}: \t{answer['text']}")
-            #     print(f"#{index} ends @{answer['answer_end']}")
-            #     start = answer['answer_start']
-            #     end = start + len(answer['text'])
-            #     print(f"from context: \t{paragraph['context'][start:end]}")
-
-            #     print("\n")
-            #     index += 1
-
-
 def translate_text(text):
    """Translates text into the target language.

@ -130,12 +94,13 @@ def add_special_chars_to_paragraph(paragraph):
 if __name__ == "__main__":
    load_dotenv()

-    squad = load('./squad-v2-dev.json')
+    with open("./squad-test.json", "r") as f:
+        squad = json.load(f)
+
    sort_qas_by_answer_index(squad)
    transform_squad(squad)
    print_squad(squad)
-    save("./squad-v2-dev-test-out.json", squad)
-
-    # translate_text("my name is tomas")

+    with open("./squad-test-out.json", "w") as f:
+        json.dump(squad, f, indent=2)

--- a/squad_utils.py
+++ b/squad_utils.py
@ -0,0 +1,76 @@
+import json
+
+
+def calculate_chars(squad):
+    num_articles = len(squad['data'])
+    print(f"total articles: {num_articles}")
+
+    context_chars = 0
+    question_chars = 0
+    answer_chars = 0
+
+    total_paragraphs = 0
+    total_qas = 0
+    total_answers = 0
+
+    for article in squad['data']:
+        total_paragraphs += len(article['paragraphs'])
+
+        for paragraph in article['paragraphs']:
+            context_chars += len(paragraph['context'])
+
+            total_qas += len(paragraph['qas'])
+
+            for qas in paragraph['qas']:
+                question_chars += len(qas['question'])
+
+                total_answers += len(qas['answers'])
+
+                for answer in qas['answers']:
+                    answer_chars += len(answer['text'])
+
+    print(f"total paragraphs: {total_paragraphs}")
+    print(f"total qas: {total_qas}")
+    print(f"total answers: {total_answers}")
+
+    print(f"chars in contexts: {context_chars}")
+    print(f"chars in questions: {question_chars}")
+    print(f"chars in answers: {answer_chars}")
+
+    total_chars = context_chars + question_chars + answer_chars
+
+    print(f"total chars: {total_chars}")
+
+
+def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100):
+    for article in squad['data'][:article_limit]:
+        print("="*40)
+        print(f"Article title: {article['title']}\n\n")
+
+        for paragraph in article['paragraphs'][:paragraph_limit]:
+            print(f"{paragraph['context']}\n")
+
+            # index = 0
+            # for qas in paragraph['qas'][:qas_limit]:
+            #     print(f"Question: {qas['question']}")
+
+            #     print(f"Answers:")
+            #     answer = qas['answers'][0]
+            #     print(f"#{index} @{answer['answer_start']}: \t{answer['text']}")
+            #     print(f"#{index} ends @{answer['answer_end']}")
+            #     start = answer['answer_start']
+            #     end = start + len(answer['text'])
+            #     print(f"from context: \t{paragraph['context'][start:end]}")
+
+            #     print("\n")
+            #     index += 1
+
+
+if __name__ == "__main__":
+    squad = None
+
+    with open("squad-v2-dev.json", "r", encoding="utf-8") as f:
+        squad = json.load(f)
+
+    calculate_chars(squad)
+