spring cleaning
This commit is contained in:
		
							parent
							
								
									f5538d4882
								
							
						
					
					
						commit
						695cef2b32
					
				
							
								
								
									
										1661218
									
								
								squad-test-translated.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1661218
									
								
								squad-test-translated.json
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| @ -1,45 +0,0 @@ | ||||
| import json | ||||
| 
 | ||||
| squad = None | ||||
| 
 | ||||
| with open("squad-v2-dev.json", "r", encoding="utf-8") as f: | ||||
|     squad = json.load(f) | ||||
| 
 | ||||
| num_articles = len(squad['data']) | ||||
| print(f"total articles: {num_articles}") | ||||
| 
 | ||||
| context_chars = 0 | ||||
| question_chars = 0 | ||||
| answer_chars = 0 | ||||
| 
 | ||||
| total_paragraphs = 0 | ||||
| total_qas = 0 | ||||
| total_answers = 0 | ||||
| for article in squad['data']: | ||||
|     total_paragraphs += len(article['paragraphs']) | ||||
| 
 | ||||
|     for paragraph in article['paragraphs']: | ||||
|         context_chars += len(paragraph['context']) | ||||
| 
 | ||||
|         total_qas += len(paragraph['qas']) | ||||
| 
 | ||||
|         for qas in paragraph['qas']: | ||||
|             question_chars += len(qas['question']) | ||||
| 
 | ||||
|             total_answers += len(qas['answers']) | ||||
| 
 | ||||
|             for answer in qas['answers']: | ||||
|                 answer_chars += len(answer['text']) | ||||
| 
 | ||||
| print(f"total paragraphs: {total_paragraphs}") | ||||
| print(f"total qas: {total_qas}") | ||||
| print(f"total answers: {total_answers}") | ||||
| 
 | ||||
| print(f"chars in contexts: {context_chars}") | ||||
| print(f"chars in questions: {question_chars}") | ||||
| print(f"chars in answers: {answer_chars}") | ||||
| 
 | ||||
| total_chars = context_chars + question_chars + answer_chars | ||||
| 
 | ||||
| print(f"total chars: {total_chars}") | ||||
| 
 | ||||
							
								
								
									
										93
									
								
								squad_transform.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										93
									
								
								squad_transform.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,93 @@ | ||||
| import json | ||||
| from dotenv import load_dotenv | ||||
| from squad_utils import print_squad | ||||
| 
 | ||||
| 
 | ||||
| def load(filename): | ||||
|     with open(filename, "r") as f: | ||||
|         squad = json.load(f) | ||||
| 
 | ||||
|     return squad | ||||
| 
 | ||||
| 
 | ||||
| def save(filename, squad): | ||||
|     with open(filename, "w") as f: | ||||
|         json.dump(squad, f, indent=2) | ||||
| 
 | ||||
| 
 | ||||
| def sort_qas_by_answer_index(squad): | ||||
|     for article in squad['data']: | ||||
|         for paragraph in article['paragraphs']: | ||||
|             impossible_qas = list(filter(lambda qas: qas['is_impossible'] == True, paragraph['qas'])) | ||||
|             possible_qas = list(filter(lambda qas: qas['is_impossible'] == False, paragraph['qas'])) | ||||
|             sorted_qas = sorted(possible_qas, key=lambda qas: qas['answers'][0]['answer_start']) | ||||
| 
 | ||||
|             for qas in sorted_qas: | ||||
|                 a = qas['answers'][0] | ||||
|                 a['answer_end'] = a['answer_start'] + len(a['text']) | ||||
| 
 | ||||
|             paragraph['qas'] = sorted_qas + impossible_qas | ||||
| 
 | ||||
| 
 | ||||
| def transform_squad(squad): | ||||
|     for article in squad['data']: | ||||
|         for paragraph in article['paragraphs']: | ||||
|             add_special_chars_to_paragraph(paragraph) | ||||
| 
 | ||||
| 
 | ||||
| def add_special_chars_to_paragraph(paragraph): | ||||
|     for counter, qas in enumerate(paragraph['qas']): | ||||
|         # Skip if impossible question | ||||
|         if qas["is_impossible"] == True: continue | ||||
| 
 | ||||
|         special_char = f"[{counter}]" | ||||
| 
 | ||||
|         if len(qas['answers']) > 1 or len(qas['answers']) == 0: continue | ||||
| 
 | ||||
|         current = qas['answers'][0] | ||||
| 
 | ||||
|         # Get start index | ||||
|         start = current['answer_start'] | ||||
|         # Calculate end index | ||||
|         end = current['answer_end'] | ||||
|         # Add special chars to context | ||||
|         context = paragraph['context'] | ||||
|         paragraph['context'] = f"{context[:start]}{special_char} {context[start:end]} {special_char}{context[end:]}" | ||||
| 
 | ||||
|         # Recalculate indexes | ||||
|         for q in paragraph['qas'][counter + 1:]: # Skip all answers before and current one | ||||
|             if q["is_impossible"] == True: continue | ||||
| 
 | ||||
|             other = q['answers'][0] | ||||
| 
 | ||||
|             if other['answer_start'] >= current['answer_start'] and other['answer_end'] <= current["answer_end"]: # Other is being enclosed by current | ||||
|                 other['answer_start'] += len(special_char) +1 | ||||
|                 other['answer_end'] += 2*len(special_char) +2 | ||||
| 
 | ||||
|             elif other['answer_start'] < current['answer_end']: # Other is enclosing the current one | ||||
|                 other['answer_start'] += len(special_char) +1 | ||||
|                 other['answer_end'] += len(special_char) +1 | ||||
| 
 | ||||
|             else: # Other is after current | ||||
|                 other['answer_start'] += 2*len(special_char) +2 | ||||
|                 other['answer_end'] += 2*len(special_char) +2 | ||||
| 
 | ||||
|         # Fix indexes in current answer | ||||
|         other = paragraph['qas'][counter]['answers'][0] | ||||
| 
 | ||||
|         if other == current: # Other answer is the one im working on | ||||
|             other['answer_start'] += len(special_char) +1 | ||||
|             other['answer_end'] += len(special_char) +1 | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     load_dotenv() | ||||
| 
 | ||||
|     squad = load('./squad-test.json') | ||||
| 
 | ||||
|     sort_qas_by_answer_index(squad) | ||||
|     transform_squad(squad) | ||||
|     print_squad(squad) | ||||
| 
 | ||||
|     save("./squad-test-translated.json", squad) | ||||
| 
 | ||||
| @ -5,42 +5,6 @@ import six | ||||
| from google.cloud import translate_v2 as translate | ||||
| 
 | ||||
| 
 | ||||
| def load(filename): | ||||
|     with open(filename, "r") as f: | ||||
|         squad = json.load(f) | ||||
| 
 | ||||
|     return squad | ||||
| 
 | ||||
| 
 | ||||
| def save(filename, squad): | ||||
|     with open(filename, "w") as f: | ||||
|         json.dump(squad, f, indent=2) | ||||
| 
 | ||||
| 
 | ||||
| def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100): | ||||
|     for article in squad['data'][:article_limit]: | ||||
|         print("="*40) | ||||
|         print(f"Article title: {article['title']}\n\n") | ||||
| 
 | ||||
|         for paragraph in article['paragraphs'][:paragraph_limit]: | ||||
|             print(f"{paragraph['context']}\n") | ||||
| 
 | ||||
|             # index = 0 | ||||
|             # for qas in paragraph['qas'][:qas_limit]: | ||||
|             #     print(f"Question: {qas['question']}") | ||||
| 
 | ||||
|             #     print(f"Answers:") | ||||
|             #     answer = qas['answers'][0] | ||||
|             #     print(f"#{index} @{answer['answer_start']}: \t{answer['text']}") | ||||
|             #     print(f"#{index} ends @{answer['answer_end']}") | ||||
|             #     start = answer['answer_start'] | ||||
|             #     end = start + len(answer['text']) | ||||
|             #     print(f"from context: \t{paragraph['context'][start:end]}") | ||||
| 
 | ||||
|             #     print("\n") | ||||
|             #     index += 1 | ||||
| 
 | ||||
| 
 | ||||
| def translate_text(text): | ||||
|     """Translates text into the target language. | ||||
| 
 | ||||
| @ -130,12 +94,13 @@ def add_special_chars_to_paragraph(paragraph): | ||||
| if __name__ == "__main__": | ||||
|     load_dotenv() | ||||
| 
 | ||||
|     squad = load('./squad-v2-dev.json') | ||||
|     with open("./squad-test.json", "r") as f: | ||||
|         squad = json.load(f) | ||||
| 
 | ||||
|     sort_qas_by_answer_index(squad) | ||||
|     transform_squad(squad) | ||||
|     print_squad(squad) | ||||
|     save("./squad-v2-dev-test-out.json", squad) | ||||
| 
 | ||||
|     # translate_text("my name is tomas") | ||||
| 
 | ||||
|     with open("./squad-test-out.json", "w") as f: | ||||
|         json.dump(squad, f, indent=2) | ||||
| 
 | ||||
							
								
								
									
										76
									
								
								squad_utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										76
									
								
								squad_utils.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,76 @@ | ||||
| import json | ||||
| 
 | ||||
| 
 | ||||
| def calculate_chars(squad): | ||||
|     num_articles = len(squad['data']) | ||||
|     print(f"total articles: {num_articles}") | ||||
| 
 | ||||
|     context_chars = 0 | ||||
|     question_chars = 0 | ||||
|     answer_chars = 0 | ||||
| 
 | ||||
|     total_paragraphs = 0 | ||||
|     total_qas = 0 | ||||
|     total_answers = 0 | ||||
| 
 | ||||
|     for article in squad['data']: | ||||
|         total_paragraphs += len(article['paragraphs']) | ||||
| 
 | ||||
|         for paragraph in article['paragraphs']: | ||||
|             context_chars += len(paragraph['context']) | ||||
| 
 | ||||
|             total_qas += len(paragraph['qas']) | ||||
| 
 | ||||
|             for qas in paragraph['qas']: | ||||
|                 question_chars += len(qas['question']) | ||||
| 
 | ||||
|                 total_answers += len(qas['answers']) | ||||
| 
 | ||||
|                 for answer in qas['answers']: | ||||
|                     answer_chars += len(answer['text']) | ||||
| 
 | ||||
|     print(f"total paragraphs: {total_paragraphs}") | ||||
|     print(f"total qas: {total_qas}") | ||||
|     print(f"total answers: {total_answers}") | ||||
| 
 | ||||
|     print(f"chars in contexts: {context_chars}") | ||||
|     print(f"chars in questions: {question_chars}") | ||||
|     print(f"chars in answers: {answer_chars}") | ||||
| 
 | ||||
|     total_chars = context_chars + question_chars + answer_chars | ||||
| 
 | ||||
|     print(f"total chars: {total_chars}") | ||||
| 
 | ||||
| 
 | ||||
| def print_squad(squad, article_limit=100, paragraph_limit=100, qas_limit=100): | ||||
|     for article in squad['data'][:article_limit]: | ||||
|         print("="*40) | ||||
|         print(f"Article title: {article['title']}\n\n") | ||||
| 
 | ||||
|         for paragraph in article['paragraphs'][:paragraph_limit]: | ||||
|             print(f"{paragraph['context']}\n") | ||||
| 
 | ||||
|             # index = 0 | ||||
|             # for qas in paragraph['qas'][:qas_limit]: | ||||
|             #     print(f"Question: {qas['question']}") | ||||
| 
 | ||||
|             #     print(f"Answers:") | ||||
|             #     answer = qas['answers'][0] | ||||
|             #     print(f"#{index} @{answer['answer_start']}: \t{answer['text']}") | ||||
|             #     print(f"#{index} ends @{answer['answer_end']}") | ||||
|             #     start = answer['answer_start'] | ||||
|             #     end = start + len(answer['text']) | ||||
|             #     print(f"from context: \t{paragraph['context'][start:end]}") | ||||
| 
 | ||||
|             #     print("\n") | ||||
|             #     index += 1 | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     squad = None | ||||
| 
 | ||||
|     with open("squad-v2-dev.json", "r", encoding="utf-8") as f: | ||||
|         squad = json.load(f) | ||||
| 
 | ||||
|     calculate_chars(squad) | ||||
| 
 | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user