forked from KEMT/zpwiki
		
	
		
			
				
	
	
		
			160 lines
		
	
	
		
			5.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			160 lines
		
	
	
		
			5.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| Usage example:
 | |
| $ python scripts.py delete_annot jakub.maruniak ./dataset.jsonl ./new_dataset.jsonl
 | |
| 
 | |
| To see available commands:
 | |
| $ python scripts.py help 
 | |
| 
 | |
| To see available arguments:
 | |
| $ python scripts.py [command] --help 
 | |
| """
 | |
| import spacy
 | |
| import srsly
 | |
| from wasabi import msg
 | |
| import plac
 | |
| import sys
 | |
| import re
 | |
| import itertools
 | |
| 
 | |
| @plac.annotations(
 | |
|     dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
 | |
| )
 | |
| def count(
 | |
|     dataset_path
 | |
| ):
 | |
|     """     
 | |
|     Print statistics about Prodigy JSONL dataset.
 | |
|     Prints number of accepted, rejected and ignored articles.
 | |
|     Prints number of annotations of each entity type.
 | |
|     Prints how much annotations were made by each annotator. 
 | |
|     """    
 | |
|     # load data
 | |
|     # filename = 'ner/skner/sknerv4spans.jsonl'
 | |
|     file = open(sys.argv[1], 'rt', encoding='utf-8')
 | |
|     text = file.read()
 | |
| 
 | |
|     # count articles
 | |
|     countAccept = text.count('accept')
 | |
|     countReject = text.count('reject')
 | |
|     countSkip = text.count('ignore')
 | |
|     countSpans = text.count('tokens')
 | |
|     # count entities
 | |
|     countPER = text.count('PER')
 | |
|     countLOC = text.count('LOC')
 | |
|     countORG = text.count('ORG')
 | |
|     countMISC = text.count('MISC')
 | |
| 
 | |
|     underline = '\033[04m'
 | |
|     reset = '\033[0m'
 | |
|     red = '\033[31m'
 | |
|     green='\033[32m'
 | |
|     gray='\033[37m'
 | |
| 
 | |
|     # table v1
 | |
|     #from lib import TableIt
 | |
|     #table1 = [    
 | |
|     #    ['Prijatých', countAccept],   
 | |
|     #    ['Zamietnutých', countReject],
 | |
|     #    ['Preskočených', countSkip],
 | |
|     #    ['------------', '------------'],
 | |
|     #    ['Spolu', countSpans]
 | |
|     #]
 | |
|     #
 | |
|     #table = [    
 | |
|     #    ['Entita', 'Počet'],   
 | |
|     #    ['PER', countPER],
 | |
|     #    ['LOC', countLOC],
 | |
|     #    ['ORG', countORG],
 | |
|     #    ['MISC', countMISC]
 | |
|     #]
 | |
|     #print('\nPočet anotovaných článkov:')
 | |
|     #TableIt.printTable(table1)
 | |
|     #print('\nPočet jednotlivých entít:')
 | |
|     #TableIt.printTable(table, useFieldNames=True, color=(26, 156, 171))
 | |
| 
 | |
|     # table v2
 | |
|     print(underline + '\nPočet anotovaných článkov:' + reset)
 | |
|     print(green + "%-15s %-20s" %("Prijatých", countAccept) + reset)
 | |
|     print(red + "%-15s %-15s" %("Zamietnutých", countReject) + reset)
 | |
|     print(gray + "%-15s %-15s" %("Preskočených", countSkip) + reset)
 | |
|     print("%-15s" %("---------------------"))
 | |
|     print("%-15s %-15s" %("Spolu", countSpans))
 | |
| 
 | |
|     print(underline + '\nPočet jednotlivých entít:' + reset)
 | |
|     print("%-10s %-10s" %("Entita:", "Počet:"))
 | |
|     print("%-10s" %("----------------"))
 | |
|     print("%-10s %-10s" %("PER", countPER))
 | |
|     print("%-10s %-10s" %("LOC", countLOC))
 | |
|     print("%-10s %-10s" %("ORG", countORG))
 | |
|     print("%-10s %-10s" %("MISC", countMISC))
 | |
| 
 | |
|         # kto anotoval koľko?
 | |
|     frequency = {}
 | |
|     #Open the sample text file in read mode.
 | |
|         #document_text = open('sample.txt', 'r')
 | |
|     #convert the string of the document in lowercase and assign it to text_string variable.
 | |
|         #text = document_text.read().lower()
 | |
|     regex1 = '"_session_id":(.*?),'
 | |
|     pattern = re.findall(regex1, text)
 | |
|     for word in pattern:
 | |
|         count = frequency.get(word,0)
 | |
|         frequency[word] = count + 1
 | |
|     frequency_list = frequency.keys()
 | |
|     print(underline + '\nKto anotoval koľko článkov?' + reset)
 | |
|     for words in frequency_list:
 | |
|         print(words, frequency[words])
 | |
| 
 | |
| 
 | |
| @plac.annotations(
 | |
|     annotator=("Keep annotations from this annotator (email address or nickname)", "positional", None, str),
 | |
|     dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
 | |
|     new_dataset_path=("Path to save new dataset(Prodigy JSONL format)", "positional", None, str),
 | |
| )
 | |
| def delete_annot(
 | |
|     annotator, dataset_path, new_dataset_path
 | |
| ):
 | |
|     """     
 | |
|     Load Prodigy JSONL dataset, 
 | |
|     and keep annotations only from one annotator.
 | |
|     """    
 | |
|     file1 = open(sys.argv[2], 'r', encoding='utf-8') 
 | |
|     file2 = open(sys.argv[3],'w', encoding='utf-8') 
 | |
|     
 | |
|     for line in file1.readlines(): 
 | |
|         x = re.findall(sys.argv[1], line) 
 | |
|         if x: 
 | |
|             print(line) 
 | |
|             file2.write(line) 
 | |
|             
 | |
|     file1.close() 
 | |
|     file2.close() 
 | |
| 
 | |
| def modelinfo(
 | |
| ):
 | |
|     """     
 | |
|     Print information about trained model (Precision, Recall and F-Score)
 | |
|     """  
 | |
|     with open('build/train/nerposparser/model-best/meta.json', 'rt') as f:
 | |
|         for line in itertools.islice(f, 31, 54):
 | |
|             print(line, end =" ") 
 | |
| 
 | |
| def helpme(
 | |
| ):
 | |
|     print("Available commands:",
 | |
|     "\ncount - Print statistics about Prodigy JSONL dataset",
 | |
|     "\ndelete_annot - Create dataset with annotations from only specific annotator",
 | |
|     "\nmodelinfo - Prints informations about trained model (Precision, Recall and F-Score)")
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     opts = {"count": count,
 | |
|             "delete_annot": delete_annot,
 | |
|             "modelinfo": modelinfo,
 | |
|             "help": helpme}
 | |
|     cmd = sys.argv.pop(1)
 | |
|     if cmd not in opts:
 | |
|         msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
 | |
|     try:
 | |
|         plac.call(opts[cmd])
 | |
|     except KeyboardInterrupt:
 | |
|         msg.warn("Stopped.", exits=1) |