forked from KEMT/zpwiki
		
	
		
			
				
	
	
		
			160 lines
		
	
	
		
			5.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			160 lines
		
	
	
		
			5.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
Usage example:
 | 
						|
$ python scripts.py delete_annot jakub.maruniak ./dataset.jsonl ./new_dataset.jsonl
 | 
						|
 | 
						|
To see available commands:
 | 
						|
$ python scripts.py help 
 | 
						|
 | 
						|
To see available arguments:
 | 
						|
$ python scripts.py [command] --help 
 | 
						|
"""
 | 
						|
import spacy
 | 
						|
import srsly
 | 
						|
from wasabi import msg
 | 
						|
import plac
 | 
						|
import sys
 | 
						|
import re
 | 
						|
import itertools
 | 
						|
 | 
						|
@plac.annotations(
 | 
						|
    dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
 | 
						|
)
 | 
						|
def count(
 | 
						|
    dataset_path
 | 
						|
):
 | 
						|
    """     
 | 
						|
    Print statistics about Prodigy JSONL dataset.
 | 
						|
    Prints number of accepted, rejected and ignored articles.
 | 
						|
    Prints number of annotations of each entity type.
 | 
						|
    Prints how much annotations were made by each annotator. 
 | 
						|
    """    
 | 
						|
    # load data
 | 
						|
    # filename = 'ner/skner/sknerv4spans.jsonl'
 | 
						|
    file = open(sys.argv[1], 'rt', encoding='utf-8')
 | 
						|
    text = file.read()
 | 
						|
 | 
						|
    # count articles
 | 
						|
    countAccept = text.count('accept')
 | 
						|
    countReject = text.count('reject')
 | 
						|
    countSkip = text.count('ignore')
 | 
						|
    countSpans = text.count('tokens')
 | 
						|
    # count entities
 | 
						|
    countPER = text.count('PER')
 | 
						|
    countLOC = text.count('LOC')
 | 
						|
    countORG = text.count('ORG')
 | 
						|
    countMISC = text.count('MISC')
 | 
						|
 | 
						|
    underline = '\033[04m'
 | 
						|
    reset = '\033[0m'
 | 
						|
    red = '\033[31m'
 | 
						|
    green='\033[32m'
 | 
						|
    gray='\033[37m'
 | 
						|
 | 
						|
    # table v1
 | 
						|
    #from lib import TableIt
 | 
						|
    #table1 = [    
 | 
						|
    #    ['Prijatých', countAccept],   
 | 
						|
    #    ['Zamietnutých', countReject],
 | 
						|
    #    ['Preskočených', countSkip],
 | 
						|
    #    ['------------', '------------'],
 | 
						|
    #    ['Spolu', countSpans]
 | 
						|
    #]
 | 
						|
    #
 | 
						|
    #table = [    
 | 
						|
    #    ['Entita', 'Počet'],   
 | 
						|
    #    ['PER', countPER],
 | 
						|
    #    ['LOC', countLOC],
 | 
						|
    #    ['ORG', countORG],
 | 
						|
    #    ['MISC', countMISC]
 | 
						|
    #]
 | 
						|
    #print('\nPočet anotovaných článkov:')
 | 
						|
    #TableIt.printTable(table1)
 | 
						|
    #print('\nPočet jednotlivých entít:')
 | 
						|
    #TableIt.printTable(table, useFieldNames=True, color=(26, 156, 171))
 | 
						|
 | 
						|
    # table v2
 | 
						|
    print(underline + '\nPočet anotovaných článkov:' + reset)
 | 
						|
    print(green + "%-15s %-20s" %("Prijatých", countAccept) + reset)
 | 
						|
    print(red + "%-15s %-15s" %("Zamietnutých", countReject) + reset)
 | 
						|
    print(gray + "%-15s %-15s" %("Preskočených", countSkip) + reset)
 | 
						|
    print("%-15s" %("---------------------"))
 | 
						|
    print("%-15s %-15s" %("Spolu", countSpans))
 | 
						|
 | 
						|
    print(underline + '\nPočet jednotlivých entít:' + reset)
 | 
						|
    print("%-10s %-10s" %("Entita:", "Počet:"))
 | 
						|
    print("%-10s" %("----------------"))
 | 
						|
    print("%-10s %-10s" %("PER", countPER))
 | 
						|
    print("%-10s %-10s" %("LOC", countLOC))
 | 
						|
    print("%-10s %-10s" %("ORG", countORG))
 | 
						|
    print("%-10s %-10s" %("MISC", countMISC))
 | 
						|
 | 
						|
        # kto anotoval koľko?
 | 
						|
    frequency = {}
 | 
						|
    #Open the sample text file in read mode.
 | 
						|
        #document_text = open('sample.txt', 'r')
 | 
						|
    #convert the string of the document in lowercase and assign it to text_string variable.
 | 
						|
        #text = document_text.read().lower()
 | 
						|
    regex1 = '"_session_id":(.*?),'
 | 
						|
    pattern = re.findall(regex1, text)
 | 
						|
    for word in pattern:
 | 
						|
        count = frequency.get(word,0)
 | 
						|
        frequency[word] = count + 1
 | 
						|
    frequency_list = frequency.keys()
 | 
						|
    print(underline + '\nKto anotoval koľko článkov?' + reset)
 | 
						|
    for words in frequency_list:
 | 
						|
        print(words, frequency[words])
 | 
						|
 | 
						|
 | 
						|
@plac.annotations(
 | 
						|
    annotator=("Keep annotations from this annotator (email address or nickname)", "positional", None, str),
 | 
						|
    dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
 | 
						|
    new_dataset_path=("Path to save new dataset(Prodigy JSONL format)", "positional", None, str),
 | 
						|
)
 | 
						|
def delete_annot(
 | 
						|
    annotator, dataset_path, new_dataset_path
 | 
						|
):
 | 
						|
    """     
 | 
						|
    Load Prodigy JSONL dataset, 
 | 
						|
    and keep annotations only from one annotator.
 | 
						|
    """    
 | 
						|
    file1 = open(sys.argv[2], 'r', encoding='utf-8') 
 | 
						|
    file2 = open(sys.argv[3],'w', encoding='utf-8') 
 | 
						|
    
 | 
						|
    for line in file1.readlines(): 
 | 
						|
        x = re.findall(sys.argv[1], line) 
 | 
						|
        if x: 
 | 
						|
            print(line) 
 | 
						|
            file2.write(line) 
 | 
						|
            
 | 
						|
    file1.close() 
 | 
						|
    file2.close() 
 | 
						|
 | 
						|
def modelinfo(
 | 
						|
):
 | 
						|
    """     
 | 
						|
    Print information about trained model (Precision, Recall and F-Score)
 | 
						|
    """  
 | 
						|
    with open('build/train/nerposparser/model-best/meta.json', 'rt') as f:
 | 
						|
        for line in itertools.islice(f, 31, 54):
 | 
						|
            print(line, end =" ") 
 | 
						|
 | 
						|
def helpme(
 | 
						|
):
 | 
						|
    print("Available commands:",
 | 
						|
    "\ncount - Print statistics about Prodigy JSONL dataset",
 | 
						|
    "\ndelete_annot - Create dataset with annotations from only specific annotator",
 | 
						|
    "\nmodelinfo - Prints informations about trained model (Precision, Recall and F-Score)")
 | 
						|
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    opts = {"count": count,
 | 
						|
            "delete_annot": delete_annot,
 | 
						|
            "modelinfo": modelinfo,
 | 
						|
            "help": helpme}
 | 
						|
    cmd = sys.argv.pop(1)
 | 
						|
    if cmd not in opts:
 | 
						|
        msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
 | 
						|
    try:
 | 
						|
        plac.call(opts[cmd])
 | 
						|
    except KeyboardInterrupt:
 | 
						|
        msg.warn("Stopped.", exits=1) |