zpwiki/pages/students/2016/jakub_maruniak/dp2021/annotation/scripts.py

"""
Usage example:
$ python scripts.py delete_annot jakub.maruniak ./dataset.jsonl ./new_dataset.jsonl

To see available commands:
$ python scripts.py help

To see available arguments:
$ python scripts.py [command] --help
"""
import spacy
import srsly
from wasabi import msg
import plac
import sys
import re
import itertools

@plac.annotations(
    dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
)
def count(
    dataset_path
):
    """
    Print statistics about Prodigy JSONL dataset.
    Prints number of accepted, rejected and ignored articles.
    Prints number of annotations of each entity type.
    Prints how much annotations were made by each annotator.
    """
    # load data
    # filename = 'ner/skner/sknerv4spans.jsonl'
    file = open(sys.argv[1], 'rt', encoding='utf-8')
    text = file.read()

    # count articles
    countAccept = text.count('accept')
    countReject = text.count('reject')
    countSkip = text.count('ignore')
    countSpans = text.count('tokens')
    # count entities
    countPER = text.count('PER')
    countLOC = text.count('LOC')
    countORG = text.count('ORG')
    countMISC = text.count('MISC')

    underline = '\033[04m'
    reset = '\033[0m'
    red = '\033[31m'
    green='\033[32m'
    gray='\033[37m'

    # table v1
    #from lib import TableIt
    #table1 = [
    #    ['Prijatých', countAccept],
    #    ['Zamietnutých', countReject],
    #    ['Preskočených', countSkip],
    #    ['------------', '------------'],
    #    ['Spolu', countSpans]
    #]
    #
    #table = [
    #    ['Entita', 'Počet'],
    #    ['PER', countPER],
    #    ['LOC', countLOC],
    #    ['ORG', countORG],
    #    ['MISC', countMISC]
    #]
    #print('\nPočet anotovaných článkov:')
    #TableIt.printTable(table1)
    #print('\nPočet jednotlivých entít:')
    #TableIt.printTable(table, useFieldNames=True, color=(26, 156, 171))

    # table v2
    print(underline + '\nPočet anotovaných článkov:' + reset)
    print(green + "%-15s %-20s" %("Prijatých", countAccept) + reset)
    print(red + "%-15s %-15s" %("Zamietnutých", countReject) + reset)
    print(gray + "%-15s %-15s" %("Preskočených", countSkip) + reset)
    print("%-15s" %("---------------------"))
    print("%-15s %-15s" %("Spolu", countSpans))

    print(underline + '\nPočet jednotlivých entít:' + reset)
    print("%-10s %-10s" %("Entita:", "Počet:"))
    print("%-10s" %("----------------"))
    print("%-10s %-10s" %("PER", countPER))
    print("%-10s %-10s" %("LOC", countLOC))
    print("%-10s %-10s" %("ORG", countORG))
    print("%-10s %-10s" %("MISC", countMISC))

        # kto anotoval koľko?
    frequency = {}
    #Open the sample text file in read mode.
        #document_text = open('sample.txt', 'r')
    #convert the string of the document in lowercase and assign it to text_string variable.
        #text = document_text.read().lower()
    regex1 = '"_session_id":(.*?),'
    pattern = re.findall(regex1, text)
    for word in pattern:
        count = frequency.get(word,0)
        frequency[word] = count + 1
    frequency_list = frequency.keys()
    print(underline + '\nKto anotoval koľko článkov?' + reset)
    for words in frequency_list:
        print(words, frequency[words])


@plac.annotations(
    annotator=("Keep annotations from this annotator (email address or nickname)", "positional", None, str),
    dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
    new_dataset_path=("Path to save new dataset(Prodigy JSONL format)", "positional", None, str),
)
def delete_annot(
    annotator, dataset_path, new_dataset_path
):
    """
    Load Prodigy JSONL dataset,
    and keep annotations only from one annotator.
    """
    file1 = open(sys.argv[2], 'r', encoding='utf-8')
    file2 = open(sys.argv[3],'w', encoding='utf-8')

    for line in file1.readlines():
        x = re.findall(sys.argv[1], line)
        if x:
            print(line)
            file2.write(line)

    file1.close()
    file2.close()

def modelinfo(
):
    """
    Print information about trained model (Precision, Recall and F-Score)
    """
    with open('build/train/nerposparser/model-best/meta.json', 'rt') as f:
        for line in itertools.islice(f, 31, 54):
            print(line, end =" ")

def helpme(
):
    print("Available commands:",
    "\ncount - Print statistics about Prodigy JSONL dataset",
    "\ndelete_annot - Create dataset with annotations from only specific annotator",
    "\nmodelinfo - Prints informations about trained model (Precision, Recall and F-Score)")


if __name__ == "__main__":
    opts = {"count": count,
            "delete_annot": delete_annot,
            "modelinfo": modelinfo,
            "help": helpme}
    cmd = sys.argv.pop(1)
    if cmd not in opts:
        msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
    try:
        plac.call(opts[cmd])
    except KeyboardInterrupt:
        msg.warn("Stopped.", exits=1)