forked from KEMT/zpwiki
160 lines
5.0 KiB
Python
160 lines
5.0 KiB
Python
"""
|
|
Usage example:
|
|
$ python scripts.py delete_annot jakub.maruniak ./dataset.jsonl ./new_dataset.jsonl
|
|
|
|
To see available commands:
|
|
$ python scripts.py help
|
|
|
|
To see available arguments:
|
|
$ python scripts.py [command] --help
|
|
"""
|
|
import spacy
|
|
import srsly
|
|
from wasabi import msg
|
|
import plac
|
|
import sys
|
|
import re
|
|
import itertools
|
|
|
|
@plac.annotations(
|
|
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
|
|
)
|
|
def count(
|
|
dataset_path
|
|
):
|
|
"""
|
|
Print statistics about Prodigy JSONL dataset.
|
|
Prints number of accepted, rejected and ignored articles.
|
|
Prints number of annotations of each entity type.
|
|
Prints how much annotations were made by each annotator.
|
|
"""
|
|
# load data
|
|
# filename = 'ner/skner/sknerv4spans.jsonl'
|
|
file = open(sys.argv[1], 'rt', encoding='utf-8')
|
|
text = file.read()
|
|
|
|
# count articles
|
|
countAccept = text.count('accept')
|
|
countReject = text.count('reject')
|
|
countSkip = text.count('ignore')
|
|
countSpans = text.count('tokens')
|
|
# count entities
|
|
countPER = text.count('PER')
|
|
countLOC = text.count('LOC')
|
|
countORG = text.count('ORG')
|
|
countMISC = text.count('MISC')
|
|
|
|
underline = '\033[04m'
|
|
reset = '\033[0m'
|
|
red = '\033[31m'
|
|
green='\033[32m'
|
|
gray='\033[37m'
|
|
|
|
# table v1
|
|
#from lib import TableIt
|
|
#table1 = [
|
|
# ['Prijatých', countAccept],
|
|
# ['Zamietnutých', countReject],
|
|
# ['Preskočených', countSkip],
|
|
# ['------------', '------------'],
|
|
# ['Spolu', countSpans]
|
|
#]
|
|
#
|
|
#table = [
|
|
# ['Entita', 'Počet'],
|
|
# ['PER', countPER],
|
|
# ['LOC', countLOC],
|
|
# ['ORG', countORG],
|
|
# ['MISC', countMISC]
|
|
#]
|
|
#print('\nPočet anotovaných článkov:')
|
|
#TableIt.printTable(table1)
|
|
#print('\nPočet jednotlivých entít:')
|
|
#TableIt.printTable(table, useFieldNames=True, color=(26, 156, 171))
|
|
|
|
# table v2
|
|
print(underline + '\nPočet anotovaných článkov:' + reset)
|
|
print(green + "%-15s %-20s" %("Prijatých", countAccept) + reset)
|
|
print(red + "%-15s %-15s" %("Zamietnutých", countReject) + reset)
|
|
print(gray + "%-15s %-15s" %("Preskočených", countSkip) + reset)
|
|
print("%-15s" %("---------------------"))
|
|
print("%-15s %-15s" %("Spolu", countSpans))
|
|
|
|
print(underline + '\nPočet jednotlivých entít:' + reset)
|
|
print("%-10s %-10s" %("Entita:", "Počet:"))
|
|
print("%-10s" %("----------------"))
|
|
print("%-10s %-10s" %("PER", countPER))
|
|
print("%-10s %-10s" %("LOC", countLOC))
|
|
print("%-10s %-10s" %("ORG", countORG))
|
|
print("%-10s %-10s" %("MISC", countMISC))
|
|
|
|
# kto anotoval koľko?
|
|
frequency = {}
|
|
#Open the sample text file in read mode.
|
|
#document_text = open('sample.txt', 'r')
|
|
#convert the string of the document in lowercase and assign it to text_string variable.
|
|
#text = document_text.read().lower()
|
|
regex1 = '"_session_id":(.*?),'
|
|
pattern = re.findall(regex1, text)
|
|
for word in pattern:
|
|
count = frequency.get(word,0)
|
|
frequency[word] = count + 1
|
|
frequency_list = frequency.keys()
|
|
print(underline + '\nKto anotoval koľko článkov?' + reset)
|
|
for words in frequency_list:
|
|
print(words, frequency[words])
|
|
|
|
|
|
@plac.annotations(
|
|
annotator=("Keep annotations from this annotator (email address or nickname)", "positional", None, str),
|
|
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
|
|
new_dataset_path=("Path to save new dataset(Prodigy JSONL format)", "positional", None, str),
|
|
)
|
|
def delete_annot(
|
|
annotator, dataset_path, new_dataset_path
|
|
):
|
|
"""
|
|
Load Prodigy JSONL dataset,
|
|
and keep annotations only from one annotator.
|
|
"""
|
|
file1 = open(sys.argv[2], 'r', encoding='utf-8')
|
|
file2 = open(sys.argv[3],'w', encoding='utf-8')
|
|
|
|
for line in file1.readlines():
|
|
x = re.findall(sys.argv[1], line)
|
|
if x:
|
|
print(line)
|
|
file2.write(line)
|
|
|
|
file1.close()
|
|
file2.close()
|
|
|
|
def modelinfo(
|
|
):
|
|
"""
|
|
Print information about trained model (Precision, Recall and F-Score)
|
|
"""
|
|
with open('build/train/nerposparser/model-best/meta.json', 'rt') as f:
|
|
for line in itertools.islice(f, 31, 54):
|
|
print(line, end =" ")
|
|
|
|
def helpme(
|
|
):
|
|
print("Available commands:",
|
|
"\ncount - Print statistics about Prodigy JSONL dataset",
|
|
"\ndelete_annot - Create dataset with annotations from only specific annotator",
|
|
"\nmodelinfo - Prints informations about trained model (Precision, Recall and F-Score)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
opts = {"count": count,
|
|
"delete_annot": delete_annot,
|
|
"modelinfo": modelinfo,
|
|
"help": helpme}
|
|
cmd = sys.argv.pop(1)
|
|
if cmd not in opts:
|
|
msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
|
|
try:
|
|
plac.call(opts[cmd])
|
|
except KeyboardInterrupt:
|
|
msg.warn("Stopped.", exits=1) |