Jakub Maruniak 2021-03-21 20:27:06 +00:00
"""Scripts used for training and evaluation of NER models
Usage example:
$ python train ./model ./train.jsonl ./eval.jsonl -o ./output_dir -n 15
import spacy
from spacy.cli.train import _load_pretrained_tok2vec
from timeit import default_timer as timer
from pathlib import Path
import srsly
from wasabi import msg
import random
import plac
import sys
import tqdm
def format_data(data):
result = []
labels = set()
for eg in data:
if eg["answer"] != "accept":
ents = [(s["start"], s["end"], s["label"]) for s in eg.get("spans", [])]
labels.update([ent[2] for ent in ents])
result.append((eg["text"], {"entities": ents}))
return result, labels
model=("The base model to load or blank:lang", "positional", None, str),
train_path=("The training data (Prodigy JSONL)", "positional", None, str),
eval_path=("The evaluation data (Prodigy JSONL)", "positional", None, str),
n_iter=("Number of iterations", "option", "n", int),
output=("Optional output directory", "option", "o", str),
tok2vec=("Pretrained tok2vec weights to initialize model", "option", "t2v", str),
def train_model(
model, train_path, eval_path, n_iter=10, output=None, tok2vec=None,
Train a model from Prodigy annotations and optionally save out the best
model to disk.
with msg.loading(f"Loading '{model}'..."):
if model.startswith("blank:"):
nlp = spacy.blank(model.replace("blank:", ""))
nlp = spacy.load(model)
msg.good(f"Loaded model '{model}'")
train_data, labels = format_data(srsly.read_jsonl(train_path))
eval_data, _ = format_data(srsly.read_jsonl(eval_path))
ner = nlp.create_pipe("ner")
for label in labels:
t2v_cfg = {
"embed_rows": 10000,
"token_vector_width": 128,
"conv_depth": 8,
"nr_feature_tokens": 3,
optimizer = nlp.begin_training(component_cfg={"ner": t2v_cfg} if tok2vec else {})
if tok2vec:
_load_pretrained_tok2vec(nlp, Path(tok2vec))
batch_size = spacy.util.compounding(1.0, 32.0, 1.001)
best_acc = 0
best_model = None
row_widths = (2, 8, 8, 8, 8)
msg.row(("#", "L", "P", "R", "F"), widths=row_widths)
for i in range(n_iter):
losses = {}
data = tqdm.tqdm(train_data, leave=False)
for batch in spacy.util.minibatch(data, size=batch_size):
texts, annots = zip(*batch)
nlp.update(texts, annots, drop=0.2, losses=losses)
with nlp.use_params(optimizer.averages):
sc = nlp.evaluate(eval_data)
if sc.ents_f > best_acc:
best_acc = sc.ents_f
if output:
best_model = nlp.to_bytes()
acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}")
msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths)
msg.text(f"Best F-Score: {best_acc:.3f}")
if output and best_model:
with msg.loading("Saving model..."):
msg.good("Saved model", output)
model=("The model to evaluate", "positional", None, str),
eval_path=("The evaluation data (Prodigy JSONL)", "positional", None, str),
def evaluate_model(model, eval_path):
Evaluate a trained model on Prodigy annotations and print the accuracy.
with msg.loading(f"Loading model '{model}'..."):
nlp = spacy.load(model)
data, _ = format_data(srsly.read_jsonl(eval_path))
sc = nlp.evaluate(data)
result = [
("Precision", f"{sc.ents_p:.3f}"),
("Recall", f"{sc.ents_r:.3f}"),
("F-Score", f"{sc.ents_f:.3f}"),
model=("The model to evaluate", "positional", None, str),
data=("Raw data as JSONL", "positional", None, str),
def wps(model, data):
Measure the processing speed in words per second. It's recommended to
use a larger corpus of raw text here (e.g. a few million words).
with msg.loading(f"Loading model '{model}'..."):
nlp = spacy.load(model)
texts = (eg["text"] for eg in srsly.read_jsonl(data))
n_docs = 0
n_words = 0
start_time = timer()
for doc in nlp.pipe(texts):
n_docs += 1
n_words += len(doc)
end_time = timer()
wps = int(n_words / (end_time - start_time))
result = [
("Docs", f"{n_docs:,}"),
("Words", f"{n_words:,}"),
("Words/s", f"{wps:,}"),
msg.table(result, widths=(7, 12), aligns=("l", "r"))
if __name__ == "__main__":
opts = {"train": train_model, "evaluate": evaluate_model, "wps": wps}
cmd = sys.argv.pop(1)
if cmd not in opts:"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
except KeyboardInterrupt:
msg.warn("Stopped.", exits=1)

Usage example:
$ python delete_annot jakub.maruniak ./dataset.jsonl ./new_dataset.jsonl
To see available commands:
$ python help
To see available arguments:
$ python [command] --help
import spacy
import srsly
from wasabi import msg
import plac
import sys
import re
import itertools
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
def count(
Print statistics about Prodigy JSONL dataset.
Prints number of accepted, rejected and ignored articles.
Prints number of annotations of each entity type.
Prints how much annotations were made by each annotator.
# load data
# filename = 'ner/skner/sknerv4spans.jsonl'
file = open(sys.argv[1], 'rt', encoding='utf-8')
text =
# count articles
countAccept = text.count('accept')
countReject = text.count('reject')
countSkip = text.count('ignore')
countSpans = text.count('tokens')
# count entities
countPER = text.count('PER')
countLOC = text.count('LOC')
countORG = text.count('ORG')
countMISC = text.count('MISC')
underline = '\033[04m'
reset = '\033[0m'
red = '\033[31m'
# table v1
#from lib import TableIt
#table1 = [
# ['Prijatých', countAccept],
# ['Zamietnutých', countReject],
# ['Preskočených', countSkip],
# ['------------', '------------'],
# ['Spolu', countSpans]
#table = [
# ['Entita', 'Počet'],
# ['PER', countPER],
# ['LOC', countLOC],
# ['ORG', countORG],
# ['MISC', countMISC]
#print('\nPočet anotovaných článkov:')
#print('\nPočet jednotlivých entít:')
#TableIt.printTable(table, useFieldNames=True, color=(26, 156, 171))
# table v2
print(underline + '\nPočet anotovaných článkov:' + reset)
print(green + "%-15s %-20s" %("Prijatých", countAccept) + reset)
print(red + "%-15s %-15s" %("Zamietnutých", countReject) + reset)
print(gray + "%-15s %-15s" %("Preskočených", countSkip) + reset)
print("%-15s" %("---------------------"))
print("%-15s %-15s" %("Spolu", countSpans))
print(underline + '\nPočet jednotlivých entít:' + reset)
print("%-10s %-10s" %("Entita:", "Počet:"))
print("%-10s" %("----------------"))
print("%-10s %-10s" %("PER", countPER))
print("%-10s %-10s" %("LOC", countLOC))
print("%-10s %-10s" %("ORG", countORG))
print("%-10s %-10s" %("MISC", countMISC))
# kto anotoval koľko?
frequency = {}
#Open the sample text file in read mode.
#document_text = open('sample.txt', 'r')
#convert the string of the document in lowercase and assign it to text_string variable.
#text =
regex1 = '"_session_id":(.*?),'
pattern = re.findall(regex1, text)
for word in pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
print(underline + '\nKto anotoval koľko článkov?' + reset)
for words in frequency_list:
print(words, frequency[words])
annotator=("Keep annotations from this annotator (email address or nickname)", "positional", None, str),
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
new_dataset_path=("Path to save new dataset(Prodigy JSONL format)", "positional", None, str),
def delete_annot(
annotator, dataset_path, new_dataset_path
Load Prodigy JSONL dataset,
and keep annotations only from one annotator.
file1 = open(sys.argv[2], 'r', encoding='utf-8')
file2 = open(sys.argv[3],'w', encoding='utf-8')
for line in file1.readlines():
x = re.findall(sys.argv[1], line)
if x:
def modelinfo(
Print information about trained model (Precision, Recall and F-Score)
with open('build/train/nerposparser/model-best/meta.json', 'rt') as f:
for line in itertools.islice(f, 31, 54):
print(line, end =" ")
def helpme(
print("Available commands:",
"\ncount - Print statistics about Prodigy JSONL dataset",
"\ndelete_annot - Create dataset with annotations from only specific annotator",
"\nmodelinfo - Prints informations about trained model (Precision, Recall and F-Score)")
if __name__ == "__main__":
opts = {"count": count,
"delete_annot": delete_annot,
"modelinfo": modelinfo,
"help": helpme}
cmd = sys.argv.pop(1)
if cmd not in opts:"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
except KeyboardInterrupt:
msg.warn("Stopped.", exits=1)

set -e
# Delete old training results
mkdir -p $TRAINDIR
mkdir -p $OUTDIR
mkdir -p dist
# Delete old training results
rm -rf $OUTDIR/*
# Train dependency and POS
spacy train sk $OUTDIR ./build/input/slovak-treebank ./build/input/ud-artificial-gapping --n-iter 15 -p tagger,parser
rm -rf $TRAINDIR/posparser
mv $OUTDIR/model-best $TRAINDIR/posparser
# Train NER
# custom script for training, but it takes too long... input is JSONL file (db from Prodigy)
# python train ./build/train/posparser ./train.jsonl ./eval.jsonl -o ./build/train/nerposparser -n 15
spacy train sk $TRAINDIR/nerposparser ./ner/experiments/34sknerfull.json ./ner/experiments/34wikiartfull.json --n-iter 15 -p ner
# Package model
spacy package $TRAINDIR/nerposparser dist --meta-path ./meta.json --force
cd dist/sk_sk1-0.2.0
python ./ sdist --dist-dir ../

Visualize the data with Streamlit and spaCy.
Usage example:
$ streamlit run visualize ./dataset.jsonl
import streamlit as st
from spacy import displacy
import srsly
import sys
import plac
from wasabi import msg
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
def visualize(
FILES = [sys.argv[1]]
HTML_WRAPPER = "<div style='border-bottom: 1px solid #ccc; padding: 20px 0'>{}</div>"
HTML_WRAPPER1 = "<div style='border-bottom: 2px solid #000; padding: 0 0 20px 0'>{}</div>"
SETTINGS = {"style": "ent", "manual": True, "options": {"colors": {MISC: "#d1bcff"}}}
def load_data(filepath):
return list(srsly.read_jsonl(filepath))
st.sidebar.title("Data visualizer")
"Visualize the annotations using [displaCy]( "
"and view stats about the datasets."
data_file = st.sidebar.selectbox("Dataset", FILES)
data = load_data(data_file)
n_no_ents = 0
n_total_ents = 0
accepted = 0
rejected = 0
st.header(f"Dataset: {data_file} ({len(data)})")
st.markdown(HTML_WRAPPER1.format("Visualize only accepted examples and their annotations."), unsafe_allow_html=True)
for eg in data:
if eg["answer"] == "accept":
accepted += 1
if eg["answer"] != "accept":
rejected += 1
row = {"text": eg["text"], "ents": eg.get("spans", [])}
answer = {"answer": eg.get("answer", [])}
n_total_ents += len(row["ents"])
if not row["ents"]:
n_no_ents += 1
html = displacy.render(row, **SETTINGS).replace("\n\n", "\n")
st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)
| `{data_file}` | |
| --- | ---: |
| Total examples | {len(data):,} |
| Accepted examples | {accepted:,} |
| Rejected examples | {rejected:,} |
| Total entities | {n_total_ents:,} |
| Examples with no entities | {n_no_ents:,} |
""", unsafe_allow_html=True
if __name__ == "__main__":
opts = {"visualize": visualize}
cmd = sys.argv.pop(1)
if cmd not in opts:"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
except KeyboardInterrupt:
msg.warn("Stopped.", exits=1)