Nahrát soubory do „pages/students/2016/jakub_maruniak/dp2021/annotation“

This commit is contained in:
Jakub Maruniak 2021-03-21 20:27:06 +00:00
parent 9de29ed2b0
commit 99f36d3050
4 changed files with 416 additions and 0 deletions

View File

@ -0,0 +1,152 @@
"""Scripts used for training and evaluation of NER models
Usage example:
$ python custom_train.py train ./model ./train.jsonl ./eval.jsonl -o ./output_dir -n 15
Requirements:
spacy>=2.2.3
https://github.com/explosion/projects/tree/master/ner-drugs
"""
import spacy
from spacy.cli.train import _load_pretrained_tok2vec
from timeit import default_timer as timer
from pathlib import Path
import srsly
from wasabi import msg
import random
import plac
import sys
import tqdm
def format_data(data):
result = []
labels = set()
for eg in data:
if eg["answer"] != "accept":
continue
ents = [(s["start"], s["end"], s["label"]) for s in eg.get("spans", [])]
labels.update([ent[2] for ent in ents])
result.append((eg["text"], {"entities": ents}))
return result, labels
@plac.annotations(
model=("The base model to load or blank:lang", "positional", None, str),
train_path=("The training data (Prodigy JSONL)", "positional", None, str),
eval_path=("The evaluation data (Prodigy JSONL)", "positional", None, str),
n_iter=("Number of iterations", "option", "n", int),
output=("Optional output directory", "option", "o", str),
tok2vec=("Pretrained tok2vec weights to initialize model", "option", "t2v", str),
)
def train_model(
model, train_path, eval_path, n_iter=10, output=None, tok2vec=None,
):
"""
Train a model from Prodigy annotations and optionally save out the best
model to disk.
"""
spacy.util.fix_random_seed(0)
with msg.loading(f"Loading '{model}'..."):
if model.startswith("blank:"):
nlp = spacy.blank(model.replace("blank:", ""))
else:
nlp = spacy.load(model)
msg.good(f"Loaded model '{model}'")
train_data, labels = format_data(srsly.read_jsonl(train_path))
eval_data, _ = format_data(srsly.read_jsonl(eval_path))
ner = nlp.create_pipe("ner")
for label in labels:
ner.add_label(label)
nlp.add_pipe(ner)
t2v_cfg = {
"embed_rows": 10000,
"token_vector_width": 128,
"conv_depth": 8,
"nr_feature_tokens": 3,
}
optimizer = nlp.begin_training(component_cfg={"ner": t2v_cfg} if tok2vec else {})
if tok2vec:
_load_pretrained_tok2vec(nlp, Path(tok2vec))
batch_size = spacy.util.compounding(1.0, 32.0, 1.001)
best_acc = 0
best_model = None
row_widths = (2, 8, 8, 8, 8)
msg.row(("#", "L", "P", "R", "F"), widths=row_widths)
for i in range(n_iter):
random.shuffle(train_data)
losses = {}
data = tqdm.tqdm(train_data, leave=False)
for batch in spacy.util.minibatch(data, size=batch_size):
texts, annots = zip(*batch)
nlp.update(texts, annots, drop=0.2, losses=losses)
with nlp.use_params(optimizer.averages):
sc = nlp.evaluate(eval_data)
if sc.ents_f > best_acc:
best_acc = sc.ents_f
if output:
best_model = nlp.to_bytes()
acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}")
msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths)
msg.text(f"Best F-Score: {best_acc:.3f}")
if output and best_model:
with msg.loading("Saving model..."):
nlp.from_bytes(best_model).to_disk(output)
msg.good("Saved model", output)
@plac.annotations(
model=("The model to evaluate", "positional", None, str),
eval_path=("The evaluation data (Prodigy JSONL)", "positional", None, str),
)
def evaluate_model(model, eval_path):
"""
Evaluate a trained model on Prodigy annotations and print the accuracy.
"""
with msg.loading(f"Loading model '{model}'..."):
nlp = spacy.load(model)
data, _ = format_data(srsly.read_jsonl(eval_path))
sc = nlp.evaluate(data)
result = [
("Precision", f"{sc.ents_p:.3f}"),
("Recall", f"{sc.ents_r:.3f}"),
("F-Score", f"{sc.ents_f:.3f}"),
]
msg.table(result)
@plac.annotations(
model=("The model to evaluate", "positional", None, str),
data=("Raw data as JSONL", "positional", None, str),
)
def wps(model, data):
"""
Measure the processing speed in words per second. It's recommended to
use a larger corpus of raw text here (e.g. a few million words).
"""
with msg.loading(f"Loading model '{model}'..."):
nlp = spacy.load(model)
texts = (eg["text"] for eg in srsly.read_jsonl(data))
n_docs = 0
n_words = 0
start_time = timer()
for doc in nlp.pipe(texts):
n_docs += 1
n_words += len(doc)
end_time = timer()
wps = int(n_words / (end_time - start_time))
result = [
("Docs", f"{n_docs:,}"),
("Words", f"{n_words:,}"),
("Words/s", f"{wps:,}"),
]
msg.table(result, widths=(7, 12), aligns=("l", "r"))
if __name__ == "__main__":
opts = {"train": train_model, "evaluate": evaluate_model, "wps": wps}
cmd = sys.argv.pop(1)
if cmd not in opts:
msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
try:
plac.call(opts[cmd])
except KeyboardInterrupt:
msg.warn("Stopped.", exits=1)

View File

@ -0,0 +1,160 @@
"""
Usage example:
$ python scripts.py delete_annot jakub.maruniak ./dataset.jsonl ./new_dataset.jsonl
To see available commands:
$ python scripts.py help
To see available arguments:
$ python scripts.py [command] --help
"""
import spacy
import srsly
from wasabi import msg
import plac
import sys
import re
import itertools
@plac.annotations(
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
)
def count(
dataset_path
):
"""
Print statistics about Prodigy JSONL dataset.
Prints number of accepted, rejected and ignored articles.
Prints number of annotations of each entity type.
Prints how much annotations were made by each annotator.
"""
# load data
# filename = 'ner/skner/sknerv4spans.jsonl'
file = open(sys.argv[1], 'rt', encoding='utf-8')
text = file.read()
# count articles
countAccept = text.count('accept')
countReject = text.count('reject')
countSkip = text.count('ignore')
countSpans = text.count('tokens')
# count entities
countPER = text.count('PER')
countLOC = text.count('LOC')
countORG = text.count('ORG')
countMISC = text.count('MISC')
underline = '\033[04m'
reset = '\033[0m'
red = '\033[31m'
green='\033[32m'
gray='\033[37m'
# table v1
#from lib import TableIt
#table1 = [
# ['Prijatých', countAccept],
# ['Zamietnutých', countReject],
# ['Preskočených', countSkip],
# ['------------', '------------'],
# ['Spolu', countSpans]
#]
#
#table = [
# ['Entita', 'Počet'],
# ['PER', countPER],
# ['LOC', countLOC],
# ['ORG', countORG],
# ['MISC', countMISC]
#]
#print('\nPočet anotovaných článkov:')
#TableIt.printTable(table1)
#print('\nPočet jednotlivých entít:')
#TableIt.printTable(table, useFieldNames=True, color=(26, 156, 171))
# table v2
print(underline + '\nPočet anotovaných článkov:' + reset)
print(green + "%-15s %-20s" %("Prijatých", countAccept) + reset)
print(red + "%-15s %-15s" %("Zamietnutých", countReject) + reset)
print(gray + "%-15s %-15s" %("Preskočených", countSkip) + reset)
print("%-15s" %("---------------------"))
print("%-15s %-15s" %("Spolu", countSpans))
print(underline + '\nPočet jednotlivých entít:' + reset)
print("%-10s %-10s" %("Entita:", "Počet:"))
print("%-10s" %("----------------"))
print("%-10s %-10s" %("PER", countPER))
print("%-10s %-10s" %("LOC", countLOC))
print("%-10s %-10s" %("ORG", countORG))
print("%-10s %-10s" %("MISC", countMISC))
# kto anotoval koľko?
frequency = {}
#Open the sample text file in read mode.
#document_text = open('sample.txt', 'r')
#convert the string of the document in lowercase and assign it to text_string variable.
#text = document_text.read().lower()
regex1 = '"_session_id":(.*?),'
pattern = re.findall(regex1, text)
for word in pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
print(underline + '\nKto anotoval koľko článkov?' + reset)
for words in frequency_list:
print(words, frequency[words])
@plac.annotations(
annotator=("Keep annotations from this annotator (email address or nickname)", "positional", None, str),
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
new_dataset_path=("Path to save new dataset(Prodigy JSONL format)", "positional", None, str),
)
def delete_annot(
annotator, dataset_path, new_dataset_path
):
"""
Load Prodigy JSONL dataset,
and keep annotations only from one annotator.
"""
file1 = open(sys.argv[2], 'r', encoding='utf-8')
file2 = open(sys.argv[3],'w', encoding='utf-8')
for line in file1.readlines():
x = re.findall(sys.argv[1], line)
if x:
print(line)
file2.write(line)
file1.close()
file2.close()
def modelinfo(
):
"""
Print information about trained model (Precision, Recall and F-Score)
"""
with open('build/train/nerposparser/model-best/meta.json', 'rt') as f:
for line in itertools.islice(f, 31, 54):
print(line, end =" ")
def helpme(
):
print("Available commands:",
"\ncount - Print statistics about Prodigy JSONL dataset",
"\ndelete_annot - Create dataset with annotations from only specific annotator",
"\nmodelinfo - Prints informations about trained model (Precision, Recall and F-Score)")
if __name__ == "__main__":
opts = {"count": count,
"delete_annot": delete_annot,
"modelinfo": modelinfo,
"help": helpme}
cmd = sys.argv.pop(1)
if cmd not in opts:
msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
try:
plac.call(opts[cmd])
except KeyboardInterrupt:
msg.warn("Stopped.", exits=1)

View File

@ -0,0 +1,24 @@
set -e
OUTDIR=build/train/output
TRAINDIR=build/train
# Delete old training results
rm -r $TRAINDIR
mkdir -p $TRAINDIR
mkdir -p $OUTDIR
mkdir -p dist
# Delete old training results
rm -rf $OUTDIR/*
# Train dependency and POS
spacy train sk $OUTDIR ./build/input/slovak-treebank ./build/input/ud-artificial-gapping --n-iter 15 -p tagger,parser
rm -rf $TRAINDIR/posparser
mv $OUTDIR/model-best $TRAINDIR/posparser
# Train NER
# custom script for training, but it takes too long... input is JSONL file (db from Prodigy)
# python custom_train.py train ./build/train/posparser ./train.jsonl ./eval.jsonl -o ./build/train/nerposparser -n 15
spacy train sk $TRAINDIR/nerposparser ./ner/experiments/34sknerfull.json ./ner/experiments/34wikiartfull.json --n-iter 15 -p ner
# Package model
spacy package $TRAINDIR/nerposparser dist --meta-path ./meta.json --force
cd dist/sk_sk1-0.2.0
python ./setup.py sdist --dist-dir ../

View File

@ -0,0 +1,80 @@
"""
Visualize the data with Streamlit and spaCy.
https://github.com/explosion/projects/blob/master/ner-drugs/streamlit_visualizer.py
Usage example:
$ streamlit run visualizer.py visualize ./dataset.jsonl
"""
import streamlit as st
from spacy import displacy
import srsly
import sys
import plac
from wasabi import msg
@plac.annotations(
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
)
def visualize(
dataset_path
):
FILES = [sys.argv[1]]
MISC = "MISC"
HTML_WRAPPER = "<div style='border-bottom: 1px solid #ccc; padding: 20px 0'>{}</div>"
HTML_WRAPPER1 = "<div style='border-bottom: 2px solid #000; padding: 0 0 20px 0'>{}</div>"
SETTINGS = {"style": "ent", "manual": True, "options": {"colors": {MISC: "#d1bcff"}}}
@st.cache(allow_output_mutation=True)
def load_data(filepath):
return list(srsly.read_jsonl(filepath))
st.sidebar.title("Data visualizer")
st.sidebar.markdown(
"Visualize the annotations using [displaCy](https://spacy.io/usage/visualizers) "
"and view stats about the datasets."
)
data_file = st.sidebar.selectbox("Dataset", FILES)
data = load_data(data_file)
n_no_ents = 0
n_total_ents = 0
accepted = 0
rejected = 0
st.header(f"Dataset: {data_file} ({len(data)})")
st.markdown(HTML_WRAPPER1.format("Visualize only accepted examples and their annotations."), unsafe_allow_html=True)
for eg in data:
if eg["answer"] == "accept":
accepted += 1
if eg["answer"] != "accept":
rejected += 1
continue
row = {"text": eg["text"], "ents": eg.get("spans", [])}
answer = {"answer": eg.get("answer", [])}
n_total_ents += len(row["ents"])
if not row["ents"]:
n_no_ents += 1
html = displacy.render(row, **SETTINGS).replace("\n\n", "\n")
st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)
st.sidebar.markdown(
f"""
| `{data_file}` | |
| --- | ---: |
| Total examples | {len(data):,} |
| Accepted examples | {accepted:,} |
| Rejected examples | {rejected:,} |
| Total entities | {n_total_ents:,} |
| Examples with no entities | {n_no_ents:,} |
""", unsafe_allow_html=True
)
if __name__ == "__main__":
opts = {"visualize": visualize}
cmd = sys.argv.pop(1)
if cmd not in opts:
msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
try:
plac.call(opts[cmd])
except KeyboardInterrupt:
msg.warn("Stopped.", exits=1)