forked from KEMT/zpwiki
Nahrát soubory do „pages/students/2016/jakub_maruniak/dp2021/annotation“
This commit is contained in:
parent
9de29ed2b0
commit
99f36d3050
@ -0,0 +1,152 @@
|
|||||||
|
"""Scripts used for training and evaluation of NER models
|
||||||
|
Usage example:
|
||||||
|
$ python custom_train.py train ./model ./train.jsonl ./eval.jsonl -o ./output_dir -n 15
|
||||||
|
Requirements:
|
||||||
|
spacy>=2.2.3
|
||||||
|
https://github.com/explosion/projects/tree/master/ner-drugs
|
||||||
|
"""
|
||||||
|
import spacy
|
||||||
|
from spacy.cli.train import _load_pretrained_tok2vec
|
||||||
|
from timeit import default_timer as timer
|
||||||
|
from pathlib import Path
|
||||||
|
import srsly
|
||||||
|
from wasabi import msg
|
||||||
|
import random
|
||||||
|
import plac
|
||||||
|
import sys
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
def format_data(data):
|
||||||
|
result = []
|
||||||
|
labels = set()
|
||||||
|
for eg in data:
|
||||||
|
if eg["answer"] != "accept":
|
||||||
|
continue
|
||||||
|
ents = [(s["start"], s["end"], s["label"]) for s in eg.get("spans", [])]
|
||||||
|
labels.update([ent[2] for ent in ents])
|
||||||
|
result.append((eg["text"], {"entities": ents}))
|
||||||
|
return result, labels
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
model=("The base model to load or blank:lang", "positional", None, str),
|
||||||
|
train_path=("The training data (Prodigy JSONL)", "positional", None, str),
|
||||||
|
eval_path=("The evaluation data (Prodigy JSONL)", "positional", None, str),
|
||||||
|
n_iter=("Number of iterations", "option", "n", int),
|
||||||
|
output=("Optional output directory", "option", "o", str),
|
||||||
|
tok2vec=("Pretrained tok2vec weights to initialize model", "option", "t2v", str),
|
||||||
|
)
|
||||||
|
def train_model(
|
||||||
|
model, train_path, eval_path, n_iter=10, output=None, tok2vec=None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Train a model from Prodigy annotations and optionally save out the best
|
||||||
|
model to disk.
|
||||||
|
"""
|
||||||
|
spacy.util.fix_random_seed(0)
|
||||||
|
with msg.loading(f"Loading '{model}'..."):
|
||||||
|
if model.startswith("blank:"):
|
||||||
|
nlp = spacy.blank(model.replace("blank:", ""))
|
||||||
|
else:
|
||||||
|
nlp = spacy.load(model)
|
||||||
|
msg.good(f"Loaded model '{model}'")
|
||||||
|
train_data, labels = format_data(srsly.read_jsonl(train_path))
|
||||||
|
eval_data, _ = format_data(srsly.read_jsonl(eval_path))
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
for label in labels:
|
||||||
|
ner.add_label(label)
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
t2v_cfg = {
|
||||||
|
"embed_rows": 10000,
|
||||||
|
"token_vector_width": 128,
|
||||||
|
"conv_depth": 8,
|
||||||
|
"nr_feature_tokens": 3,
|
||||||
|
}
|
||||||
|
optimizer = nlp.begin_training(component_cfg={"ner": t2v_cfg} if tok2vec else {})
|
||||||
|
if tok2vec:
|
||||||
|
_load_pretrained_tok2vec(nlp, Path(tok2vec))
|
||||||
|
batch_size = spacy.util.compounding(1.0, 32.0, 1.001)
|
||||||
|
best_acc = 0
|
||||||
|
best_model = None
|
||||||
|
row_widths = (2, 8, 8, 8, 8)
|
||||||
|
msg.row(("#", "L", "P", "R", "F"), widths=row_widths)
|
||||||
|
for i in range(n_iter):
|
||||||
|
random.shuffle(train_data)
|
||||||
|
losses = {}
|
||||||
|
data = tqdm.tqdm(train_data, leave=False)
|
||||||
|
for batch in spacy.util.minibatch(data, size=batch_size):
|
||||||
|
texts, annots = zip(*batch)
|
||||||
|
nlp.update(texts, annots, drop=0.2, losses=losses)
|
||||||
|
with nlp.use_params(optimizer.averages):
|
||||||
|
sc = nlp.evaluate(eval_data)
|
||||||
|
if sc.ents_f > best_acc:
|
||||||
|
best_acc = sc.ents_f
|
||||||
|
if output:
|
||||||
|
best_model = nlp.to_bytes()
|
||||||
|
acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}")
|
||||||
|
msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths)
|
||||||
|
msg.text(f"Best F-Score: {best_acc:.3f}")
|
||||||
|
if output and best_model:
|
||||||
|
with msg.loading("Saving model..."):
|
||||||
|
nlp.from_bytes(best_model).to_disk(output)
|
||||||
|
msg.good("Saved model", output)
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
model=("The model to evaluate", "positional", None, str),
|
||||||
|
eval_path=("The evaluation data (Prodigy JSONL)", "positional", None, str),
|
||||||
|
)
|
||||||
|
def evaluate_model(model, eval_path):
|
||||||
|
"""
|
||||||
|
Evaluate a trained model on Prodigy annotations and print the accuracy.
|
||||||
|
"""
|
||||||
|
with msg.loading(f"Loading model '{model}'..."):
|
||||||
|
nlp = spacy.load(model)
|
||||||
|
data, _ = format_data(srsly.read_jsonl(eval_path))
|
||||||
|
sc = nlp.evaluate(data)
|
||||||
|
result = [
|
||||||
|
("Precision", f"{sc.ents_p:.3f}"),
|
||||||
|
("Recall", f"{sc.ents_r:.3f}"),
|
||||||
|
("F-Score", f"{sc.ents_f:.3f}"),
|
||||||
|
]
|
||||||
|
msg.table(result)
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
model=("The model to evaluate", "positional", None, str),
|
||||||
|
data=("Raw data as JSONL", "positional", None, str),
|
||||||
|
)
|
||||||
|
def wps(model, data):
|
||||||
|
"""
|
||||||
|
Measure the processing speed in words per second. It's recommended to
|
||||||
|
use a larger corpus of raw text here (e.g. a few million words).
|
||||||
|
"""
|
||||||
|
with msg.loading(f"Loading model '{model}'..."):
|
||||||
|
nlp = spacy.load(model)
|
||||||
|
texts = (eg["text"] for eg in srsly.read_jsonl(data))
|
||||||
|
n_docs = 0
|
||||||
|
n_words = 0
|
||||||
|
start_time = timer()
|
||||||
|
for doc in nlp.pipe(texts):
|
||||||
|
n_docs += 1
|
||||||
|
n_words += len(doc)
|
||||||
|
end_time = timer()
|
||||||
|
wps = int(n_words / (end_time - start_time))
|
||||||
|
result = [
|
||||||
|
("Docs", f"{n_docs:,}"),
|
||||||
|
("Words", f"{n_words:,}"),
|
||||||
|
("Words/s", f"{wps:,}"),
|
||||||
|
]
|
||||||
|
msg.table(result, widths=(7, 12), aligns=("l", "r"))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
opts = {"train": train_model, "evaluate": evaluate_model, "wps": wps}
|
||||||
|
cmd = sys.argv.pop(1)
|
||||||
|
if cmd not in opts:
|
||||||
|
msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
|
||||||
|
try:
|
||||||
|
plac.call(opts[cmd])
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
msg.warn("Stopped.", exits=1)
|
160
pages/students/2016/jakub_maruniak/dp2021/annotation/scripts.py
Normal file
160
pages/students/2016/jakub_maruniak/dp2021/annotation/scripts.py
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
"""
|
||||||
|
Usage example:
|
||||||
|
$ python scripts.py delete_annot jakub.maruniak ./dataset.jsonl ./new_dataset.jsonl
|
||||||
|
|
||||||
|
To see available commands:
|
||||||
|
$ python scripts.py help
|
||||||
|
|
||||||
|
To see available arguments:
|
||||||
|
$ python scripts.py [command] --help
|
||||||
|
"""
|
||||||
|
import spacy
|
||||||
|
import srsly
|
||||||
|
from wasabi import msg
|
||||||
|
import plac
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
|
||||||
|
)
|
||||||
|
def count(
|
||||||
|
dataset_path
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Print statistics about Prodigy JSONL dataset.
|
||||||
|
Prints number of accepted, rejected and ignored articles.
|
||||||
|
Prints number of annotations of each entity type.
|
||||||
|
Prints how much annotations were made by each annotator.
|
||||||
|
"""
|
||||||
|
# load data
|
||||||
|
# filename = 'ner/skner/sknerv4spans.jsonl'
|
||||||
|
file = open(sys.argv[1], 'rt', encoding='utf-8')
|
||||||
|
text = file.read()
|
||||||
|
|
||||||
|
# count articles
|
||||||
|
countAccept = text.count('accept')
|
||||||
|
countReject = text.count('reject')
|
||||||
|
countSkip = text.count('ignore')
|
||||||
|
countSpans = text.count('tokens')
|
||||||
|
# count entities
|
||||||
|
countPER = text.count('PER')
|
||||||
|
countLOC = text.count('LOC')
|
||||||
|
countORG = text.count('ORG')
|
||||||
|
countMISC = text.count('MISC')
|
||||||
|
|
||||||
|
underline = '\033[04m'
|
||||||
|
reset = '\033[0m'
|
||||||
|
red = '\033[31m'
|
||||||
|
green='\033[32m'
|
||||||
|
gray='\033[37m'
|
||||||
|
|
||||||
|
# table v1
|
||||||
|
#from lib import TableIt
|
||||||
|
#table1 = [
|
||||||
|
# ['Prijatých', countAccept],
|
||||||
|
# ['Zamietnutých', countReject],
|
||||||
|
# ['Preskočených', countSkip],
|
||||||
|
# ['------------', '------------'],
|
||||||
|
# ['Spolu', countSpans]
|
||||||
|
#]
|
||||||
|
#
|
||||||
|
#table = [
|
||||||
|
# ['Entita', 'Počet'],
|
||||||
|
# ['PER', countPER],
|
||||||
|
# ['LOC', countLOC],
|
||||||
|
# ['ORG', countORG],
|
||||||
|
# ['MISC', countMISC]
|
||||||
|
#]
|
||||||
|
#print('\nPočet anotovaných článkov:')
|
||||||
|
#TableIt.printTable(table1)
|
||||||
|
#print('\nPočet jednotlivých entít:')
|
||||||
|
#TableIt.printTable(table, useFieldNames=True, color=(26, 156, 171))
|
||||||
|
|
||||||
|
# table v2
|
||||||
|
print(underline + '\nPočet anotovaných článkov:' + reset)
|
||||||
|
print(green + "%-15s %-20s" %("Prijatých", countAccept) + reset)
|
||||||
|
print(red + "%-15s %-15s" %("Zamietnutých", countReject) + reset)
|
||||||
|
print(gray + "%-15s %-15s" %("Preskočených", countSkip) + reset)
|
||||||
|
print("%-15s" %("---------------------"))
|
||||||
|
print("%-15s %-15s" %("Spolu", countSpans))
|
||||||
|
|
||||||
|
print(underline + '\nPočet jednotlivých entít:' + reset)
|
||||||
|
print("%-10s %-10s" %("Entita:", "Počet:"))
|
||||||
|
print("%-10s" %("----------------"))
|
||||||
|
print("%-10s %-10s" %("PER", countPER))
|
||||||
|
print("%-10s %-10s" %("LOC", countLOC))
|
||||||
|
print("%-10s %-10s" %("ORG", countORG))
|
||||||
|
print("%-10s %-10s" %("MISC", countMISC))
|
||||||
|
|
||||||
|
# kto anotoval koľko?
|
||||||
|
frequency = {}
|
||||||
|
#Open the sample text file in read mode.
|
||||||
|
#document_text = open('sample.txt', 'r')
|
||||||
|
#convert the string of the document in lowercase and assign it to text_string variable.
|
||||||
|
#text = document_text.read().lower()
|
||||||
|
regex1 = '"_session_id":(.*?),'
|
||||||
|
pattern = re.findall(regex1, text)
|
||||||
|
for word in pattern:
|
||||||
|
count = frequency.get(word,0)
|
||||||
|
frequency[word] = count + 1
|
||||||
|
frequency_list = frequency.keys()
|
||||||
|
print(underline + '\nKto anotoval koľko článkov?' + reset)
|
||||||
|
for words in frequency_list:
|
||||||
|
print(words, frequency[words])
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
annotator=("Keep annotations from this annotator (email address or nickname)", "positional", None, str),
|
||||||
|
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
|
||||||
|
new_dataset_path=("Path to save new dataset(Prodigy JSONL format)", "positional", None, str),
|
||||||
|
)
|
||||||
|
def delete_annot(
|
||||||
|
annotator, dataset_path, new_dataset_path
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Load Prodigy JSONL dataset,
|
||||||
|
and keep annotations only from one annotator.
|
||||||
|
"""
|
||||||
|
file1 = open(sys.argv[2], 'r', encoding='utf-8')
|
||||||
|
file2 = open(sys.argv[3],'w', encoding='utf-8')
|
||||||
|
|
||||||
|
for line in file1.readlines():
|
||||||
|
x = re.findall(sys.argv[1], line)
|
||||||
|
if x:
|
||||||
|
print(line)
|
||||||
|
file2.write(line)
|
||||||
|
|
||||||
|
file1.close()
|
||||||
|
file2.close()
|
||||||
|
|
||||||
|
def modelinfo(
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Print information about trained model (Precision, Recall and F-Score)
|
||||||
|
"""
|
||||||
|
with open('build/train/nerposparser/model-best/meta.json', 'rt') as f:
|
||||||
|
for line in itertools.islice(f, 31, 54):
|
||||||
|
print(line, end =" ")
|
||||||
|
|
||||||
|
def helpme(
|
||||||
|
):
|
||||||
|
print("Available commands:",
|
||||||
|
"\ncount - Print statistics about Prodigy JSONL dataset",
|
||||||
|
"\ndelete_annot - Create dataset with annotations from only specific annotator",
|
||||||
|
"\nmodelinfo - Prints informations about trained model (Precision, Recall and F-Score)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
opts = {"count": count,
|
||||||
|
"delete_annot": delete_annot,
|
||||||
|
"modelinfo": modelinfo,
|
||||||
|
"help": helpme}
|
||||||
|
cmd = sys.argv.pop(1)
|
||||||
|
if cmd not in opts:
|
||||||
|
msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
|
||||||
|
try:
|
||||||
|
plac.call(opts[cmd])
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
msg.warn("Stopped.", exits=1)
|
@ -0,0 +1,24 @@
|
|||||||
|
set -e
|
||||||
|
OUTDIR=build/train/output
|
||||||
|
TRAINDIR=build/train
|
||||||
|
|
||||||
|
# Delete old training results
|
||||||
|
rm -r $TRAINDIR
|
||||||
|
|
||||||
|
mkdir -p $TRAINDIR
|
||||||
|
mkdir -p $OUTDIR
|
||||||
|
mkdir -p dist
|
||||||
|
# Delete old training results
|
||||||
|
rm -rf $OUTDIR/*
|
||||||
|
# Train dependency and POS
|
||||||
|
spacy train sk $OUTDIR ./build/input/slovak-treebank ./build/input/ud-artificial-gapping --n-iter 15 -p tagger,parser
|
||||||
|
rm -rf $TRAINDIR/posparser
|
||||||
|
mv $OUTDIR/model-best $TRAINDIR/posparser
|
||||||
|
# Train NER
|
||||||
|
# custom script for training, but it takes too long... input is JSONL file (db from Prodigy)
|
||||||
|
# python custom_train.py train ./build/train/posparser ./train.jsonl ./eval.jsonl -o ./build/train/nerposparser -n 15
|
||||||
|
spacy train sk $TRAINDIR/nerposparser ./ner/experiments/34sknerfull.json ./ner/experiments/34wikiartfull.json --n-iter 15 -p ner
|
||||||
|
# Package model
|
||||||
|
spacy package $TRAINDIR/nerposparser dist --meta-path ./meta.json --force
|
||||||
|
cd dist/sk_sk1-0.2.0
|
||||||
|
python ./setup.py sdist --dist-dir ../
|
@ -0,0 +1,80 @@
|
|||||||
|
"""
|
||||||
|
Visualize the data with Streamlit and spaCy.
|
||||||
|
https://github.com/explosion/projects/blob/master/ner-drugs/streamlit_visualizer.py
|
||||||
|
|
||||||
|
Usage example:
|
||||||
|
$ streamlit run visualizer.py visualize ./dataset.jsonl
|
||||||
|
"""
|
||||||
|
import streamlit as st
|
||||||
|
from spacy import displacy
|
||||||
|
import srsly
|
||||||
|
import sys
|
||||||
|
import plac
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
|
||||||
|
)
|
||||||
|
def visualize(
|
||||||
|
dataset_path
|
||||||
|
):
|
||||||
|
FILES = [sys.argv[1]]
|
||||||
|
MISC = "MISC"
|
||||||
|
|
||||||
|
HTML_WRAPPER = "<div style='border-bottom: 1px solid #ccc; padding: 20px 0'>{}</div>"
|
||||||
|
HTML_WRAPPER1 = "<div style='border-bottom: 2px solid #000; padding: 0 0 20px 0'>{}</div>"
|
||||||
|
SETTINGS = {"style": "ent", "manual": True, "options": {"colors": {MISC: "#d1bcff"}}}
|
||||||
|
|
||||||
|
@st.cache(allow_output_mutation=True)
|
||||||
|
def load_data(filepath):
|
||||||
|
return list(srsly.read_jsonl(filepath))
|
||||||
|
|
||||||
|
st.sidebar.title("Data visualizer")
|
||||||
|
st.sidebar.markdown(
|
||||||
|
"Visualize the annotations using [displaCy](https://spacy.io/usage/visualizers) "
|
||||||
|
"and view stats about the datasets."
|
||||||
|
)
|
||||||
|
data_file = st.sidebar.selectbox("Dataset", FILES)
|
||||||
|
data = load_data(data_file)
|
||||||
|
n_no_ents = 0
|
||||||
|
n_total_ents = 0
|
||||||
|
accepted = 0
|
||||||
|
rejected = 0
|
||||||
|
|
||||||
|
st.header(f"Dataset: {data_file} ({len(data)})")
|
||||||
|
st.markdown(HTML_WRAPPER1.format("Visualize only accepted examples and their annotations."), unsafe_allow_html=True)
|
||||||
|
for eg in data:
|
||||||
|
if eg["answer"] == "accept":
|
||||||
|
accepted += 1
|
||||||
|
if eg["answer"] != "accept":
|
||||||
|
rejected += 1
|
||||||
|
continue
|
||||||
|
row = {"text": eg["text"], "ents": eg.get("spans", [])}
|
||||||
|
answer = {"answer": eg.get("answer", [])}
|
||||||
|
n_total_ents += len(row["ents"])
|
||||||
|
if not row["ents"]:
|
||||||
|
n_no_ents += 1
|
||||||
|
html = displacy.render(row, **SETTINGS).replace("\n\n", "\n")
|
||||||
|
st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
||||||
|
|
||||||
|
st.sidebar.markdown(
|
||||||
|
f"""
|
||||||
|
| `{data_file}` | |
|
||||||
|
| --- | ---: |
|
||||||
|
| Total examples | {len(data):,} |
|
||||||
|
| Accepted examples | {accepted:,} |
|
||||||
|
| Rejected examples | {rejected:,} |
|
||||||
|
| Total entities | {n_total_ents:,} |
|
||||||
|
| Examples with no entities | {n_no_ents:,} |
|
||||||
|
""", unsafe_allow_html=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
opts = {"visualize": visualize}
|
||||||
|
cmd = sys.argv.pop(1)
|
||||||
|
if cmd not in opts:
|
||||||
|
msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
|
||||||
|
try:
|
||||||
|
plac.call(opts[cmd])
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
msg.warn("Stopped.", exits=1)
|
Loading…
Reference in New Issue
Block a user