From 99f36d30509a191fb263d56088e4baceceab2f73 Mon Sep 17 00:00:00 2001 From: Jakub Maruniak Date: Sun, 21 Mar 2021 20:27:06 +0000 Subject: [PATCH] =?UTF-8?q?Nahr=C3=A1t=20soubory=20do=20=E2=80=9Epages/stu?= =?UTF-8?q?dents/2016/jakub=5Fmaruniak/dp2021/annotation=E2=80=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../dp2021/annotation/custom_train.py | 152 +++++++++++++++++ .../dp2021/annotation/scripts.py | 160 ++++++++++++++++++ .../jakub_maruniak/dp2021/annotation/train.sh | 24 +++ .../dp2021/annotation/visualizer.py | 80 +++++++++ 4 files changed, 416 insertions(+) create mode 100644 pages/students/2016/jakub_maruniak/dp2021/annotation/custom_train.py create mode 100644 pages/students/2016/jakub_maruniak/dp2021/annotation/scripts.py create mode 100644 pages/students/2016/jakub_maruniak/dp2021/annotation/train.sh create mode 100644 pages/students/2016/jakub_maruniak/dp2021/annotation/visualizer.py diff --git a/pages/students/2016/jakub_maruniak/dp2021/annotation/custom_train.py b/pages/students/2016/jakub_maruniak/dp2021/annotation/custom_train.py new file mode 100644 index 000000000..f8c3f01fb --- /dev/null +++ b/pages/students/2016/jakub_maruniak/dp2021/annotation/custom_train.py @@ -0,0 +1,152 @@ +"""Scripts used for training and evaluation of NER models +Usage example: +$ python custom_train.py train ./model ./train.jsonl ./eval.jsonl -o ./output_dir -n 15 +Requirements: +spacy>=2.2.3 +https://github.com/explosion/projects/tree/master/ner-drugs +""" +import spacy +from spacy.cli.train import _load_pretrained_tok2vec +from timeit import default_timer as timer +from pathlib import Path +import srsly +from wasabi import msg +import random +import plac +import sys +import tqdm + + +def format_data(data): + result = [] + labels = set() + for eg in data: + if eg["answer"] != "accept": + continue + ents = [(s["start"], s["end"], s["label"]) for s in eg.get("spans", [])] + labels.update([ent[2] for ent in ents]) + result.append((eg["text"], {"entities": ents})) + return result, labels + + +@plac.annotations( + model=("The base model to load or blank:lang", "positional", None, str), + train_path=("The training data (Prodigy JSONL)", "positional", None, str), + eval_path=("The evaluation data (Prodigy JSONL)", "positional", None, str), + n_iter=("Number of iterations", "option", "n", int), + output=("Optional output directory", "option", "o", str), + tok2vec=("Pretrained tok2vec weights to initialize model", "option", "t2v", str), +) +def train_model( + model, train_path, eval_path, n_iter=10, output=None, tok2vec=None, +): + """ + Train a model from Prodigy annotations and optionally save out the best + model to disk. + """ + spacy.util.fix_random_seed(0) + with msg.loading(f"Loading '{model}'..."): + if model.startswith("blank:"): + nlp = spacy.blank(model.replace("blank:", "")) + else: + nlp = spacy.load(model) + msg.good(f"Loaded model '{model}'") + train_data, labels = format_data(srsly.read_jsonl(train_path)) + eval_data, _ = format_data(srsly.read_jsonl(eval_path)) + ner = nlp.create_pipe("ner") + for label in labels: + ner.add_label(label) + nlp.add_pipe(ner) + t2v_cfg = { + "embed_rows": 10000, + "token_vector_width": 128, + "conv_depth": 8, + "nr_feature_tokens": 3, + } + optimizer = nlp.begin_training(component_cfg={"ner": t2v_cfg} if tok2vec else {}) + if tok2vec: + _load_pretrained_tok2vec(nlp, Path(tok2vec)) + batch_size = spacy.util.compounding(1.0, 32.0, 1.001) + best_acc = 0 + best_model = None + row_widths = (2, 8, 8, 8, 8) + msg.row(("#", "L", "P", "R", "F"), widths=row_widths) + for i in range(n_iter): + random.shuffle(train_data) + losses = {} + data = tqdm.tqdm(train_data, leave=False) + for batch in spacy.util.minibatch(data, size=batch_size): + texts, annots = zip(*batch) + nlp.update(texts, annots, drop=0.2, losses=losses) + with nlp.use_params(optimizer.averages): + sc = nlp.evaluate(eval_data) + if sc.ents_f > best_acc: + best_acc = sc.ents_f + if output: + best_model = nlp.to_bytes() + acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}") + msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths) + msg.text(f"Best F-Score: {best_acc:.3f}") + if output and best_model: + with msg.loading("Saving model..."): + nlp.from_bytes(best_model).to_disk(output) + msg.good("Saved model", output) + + +@plac.annotations( + model=("The model to evaluate", "positional", None, str), + eval_path=("The evaluation data (Prodigy JSONL)", "positional", None, str), +) +def evaluate_model(model, eval_path): + """ + Evaluate a trained model on Prodigy annotations and print the accuracy. + """ + with msg.loading(f"Loading model '{model}'..."): + nlp = spacy.load(model) + data, _ = format_data(srsly.read_jsonl(eval_path)) + sc = nlp.evaluate(data) + result = [ + ("Precision", f"{sc.ents_p:.3f}"), + ("Recall", f"{sc.ents_r:.3f}"), + ("F-Score", f"{sc.ents_f:.3f}"), + ] + msg.table(result) + + +@plac.annotations( + model=("The model to evaluate", "positional", None, str), + data=("Raw data as JSONL", "positional", None, str), +) +def wps(model, data): + """ + Measure the processing speed in words per second. It's recommended to + use a larger corpus of raw text here (e.g. a few million words). + """ + with msg.loading(f"Loading model '{model}'..."): + nlp = spacy.load(model) + texts = (eg["text"] for eg in srsly.read_jsonl(data)) + n_docs = 0 + n_words = 0 + start_time = timer() + for doc in nlp.pipe(texts): + n_docs += 1 + n_words += len(doc) + end_time = timer() + wps = int(n_words / (end_time - start_time)) + result = [ + ("Docs", f"{n_docs:,}"), + ("Words", f"{n_words:,}"), + ("Words/s", f"{wps:,}"), + ] + msg.table(result, widths=(7, 12), aligns=("l", "r")) + + +if __name__ == "__main__": + opts = {"train": train_model, "evaluate": evaluate_model, "wps": wps} + cmd = sys.argv.pop(1) + if cmd not in opts: + msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1) + try: + plac.call(opts[cmd]) + except KeyboardInterrupt: + msg.warn("Stopped.", exits=1) \ No newline at end of file diff --git a/pages/students/2016/jakub_maruniak/dp2021/annotation/scripts.py b/pages/students/2016/jakub_maruniak/dp2021/annotation/scripts.py new file mode 100644 index 000000000..ea6b21544 --- /dev/null +++ b/pages/students/2016/jakub_maruniak/dp2021/annotation/scripts.py @@ -0,0 +1,160 @@ +""" +Usage example: +$ python scripts.py delete_annot jakub.maruniak ./dataset.jsonl ./new_dataset.jsonl + +To see available commands: +$ python scripts.py help + +To see available arguments: +$ python scripts.py [command] --help +""" +import spacy +import srsly +from wasabi import msg +import plac +import sys +import re +import itertools + +@plac.annotations( + dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str), +) +def count( + dataset_path +): + """ + Print statistics about Prodigy JSONL dataset. + Prints number of accepted, rejected and ignored articles. + Prints number of annotations of each entity type. + Prints how much annotations were made by each annotator. + """ + # load data + # filename = 'ner/skner/sknerv4spans.jsonl' + file = open(sys.argv[1], 'rt', encoding='utf-8') + text = file.read() + + # count articles + countAccept = text.count('accept') + countReject = text.count('reject') + countSkip = text.count('ignore') + countSpans = text.count('tokens') + # count entities + countPER = text.count('PER') + countLOC = text.count('LOC') + countORG = text.count('ORG') + countMISC = text.count('MISC') + + underline = '\033[04m' + reset = '\033[0m' + red = '\033[31m' + green='\033[32m' + gray='\033[37m' + + # table v1 + #from lib import TableIt + #table1 = [ + # ['Prijatých', countAccept], + # ['Zamietnutých', countReject], + # ['Preskočených', countSkip], + # ['------------', '------------'], + # ['Spolu', countSpans] + #] + # + #table = [ + # ['Entita', 'Počet'], + # ['PER', countPER], + # ['LOC', countLOC], + # ['ORG', countORG], + # ['MISC', countMISC] + #] + #print('\nPočet anotovaných článkov:') + #TableIt.printTable(table1) + #print('\nPočet jednotlivých entít:') + #TableIt.printTable(table, useFieldNames=True, color=(26, 156, 171)) + + # table v2 + print(underline + '\nPočet anotovaných článkov:' + reset) + print(green + "%-15s %-20s" %("Prijatých", countAccept) + reset) + print(red + "%-15s %-15s" %("Zamietnutých", countReject) + reset) + print(gray + "%-15s %-15s" %("Preskočených", countSkip) + reset) + print("%-15s" %("---------------------")) + print("%-15s %-15s" %("Spolu", countSpans)) + + print(underline + '\nPočet jednotlivých entít:' + reset) + print("%-10s %-10s" %("Entita:", "Počet:")) + print("%-10s" %("----------------")) + print("%-10s %-10s" %("PER", countPER)) + print("%-10s %-10s" %("LOC", countLOC)) + print("%-10s %-10s" %("ORG", countORG)) + print("%-10s %-10s" %("MISC", countMISC)) + + # kto anotoval koľko? + frequency = {} + #Open the sample text file in read mode. + #document_text = open('sample.txt', 'r') + #convert the string of the document in lowercase and assign it to text_string variable. + #text = document_text.read().lower() + regex1 = '"_session_id":(.*?),' + pattern = re.findall(regex1, text) + for word in pattern: + count = frequency.get(word,0) + frequency[word] = count + 1 + frequency_list = frequency.keys() + print(underline + '\nKto anotoval koľko článkov?' + reset) + for words in frequency_list: + print(words, frequency[words]) + + +@plac.annotations( + annotator=("Keep annotations from this annotator (email address or nickname)", "positional", None, str), + dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str), + new_dataset_path=("Path to save new dataset(Prodigy JSONL format)", "positional", None, str), +) +def delete_annot( + annotator, dataset_path, new_dataset_path +): + """ + Load Prodigy JSONL dataset, + and keep annotations only from one annotator. + """ + file1 = open(sys.argv[2], 'r', encoding='utf-8') + file2 = open(sys.argv[3],'w', encoding='utf-8') + + for line in file1.readlines(): + x = re.findall(sys.argv[1], line) + if x: + print(line) + file2.write(line) + + file1.close() + file2.close() + +def modelinfo( +): + """ + Print information about trained model (Precision, Recall and F-Score) + """ + with open('build/train/nerposparser/model-best/meta.json', 'rt') as f: + for line in itertools.islice(f, 31, 54): + print(line, end =" ") + +def helpme( +): + print("Available commands:", + "\ncount - Print statistics about Prodigy JSONL dataset", + "\ndelete_annot - Create dataset with annotations from only specific annotator", + "\nmodelinfo - Prints informations about trained model (Precision, Recall and F-Score)") + + +if __name__ == "__main__": + opts = {"count": count, + "delete_annot": delete_annot, + "modelinfo": modelinfo, + "help": helpme} + cmd = sys.argv.pop(1) + if cmd not in opts: + msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1) + try: + plac.call(opts[cmd]) + except KeyboardInterrupt: + msg.warn("Stopped.", exits=1) \ No newline at end of file diff --git a/pages/students/2016/jakub_maruniak/dp2021/annotation/train.sh b/pages/students/2016/jakub_maruniak/dp2021/annotation/train.sh new file mode 100644 index 000000000..ba731b8d6 --- /dev/null +++ b/pages/students/2016/jakub_maruniak/dp2021/annotation/train.sh @@ -0,0 +1,24 @@ +set -e +OUTDIR=build/train/output +TRAINDIR=build/train + +# Delete old training results +rm -r $TRAINDIR + +mkdir -p $TRAINDIR +mkdir -p $OUTDIR +mkdir -p dist +# Delete old training results +rm -rf $OUTDIR/* +# Train dependency and POS +spacy train sk $OUTDIR ./build/input/slovak-treebank ./build/input/ud-artificial-gapping --n-iter 15 -p tagger,parser +rm -rf $TRAINDIR/posparser +mv $OUTDIR/model-best $TRAINDIR/posparser +# Train NER +# custom script for training, but it takes too long... input is JSONL file (db from Prodigy) +# python custom_train.py train ./build/train/posparser ./train.jsonl ./eval.jsonl -o ./build/train/nerposparser -n 15 +spacy train sk $TRAINDIR/nerposparser ./ner/experiments/34sknerfull.json ./ner/experiments/34wikiartfull.json --n-iter 15 -p ner +# Package model +spacy package $TRAINDIR/nerposparser dist --meta-path ./meta.json --force +cd dist/sk_sk1-0.2.0 +python ./setup.py sdist --dist-dir ../ diff --git a/pages/students/2016/jakub_maruniak/dp2021/annotation/visualizer.py b/pages/students/2016/jakub_maruniak/dp2021/annotation/visualizer.py new file mode 100644 index 000000000..a44fa7ea0 --- /dev/null +++ b/pages/students/2016/jakub_maruniak/dp2021/annotation/visualizer.py @@ -0,0 +1,80 @@ +""" +Visualize the data with Streamlit and spaCy. +https://github.com/explosion/projects/blob/master/ner-drugs/streamlit_visualizer.py + +Usage example: +$ streamlit run visualizer.py visualize ./dataset.jsonl +""" +import streamlit as st +from spacy import displacy +import srsly +import sys +import plac +from wasabi import msg + +@plac.annotations( + dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str), +) +def visualize( + dataset_path +): + FILES = [sys.argv[1]] + MISC = "MISC" + + HTML_WRAPPER = "
{}
" + HTML_WRAPPER1 = "
{}
" + SETTINGS = {"style": "ent", "manual": True, "options": {"colors": {MISC: "#d1bcff"}}} + + @st.cache(allow_output_mutation=True) + def load_data(filepath): + return list(srsly.read_jsonl(filepath)) + + st.sidebar.title("Data visualizer") + st.sidebar.markdown( + "Visualize the annotations using [displaCy](https://spacy.io/usage/visualizers) " + "and view stats about the datasets." + ) + data_file = st.sidebar.selectbox("Dataset", FILES) + data = load_data(data_file) + n_no_ents = 0 + n_total_ents = 0 + accepted = 0 + rejected = 0 + + st.header(f"Dataset: {data_file} ({len(data)})") + st.markdown(HTML_WRAPPER1.format("Visualize only accepted examples and their annotations."), unsafe_allow_html=True) + for eg in data: + if eg["answer"] == "accept": + accepted += 1 + if eg["answer"] != "accept": + rejected += 1 + continue + row = {"text": eg["text"], "ents": eg.get("spans", [])} + answer = {"answer": eg.get("answer", [])} + n_total_ents += len(row["ents"]) + if not row["ents"]: + n_no_ents += 1 + html = displacy.render(row, **SETTINGS).replace("\n\n", "\n") + st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True) + + st.sidebar.markdown( + f""" + | `{data_file}` | | + | --- | ---: | + | Total examples | {len(data):,} | + | Accepted examples | {accepted:,} | + | Rejected examples | {rejected:,} | + | Total entities | {n_total_ents:,} | + | Examples with no entities | {n_no_ents:,} | + """, unsafe_allow_html=True + ) + +if __name__ == "__main__": + opts = {"visualize": visualize} + cmd = sys.argv.pop(1) + if cmd not in opts: + msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1) + try: + plac.call(opts[cmd]) + except KeyboardInterrupt: + msg.warn("Stopped.", exits=1) \ No newline at end of file