Nahrát soubory do „pages/students/2016/jakub_maruniak/dp2021/annotation“

2021-03-21 20:27:06 +00:00 · 2021-03-21 20:27:06 +00:00 · 99f36d3050
commit 99f36d3050
parent 9de29ed2b0
4 changed files with 416 additions and 0 deletions
--- a/pages/students/2016/jakub_maruniak/dp2021/annotation/custom_train.py
+++ b/pages/students/2016/jakub_maruniak/dp2021/annotation/custom_train.py
@ -0,0 +1,152 @@
 """Scripts used for training and evaluation of NER models
 Usage example:
 $ python custom_train.py train ./model ./train.jsonl ./eval.jsonl -o ./output_dir -n 15
 Requirements:
 spacy>=2.2.3
 https://github.com/explosion/projects/tree/master/ner-drugs
 """
 import spacy
 from spacy.cli.train import _load_pretrained_tok2vec
 from timeit import default_timer as timer
 from pathlib import Path
 import srsly
 from wasabi import msg
 import random
 import plac
 import sys
 import tqdm
 def format_data(data):
    result = []
    labels = set()
    for eg in data:
        if eg["answer"] != "accept":
            continue
        ents = [(s["start"], s["end"], s["label"]) for s in eg.get("spans", [])]
        labels.update([ent[2] for ent in ents])
        result.append((eg["text"], {"entities": ents}))
    return result, labels
@plac.annotations(
    model=("The base model to load or blank:lang", "positional", None, str),
    train_path=("The training data (Prodigy JSONL)", "positional", None, str),
    eval_path=("The evaluation data (Prodigy JSONL)", "positional", None, str),
    n_iter=("Number of iterations", "option", "n", int),
    output=("Optional output directory", "option", "o", str),
    tok2vec=("Pretrained tok2vec weights to initialize model", "option", "t2v", str),
 )
 def train_model(
    model, train_path, eval_path, n_iter=10, output=None, tok2vec=None,
 ):
    """
    Train a model from Prodigy annotations and optionally save out the best
    model to disk.
    """
    spacy.util.fix_random_seed(0)
    with msg.loading(f"Loading '{model}'..."):
        if model.startswith("blank:"):
            nlp = spacy.blank(model.replace("blank:", ""))
        else:
            nlp = spacy.load(model)
    msg.good(f"Loaded model '{model}'")
    train_data, labels = format_data(srsly.read_jsonl(train_path))
    eval_data, _ = format_data(srsly.read_jsonl(eval_path))
    ner = nlp.create_pipe("ner")
    for label in labels:
        ner.add_label(label)
    nlp.add_pipe(ner)
    t2v_cfg = {
        "embed_rows": 10000,
        "token_vector_width": 128,
        "conv_depth": 8,
        "nr_feature_tokens": 3,
    }
    optimizer = nlp.begin_training(component_cfg={"ner": t2v_cfg} if tok2vec else {})
    if tok2vec:
        _load_pretrained_tok2vec(nlp, Path(tok2vec))
    batch_size = spacy.util.compounding(1.0, 32.0, 1.001)
    best_acc = 0
    best_model = None
    row_widths = (2, 8, 8, 8, 8)
    msg.row(("#", "L", "P", "R", "F"), widths=row_widths)
    for i in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        data = tqdm.tqdm(train_data, leave=False)
        for batch in spacy.util.minibatch(data, size=batch_size):
            texts, annots = zip(*batch)
            nlp.update(texts, annots, drop=0.2, losses=losses)
        with nlp.use_params(optimizer.averages):
            sc = nlp.evaluate(eval_data)
            if sc.ents_f > best_acc:
                best_acc = sc.ents_f
                if output:
                    best_model = nlp.to_bytes()
        acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}")
        msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths)
    msg.text(f"Best F-Score: {best_acc:.3f}")
    if output and best_model:
        with msg.loading("Saving model..."):
            nlp.from_bytes(best_model).to_disk(output)
        msg.good("Saved model", output)
@plac.annotations(
    model=("The model to evaluate", "positional", None, str),
    eval_path=("The evaluation data (Prodigy JSONL)", "positional", None, str),
 )
 def evaluate_model(model, eval_path):
    """
    Evaluate a trained model on Prodigy annotations and print the accuracy.
    """
    with msg.loading(f"Loading model '{model}'..."):
        nlp = spacy.load(model)
    data, _ = format_data(srsly.read_jsonl(eval_path))
    sc = nlp.evaluate(data)
    result = [
        ("Precision", f"{sc.ents_p:.3f}"),
        ("Recall", f"{sc.ents_r:.3f}"),
        ("F-Score", f"{sc.ents_f:.3f}"),
    ]
    msg.table(result)
@plac.annotations(
    model=("The model to evaluate", "positional", None, str),
    data=("Raw data as JSONL", "positional", None, str),
 )
 def wps(model, data):
    """
    Measure the processing speed in words per second. It's recommended to
    use a larger corpus of raw text here (e.g. a few million words).
    """
    with msg.loading(f"Loading model '{model}'..."):
        nlp = spacy.load(model)
    texts = (eg["text"] for eg in srsly.read_jsonl(data))
    n_docs = 0
    n_words = 0
    start_time = timer()
    for doc in nlp.pipe(texts):
        n_docs += 1
        n_words += len(doc)
    end_time = timer()
    wps = int(n_words / (end_time - start_time))
    result = [
        ("Docs", f"{n_docs:,}"),
        ("Words", f"{n_words:,}"),
        ("Words/s", f"{wps:,}"),
    ]
    msg.table(result, widths=(7, 12), aligns=("l", "r"))
 if __name__ == "__main__":
    opts = {"train": train_model, "evaluate": evaluate_model, "wps": wps}
    cmd = sys.argv.pop(1)
    if cmd not in opts:
        msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
    try:
        plac.call(opts[cmd])
    except KeyboardInterrupt:
        msg.warn("Stopped.", exits=1)
--- a/pages/students/2016/jakub_maruniak/dp2021/annotation/scripts.py
+++ b/pages/students/2016/jakub_maruniak/dp2021/annotation/scripts.py
@ -0,0 +1,160 @@
 """
 Usage example:
 $ python scripts.py delete_annot jakub.maruniak ./dataset.jsonl ./new_dataset.jsonl
 To see available commands:
 $ python scripts.py help 
 To see available arguments:
 $ python scripts.py [command] --help 
 """
 import spacy
 import srsly
 from wasabi import msg
 import plac
 import sys
 import re
 import itertools
@plac.annotations(
    dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
 )
 def count(
    dataset_path
 ):
    """     
    Print statistics about Prodigy JSONL dataset.
    Prints number of accepted, rejected and ignored articles.
    Prints number of annotations of each entity type.
    Prints how much annotations were made by each annotator. 
    """    
    # load data
    # filename = 'ner/skner/sknerv4spans.jsonl'
    file = open(sys.argv[1], 'rt', encoding='utf-8')
    text = file.read()
    # count articles
    countAccept = text.count('accept')
    countReject = text.count('reject')
    countSkip = text.count('ignore')
    countSpans = text.count('tokens')
    # count entities
    countPER = text.count('PER')
    countLOC = text.count('LOC')
    countORG = text.count('ORG')
    countMISC = text.count('MISC')
    underline = '\033[04m'
    reset = '\033[0m'
    red = '\033[31m'
    green='\033[32m'
    gray='\033[37m'
    # table v1
    #from lib import TableIt
    #table1 = [    
    #    ['Prijatých', countAccept],   
    #    ['Zamietnutých', countReject],
    #    ['Preskočených', countSkip],
    #    ['------------', '------------'],
    #    ['Spolu', countSpans]
    #]
    #
    #table = [    
    #    ['Entita', 'Počet'],   
    #    ['PER', countPER],
    #    ['LOC', countLOC],
    #    ['ORG', countORG],
    #    ['MISC', countMISC]
    #]
    #print('\nPočet anotovaných článkov:')
    #TableIt.printTable(table1)
    #print('\nPočet jednotlivých entít:')
    #TableIt.printTable(table, useFieldNames=True, color=(26, 156, 171))
    # table v2
    print(underline + '\nPočet anotovaných článkov:' + reset)
    print(green + "%-15s %-20s" %("Prijatých", countAccept) + reset)
    print(red + "%-15s %-15s" %("Zamietnutých", countReject) + reset)
    print(gray + "%-15s %-15s" %("Preskočených", countSkip) + reset)
    print("%-15s" %("---------------------"))
    print("%-15s %-15s" %("Spolu", countSpans))
    print(underline + '\nPočet jednotlivých entít:' + reset)
    print("%-10s %-10s" %("Entita:", "Počet:"))
    print("%-10s" %("----------------"))
    print("%-10s %-10s" %("PER", countPER))
    print("%-10s %-10s" %("LOC", countLOC))
    print("%-10s %-10s" %("ORG", countORG))
    print("%-10s %-10s" %("MISC", countMISC))
        # kto anotoval koľko?
    frequency = {}
    #Open the sample text file in read mode.
        #document_text = open('sample.txt', 'r')
    #convert the string of the document in lowercase and assign it to text_string variable.
        #text = document_text.read().lower()
    regex1 = '"_session_id":(.*?),'
    pattern = re.findall(regex1, text)
    for word in pattern:
        count = frequency.get(word,0)
        frequency[word] = count + 1
    frequency_list = frequency.keys()
    print(underline + '\nKto anotoval koľko článkov?' + reset)
    for words in frequency_list:
        print(words, frequency[words])
@plac.annotations(
    annotator=("Keep annotations from this annotator (email address or nickname)", "positional", None, str),
    dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
    new_dataset_path=("Path to save new dataset(Prodigy JSONL format)", "positional", None, str),
 )
 def delete_annot(
    annotator, dataset_path, new_dataset_path
 ):
    """     
    Load Prodigy JSONL dataset, 
    and keep annotations only from one annotator.
    """    
    file1 = open(sys.argv[2], 'r', encoding='utf-8') 
    file2 = open(sys.argv[3],'w', encoding='utf-8') 
    for line in file1.readlines(): 
        x = re.findall(sys.argv[1], line) 
        if x: 
            print(line) 
            file2.write(line) 
    file1.close() 
    file2.close() 
 def modelinfo(
 ):
    """     
    Print information about trained model (Precision, Recall and F-Score)
    """  
    with open('build/train/nerposparser/model-best/meta.json', 'rt') as f:
        for line in itertools.islice(f, 31, 54):
            print(line, end =" ") 
 def helpme(
 ):
    print("Available commands:",
    "\ncount - Print statistics about Prodigy JSONL dataset",
    "\ndelete_annot - Create dataset with annotations from only specific annotator",
    "\nmodelinfo - Prints informations about trained model (Precision, Recall and F-Score)")
 if __name__ == "__main__":
    opts = {"count": count,
            "delete_annot": delete_annot,
            "modelinfo": modelinfo,
            "help": helpme}
    cmd = sys.argv.pop(1)
    if cmd not in opts:
        msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
    try:
        plac.call(opts[cmd])
    except KeyboardInterrupt:
        msg.warn("Stopped.", exits=1)
--- a/pages/students/2016/jakub_maruniak/dp2021/annotation/train.sh
+++ b/pages/students/2016/jakub_maruniak/dp2021/annotation/train.sh
@ -0,0 +1,24 @@
 set -e
 OUTDIR=build/train/output
 TRAINDIR=build/train
 # Delete old training results
 rm -r $TRAINDIR
 mkdir -p $TRAINDIR
 mkdir -p $OUTDIR
 mkdir -p dist
 # Delete old training results
 rm -rf $OUTDIR/*
 # Train dependency and POS
 spacy train sk $OUTDIR ./build/input/slovak-treebank ./build/input/ud-artificial-gapping  --n-iter 15 -p tagger,parser
 rm -rf $TRAINDIR/posparser
 mv $OUTDIR/model-best $TRAINDIR/posparser
 # Train NER
 # custom script for training, but it takes too long... input is JSONL file (db from Prodigy)
 # python custom_train.py train ./build/train/posparser ./train.jsonl ./eval.jsonl -o ./build/train/nerposparser -n 15
 spacy train sk $TRAINDIR/nerposparser ./ner/experiments/34sknerfull.json ./ner/experiments/34wikiartfull.json --n-iter 15 -p ner
 # Package model
 spacy package $TRAINDIR/nerposparser dist --meta-path ./meta.json --force
 cd dist/sk_sk1-0.2.0
 python ./setup.py sdist --dist-dir ../
--- a/pages/students/2016/jakub_maruniak/dp2021/annotation/visualizer.py
+++ b/pages/students/2016/jakub_maruniak/dp2021/annotation/visualizer.py
@ -0,0 +1,80 @@
 """
 Visualize the data with Streamlit and spaCy.
 https://github.com/explosion/projects/blob/master/ner-drugs/streamlit_visualizer.py
 Usage example:
 $ streamlit run visualizer.py visualize ./dataset.jsonl
 """
 import streamlit as st
 from spacy import displacy
 import srsly
 import sys
 import plac
 from wasabi import msg
@plac.annotations(
    dataset_path=("Path to dataset (Prodigy JSONL format)", "positional", None, str),
 )
 def visualize(
    dataset_path
 ):
    FILES = [sys.argv[1]]
    MISC = "MISC"
    HTML_WRAPPER = "<div style='border-bottom: 1px solid #ccc; padding: 20px 0'>{}</div>"
    HTML_WRAPPER1 = "<div style='border-bottom: 2px solid #000; padding: 0 0 20px 0'>{}</div>"
    SETTINGS = {"style": "ent", "manual": True, "options": {"colors": {MISC: "#d1bcff"}}}
    @st.cache(allow_output_mutation=True)
    def load_data(filepath):
        return list(srsly.read_jsonl(filepath))
    st.sidebar.title("Data visualizer")
    st.sidebar.markdown(
        "Visualize the annotations using [displaCy](https://spacy.io/usage/visualizers) "
        "and view stats about the datasets."
    )
    data_file = st.sidebar.selectbox("Dataset", FILES)
    data = load_data(data_file)
    n_no_ents = 0
    n_total_ents = 0
    accepted = 0
    rejected = 0
    st.header(f"Dataset: {data_file} ({len(data)})")
    st.markdown(HTML_WRAPPER1.format("Visualize only accepted examples and their annotations."), unsafe_allow_html=True)
    for eg in data:
        if eg["answer"] == "accept":
            accepted += 1
        if eg["answer"] != "accept":
            rejected += 1
            continue
        row = {"text": eg["text"], "ents": eg.get("spans", [])}
        answer = {"answer": eg.get("answer", [])}
        n_total_ents += len(row["ents"])
        if not row["ents"]:
            n_no_ents += 1
        html = displacy.render(row, **SETTINGS).replace("\n\n", "\n")
        st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)
    st.sidebar.markdown(
        f"""        
    | `{data_file}` | |
    | --- | ---: |
    | Total examples | {len(data):,} |
    | Accepted examples | {accepted:,} |
    | Rejected examples | {rejected:,} |
    | Total entities | {n_total_ents:,} |
    | Examples with no entities | {n_no_ents:,} |
    """, unsafe_allow_html=True
    )
 if __name__ == "__main__":
    opts = {"visualize": visualize}
    cmd = sys.argv.pop(1)
    if cmd not in opts:
        msg.fail(f"Unknown command: {cmd}", f"Available: {', '.join(opts)}", exits=1)
    try:
        plac.call(opts[cmd])
    except KeyboardInterrupt:
        msg.warn("Stopped.", exits=1)