From 0fcb3d7f2d8a615ddddaa57c8d9e87931d77ca02 Mon Sep 17 00:00:00 2001 From: Jakub Maruniak Date: Mon, 9 Nov 2020 21:46:12 +0000 Subject: [PATCH] =?UTF-8?q?Nahr=C3=A1t=20soubory=20do=20=E2=80=9Epages/stu?= =?UTF-8?q?dents/2016/jakub=5Fmaruniak/dp2021/annotation/train=E2=80=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../dp2021/annotation/train/prepare.sh | 19 +++++++++++++++++++ .../dp2021/annotation/train/train.sh | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 pages/students/2016/jakub_maruniak/dp2021/annotation/train/prepare.sh create mode 100644 pages/students/2016/jakub_maruniak/dp2021/annotation/train/train.sh diff --git a/pages/students/2016/jakub_maruniak/dp2021/annotation/train/prepare.sh b/pages/students/2016/jakub_maruniak/dp2021/annotation/train/prepare.sh new file mode 100644 index 00000000..ade40371 --- /dev/null +++ b/pages/students/2016/jakub_maruniak/dp2021/annotation/train/prepare.sh @@ -0,0 +1,19 @@ +mkdir -p build +mkdir -p build/input +# Prepare Treebank +mkdir -p build/input/slovak-treebank +spacy convert ./sources/slovak-treebank/stb.conll ./build/input/slovak-treebank +# UDAG used as evaluation +mkdir -p build/input/ud-artificial-gapping +spacy convert ./sources/ud-artificial-gapping/sk-ud-crawled-orphan.conllu ./build/input/ud-artificial-gapping +# Prepare skner +mkdir -p build/input/skner +# Convert to IOB +cat ./sources/skner/wikiann-sk.bio | python ./sources/bio-to-iob.py > build/input/skner/wikiann-sk.iob +# Split to train test +cat ./build/input/skner/wikiann-sk.iob | python ./sources/iob-to-traintest.py ./build/input/skner/wikiann-sk +# Convert train and test +mkdir -p build/input/skner-train +spacy convert -n 15 --converter ner ./build/input/skner/wikiann-sk.train ./build/input/skner-train +mkdir -p build/input/skner-test +spacy convert -n 15 --converter ner ./build/input/skner/wikiann-sk.test ./build/input/skner-test diff --git a/pages/students/2016/jakub_maruniak/dp2021/annotation/train/train.sh b/pages/students/2016/jakub_maruniak/dp2021/annotation/train/train.sh new file mode 100644 index 00000000..a0d1c7cf --- /dev/null +++ b/pages/students/2016/jakub_maruniak/dp2021/annotation/train/train.sh @@ -0,0 +1,19 @@ +set -e +OUTDIR=build/train/output +TRAINDIR=build/train +mkdir -p $TRAINDIR +mkdir -p $OUTDIR +mkdir -p dist +# Delete old training results +rm -rf $OUTDIR/* +# Train dependency and POS +spacy train sk $OUTDIR ./build/input/slovak-treebank ./build/input/ud-artificial-gapping --n-iter 20 -p tagger,parser +rm -rf $TRAINDIR/posparser +mv $OUTDIR/model-best $TRAINDIR/posparser +# Train NER +# python ./train.py -t ./train.json -o $TRAINDIR/nerposparser -n 10 -m $TRAINDIR/posparser/ +spacy train sk $TRAINDIR/nerposparser ./ner/train.json ./ner/eval.json --n-iter 20 -p ner +# Package model +spacy package $TRAINDIR/nerposparser dist --meta-path ./meta.json --force +cd dist/sk_sk1-0.2.0 +python ./setup.py sdist --dist-dir ../