From 417a3c9319ca16e68bc55fe97ec3dbdc8094b9aa Mon Sep 17 00:00:00 2001
From: Darius Lindvai <darius.lindvai@student.tuke.sk>
Date: Fri, 10 Apr 2020 09:34:14 +0200
Subject: [PATCH] update

---
 .../2016/darius_lindvai/dp2021/README.md      |  13 ++
 .../2016/darius_lindvai/dp2021/punc.py        | 221 ++++++++++++++++++
 .../2016/darius_lindvai/dp2021/script1.py     |  11 +
 3 files changed, 245 insertions(+)
 create mode 100644 pages/students/2016/darius_lindvai/dp2021/README.md
 create mode 100644 pages/students/2016/darius_lindvai/dp2021/punc.py
 create mode 100755 pages/students/2016/darius_lindvai/dp2021/script1.py

diff --git a/pages/students/2016/darius_lindvai/dp2021/README.md b/pages/students/2016/darius_lindvai/dp2021/README.md
new file mode 100644
index 000000000..292ee04fe
--- /dev/null
+++ b/pages/students/2016/darius_lindvai/dp2021/README.md
@@ -0,0 +1,13 @@
+## Update 09.04.2020
+- Upravil som vzorový zdrojový kód, ktorý riešil Named-Entity Recognition, tak, aby dopĺňal interpunkciu.
+- Momentálne to funguje s ručne vpísanými trénovacími dátami a ručným "otagovaním", avšak iba pre bodku a otáznik.
+- Keď som skúšal použiť dáta, kde bol aj otáznik, ale namiesto otáznika model doplňoval bodku.
+
+vysvetlenie zápisu dát:
+- v texte som nahradil interpunciu slovami, resp. skratkami ('.' -> 'PER', ',' -> 'COM', '?' -> '.QUE')
+- sekvencie slov som označil ako "S", nerozlišoval som slovné druhy
+- interpunkčné znamienka som označil ako "C" (pre čiarku), "P" (pre bodku) a "Q" (pre otáznik)
+
+vysvetlenie výstupu: 
+- Prvý tensor je predikcia modelu pred trénovaním.
+- Druhý tensor je predikcia po trénovaní.
diff --git a/pages/students/2016/darius_lindvai/dp2021/punc.py b/pages/students/2016/darius_lindvai/dp2021/punc.py
new file mode 100644
index 000000000..dbb5d8a79
--- /dev/null
+++ b/pages/students/2016/darius_lindvai/dp2021/punc.py
@@ -0,0 +1,221 @@
+import torch
+import torch.autograd as autograd
+import torch.nn as nn
+import torch.optim as optim
+
+torch.manual_seed(1)
+
+
+
+
+
+def argmax(vec):
+    # return the argmax as a python int
+    _, idx = torch.max(vec, 1)
+    return idx.item()
+
+
+def prepare_sequence(seq, to_ix):
+    idxs = [to_ix[w] for w in seq]
+    return torch.tensor(idxs, dtype=torch.long)
+
+
+# Compute log sum exp in a numerically stable way for the forward algorithm
+def log_sum_exp(vec):
+    max_score = vec[0, argmax(vec)]
+    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
+    return max_score + \
+        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
+
+
+
+
+
+class BiLSTM_CRF(nn.Module):
+
+    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
+        super(BiLSTM_CRF, self).__init__()
+        self.embedding_dim = embedding_dim
+        self.hidden_dim = hidden_dim
+        self.vocab_size = vocab_size
+        self.tag_to_ix = tag_to_ix
+        self.tagset_size = len(tag_to_ix)
+
+        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
+        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
+                            num_layers=1, bidirectional=True)
+
+        # Maps the output of the LSTM into tag space.
+        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
+
+        # Matrix of transition parameters.  Entry i,j is the score of
+        # transitioning *to* i *from* j.
+        self.transitions = nn.Parameter(
+            torch.randn(self.tagset_size, self.tagset_size))
+
+        # These two statements enforce the constraint that we never transfer
+        # to the start tag and we never transfer from the stop tag
+        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
+        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
+
+        self.hidden = self.init_hidden()
+
+    def init_hidden(self):
+        return (torch.randn(2, 1, self.hidden_dim // 2),
+                torch.randn(2, 1, self.hidden_dim // 2))
+
+    def _forward_alg(self, feats):
+        # Forward algorithm to compute the partition function
+        init_alphas = torch.full((1, self.tagset_size), -10000.)
+        # START_TAG has all of the score.
+        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
+
+        # Wrap in a variable so that we will get automatic backprop
+        forward_var = init_alphas
+
+        # Iterate through the sentence
+        for feat in feats:
+            alphas_t = []  # The forward tensors at this timestep
+            for next_tag in range(self.tagset_size):
+                # broadcast the emission score: it is the same regardless of the previous tag
+                emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
+                # the ith entry of trans_score is the score of transitioning to next_tag from i
+                trans_score = self.transitions[next_tag].view(1, -1)
+                # The ith entry of next_tag_var is the value for the edge (i -> next_tag) before we do log-sum-exp
+                next_tag_var = forward_var + trans_score + emit_score
+                # The forward variable for this tag is log-sum-exp of all the scores.
+                alphas_t.append(log_sum_exp(next_tag_var).view(1))
+            forward_var = torch.cat(alphas_t).view(1, -1)
+        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
+        alpha = log_sum_exp(terminal_var)
+        return alpha
+
+    def _get_lstm_features(self, sentence):
+        self.hidden = self.init_hidden()
+        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
+        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
+        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
+        lstm_feats = self.hidden2tag(lstm_out)
+        return lstm_feats
+
+    def _score_sentence(self, feats, tags):
+        # Gives the score of a provided tag sequence
+        score = torch.zeros(1)
+        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
+        for i, feat in enumerate(feats):
+            score = score + \
+                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
+        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
+        return score
+
+    def _viterbi_decode(self, feats):
+        backpointers = []
+
+        # Initialize the viterbi variables in log space
+        init_vvars = torch.full((1, self.tagset_size), -10000.)
+        init_vvars[0][self.tag_to_ix[START_TAG]] = 0
+
+        # forward_var at step i holds the viterbi variables for step i-1
+        forward_var = init_vvars
+        for feat in feats:
+            bptrs_t = []  # holds the backpointers for this step
+            viterbivars_t = []  # holds the viterbi variables for this step
+
+            for next_tag in range(self.tagset_size):
+                # next_tag_var[i] holds the viterbi variable for tag i at the
+                # previous step, plus the score of transitioning
+                # from tag i to next_tag.
+                # We don't include the emission scores here because the max
+                # does not depend on them (we add them in below)
+                next_tag_var = forward_var + self.transitions[next_tag]
+                best_tag_id = argmax(next_tag_var)
+                bptrs_t.append(best_tag_id)
+                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
+            # Now add in the emission scores, and assign forward_var to the set
+            # of viterbi variables we just computed
+            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
+            backpointers.append(bptrs_t)
+
+        # Transition to STOP_TAG
+        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
+        best_tag_id = argmax(terminal_var)
+        path_score = terminal_var[0][best_tag_id]
+
+        # Follow the back pointers to decode the best path.
+        best_path = [best_tag_id]
+        for bptrs_t in reversed(backpointers):
+            best_tag_id = bptrs_t[best_tag_id]
+            best_path.append(best_tag_id)
+        # Pop off the start tag (we dont want to return that to the caller)
+        start = best_path.pop()
+        assert start == self.tag_to_ix[START_TAG]  # Sanity check
+        best_path.reverse()
+        return path_score, best_path
+
+    def neg_log_likelihood(self, sentence, tags):
+        feats = self._get_lstm_features(sentence)
+        forward_score = self._forward_alg(feats)
+        gold_score = self._score_sentence(feats, tags)
+        return forward_score - gold_score
+
+    def forward(self, sentence):
+        # Get the emission scores from the BiLSTM
+        lstm_feats = self._get_lstm_features(sentence)
+
+        # Find the best path, given the features.
+        score, tag_seq = self._viterbi_decode(lstm_feats)
+        return score, tag_seq
+
+
+
+
+
+START_TAG = "<START>"
+STOP_TAG = "<STOP>"
+EMBEDDING_DIM = 5
+HIDDEN_DIM = 4
+
+training_data = [(
+    "hovorí sa ,COM že ľudstvo postihuje nová epidémia ,COM šíriaca sa závratnou rýchlosťou .PER preto je dôležité vedieť čo to je ,COM ako jej predísť alebo ako ju odstrániť .PER".split(),
+    "S S C S S S S S C S S S S P S S S S S S S C S S S S S S S P".split()
+), (
+    "nárast obezity je spôsobený najmä spôsobom života .PER tuky zlepšujú chuť do jedla a dávajú lepší pocit sýtosti ,COM uvedomte si však ,COM že všetky tuky sa Vám ukladajú ,COM pokiaľ ich nespálite .PER".split(),
+    "S S S S S S S P S S S S S S S S S S C S S S C S S S S S S C S S S P".split()
+)]
+
+word_to_ix = {}
+for sentence, tags in training_data:
+    for word in sentence:
+        if word not in word_to_ix:
+            word_to_ix[word] = len(word_to_ix)
+
+tag_to_ix = {"S": 0, "C": 1, "P": 2, "E": 3, START_TAG: 4, STOP_TAG: 5}
+
+model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
+optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
+
+with torch.no_grad():
+    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
+    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
+    print("Predicted output before training: ", model(precheck_sent))
+
+for epoch in range(300):  # normally you would NOT do 300 epochs, but this is small dataset
+    for sentence, tags in training_data:
+        # Step 1. Remember that Pytorch accumulates gradients.
+        # We need to clear them out before each instance
+        model.zero_grad()
+
+        # Step 2. Get our inputs ready for the network, that is, turn them into Tensors of word indices.
+        sentence_in = prepare_sequence(sentence, word_to_ix)
+        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
+
+        # Step 3. Run our forward pass.
+        loss = model.neg_log_likelihood(sentence_in, targets)
+
+        # Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step()
+        loss.backward()
+        optimizer.step()
+
+with torch.no_grad():
+    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
+    print("Predicted output after training: ", model(precheck_sent))
diff --git a/pages/students/2016/darius_lindvai/dp2021/script1.py b/pages/students/2016/darius_lindvai/dp2021/script1.py
new file mode 100755
index 000000000..1b9140fbb
--- /dev/null
+++ b/pages/students/2016/darius_lindvai/dp2021/script1.py
@@ -0,0 +1,11 @@
+# coding: utf-8
+#!/usr/bin/python
+
+import codecs
+import sys
+
+with codecs.open(sys.argv[2],'w') as out_txt:
+        with codecs.open(sys.argv[1],'r') as text:
+            for line in text:
+                line = line.replace('.','PER').replace(',','COM').replace('?','QUE')
+                out_txt.write(line)