diff --git a/pages/students/2016/lukas_pokryvka/dp2021/yelp/script.py b/pages/students/2016/lukas_pokryvka/dp2021/yelp/script.py index 52da23809..b0629a18b 100644 --- a/pages/students/2016/lukas_pokryvka/dp2021/yelp/script.py +++ b/pages/students/2016/lukas_pokryvka/dp2021/yelp/script.py @@ -1 +1,748 @@ -# README \ No newline at end of file +from argparse import Namespace +from collections import Counter +import json +import os +import re +import string + +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.utils.data import Dataset, DataLoader +from tqdm.notebook import tqdm + + +class Vocabulary(object): + """Class to process text and extract vocabulary for mapping""" + + def __init__(self, token_to_idx=None, add_unk=True, unk_token=""): + """ + Args: + token_to_idx (dict): a pre-existing map of tokens to indices + add_unk (bool): a flag that indicates whether to add the UNK token + unk_token (str): the UNK token to add into the Vocabulary + """ + + if token_to_idx is None: + token_to_idx = {} + self._token_to_idx = token_to_idx + + self._idx_to_token = {idx: token + for token, idx in self._token_to_idx.items()} + + self._add_unk = add_unk + self._unk_token = unk_token + + self.unk_index = -1 + if add_unk: + self.unk_index = self.add_token(unk_token) + + + def to_serializable(self): + """ returns a dictionary that can be serialized """ + return {'token_to_idx': self._token_to_idx, + 'add_unk': self._add_unk, + 'unk_token': self._unk_token} + + @classmethod + def from_serializable(cls, contents): + """ instantiates the Vocabulary from a serialized dictionary """ + return cls(**contents) + + def add_token(self, token): + """Update mapping dicts based on the token. + + Args: + token (str): the item to add into the Vocabulary + Returns: + index (int): the integer corresponding to the token + """ + if token in self._token_to_idx: + index = self._token_to_idx[token] + else: + index = len(self._token_to_idx) + self._token_to_idx[token] = index + self._idx_to_token[index] = token + return index + + def add_many(self, tokens): + """Add a list of tokens into the Vocabulary + + Args: + tokens (list): a list of string tokens + Returns: + indices (list): a list of indices corresponding to the tokens + """ + return [self.add_token(token) for token in tokens] + + def lookup_token(self, token): + """Retrieve the index associated with the token + or the UNK index if token isn't present. + + Args: + token (str): the token to look up + Returns: + index (int): the index corresponding to the token + Notes: + `unk_index` needs to be >=0 (having been added into the Vocabulary) + for the UNK functionality + """ + if self.unk_index >= 0: + return self._token_to_idx.get(token, self.unk_index) + else: + return self._token_to_idx[token] + + def lookup_index(self, index): + """Return the token associated with the index + + Args: + index (int): the index to look up + Returns: + token (str): the token corresponding to the index + Raises: + KeyError: if the index is not in the Vocabulary + """ + if index not in self._idx_to_token: + raise KeyError("the index (%d) is not in the Vocabulary" % index) + return self._idx_to_token[index] + + def __str__(self): + return "" % len(self) + + def __len__(self): + return len(self._token_to_idx) + + + + +class ReviewVectorizer(object): + """ The Vectorizer which coordinates the Vocabularies and puts them to use""" + def __init__(self, review_vocab, rating_vocab): + """ + Args: + review_vocab (Vocabulary): maps words to integers + rating_vocab (Vocabulary): maps class labels to integers + """ + self.review_vocab = review_vocab + self.rating_vocab = rating_vocab + + def vectorize(self, review): + """Create a collapsed one-hit vector for the review + + Args: + review (str): the review + Returns: + one_hot (np.ndarray): the collapsed one-hot encoding + """ + one_hot = np.zeros(len(self.review_vocab), dtype=np.float32) + + for token in review.split(" "): + if token not in string.punctuation: + one_hot[self.review_vocab.lookup_token(token)] = 1 + + return one_hot + + @classmethod + def from_dataframe(cls, review_df, cutoff=25): + """Instantiate the vectorizer from the dataset dataframe + + Args: + review_df (pandas.DataFrame): the review dataset + cutoff (int): the parameter for frequency-based filtering + Returns: + an instance of the ReviewVectorizer + """ + review_vocab = Vocabulary(add_unk=True) + rating_vocab = Vocabulary(add_unk=False) + + # Add ratings + for rating in sorted(set(review_df.rating)): + rating_vocab.add_token(rating) + + # Add top words if count > provided count + word_counts = Counter() + for review in review_df.review: + for word in review.split(" "): + if word not in string.punctuation: + word_counts[word] += 1 + + for word, count in word_counts.items(): + if count > cutoff: + review_vocab.add_token(word) + + return cls(review_vocab, rating_vocab) + + @classmethod + def from_serializable(cls, contents): + """Instantiate a ReviewVectorizer from a serializable dictionary + + Args: + contents (dict): the serializable dictionary + Returns: + an instance of the ReviewVectorizer class + """ + review_vocab = Vocabulary.from_serializable(contents['review_vocab']) + rating_vocab = Vocabulary.from_serializable(contents['rating_vocab']) + + return cls(review_vocab=review_vocab, rating_vocab=rating_vocab) + + def to_serializable(self): + """Create the serializable dictionary for caching + + Returns: + contents (dict): the serializable dictionary + """ + return {'review_vocab': self.review_vocab.to_serializable(), + 'rating_vocab': self.rating_vocab.to_serializable()} + + + +class ReviewDataset(Dataset): + def __init__(self, review_df, vectorizer): + """ + Args: + review_df (pandas.DataFrame): the dataset + vectorizer (ReviewVectorizer): vectorizer instantiated from dataset + """ + self.review_df = review_df + self._vectorizer = vectorizer + + self.train_df = self.review_df[self.review_df.split=='train'] + self.train_size = len(self.train_df) + + self.val_df = self.review_df[self.review_df.split=='val'] + self.validation_size = len(self.val_df) + + self.test_df = self.review_df[self.review_df.split=='test'] + self.test_size = len(self.test_df) + + self._lookup_dict = {'train': (self.train_df, self.train_size), + 'val': (self.val_df, self.validation_size), + 'test': (self.test_df, self.test_size)} + + self.set_split('train') + + @classmethod + def load_dataset_and_make_vectorizer(cls, review_csv): + """Load dataset and make a new vectorizer from scratch + + Args: + review_csv (str): location of the dataset + Returns: + an instance of ReviewDataset + """ + review_df = pd.read_csv(review_csv) + train_review_df = review_df[review_df.split=='train'] + return cls(review_df, ReviewVectorizer.from_dataframe(train_review_df)) + + @classmethod + def load_dataset_and_load_vectorizer(cls, review_csv, vectorizer_filepath): + """Load dataset and the corresponding vectorizer. + Used in the case in the vectorizer has been cached for re-use + + Args: + review_csv (str): location of the dataset + vectorizer_filepath (str): location of the saved vectorizer + Returns: + an instance of ReviewDataset + """ + review_df = pd.read_csv(review_csv) + vectorizer = cls.load_vectorizer_only(vectorizer_filepath) + return cls(review_df, vectorizer) + + @staticmethod + def load_vectorizer_only(vectorizer_filepath): + """a static method for loading the vectorizer from file + + Args: + vectorizer_filepath (str): the location of the serialized vectorizer + Returns: + an instance of ReviewVectorizer + """ + with open(vectorizer_filepath) as fp: + return ReviewVectorizer.from_serializable(json.load(fp)) + + def save_vectorizer(self, vectorizer_filepath): + """saves the vectorizer to disk using json + + Args: + vectorizer_filepath (str): the location to save the vectorizer + """ + with open(vectorizer_filepath, "w") as fp: + json.dump(self._vectorizer.to_serializable(), fp) + + def get_vectorizer(self): + """ returns the vectorizer """ + return self._vectorizer + + def set_split(self, split="train"): + """ selects the splits in the dataset using a column in the dataframe + + Args: + split (str): one of "train", "val", or "test" + """ + self._target_split = split + self._target_df, self._target_size = self._lookup_dict[split] + + def __len__(self): + return self._target_size + + def __getitem__(self, index): + """the primary entry point method for PyTorch datasets + + Args: + index (int): the index to the data point + Returns: + a dictionary holding the data point's features (x_data) and label (y_target) + """ + row = self._target_df.iloc[index] + + review_vector = \ + self._vectorizer.vectorize(row.review) + + rating_index = \ + self._vectorizer.rating_vocab.lookup_token(row.rating) + + return {'x_data': review_vector, + 'y_target': rating_index} + + def get_num_batches(self, batch_size): + """Given a batch size, return the number of batches in the dataset + + Args: + batch_size (int) + Returns: + number of batches in the dataset + """ + return len(self) // batch_size + +def generate_batches(dataset, batch_size, shuffle=True, + drop_last=True, device="cpu"): + """ + A generator function which wraps the PyTorch DataLoader. It will + ensure each tensor is on the write device location. + """ + dataloader = DataLoader(dataset=dataset, batch_size=batch_size, + shuffle=shuffle, drop_last=drop_last) + + for data_dict in dataloader: + out_data_dict = {} + for name, tensor in data_dict.items(): + out_data_dict[name] = data_dict[name].to(device) + yield out_data_dict + + + +class ReviewClassifier(nn.Module): + """ a simple perceptron based classifier """ + def __init__(self, num_features): + """ + Args: + num_features (int): the size of the input feature vector + """ + super(ReviewClassifier, self).__init__() + self.fc1 = nn.Linear(in_features=num_features, + out_features=1) + + def forward(self, x_in, apply_sigmoid=False): + """The forward pass of the classifier + + Args: + x_in (torch.Tensor): an input data tensor. + x_in.shape should be (batch, num_features) + apply_sigmoid (bool): a flag for the sigmoid activation + should be false if used with the Cross Entropy losses + Returns: + the resulting tensor. tensor.shape should be (batch,) + """ + y_out = self.fc1(x_in).squeeze() + if apply_sigmoid: + y_out = torch.sigmoid(y_out) + return y_out + + + + +def make_train_state(args): + return {'stop_early': False, + 'early_stopping_step': 0, + 'early_stopping_best_val': 1e8, + 'learning_rate': args.learning_rate, + 'epoch_index': 0, + 'train_loss': [], + 'train_acc': [], + 'val_loss': [], + 'val_acc': [], + 'test_loss': -1, + 'test_acc': -1, + 'model_filename': args.model_state_file} + +def update_train_state(args, model, train_state): + """Handle the training state updates. + + Components: + - Early Stopping: Prevent overfitting. + - Model Checkpoint: Model is saved if the model is better + + :param args: main arguments + :param model: model to train + :param train_state: a dictionary representing the training state values + :returns: + a new train_state + """ + + # Save one model at least + if train_state['epoch_index'] == 0: + torch.save(model.state_dict(), train_state['model_filename']) + train_state['stop_early'] = False + + # Save model if performance improved + elif train_state['epoch_index'] >= 1: + loss_tm1, loss_t = train_state['val_loss'][-2:] + + # If loss worsened + if loss_t >= train_state['early_stopping_best_val']: + # Update step + train_state['early_stopping_step'] += 1 + # Loss decreased + else: + # Save the best model + if loss_t < train_state['early_stopping_best_val']: + torch.save(model.state_dict(), train_state['model_filename']) + + # Reset early stopping step + train_state['early_stopping_step'] = 0 + + # Stop early ? + train_state['stop_early'] = \ + train_state['early_stopping_step'] >= args.early_stopping_criteria + + return train_state + +def compute_accuracy(y_pred, y_target): + y_target = y_target.cpu() + y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1] + n_correct = torch.eq(y_pred_indices, y_target).sum().item() + return n_correct / len(y_pred_indices) * 100 + + + + +def set_seed_everywhere(seed, cuda): + np.random.seed(seed) + torch.manual_seed(seed) + if cuda: + torch.cuda.manual_seed_all(seed) + +def handle_dirs(dirpath): + if not os.path.exists(dirpath): + os.makedirs(dirpath) + + + + +args = Namespace( + # Data and Path information + frequency_cutoff=25, + model_state_file='model.pth', + review_csv='data/yelp/reviews_with_splits_lite.csv', + # review_csv='data/yelp/reviews_with_splits_full.csv', + save_dir='model_storage/ch3/yelp/', + vectorizer_file='vectorizer.json', + # No Model hyper parameters + # Training hyper parameters + batch_size=128, + early_stopping_criteria=5, + learning_rate=0.001, + num_epochs=100, + seed=1337, + # Runtime options + catch_keyboard_interrupt=True, + cuda=True, + expand_filepaths_to_save_dir=True, + reload_from_files=False, +) + +if args.expand_filepaths_to_save_dir: + args.vectorizer_file = os.path.join(args.save_dir, + args.vectorizer_file) + + args.model_state_file = os.path.join(args.save_dir, + args.model_state_file) + + print("Expanded filepaths: ") + print("\t{}".format(args.vectorizer_file)) + print("\t{}".format(args.model_state_file)) + +# Check CUDA +if not torch.cuda.is_available(): + args.cuda = False +if torch.cuda.device_count() > 1: + print("Pouzivam", torch.cuda.device_count(), "graficke karty!") + +args.device = torch.device("cuda" if args.cuda else "cpu") + +# Set seed for reproducibility +set_seed_everywhere(args.seed, args.cuda) + +# handle dirs +handle_dirs(args.save_dir) + + + + +if args.reload_from_files: + # training from a checkpoint + print("Loading dataset and vectorizer") + dataset = ReviewDataset.load_dataset_and_load_vectorizer(args.review_csv, + args.vectorizer_file) +else: + print("Loading dataset and creating vectorizer") + # create dataset and vectorizer + dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv) + dataset.save_vectorizer(args.vectorizer_file) +vectorizer = dataset.get_vectorizer() + +classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab)) + + + +classifier = nn.DataParallel(classifier) +classifier = classifier.to(args.device) + +loss_func = nn.BCEWithLogitsLoss() +optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate) +scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, + mode='min', factor=0.5, + patience=1) + +train_state = make_train_state(args) + +epoch_bar = tqdm(desc='training routine', + total=args.num_epochs, + position=0) + +dataset.set_split('train') +train_bar = tqdm(desc='split=train', + total=dataset.get_num_batches(args.batch_size), + position=1, + leave=True) +dataset.set_split('val') +val_bar = tqdm(desc='split=val', + total=dataset.get_num_batches(args.batch_size), + position=1, + leave=True) + +try: + for epoch_index in range(args.num_epochs): + train_state['epoch_index'] = epoch_index + + # Iterate over training dataset + + # setup: batch generator, set loss and acc to 0, set train mode on + dataset.set_split('train') + batch_generator = generate_batches(dataset, + batch_size=args.batch_size, + device=args.device) + running_loss = 0.0 + running_acc = 0.0 + classifier.train() + + for batch_index, batch_dict in enumerate(batch_generator): + # the training routine is these 5 steps: + + # -------------------------------------- + # step 1. zero the gradients + optimizer.zero_grad() + + # step 2. compute the output + y_pred = classifier(x_in=batch_dict['x_data'].float()) + + # step 3. compute the loss + loss = loss_func(y_pred, batch_dict['y_target'].float()) + loss_t = loss.item() + running_loss += (loss_t - running_loss) / (batch_index + 1) + + # step 4. use loss to produce gradients + loss.backward() + + # step 5. use optimizer to take gradient step + optimizer.step() + # ----------------------------------------- + # compute the accuracy + acc_t = compute_accuracy(y_pred, batch_dict['y_target']) + running_acc += (acc_t - running_acc) / (batch_index + 1) + + # update bar + train_bar.set_postfix(loss=running_loss, + acc=running_acc, + epoch=epoch_index) + train_bar.update() + + train_state['train_loss'].append(running_loss) + train_state['train_acc'].append(running_acc) + + # Iterate over val dataset + + # setup: batch generator, set loss and acc to 0; set eval mode on + dataset.set_split('val') + batch_generator = generate_batches(dataset, + batch_size=args.batch_size, + device=args.device) + running_loss = 0. + running_acc = 0. + classifier.eval() + + for batch_index, batch_dict in enumerate(batch_generator): + + # compute the output + y_pred = classifier(x_in=batch_dict['x_data'].float()) + + # step 3. compute the loss + loss = loss_func(y_pred, batch_dict['y_target'].float()) + loss_t = loss.item() + running_loss += (loss_t - running_loss) / (batch_index + 1) + + # compute the accuracy + acc_t = compute_accuracy(y_pred, batch_dict['y_target']) + running_acc += (acc_t - running_acc) / (batch_index + 1) + + val_bar.set_postfix(loss=running_loss, + acc=running_acc, + epoch=epoch_index) + val_bar.update() + + train_state['val_loss'].append(running_loss) + train_state['val_acc'].append(running_acc) + + train_state = update_train_state(args=args, model=classifier, + train_state=train_state) + + scheduler.step(train_state['val_loss'][-1]) + + train_bar.n = 0 + val_bar.n = 0 + epoch_bar.update() + + if train_state['stop_early']: + break + + train_bar.n = 0 + val_bar.n = 0 + epoch_bar.update() +except KeyboardInterrupt: + print("Exiting loop") + + + + + + + +classifier.load_state_dict(torch.load(train_state['model_filename'])) +classifier = classifier.to(args.device) + +dataset.set_split('test') +batch_generator = generate_batches(dataset, + batch_size=args.batch_size, + device=args.device) +running_loss = 0. +running_acc = 0. +classifier.eval() + +for batch_index, batch_dict in enumerate(batch_generator): + # compute the output + y_pred = classifier(x_in=batch_dict['x_data'].float()) + + # compute the loss + loss = loss_func(y_pred, batch_dict['y_target'].float()) + loss_t = loss.item() + running_loss += (loss_t - running_loss) / (batch_index + 1) + + # compute the accuracy + acc_t = compute_accuracy(y_pred, batch_dict['y_target']) + running_acc += (acc_t - running_acc) / (batch_index + 1) + +train_state['test_loss'] = running_loss +train_state['test_acc'] = running_acc + + + + + + +print("Test loss: {:.3f}".format(train_state['test_loss'])) +print("Test Accuracy: {:.2f}".format(train_state['test_acc'])) + + + + + + +def preprocess_text(text): + text = text.lower() + text = re.sub(r"([.,!?])", r" \1 ", text) + text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text) + return text + + + + + +def predict_rating(review, classifier, vectorizer, decision_threshold=0.5): + """Predict the rating of a review + + Args: + review (str): the text of the review + classifier (ReviewClassifier): the trained model + vectorizer (ReviewVectorizer): the corresponding vectorizer + decision_threshold (float): The numerical boundary which separates the rating classes + """ + review = preprocess_text(review) + + vectorized_review = torch.tensor(vectorizer.vectorize(review)) + result = classifier(vectorized_review.view(1, -1)) + + probability_value = F.sigmoid(result).item() + index = 1 + if probability_value < decision_threshold: + index = 0 + + return vectorizer.rating_vocab.lookup_index(index) + + + + + +test_review = "this is a pretty awesome book" + +classifier = classifier.cpu() +prediction = predict_rating(test_review, classifier, vectorizer, decision_threshold=0.5) +print("{} -> {}".format(test_review, prediction)) + + + + + +# Sort weights +fc1_weights = classifier.fc1.weight.detach()[0] +_, indices = torch.sort(fc1_weights, dim=0, descending=True) +indices = indices.numpy().tolist() + +# Top 20 words +print("Influential words in Positive Reviews:") +print("--------------------------------------") +for i in range(20): + print(vectorizer.review_vocab.lookup_index(indices[i])) + +print("====\n\n\n") + +# Top 20 negative words +print("Influential words in Negative Reviews:") +print("--------------------------------------") +indices.reverse() +for i in range(20): + print(vectorizer.review_vocab.lookup_index(indices[i])) \ No newline at end of file