commit 5ace552b69874082a916468a7183031eb121088f Author: Mannohargs Date: Fri Jul 8 14:38:52 2022 +0530 Added Steamlit commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..e22f25a --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# Hate-Speech-Detector +ML and DL models trained for hate speech detection on custom inputs using a Streamlit web API. diff --git a/streamlit_hateXplain.py b/streamlit_hateXplain.py new file mode 100644 index 0000000..b69bfbf --- /dev/null +++ b/streamlit_hateXplain.py @@ -0,0 +1,351 @@ +import streamlit as st + +import torch +import torchtext +from torchtext import data +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.utils.data import TensorDataset, random_split +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler +torch.manual_seed(1) + +from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW +from transformers import get_linear_schedule_with_warmup + +import random +import os +import base64 +import re +import string +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import plotly.express as px + +from sklearn.tree import DecisionTreeClassifier +from sklearn.model_selection import train_test_split +from sklearn import metrics +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.neural_network import MLPClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.neighbors import KNeighborsClassifier +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.metrics import roc_auc_score +from scipy.special import softmax + +import spacy +import nltk +from nltk.corpus import wordnet as wn +from nltk.corpus import genesis +from nltk.tokenize import word_tokenize +from nltk.stem.porter import PorterStemmer +from nltk.stem import SnowballStemmer +from nltk.stem.lancaster import LancasterStemmer +from nltk.corpus import stopwords + +spacy_eng = spacy.load('en') +nltk.download('genesis') +nltk.download('wordnet') +nltk.download('punkt') +nltk.download('averaged_perceptron_tagger') +nltk.download('stopwords') +genesis_ic = wn.ic(genesis, False, 0.0) + +device = torch.device("cuda") if torch.cuda.is_available() else "cpu" + + +@st.cache +def loadData(): + data = pd.read_csv('train/labeled_data.csv') + column = 'tweet' + cleaned_column_ml = 'cleaned_tweet_ml' + cleaned_column_deep = 'cleaned_tweet_deep' + data = clean_csv(data, column, cleaned_column_ml, cleaned_column_deep) + return data + + +def clean_csv(data, column, cleaned_column_ml, cleaned_column_deep): + cleaned_sents_ml = [] + cleaned_sents_deep = [] + for sent in data[column]: + cleaned_ml, cleaned_deep = clean(sent) + cleaned_sents_ml.append(cleaned_ml) + cleaned_sents_deep.append(cleaned_deep) + data[cleaned_column_ml] = cleaned_sents_ml + data[cleaned_column_deep] = cleaned_sents_deep + return data + + +def clean(sent): + stopw = stopwords.words('english') + lem = nltk.wordnet.WordNetLemmatizer() + tokens = sent.split(' ') + final_tokens_ml = [] + final_tokens_deep = [] + for token in tokens: + if (not token.startswith('http') and not token.startswith('"@') and not token.startswith('#') and not token.startswith('!') and not token.startswith('&') and not token.startswith('@') and not token.startswith('RT')): + final_tokens_deep.append(lem.lemmatize(token)) + if token not in stopw: + final_tokens_ml.append(lem.lemmatize(token)) + cleaned_ml = " ".join(final_tokens_ml).lower() + cleaned_ml = re.sub(r'[^\w\s]', '', cleaned_ml) + cleaned_ml = ''.join([s for s in cleaned_ml if not s.isdigit()]) + cleaned_deep = " ".join(final_tokens_deep).lower() + cleaned_deep = re.sub(r'[^\w\s]', '', cleaned_deep) + cleaned_deep = ''.join([s for s in cleaned_deep if not s.isdigit()]) + return cleaned_ml, cleaned_deep + + +def split(data, model_type): + x_column = 'cleaned_tweet_deep' if model_type == 'BiLSTM' else 'cleaned_tweet_ml' + X = data[x_column] + y = data['class'] + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) + vectorizer = CountVectorizer() + vectorizer.fit(X_train) + X_train = vectorizer.transform(X_train) + X_test = vectorizer.transform(X_test) + + return X_train, X_test, y_train, y_test, vectorizer + + +@st.cache(suppress_st_warning=True) +def decisionTree(X_train, X_test, y_train, y_test): + tree = DecisionTreeClassifier(max_leaf_nodes=10, random_state=0) + tree.fit(X_train, y_train) + y_pred = tree.predict(X_test) + score = metrics.accuracy_score(y_test, y_pred) * 100 + report = classification_report(y_test, y_pred, output_dict = True) + + return score, report, tree, y_pred + + +@st.cache(suppress_st_warning=True) +def neuralNet(X_train, X_test, y_train, y_test): + scaler = StandardScaler(with_mean = False) + scaler.fit(X_train) + X_train = scaler.transform(X_train) + X_test = scaler.transform(X_test) + clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + score1 = metrics.accuracy_score(y_test, y_pred) * 100 + report = classification_report(y_test, y_pred, output_dict = True) + + return score1, report, clf, y_pred + + +@st.cache(suppress_st_warning=True) +def Knn_Classifier(X_train, X_test, y_train, y_test): + clf = KNeighborsClassifier(n_neighbors=10) + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + score = metrics.accuracy_score(y_test, y_pred) * 100 + report = classification_report(y_test, y_pred, output_dict = True) + + return score, report, clf, y_pred + +# @st.cache(suppress_st_warning=True) +# def BiLSTM(X_train, X_test, y_train, y_test): +# return score, report, clf, y_pred +# @st.cache(suppress_st_warning=True) + +def load_model_and_tokenizer(): + tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain") + model = AutoModelForSequenceClassification.from_pretrained( + "Hate-speech-CNERG/bert-base-uncased-hatexplain", + num_labels = 3, + output_attentions = True, + output_hidden_states = False + ) + model.to(device) + return model, tokenizer + + +def predict_for_user_input(text, model, tokenizer): + + sentences = [text] + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + sent, + add_special_tokens = True, + max_length = 64, + pad_to_max_length = True, + return_attention_mask = True, + return_tensors = 'pt', + ) + + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + input_ids = torch.cat(input_ids, dim=0) + attention_masks = torch.cat(attention_masks, dim=0) + batch_size = 1 + prediction_data = TensorDataset(input_ids, attention_masks)#, labels) + prediction_sampler = SequentialSampler(prediction_data) + prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) + model.to(device) + model.eval() + predictions = [] + + for batch in prediction_dataloader: + batch = tuple(t.to(device) for t in batch) + b_input_ids, b_input_mask = batch + with torch.no_grad(): + outputs = model(b_input_ids, token_type_ids=None, + attention_mask=b_input_mask) + + logits = outputs[0] + logits = logits.detach().cpu().numpy() + attention_vectors = np.mean(outputs[1][11][:,:,0,:].detach().cpu().numpy(), axis=1) + predictions.append(logits) + + return predictions[0], attention_vectors[0] + +def get_table_download_link(df): + csv = df.to_csv(index=False) + b64 = base64.b64encode(csv.encode()).decode() + href = f'here' + return href + + +def plot_confidence(pred_proba, model, col = 'g'): + model = [model] + plt.rcParams.update({'font.size': 13}) + fig, ax = plt.subplots(figsize = (10, 0.5)) + ax.barh(model, pred_proba, align='center', color = col) + ax.set_xlabel('Confidence') + ax.tick_params( + axis='y', + which='both', + left=False, + right=False, + labelleft=False) + + ax.tick_params( + axis='x', + which='major', + top=False, + bottom=True) + + plt.xlim(0, 1) + st.pyplot(fig, scale = False) + + +def print_model_characteristics(model, report, f1macro): + st.header(model) + df = pd.DataFrame(report).transpose() + seq = ['Hate Speech', 'Offensive Speech', 'Normal', 'accuracy', 'macro avg', 'weighted avg'] + df.insert(0, '', seq) + st.write("Model trained using [data](https://github.com/t-davidson/hate-speech-and-offensive-language) and achieved a macro F-1 score of {:.4f}.".format(f1macro)) + st.markdown("Download the complete report " + get_table_download_link(df) + '.', unsafe_allow_html=True) + + +def get_print_results(vectorizer, clf, model_type = 'ml'): + st.text("") + user_prediction_data_txt = st.text_input("Enter the text:") + user_prediction_data_ml, user_prediction_data_deep = clean(user_prediction_data_txt) + user_prediction_data = vectorizer.transform([user_prediction_data_ml]) if not model_type == 'deep' else vectorizer.transform([user_prediction_data_deep]) + pred = clf.predict_proba(user_prediction_data) + pred_class = np.argmax(pred) + pred_proba = pred[0][pred_class] + pred_color = 'g' if pred_proba > 0.75 else 'r' if pred_proba < 0.5 else 'orange' + pred_text = 'Hate Speech' if pred_class==0 else ('Offensive Speech' if pred_class==1 else 'Normal') + if user_prediction_data_txt != "": + st.subheader(f"The Predicted Class is: {pred_text}") + plot_confidence(pred_proba, "Decision Tree", pred_color) + display_legend("ml") + + +def print_attentions(user_prediction_data, tokenizer, attention_vectors): + + if user_prediction_data != "": + st.subheader("Attention visualization:") + tokens = tokenizer.tokenize(user_prediction_data) + tokens_colors = [] + sent_length = len(tokens) + for i, token in enumerate(tokens): + if attention_vectors[i + 1] * sent_length >= 0.75: + tokens_colors.append('#FF0000') # Red + elif attention_vectors[i + 1] * sent_length >= 0.5: + tokens_colors.append('#FF8700') # Orange + elif attention_vectors[i + 1] * sent_length >= 0.25: + tokens_colors.append('#FFEB00') # Yellow + else: + tokens_colors.append('#A7FF00') # Green + text = "" + for i, token in enumerate(tokens): + text += ('' + token + ' ') + st.markdown(text, unsafe_allow_html=True) + +def display_legend(model = 'ml'): + if model == "BERT": + st.markdown("Info:
Attention visualization on words: Red (large attention) > Orange > Yellow > Green (less attention)
Confidence score: 0.0 to 1.0 - low prediction confidence to high prediction confidence
", unsafe_allow_html = True) + else: + st.markdown("Info:
Confidence score: 0.0 to 1.0 - low prediction confidence to high prediction confidence
", unsafe_allow_html = True) + + + +def main(): + st.title("Hate Speech Detector") + data = loadData() + X_train, X_test, y_train, y_test, vectorizer = split(data, 'ml') + + choose_model = st.sidebar.selectbox("Choose the Model", + ["BERT", "K-Nearest Neighbours", "Multi-Layer Perceptron", "Decision Tree"]) + + if (choose_model == "BERT"): + st.header("BERT") + st.write("Model trained using [this data](https://github.com/punyajoy/HateXplain)") + mod, tok = load_model_and_tokenizer() + user_prediction_data = st.text_input("Enter the text:") + pred, attention_vectors = predict_for_user_input(user_prediction_data, mod, tok) + + print_attentions(user_prediction_data, tok, attention_vectors) + probs = softmax(pred) + pred_class = np.argmax(probs) + pred_proba = probs[0][pred_class] + pred_color = 'g' if pred_proba > 0.75 else 'r' if pred_proba < 0.5 else 'orange' + pred_text = 'Hate Speech' if pred_class==0 else ('Normal' if pred_class==1 else 'Offensive Speech') + if user_prediction_data != "": + st.subheader(f"The Predicted Class is: {pred_text}") + plot_confidence(pred_proba, "Decision Tree", pred_color) + display_legend("BERT") + + elif(choose_model == "Decision Tree"): + score, report, tree, y_pred = decisionTree(X_train, X_test, y_train, y_test) + + f1macro = metrics.f1_score(y_test, y_pred, average = 'macro') + print_model_characteristics("Decision Tree", report, f1macro) + + get_print_results(vectorizer, tree) + + elif(choose_model == "Multi-Layer Perceptron"): + score, report, clf, y_pred = neuralNet(X_train, X_test, y_train, y_test) + + f1macro = metrics.f1_score(y_test, y_pred, average = 'macro') + print_model_characteristics("Multi-Layer Perceptron", report, f1macro) + + get_print_results(vectorizer, clf) + + elif(choose_model == "K-Nearest Neighbours"): + score, report, clf, y_pred = Knn_Classifier(X_train, X_test, y_train, y_test) + + f1macro = metrics.f1_score(y_test, y_pred, average = 'macro') + print_model_characteristics("K-Nearest Neighbours", report, f1macro) + + get_print_results(vectorizer, clf) + + # if st.button('Info'): + # display_legend(choose_model) + footnote = "



















Made by Divyanshu Sheth" + st.sidebar.markdown(footnote, unsafe_allow_html = True) + +if __name__ == "__main__": + main() \ No newline at end of file