first step py and hs code py run and tested with datasets commit
This commit is contained in:
parent
4c100d4029
commit
7b19a84976
152
.gitignore
vendored
152
.gitignore
vendored
@ -1 +1,151 @@
|
||||
*.log
|
||||
# Created by .ignore support plugin (hsz.mobi)
|
||||
### Python template
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
env/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*,cover
|
||||
.hypothesis/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# IPython Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# dotenv
|
||||
.env
|
||||
|
||||
# virtualenv
|
||||
venv/
|
||||
ENV/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
### VirtualEnv template
|
||||
# Virtualenv
|
||||
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
|
||||
.Python
|
||||
[Bb]in
|
||||
[Ii]nclude
|
||||
[Ll]ib
|
||||
[Ll]ib64
|
||||
[Ll]ocal
|
||||
[Ss]cripts
|
||||
pyvenv.cfg
|
||||
.venv
|
||||
pip-selfcheck.json
|
||||
### JetBrains template
|
||||
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
|
||||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||
|
||||
# User-specific stuff:
|
||||
.idea/workspace.xml
|
||||
.idea/tasks.xml
|
||||
.idea/dictionaries
|
||||
.idea/vcs.xml
|
||||
.idea/jsLibraryMappings.xml
|
||||
|
||||
# Sensitive or high-churn files:
|
||||
.idea/dataSources.ids
|
||||
.idea/dataSources.xml
|
||||
.idea/dataSources.local.xml
|
||||
.idea/sqlDataSources.xml
|
||||
.idea/dynamic.xml
|
||||
.idea/uiDesigner.xml
|
||||
|
||||
# Gradle:
|
||||
.idea/gradle.xml
|
||||
.idea/libraries
|
||||
|
||||
# Mongo Explorer plugin:
|
||||
.idea/mongoSettings.xml
|
||||
|
||||
.idea/
|
||||
|
||||
## File-based project format:
|
||||
*.iws
|
||||
|
||||
## Plugin-specific files:
|
||||
|
||||
# IntelliJ
|
||||
/out/
|
||||
|
||||
# mpeltonen/sbt-idea plugin
|
||||
.idea_modules/
|
||||
|
||||
# JIRA plugin
|
||||
atlassian-ide-plugin.xml
|
||||
|
||||
# Crashlytics plugin (for Android Studio and IntelliJ)
|
||||
com_crashlytics_export_strings.xml
|
||||
crashlytics.properties
|
||||
crashlytics-build.properties
|
||||
fabric.properties
|
||||
|
3
.idea/.gitignore
vendored
Normal file
3
.idea/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
10
.idea/hatespeech-twitterdataset.iml
Normal file
10
.idea/hatespeech-twitterdataset.iml
Normal file
@ -0,0 +1,10 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
4
.idea/misc.xml
Normal file
4
.idea/misc.xml
Normal file
@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (hatespeech-twitterdataset)" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/hatespeech-twitterdataset.iml" filepath="$PROJECT_DIR$/.idea/hatespeech-twitterdataset.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
6
.idea/vcs.xml
Normal file
6
.idea/vcs.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
Can't render this file because it is too large.
|
26954
datasets/twitter_test.csv
Normal file
26954
datasets/twitter_test.csv
Normal file
File diff suppressed because it is too large
Load Diff
72
firststep.py
Normal file
72
firststep.py
Normal file
@ -0,0 +1,72 @@
|
||||
from nltk.util import pr
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
import re
|
||||
import nltk
|
||||
stemmer = nltk.SnowballStemmer("english")
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
stopword=set(stopwords.words('english'))
|
||||
data = pd.read_csv("datasets//twitter.csv")
|
||||
|
||||
#Step 2 I will add a new column to this dataset as labels which will contain the values as: Hate Speech O. ffensive Language No Hate and Offensive
|
||||
|
||||
data["labels"] = data["class"].map({0: "Hate Speech",
|
||||
1: "Offensive Language",
|
||||
2: "No Hate and Offensive"})
|
||||
data = data[["tweet", "labels"]]
|
||||
|
||||
|
||||
#Step 3 Now I will only select the tweet and labels columns for the rest of the task of training a hate speech detection model:
|
||||
|
||||
def clean(text):
|
||||
text = str(text).lower()
|
||||
text = re.sub('\[.*?\]', '', text)
|
||||
text = re.sub('https?://\S+|www\.\S+', '', text)
|
||||
text = re.sub('<.*?>+', '', text)
|
||||
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
|
||||
text = re.sub('\n', '', text)
|
||||
text = re.sub('\w*\d\w*', '', text)
|
||||
text = [word for word in text.split(' ') if word not in stopword]
|
||||
text=" ".join(text)
|
||||
text = [stemmer.stem(word) for word in text.split(' ')]
|
||||
text=" ".join(text)
|
||||
return text
|
||||
data["tweet"] = data["tweet"].apply(clean)
|
||||
|
||||
#Step 5 Now I will create a function to clean the texts in the tweet column
|
||||
|
||||
|
||||
x = np.array(data["tweet"])
|
||||
y = np.array(data["labels"])
|
||||
|
||||
cv = CountVectorizer()
|
||||
X = cv.fit_transform(x) # Fit the Data
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
|
||||
|
||||
clf = DecisionTreeClassifier()
|
||||
clf.fit(X_train,y_train)
|
||||
|
||||
#Step 6 Now let’s split the dataset into training and test sets and train a machine learning model for the task of hate speech detection
|
||||
|
||||
x = np.array(data["tweet"])
|
||||
y = np.array(data["labels"])
|
||||
|
||||
cv = CountVectorizer()
|
||||
X = cv.fit_transform(x) # Fit the Data
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
|
||||
|
||||
clf = DecisionTreeClassifier()
|
||||
clf.fit(X_train,y_train)
|
||||
#print(data.head())
|
||||
|
||||
#Step 7 Now let’s test this machine learning model to see if it detects hate speech or not
|
||||
|
||||
sample = "Let's unite and kill all the people who are protesting against the government"
|
||||
data = cv.transform([sample]).toarray()
|
||||
print(clf.predict(data))
|
||||
|
||||
|
@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
|
||||
data = pd.read_csv("twitter.csv")
|
||||
data = pd.read_csv("datasets//twitter.csv")
|
||||
#print(data.head())
|
||||
|
||||
data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Language", 2: "No Hate and Offensive"})
|
||||
|
BIN
notepad/HateSpeechDetecyion.docx
Normal file
BIN
notepad/HateSpeechDetecyion.docx
Normal file
Binary file not shown.
112
notepad/git cmds.txt
Normal file
112
notepad/git cmds.txt
Normal file
@ -0,0 +1,112 @@
|
||||
git init
|
||||
|
||||
git add --a - add to stage
|
||||
|
||||
git status
|
||||
|
||||
git commit -m "Message" -- add .git to commit
|
||||
|
||||
git diff
|
||||
|
||||
git diff --staged
|
||||
|
||||
git commit -a -m "Message" -- direct commit without staging
|
||||
|
||||
git log
|
||||
%**************************************%
|
||||
git rm -rf - remove full .git from the dir
|
||||
|
||||
git rm <filename> - remove the file
|
||||
|
||||
git mv <filename> <renamefilename>
|
||||
|
||||
%**************************************%
|
||||
Git Skipping
|
||||
git rm --cached <filename> for untrack the file
|
||||
%**************************************%
|
||||
|
||||
%**************************************%
|
||||
Git Log: Viewing & Changing Commits
|
||||
|
||||
Create new Repo.,
|
||||
git clone <gitpath> <foldername>
|
||||
|
||||
git log -- to see all history
|
||||
|
||||
git log --stat
|
||||
|
||||
git log --pretty=oneline
|
||||
git log --pretty=short
|
||||
git log --pretty=full
|
||||
|
||||
git log --since=2.days
|
||||
git log --since=2.weeks
|
||||
git log --since=2.years
|
||||
|
||||
git log --pretty=format:
|
||||
git log --pretty=format: "%h --%ae"
|
||||
More :
|
||||
https://git-scm.com/docs/git-log
|
||||
|
||||
|
||||
if you want to change commit message and update the changes again,
|
||||
git log -p -l
|
||||
git commit --amend
|
||||
press i and write and exit,
|
||||
staging and commit again
|
||||
|
||||
%**************************************%
|
||||
|
||||
|
||||
Stage - git add . or git add -a ->(all)
|
||||
Unstage - git restore --staged <filename>
|
||||
|
||||
|
||||
change git checkout -- .gitignore
|
||||
|
||||
git checkout --f
|
||||
|
||||
without changes get back
|
||||
do not use much -- it remove staging and remove recent changes
|
||||
|
||||
%**************************************%
|
||||
|
||||
git remote
|
||||
|
||||
git remote add orgin <link>
|
||||
|
||||
git checkout -b <branch>
|
||||
|
||||
45) To create new branch
|
||||
$ git checkout -b <new-branch-name>
|
||||
|
||||
46) To switch into different branch
|
||||
$ git checkout <branch-name>
|
||||
|
||||
47) TO check all the branches in git
|
||||
$ git branch
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user