first step py and hs code py run and tested with datasets commit

This commit is contained in:
Manohar Gowdru Shridhara 2022-01-07 19:00:29 +05:30
parent 4c100d4029
commit 7b19a84976
13 changed files with 27327 additions and 2 deletions

150
.gitignore vendored
View File

@ -1 +1,151 @@
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log *.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
### VirtualEnv template
# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
.Python
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
.venv
pip-selfcheck.json
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff:
.idea/workspace.xml
.idea/tasks.xml
.idea/dictionaries
.idea/vcs.xml
.idea/jsLibraryMappings.xml
# Sensitive or high-churn files:
.idea/dataSources.ids
.idea/dataSources.xml
.idea/dataSources.local.xml
.idea/sqlDataSources.xml
.idea/dynamic.xml
.idea/uiDesigner.xml
# Gradle:
.idea/gradle.xml
.idea/libraries
# Mongo Explorer plugin:
.idea/mongoSettings.xml
.idea/
## File-based project format:
*.iws
## Plugin-specific files:
# IntelliJ
/out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

3
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (hatespeech-twitterdataset)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/hatespeech-twitterdataset.iml" filepath="$PROJECT_DIR$/.idea/hatespeech-twitterdataset.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

Can't render this file because it is too large.

26954
datasets/twitter_test.csv Normal file

File diff suppressed because it is too large Load Diff

72
firststep.py Normal file
View File

@ -0,0 +1,72 @@
from nltk.util import pr
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import re
import nltk
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))
data = pd.read_csv("datasets//twitter.csv")
#Step 2 I will add a new column to this dataset as labels which will contain the values as: Hate Speech O. ffensive Language No Hate and Offensive
data["labels"] = data["class"].map({0: "Hate Speech",
1: "Offensive Language",
2: "No Hate and Offensive"})
data = data[["tweet", "labels"]]
#Step 3 Now I will only select the tweet and labels columns for the rest of the task of training a hate speech detection model:
def clean(text):
text = str(text).lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
text = [word for word in text.split(' ') if word not in stopword]
text=" ".join(text)
text = [stemmer.stem(word) for word in text.split(' ')]
text=" ".join(text)
return text
data["tweet"] = data["tweet"].apply(clean)
#Step 5 Now I will create a function to clean the texts in the tweet column
x = np.array(data["tweet"])
y = np.array(data["labels"])
cv = CountVectorizer()
X = cv.fit_transform(x) # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
#Step 6 Now lets split the dataset into training and test sets and train a machine learning model for the task of hate speech detection
x = np.array(data["tweet"])
y = np.array(data["labels"])
cv = CountVectorizer()
X = cv.fit_transform(x) # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
#print(data.head())
#Step 7 Now lets test this machine learning model to see if it detects hate speech or not
sample = "Let's unite and kill all the people who are protesting against the government"
data = cv.transform([sample]).toarray()
print(clf.predict(data))

View File

@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
data = pd.read_csv("twitter.csv") data = pd.read_csv("datasets//twitter.csv")
#print(data.head()) #print(data.head())
data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Language", 2: "No Hate and Offensive"}) data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Language", 2: "No Hate and Offensive"})

Binary file not shown.

112
notepad/git cmds.txt Normal file
View File

@ -0,0 +1,112 @@
git init
git add --a - add to stage
git status
git commit -m "Message" -- add .git to commit
git diff
git diff --staged
git commit -a -m "Message" -- direct commit without staging
git log
%**************************************%
git rm -rf - remove full .git from the dir
git rm <filename> - remove the file
git mv <filename> <renamefilename>
%**************************************%
Git Skipping
git rm --cached <filename> for untrack the file
%**************************************%
%**************************************%
Git Log: Viewing & Changing Commits
Create new Repo.,
git clone <gitpath> <foldername>
git log -- to see all history
git log --stat
git log --pretty=oneline
git log --pretty=short
git log --pretty=full
git log --since=2.days
git log --since=2.weeks
git log --since=2.years
git log --pretty=format:
git log --pretty=format: "%h --%ae"
More :
https://git-scm.com/docs/git-log
if you want to change commit message and update the changes again,
git log -p -l
git commit --amend
press i and write and exit,
staging and commit again
%**************************************%
Stage - git add . or git add -a ->(all)
Unstage - git restore --staged <filename>
change git checkout -- .gitignore
git checkout --f
without changes get back
do not use much -- it remove staging and remove recent changes
%**************************************%
git remote
git remote add orgin <link>
git checkout -b <branch>
45) To create new branch
$ git checkout -b <new-branch-name>
46) To switch into different branch
$ git checkout <branch-name>
47) TO check all the branches in git
$ git branch