first step py and hs code py run and tested with datasets commit

2022-01-07 19:00:29 +05:30 · 2022-01-07 19:00:29 +05:30 · 7b19a84976
commit 7b19a84976
parent 4c100d4029
13 changed files with 27327 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,151 @@
-*.log
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+### VirtualEnv template
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+.Python
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+.venv
+pip-selfcheck.json
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff:
+.idea/workspace.xml
+.idea/tasks.xml
+.idea/dictionaries
+.idea/vcs.xml
+.idea/jsLibraryMappings.xml
+
+# Sensitive or high-churn files:
+.idea/dataSources.ids
+.idea/dataSources.xml
+.idea/dataSources.local.xml
+.idea/sqlDataSources.xml
+.idea/dynamic.xml
+.idea/uiDesigner.xml
+
+# Gradle:
+.idea/gradle.xml
+.idea/libraries
+
+# Mongo Explorer plugin:
+.idea/mongoSettings.xml
+
+.idea/
+
+## File-based project format:
+*.iws
+
+## Plugin-specific files:
+
+# IntelliJ
+/out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/.idea/hatespeech-twitterdataset.iml
+++ b/.idea/hatespeech-twitterdataset.iml
@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (hatespeech-twitterdataset)" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/hatespeech-twitterdataset.iml" filepath="$PROJECT_DIR$/.idea/hatespeech-twitterdataset.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/datasets/twitter.csv
+++ b/datasets/twitter.csv
--- a/datasets/twitter_test.csv
+++ b/datasets/twitter_test.csv
--- a/firststep.py
+++ b/firststep.py
@ -0,0 +1,72 @@
+from nltk.util import pr
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier
+import re
+import nltk
+stemmer = nltk.SnowballStemmer("english")
+from nltk.corpus import stopwords
+import string
+stopword=set(stopwords.words('english'))
+data = pd.read_csv("datasets//twitter.csv")
+
+#Step 2 I will add a new column to this dataset as labels which will contain the values as: Hate Speech O. ffensive Language No Hate and Offensive
+
+data["labels"] = data["class"].map({0: "Hate Speech",
+                                    1: "Offensive Language",
+                                    2: "No Hate and Offensive"})
+data = data[["tweet", "labels"]]
+
+
+#Step 3 Now I will only select the tweet and labels columns for the rest of the task of training a hate speech detection model:
+
+def clean(text):
+    text = str(text).lower()
+    text = re.sub('\[.*?\]', '', text)
+    text = re.sub('https?://\S+|www\.\S+', '', text)
+    text = re.sub('<.*?>+', '', text)
+    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
+    text = re.sub('\n', '', text)
+    text = re.sub('\w*\d\w*', '', text)
+    text = [word for word in text.split(' ') if word not in stopword]
+    text=" ".join(text)
+    text = [stemmer.stem(word) for word in text.split(' ')]
+    text=" ".join(text)
+    return text
+data["tweet"] = data["tweet"].apply(clean)
+
+#Step 5 Now I will create a function to clean the texts in the tweet column
+
+
+x = np.array(data["tweet"])
+y = np.array(data["labels"])
+
+cv = CountVectorizer()
+X = cv.fit_transform(x) # Fit the Data
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
+
+clf = DecisionTreeClassifier()
+clf.fit(X_train,y_train)
+
+#Step 6 Now let’s split the dataset into training and test sets and train a machine learning model for the task of hate speech detection
+
+x = np.array(data["tweet"])
+y = np.array(data["labels"])
+
+cv = CountVectorizer()
+X = cv.fit_transform(x) # Fit the Data
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
+
+clf = DecisionTreeClassifier()
+clf.fit(X_train,y_train)
+#print(data.head())
+
+#Step 7 Now let’s test this machine learning model to see if it detects hate speech or not
+
+sample = "Let's unite and kill all the people who are protesting against the government"
+data = cv.transform([sample]).toarray()
+print(clf.predict(data))
+
+
--- a/hs_code.py
+++ b/hs_code.py
@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.model_selection import train_test_split
 from sklearn.tree import DecisionTreeClassifier

-data = pd.read_csv("twitter.csv")
+data = pd.read_csv("datasets//twitter.csv")
 #print(data.head())

 data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Language", 2: "No Hate and Offensive"})
--- a/notepad/HateSpeechDetecyion.docx
+++ b/notepad/HateSpeechDetecyion.docx
--- a/notepad/git
+++ b/notepad/git
@ -0,0 +1,112 @@
+git init
+
+git add --a   - add to stage
+
+git status 
+
+git commit -m "Message" -- add .git to commit
+
+git diff 
+
+git diff --staged
+
+git commit -a -m "Message" -- direct commit without staging 
+
+git log
+%**************************************%
+git rm -rf - remove full .git from the dir
+
+git rm <filename> - remove the file 
+
+git mv <filename> <renamefilename>
+
+%**************************************%
+Git Skipping
+git rm --cached <filename> for untrack the file 
+%**************************************%
+
+%**************************************%
+Git Log: Viewing & Changing Commits
+
+Create new Repo.,
+git clone <gitpath> <foldername>
+
+git log -- to see all history 
+
+git log --stat
+
+git log --pretty=oneline
+git log --pretty=short
+git log --pretty=full
+
+git log --since=2.days
+git log --since=2.weeks
+git log --since=2.years
+
+git log --pretty=format: 
+git log --pretty=format: "%h --%ae"
+More :
+https://git-scm.com/docs/git-log
+
+
+if you want to change commit message and update the changes again,
+git log -p -l
+git commit --amend
+press i and write and exit, 
+staging and commit again 
+
+%**************************************%
+
+
+Stage - git add . or git add -a   ->(all)
+Unstage - git restore --staged <filename>
+
+
+change git checkout -- .gitignore
+
+git checkout --f
+
+without changes get back
+do not use much -- it remove staging and remove recent changes 
+
+%**************************************%
+
+git remote
+
+git remote add orgin <link>
+
+git checkout -b <branch>
+
+45) To create new branch
+$ git checkout -b <new-branch-name>
+ 
+46) To switch into different branch
+$ git checkout <branch-name>
+
+47) TO check all the branches in git 
+$ git branch
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+