first step py and hs code py run and tested with datasets commit

2022-01-07 19:00:29 +05:30 · 2022-01-07 19:00:29 +05:30 · 7b19a84976
commit 7b19a84976
parent 4c100d4029
13 changed files with 27327 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,151 @@
 # Created by .ignore support plugin (hsz.mobi)
 ### Python template
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 env/
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 *.egg-info/
 .installed.cfg
 *.egg
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *,cover
 .hypothesis/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # IPython Notebook
 .ipynb_checkpoints
 # pyenv
 .python-version
 # celery beat schedule file
 celerybeat-schedule
 # dotenv
 .env
 # virtualenv
 venv/
 ENV/
 # Spyder project settings
 .spyderproject
 # Rope project settings
 .ropeproject
 ### VirtualEnv template
 # Virtualenv
 # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
 .Python
 [Bb]in
 [Ii]nclude
 [Ll]ib
 [Ll]ib64
 [Ll]ocal
 [Ss]cripts
 pyvenv.cfg
 .venv
 pip-selfcheck.json
 ### JetBrains template
 # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 # User-specific stuff:
 .idea/workspace.xml
 .idea/tasks.xml
 .idea/dictionaries
 .idea/vcs.xml
 .idea/jsLibraryMappings.xml
 # Sensitive or high-churn files:
 .idea/dataSources.ids
 .idea/dataSources.xml
 .idea/dataSources.local.xml
 .idea/sqlDataSources.xml
 .idea/dynamic.xml
 .idea/uiDesigner.xml
 # Gradle:
 .idea/gradle.xml
 .idea/libraries
 # Mongo Explorer plugin:
 .idea/mongoSettings.xml
 .idea/
 ## File-based project format:
 *.iws
 ## Plugin-specific files:
 # IntelliJ
 /out/
 # mpeltonen/sbt-idea plugin
 .idea_modules/
 # JIRA plugin
 atlassian-ide-plugin.xml
 # Crashlytics plugin (for Android Studio and IntelliJ)
 com_crashlytics_export_strings.xml
 crashlytics.properties
 crashlytics-build.properties
 fabric.properties
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,3 @@
 # Default ignored files
 /shelf/
 /workspace.xml
--- a/.idea/hatespeech-twitterdataset.iml
+++ b/.idea/hatespeech-twitterdataset.iml
@ -0,0 +1,10 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$">
      <excludeFolder url="file://$MODULE_DIR$/venv" />
    </content>
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
 <component name="InspectionProjectProfileManager">
  <settings>
    <option name="USE_PROJECT_PROFILE" value="false" />
    <version value="1.0" />
  </settings>
 </component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (hatespeech-twitterdataset)" project-jdk-type="Python SDK" />
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/hatespeech-twitterdataset.iml" filepath="$PROJECT_DIR$/.idea/hatespeech-twitterdataset.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
  </component>
 </project>
--- a/datasets/twitter.csv
+++ b/datasets/twitter.csv
--- a/datasets/twitter_test.csv
+++ b/datasets/twitter_test.csv
--- a/firststep.py
+++ b/firststep.py
@ -0,0 +1,72 @@
 from nltk.util import pr
 import pandas as pd
 import numpy as np
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.model_selection import train_test_split
 from sklearn.tree import DecisionTreeClassifier
 import re
 import nltk
 stemmer = nltk.SnowballStemmer("english")
 from nltk.corpus import stopwords
 import string
 stopword=set(stopwords.words('english'))
 data = pd.read_csv("datasets//twitter.csv")
 #Step 2 I will add a new column to this dataset as labels which will contain the values as: Hate Speech O. ffensive Language No Hate and Offensive
 data["labels"] = data["class"].map({0: "Hate Speech",
                                    1: "Offensive Language",
                                    2: "No Hate and Offensive"})
 data = data[["tweet", "labels"]]
 #Step 3 Now I will only select the tweet and labels columns for the rest of the task of training a hate speech detection model:
 def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
 data["tweet"] = data["tweet"].apply(clean)
 #Step 5 Now I will create a function to clean the texts in the tweet column
 x = np.array(data["tweet"])
 y = np.array(data["labels"])
 cv = CountVectorizer()
 X = cv.fit_transform(x) # Fit the Data
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
 clf = DecisionTreeClassifier()
 clf.fit(X_train,y_train)
 #Step 6 Now let’s split the dataset into training and test sets and train a machine learning model for the task of hate speech detection
 x = np.array(data["tweet"])
 y = np.array(data["labels"])
 cv = CountVectorizer()
 X = cv.fit_transform(x) # Fit the Data
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
 clf = DecisionTreeClassifier()
 clf.fit(X_train,y_train)
 #print(data.head())
 #Step 7 Now let’s test this machine learning model to see if it detects hate speech or not
 sample = "Let's unite and kill all the people who are protesting against the government"
 data = cv.transform([sample]).toarray()
 print(clf.predict(data))
--- a/hs_code.py
+++ b/hs_code.py
@ -5,7 +5,7 @@ from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.model_selection import train_test_split
 from sklearn.tree import DecisionTreeClassifier
-data = pd.read_csv("twitter.csv")
+data = pd.read_csv("datasets//twitter.csv")
 #print(data.head())
 data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Language", 2: "No Hate and Offensive"})
--- a/notepad/HateSpeechDetecyion.docx
+++ b/notepad/HateSpeechDetecyion.docx
--- a/notepad/git
+++ b/notepad/git
@ -0,0 +1,112 @@
 git init
 git add --a   - add to stage
 git status 
 git commit -m "Message" -- add .git to commit
 git diff 
 git diff --staged
 git commit -a -m "Message" -- direct commit without staging 
 git log
 %**************************************%
 git rm -rf - remove full .git from the dir
 git rm <filename> - remove the file 
 git mv <filename> <renamefilename>
 %**************************************%
 Git Skipping
 git rm --cached <filename> for untrack the file 
 %**************************************%
 %**************************************%
 Git Log: Viewing & Changing Commits
 Create new Repo.,
 git clone <gitpath> <foldername>
 git log -- to see all history 
 git log --stat
 git log --pretty=oneline
 git log --pretty=short
 git log --pretty=full
 git log --since=2.days
 git log --since=2.weeks
 git log --since=2.years
 git log --pretty=format: 
 git log --pretty=format: "%h --%ae"
 More :
 https://git-scm.com/docs/git-log
 if you want to change commit message and update the changes again,
 git log -p -l
 git commit --amend
 press i and write and exit, 
 staging and commit again 
 %**************************************%
 Stage - git add . or git add -a   ->(all)
 Unstage - git restore --staged <filename>
 change git checkout -- .gitignore
 git checkout --f
 without changes get back
 do not use much -- it remove staging and remove recent changes 
 %**************************************%
 git remote
 git remote add orgin <link>
 git checkout -b <branch>
 45) To create new branch
 $ git checkout -b <new-branch-name>
 46) To switch into different branch
 $ git checkout <branch-name>
 47) TO check all the branches in git 
 $ git branch