annotation app finished
This commit is contained in:
commit
d0dc4fa0f4
27
.gitignore
vendored
Normal file
27
.gitignore
vendored
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# preprocessing ignore
|
||||||
|
|
||||||
|
/preprocessing/*.jsonl
|
||||||
|
/preprocessing/*.pickle
|
||||||
|
|
||||||
|
/preprocessing/__pycache__
|
||||||
|
/preprocessing/classified_data
|
||||||
|
/preprocessing/clustered_jsonl
|
||||||
|
/preprocessing/json_data
|
||||||
|
/preprocessing/json_data_id
|
||||||
|
/preprocessing/jsonl_data
|
||||||
|
/preprocessing/.DS_Store
|
||||||
|
|
||||||
|
# harvesting ignore
|
||||||
|
/harvester/Facebook/inputs
|
||||||
|
/harvester/Facebook/outputs
|
||||||
|
/harvester/Facebook/.*
|
||||||
|
/harvester/Facebook/__pycache__
|
||||||
|
/harvester/__pycache__
|
||||||
|
/harvester/.DS_Store
|
||||||
|
|
||||||
|
# annotation_app
|
||||||
|
/annotation_app/.env
|
||||||
|
/annotation_app/__pycache__
|
||||||
|
/annotation_app/.DS_Store
|
||||||
|
|
||||||
|
|
2
annotation_app/.dockerignore
Normal file
2
annotation_app/.dockerignore
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
get_data.py
|
||||||
|
/instance
|
13
annotation_app/Dockerfile
Normal file
13
annotation_app/Dockerfile
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
FROM python:3.9
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . /app/
|
||||||
|
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
RUN python initial.py
|
||||||
|
|
||||||
|
EXPOSE 5050
|
||||||
|
|
||||||
|
CMD ["python3", "app.py"]
|
140
annotation_app/app.py
Normal file
140
annotation_app/app.py
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
from flask import Flask, render_template, request, redirect, flash, session, url_for
|
||||||
|
from models import db, Users, Annotations, Samples
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from sqlalchemy.orm import aliased
|
||||||
|
import sqlalchemy
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
app.config['SQLALCHEMY_DATABASE_URI'] = os.getenv('DB_URI')
|
||||||
|
app.secret_key = os.getenv('SECRET_KEY')
|
||||||
|
|
||||||
|
db.init_app(app)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/', methods=['GET'])
|
||||||
|
def home():
|
||||||
|
session.pop('id_user', None)
|
||||||
|
return render_template('home.html')
|
||||||
|
|
||||||
|
@app.route('/login', methods=['POST'])
|
||||||
|
def login():
|
||||||
|
if request.method == 'POST':
|
||||||
|
email = request.form['email']
|
||||||
|
if '@' in email:
|
||||||
|
try:
|
||||||
|
splitted = email.split('@')
|
||||||
|
name, surname = splitted[0].split('.')[:2]
|
||||||
|
domain = splitted[1]
|
||||||
|
if 'tuke' not in domain:
|
||||||
|
raise ValueError
|
||||||
|
except ValueError:
|
||||||
|
flash('Nie je validný TUKE email')
|
||||||
|
return redirect('/')
|
||||||
|
try:
|
||||||
|
db.session.add(Users(name, surname, email))
|
||||||
|
db.session.commit()
|
||||||
|
except sqlalchemy.exc.IntegrityError as err:
|
||||||
|
db.session.rollback()
|
||||||
|
logging.info('Logged existing email')
|
||||||
|
user = Users.query.filter_by(email=email).first()
|
||||||
|
session['email'] = email
|
||||||
|
session['id_user'] = user._id
|
||||||
|
return redirect('/anot')
|
||||||
|
flash('Nie je validný TUKE email')
|
||||||
|
return redirect('/')
|
||||||
|
|
||||||
|
@app.route('/anot', methods=['GET'])
|
||||||
|
def anot():
|
||||||
|
if 'id_user' in session:
|
||||||
|
try:
|
||||||
|
annotated_count = Annotations.query.filter_by(user_id=session['id_user']).count()
|
||||||
|
|
||||||
|
# query = text(
|
||||||
|
# f'''
|
||||||
|
# SELECT samples._id, text
|
||||||
|
# FROM samples
|
||||||
|
# LEFT JOIN annotations
|
||||||
|
# ON samples._id = annotations.sample_id
|
||||||
|
# WHERE samples._id NOT IN (
|
||||||
|
# SELECT sample_id
|
||||||
|
# FROM annotations
|
||||||
|
# GROUP BY sample_id
|
||||||
|
# HAVING COUNT(sample_id) > 5
|
||||||
|
# )
|
||||||
|
# AND samples._id NOT IN (
|
||||||
|
# SELECT samples._id
|
||||||
|
# FROM samples
|
||||||
|
# LEFT JOIN annotations
|
||||||
|
# ON samples._id = annotations.sample_id
|
||||||
|
# WHERE annotations.user_id IS {session['id_user']}
|
||||||
|
# )
|
||||||
|
# ORDER BY samples._id ASC
|
||||||
|
# LIMIT 1;
|
||||||
|
# '''
|
||||||
|
# )
|
||||||
|
|
||||||
|
annotations_alias = aliased(Annotations)
|
||||||
|
|
||||||
|
# Construct the query
|
||||||
|
query = (
|
||||||
|
db.session.query(Samples._id, Samples.text)
|
||||||
|
.outerjoin(annotations_alias, Samples._id == annotations_alias.sample_id)
|
||||||
|
.filter(
|
||||||
|
~Samples._id.in_(
|
||||||
|
db.session.query(Annotations.sample_id)
|
||||||
|
.group_by(Annotations.sample_id)
|
||||||
|
.having(db.func.count(Annotations.sample_id) > 5)
|
||||||
|
),
|
||||||
|
~Samples._id.in_(
|
||||||
|
db.session.query(Samples._id)
|
||||||
|
.outerjoin(Annotations, Samples._id == Annotations.sample_id)
|
||||||
|
.filter(Annotations.user_id == session['id_user'])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.order_by(Samples._id.asc())
|
||||||
|
.limit(1)
|
||||||
|
)
|
||||||
|
sample_id, sample_text = query.one_or_none()
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'email': session.get('email'),
|
||||||
|
'text': sample_text,
|
||||||
|
'sample_id': sample_id,
|
||||||
|
'annotated_count': annotated_count
|
||||||
|
}
|
||||||
|
|
||||||
|
except (sqlalchemy.exc.OperationalError) as err:
|
||||||
|
print(err)
|
||||||
|
logging.info('Annotationss started')
|
||||||
|
data = {
|
||||||
|
'email': session.get('email'),
|
||||||
|
'text': Samples.query.order_by(Samples._id.asc()).first().text,
|
||||||
|
'sample_id': Samples.query.order_by(Samples._id.asc()).first()._id,
|
||||||
|
'annotated_count': annotated_count
|
||||||
|
}
|
||||||
|
return render_template('anot.html', **data)
|
||||||
|
return redirect('/')
|
||||||
|
|
||||||
|
@app.route('/process_anot', methods=['POST'])
|
||||||
|
def process():
|
||||||
|
if request.method == 'POST':
|
||||||
|
data = request.get_json()
|
||||||
|
print(data)
|
||||||
|
db.session.add(Annotations(
|
||||||
|
user_id=session['id_user'],
|
||||||
|
sample_id=data['sample_id'],
|
||||||
|
label=data['value']
|
||||||
|
))
|
||||||
|
db.session.commit()
|
||||||
|
return redirect(url_for('anot'))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run(host="0.0.0.0", port=5050)
|
57140
annotation_app/dataset/final_id_v2.jsonl
Normal file
57140
annotation_app/dataset/final_id_v2.jsonl
Normal file
File diff suppressed because it is too large
Load Diff
54
annotation_app/get_data.py
Normal file
54
annotation_app/get_data.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
from models import db, Samples, Users, Annotations
|
||||||
|
from app import app
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
with app.app_context():
|
||||||
|
# AND annotations.user_id <> '{id_user}'
|
||||||
|
id_user = 4
|
||||||
|
|
||||||
|
|
||||||
|
query = text(
|
||||||
|
f'''
|
||||||
|
SELECT samples.id, text
|
||||||
|
FROM samples
|
||||||
|
LEFT JOIN annotations
|
||||||
|
ON samples.id = annotations.sample_id
|
||||||
|
WHERE samples.id NOT IN (
|
||||||
|
SELECT sample_id
|
||||||
|
FROM annotations
|
||||||
|
GROUP BY sample_id
|
||||||
|
HAVING COUNT(sample_id) > 5
|
||||||
|
)
|
||||||
|
AND samples.id NOT IN (
|
||||||
|
SELECT samples.id
|
||||||
|
FROM samples
|
||||||
|
LEFT JOIN annotations
|
||||||
|
ON samples.id = annotations.sample_id
|
||||||
|
WHERE annotations.user_id IS {id_user}
|
||||||
|
)
|
||||||
|
ORDER BY samples.id ASC
|
||||||
|
LIMIT 1;
|
||||||
|
'''
|
||||||
|
)
|
||||||
|
|
||||||
|
# query = text(
|
||||||
|
# '''
|
||||||
|
# SELECT samples.id
|
||||||
|
# FROM samples
|
||||||
|
# LEFT JOIN annotations
|
||||||
|
# ON samples.id = annotations.sample_id
|
||||||
|
# WHERE annotations.user_id IS NOT 1
|
||||||
|
# '''
|
||||||
|
# )
|
||||||
|
|
||||||
|
result = db.session.execute(query)
|
||||||
|
|
||||||
|
print(result.fetchall())
|
||||||
|
|
||||||
|
annotations = Annotations.query.all()
|
||||||
|
print(len(annotations))
|
||||||
|
# for annotation in annotations:
|
||||||
|
# print(annotation.user_id)
|
26
annotation_app/initial.py
Normal file
26
annotation_app/initial.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
from models import db, Samples
|
||||||
|
from app import app
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
with app.app_context():
|
||||||
|
# db.init_app(app)
|
||||||
|
|
||||||
|
# creating database
|
||||||
|
db.create_all()
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(os.path.join('dataset', 'final_id_v2.jsonl'), encoding='utf-8') as file:
|
||||||
|
data = [json.loads(line) for line in file]
|
||||||
|
|
||||||
|
|
||||||
|
for sample in data:
|
||||||
|
db.session.add(Samples(sample['text']))
|
||||||
|
db.session.commit()
|
||||||
|
print('Data sucessfully inserted')
|
||||||
|
except FileNotFoundError as err:
|
||||||
|
print(err)
|
BIN
annotation_app/instance/anot_db.db
Normal file
BIN
annotation_app/instance/anot_db.db
Normal file
Binary file not shown.
40
annotation_app/models.py
Normal file
40
annotation_app/models.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
from flask_sqlalchemy import SQLAlchemy
|
||||||
|
|
||||||
|
db = SQLAlchemy()
|
||||||
|
|
||||||
|
class Annotations(db.Model):
|
||||||
|
__tablename__ = 'annotations'
|
||||||
|
|
||||||
|
_id = db.Column("id", db.Integer, primary_key=True)
|
||||||
|
user_id = db.Column(db.Integer, db.ForeignKey('users.id'), nullable=False)
|
||||||
|
sample_id = db.Column(db.Integer, db.ForeignKey('samples.id'), nullable=False)
|
||||||
|
label = db.Column(db.String(32), nullable=False)
|
||||||
|
|
||||||
|
def __init__(self, user_id, sample_id, label):
|
||||||
|
self.user_id = user_id
|
||||||
|
self.sample_id = sample_id
|
||||||
|
self.label = label
|
||||||
|
|
||||||
|
class Users(db.Model):
|
||||||
|
__tablename__ = 'users'
|
||||||
|
|
||||||
|
_id = db.Column("id", db.Integer, primary_key=True)
|
||||||
|
name = db.Column(db.String(32), nullable=False)
|
||||||
|
surname = db.Column(db.String(32), nullable=False)
|
||||||
|
email = db.Column(db.String(64), unique=True, nullable=False)
|
||||||
|
annotations = db.relationship('Annotations', uselist=False, backref='user', lazy=True)
|
||||||
|
|
||||||
|
def __init__(self, name, surname, email):
|
||||||
|
self.name = name
|
||||||
|
self.surname = surname
|
||||||
|
self.email = email
|
||||||
|
|
||||||
|
class Samples(db.Model):
|
||||||
|
__tablename__ = 'samples'
|
||||||
|
|
||||||
|
_id = db.Column("id", db.Integer, primary_key=True)
|
||||||
|
text = db.Column(db.String(512), nullable=False)
|
||||||
|
annotations = db.relationship('Annotations', lazy=True, backref='sample') # corrected relationship and added backref
|
||||||
|
|
||||||
|
def __init__(self, text):
|
||||||
|
self.text = text
|
13
annotation_app/requirements.txt
Normal file
13
annotation_app/requirements.txt
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
blinker==1.7.0
|
||||||
|
click==8.1.7
|
||||||
|
Flask==3.0.2
|
||||||
|
Flask-SQLAlchemy==3.1.1
|
||||||
|
importlib_metadata==7.0.2
|
||||||
|
itsdangerous==2.1.2
|
||||||
|
Jinja2==3.1.3
|
||||||
|
MarkupSafe==2.1.5
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
SQLAlchemy==2.0.28
|
||||||
|
typing_extensions==4.10.0
|
||||||
|
Werkzeug==3.0.1
|
||||||
|
zipp==3.18.1
|
76
annotation_app/static/stylesheets/styles.css
Normal file
76
annotation_app/static/stylesheets/styles.css
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
body, html {
|
||||||
|
height: 100%;
|
||||||
|
margin: 0;
|
||||||
|
display: flex;
|
||||||
|
justify-content: center;
|
||||||
|
align-items: center;
|
||||||
|
background-color: #FEFFFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
table {
|
||||||
|
border-radius: 8px;
|
||||||
|
}
|
||||||
|
.container{
|
||||||
|
position: absolute;
|
||||||
|
top: 50%;
|
||||||
|
left: 50%;
|
||||||
|
transform: translate(-50%, -50%);
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.btn {
|
||||||
|
background-color: #3AAFA9;
|
||||||
|
border: 1px solid #3AAFA9;
|
||||||
|
}
|
||||||
|
|
||||||
|
.logout-btn{
|
||||||
|
background-color: #454d55;
|
||||||
|
margin-top: 7.5%;
|
||||||
|
}
|
||||||
|
|
||||||
|
#top-info{
|
||||||
|
margin-top: 5%;
|
||||||
|
position: fixed;
|
||||||
|
top: 5%;
|
||||||
|
}
|
||||||
|
|
||||||
|
.anot{
|
||||||
|
border: 1px solid #000000;
|
||||||
|
border-radius: 10px;
|
||||||
|
padding: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.anot-text {
|
||||||
|
padding: 2.5%;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
.form-control{
|
||||||
|
margin-bottom: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
#login{
|
||||||
|
position: absolute;
|
||||||
|
top: -20vh;
|
||||||
|
left: 50%;
|
||||||
|
transform: translate(-50%, 50%);
|
||||||
|
border: 1px solid #000000;
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 4vh;
|
||||||
|
width: 500px;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
.top-info {
|
||||||
|
width: 100%;
|
||||||
|
border-collapse: collapse; /* Optional: collapse border */
|
||||||
|
}
|
||||||
|
|
||||||
|
.top-info td {
|
||||||
|
border: 1px solid #000; /* Add border to table cells */
|
||||||
|
padding: 8px; /* Optional: Add padding */
|
||||||
|
}
|
||||||
|
|
||||||
|
h3{
|
||||||
|
margin-bottom: 3%;
|
||||||
|
}
|
66
annotation_app/templates/anot.html
Normal file
66
annotation_app/templates/anot.html
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
{% extends "base.html" %}
|
||||||
|
|
||||||
|
{% block title %} annotation {% endblock %}
|
||||||
|
|
||||||
|
{% block content%}
|
||||||
|
|
||||||
|
<div class="container" id="top-info">
|
||||||
|
<table class="table top-info">
|
||||||
|
<thead class="thead-dark">
|
||||||
|
<tr>
|
||||||
|
<th>Email</th>
|
||||||
|
<th>Počet anotovaných jednotiek</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tr>
|
||||||
|
<td>{{ email }}</td>
|
||||||
|
<td>{{ annotated_count }}</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<div class="anot">
|
||||||
|
<p class="anot-text">{{ text }}</p>
|
||||||
|
<button id="post" class="btn btn-primary" onclick="postBcknd('offensive', {{ sample_id }})">Ofenzívny</button>
|
||||||
|
<button id="post" class="btn btn-primary" onclick="postBcknd('not_offensive', {{ sample_id }})">Neofenzívny</button>
|
||||||
|
<button id="post" class="btn btn-primary" onclick="postBcknd('dont_know', {{ sample_id }})">Neviem</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<button id="get" class="btn btn-primary logout-btn" onclick="logout()"> Odhlásiť sa</button>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
function postBcknd(value, sample_id){
|
||||||
|
var xhr = new XMLHttpRequest();
|
||||||
|
xhr.open('POST', '/process_anot', true);
|
||||||
|
xhr.setRequestHeader('Content-Type', 'application/json');
|
||||||
|
|
||||||
|
xhr.onload = function () {
|
||||||
|
if(xhr.status === 200) {
|
||||||
|
console.log('request sent succesfully');
|
||||||
|
window.location.href = '/anot';
|
||||||
|
} else {
|
||||||
|
console.log('request failed');
|
||||||
|
}
|
||||||
|
};
|
||||||
|
xhr.send(JSON.stringify({value: value, sample_id: sample_id}));
|
||||||
|
}
|
||||||
|
|
||||||
|
function logout() {
|
||||||
|
var xhr = new XMLHttpRequest();
|
||||||
|
xhr.open('GET', '/', true);
|
||||||
|
|
||||||
|
xhr.onload = function () {
|
||||||
|
if (xhr.status === 200) {
|
||||||
|
console.log('Logout successful');
|
||||||
|
window.location.href = '/';
|
||||||
|
} else {
|
||||||
|
console.log('Logout request failed');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
xhr.send(); // Send the request
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
{% endblock %}
|
17
annotation_app/templates/base.html
Normal file
17
annotation_app/templates/base.html
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
{% block head %}
|
||||||
|
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
|
||||||
|
<link type="text/css" rel="stylesheet" href="{{ url_for('static', filename='stylesheets/styles.css') }}" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>{% block title %}{% endblock %}</title>
|
||||||
|
{% endblock %}
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="content">
|
||||||
|
{% block content %}
|
||||||
|
{% endblock %}
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
19
annotation_app/templates/home.html
Normal file
19
annotation_app/templates/home.html
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
{% extends "base.html" %}
|
||||||
|
{% block title %} Welcome {% endblock %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<form action="/login" id="login" method="post" >
|
||||||
|
<h3>Login anotačnej aplikácie</h3>
|
||||||
|
<input type="text" name="email" placeholder="meno.priezvisko@student.tuke.sk" class="form-control">
|
||||||
|
<button id="post" class="btn btn-primary login-btn">Prihlásiť sa</button>
|
||||||
|
{% with messages = get_flashed_messages() %}
|
||||||
|
{% if messages %}
|
||||||
|
<p style="margin-top: 2%;"> {{ messages[0] }} </p>
|
||||||
|
{% endif %}
|
||||||
|
{% endwith %}
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% endblock %}
|
134
harvester/Facebook/facebook_crawler.py
Normal file
134
harvester/Facebook/facebook_crawler.py
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.common.keys import Keys
|
||||||
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
import selenium.common.exceptions
|
||||||
|
from Facebook.facebook_parser import FacebookParser
|
||||||
|
|
||||||
|
from crawler import Crawler
|
||||||
|
|
||||||
|
|
||||||
|
class FacebookCrawler(Crawler, FacebookParser):
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, base_url: str, file_name: str):
|
||||||
|
super().__init__(base_url, file_name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(os.path.join('locators.json')) as file:
|
||||||
|
self.locators = json.load(file)
|
||||||
|
|
||||||
|
with open(os.path.join('Facebook', 'inputs', self.filename)) as file:
|
||||||
|
self.URLS = tuple(file.readlines())
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(os.path.join('Facebook', 'inputs', self.filename))
|
||||||
|
print("Invalid input value")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# crawling part of the code
|
||||||
|
def crawl(self):
|
||||||
|
counter = len(self.URLS)
|
||||||
|
for idx,url in enumerate(self.URLS):
|
||||||
|
# redirect and wait for page to load
|
||||||
|
self.driver.get(url)
|
||||||
|
self.driver.implicitly_wait(4)
|
||||||
|
|
||||||
|
if 'videos' in url:
|
||||||
|
try:
|
||||||
|
self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest' or text()='Oldest']")
|
||||||
|
except selenium.common.exceptions.NoSuchElementException:
|
||||||
|
self.driver.find_element(By.XPATH, "//span[contains(@class, 'x6ikm8r x10wlt62 xlyipyv')]").click()
|
||||||
|
if self.driver.find_element(By.XPATH, "//div[contains(@class, 'x78zum5 xdt5ytf x1iyjqo2 x7ywyr2')]") is not None:
|
||||||
|
print('Cant crawl comments section')
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.close_censorship('Newest')
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest' or text()='Oldest']")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.close_censorship('All comments')
|
||||||
|
|
||||||
|
self.driver.implicitly_wait(3)
|
||||||
|
print('continue scraping')
|
||||||
|
|
||||||
|
# clicking features
|
||||||
|
self.view_more_comments()
|
||||||
|
self.show_replies()
|
||||||
|
self.click_see_more()
|
||||||
|
|
||||||
|
# # parsing part of the code
|
||||||
|
|
||||||
|
# Dictionary of classes, if facebook changes any class, rewrite this DICT
|
||||||
|
if '/videos/' in url:
|
||||||
|
self.class_dict = self.locators['facebook_video_locators']
|
||||||
|
elif '/posts/' in url:
|
||||||
|
self.class_dict = self.locators['facebook_post_locators']
|
||||||
|
|
||||||
|
self.parse(self.driver.page_source, self.class_dict, self.filename)
|
||||||
|
print(f'Done: [{idx + 1}/{counter}]')
|
||||||
|
|
||||||
|
def view_more_comments(self):
|
||||||
|
elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
|
||||||
|
while elements:
|
||||||
|
try:
|
||||||
|
self.driver.execute_script("arguments[0].click();", elements[0])
|
||||||
|
elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
|
||||||
|
self.driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
|
||||||
|
except selenium.common.exceptions.StaleElementReferenceException:
|
||||||
|
elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
|
||||||
|
|
||||||
|
|
||||||
|
# function, for showing hidden replies
|
||||||
|
def show_replies(self):
|
||||||
|
|
||||||
|
repl_elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'repl') and not(contains(text(), 'Hide')) and not(contains(text(), 'Newest'))]")
|
||||||
|
i = 1
|
||||||
|
while repl_elements:
|
||||||
|
|
||||||
|
try:
|
||||||
|
for element in repl_elements:
|
||||||
|
self.driver.execute_script("arguments[0].click();", element)
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
except selenium.common.exceptions.StaleElementReferenceException:
|
||||||
|
pass
|
||||||
|
repl_elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'repl') and not(contains(text(), 'Hide')) and not(contains(text(), 'Newest'))]")
|
||||||
|
|
||||||
|
# method for expanding comments
|
||||||
|
def click_see_more(self):
|
||||||
|
|
||||||
|
elements = self.driver.find_elements(By.XPATH, "//*[text()='See more']")
|
||||||
|
|
||||||
|
for element in elements:
|
||||||
|
self.driver.execute_script("arguments[0].click();", element)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# method for clossing most relevant filter to Newest
|
||||||
|
def close_censorship(self, classification: str):
|
||||||
|
self.driver.implicitly_wait(3)
|
||||||
|
try:
|
||||||
|
dropdown = self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest']")
|
||||||
|
self.driver.execute_script("arguments[0].click();", dropdown) # clicking on it
|
||||||
|
|
||||||
|
newest_comments = self.driver.find_element(By.XPATH, f"//*[text()='{classification}']")
|
||||||
|
self.driver.execute_script("arguments[0].click();", newest_comments) # clicking on it
|
||||||
|
except:
|
||||||
|
self.close_censorship(classification)
|
||||||
|
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
print('Scraping ended succesffuly')
|
||||||
|
self.driver.quit()
|
||||||
|
sys.exit(0)
|
129
harvester/Facebook/facebook_parser.py
Normal file
129
harvester/Facebook/facebook_parser.py
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from print_dict import pd
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
class FacebookParser:
|
||||||
|
|
||||||
|
def parse(self, html, clsDict, fname = 'final_dataset.json'):
|
||||||
|
|
||||||
|
self.soup = BeautifulSoup(html, 'lxml')
|
||||||
|
self.outFileName = fname
|
||||||
|
self.outFileName = f"outputs/parts/{self.outFileName.split('.')[0]}_data.json"
|
||||||
|
|
||||||
|
# dict for data about facebook post
|
||||||
|
self.post_data = {
|
||||||
|
'publisher': None,
|
||||||
|
'title': None,
|
||||||
|
'comments': [],
|
||||||
|
'post_reactions': None
|
||||||
|
}
|
||||||
|
|
||||||
|
# dict for comments
|
||||||
|
self.comment_data = {
|
||||||
|
'publisher': None,
|
||||||
|
'text': None,
|
||||||
|
'replies': []
|
||||||
|
}
|
||||||
|
|
||||||
|
# reply data
|
||||||
|
self.reply_data = {
|
||||||
|
'publisher': None,
|
||||||
|
'text': None
|
||||||
|
}
|
||||||
|
|
||||||
|
# post info
|
||||||
|
self.name = self.soup.find('a', {'class': clsDict['POST_AUTHOR']})
|
||||||
|
|
||||||
|
if clsDict['TOP_LABEL'] == 'message':
|
||||||
|
self.top = self.soup.find('div', {'data-ad-comet-preview': clsDict['TOP_LABEL']})
|
||||||
|
else:
|
||||||
|
self.top = self.soup.find('div', {'class': clsDict['TOP_LABEL']})
|
||||||
|
if self.top is None:
|
||||||
|
self.top = self.soup.find('div', {'class': 'x78zum5 xdt5ytf x2lah0s xyamay9 x1pi30zi x18d9i69 x1swvt13'})
|
||||||
|
|
||||||
|
self.title_likes = self.soup.find('span', {'class': clsDict['TITLE_LIKES']})
|
||||||
|
try:
|
||||||
|
self.tmp_strings = self.top.find_all('div', {'style': clsDict['STATUS_STRINGS']})
|
||||||
|
self.title = ''
|
||||||
|
for x in self.tmp_strings:
|
||||||
|
try:
|
||||||
|
self.title += x.text + '. '
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
self.title = None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
self.post_data = {
|
||||||
|
'publisher': self.name.text if self.name is not None else None,
|
||||||
|
'title': self.title,
|
||||||
|
'post_reactions': self.title_likes.text if self.title_likes is not None else None,
|
||||||
|
'comments': []
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.post_data['publisher'] is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# comment info
|
||||||
|
self.all_comments = self.soup.find_all("div", {"aria-label": lambda x: x and x.endswith("ago")}) # arr with all comments under the post
|
||||||
|
# print(len(self.all_comments))
|
||||||
|
for item in self.all_comments:
|
||||||
|
self.publisher = item.find('span', {'class', clsDict['COMMENT_AUTHOR']})
|
||||||
|
self.txt = item.find('div', {'class': clsDict['COMMENT_STR']})
|
||||||
|
try:
|
||||||
|
tmp_type = item.get('aria-label').split(' ')[0]
|
||||||
|
tmp_class = item.find('div', {'class': lambda x: x and x.startswith(clsDict['TMP_COMMENTS_CLASS'])}).get('class')[-1]
|
||||||
|
if tmp_type == "Comment":
|
||||||
|
self.comment_data = {
|
||||||
|
'publisher': self.publisher.text,
|
||||||
|
'text': self.txt.text if self.txt is not None else None,
|
||||||
|
'replies': []
|
||||||
|
}
|
||||||
|
self.post_data['comments'].append(self.comment_data)
|
||||||
|
|
||||||
|
|
||||||
|
elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER']:
|
||||||
|
self.comment_data = {
|
||||||
|
'publisher': self.publisher.text,
|
||||||
|
'text': self.txt.text if self.txt is not None else None,
|
||||||
|
'replies': []
|
||||||
|
}
|
||||||
|
self.post_data['comments'][-1]['replies'].append(self.comment_data)
|
||||||
|
|
||||||
|
elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER_2']:
|
||||||
|
self.reply_data = {
|
||||||
|
'publisher': self.publisher.text,
|
||||||
|
'text': self.txt.text if self.txt is not None else None,
|
||||||
|
}
|
||||||
|
self.post_data['comments'][-1]['replies'][-1]['replies'].append(self.reply_data)
|
||||||
|
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
if os.path.exists(self.outFileName):
|
||||||
|
with open(self.outFileName, 'r+', encoding= "utf-8") as file:
|
||||||
|
tmp = json.load(file)
|
||||||
|
tmp.append(self.post_data)
|
||||||
|
file.seek(0)
|
||||||
|
json.dump(tmp, file, indent=4, separators=(',',': '))
|
||||||
|
else:
|
||||||
|
with open(self.outFileName, 'w', encoding = "utf-8", ) as file:
|
||||||
|
json.dump([self.post_data], file, indent=4, separators=(',',': '))
|
||||||
|
|
||||||
|
#read URLS from a .txt
|
||||||
|
try:
|
||||||
|
with open(os.path.join('Facebook', 'inputs', self.filename), 'r+') as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
# move file pointer to the beginning of a file
|
||||||
|
file.seek(0)
|
||||||
|
# truncate the file
|
||||||
|
file.truncate()
|
||||||
|
# start writing lines except the first line
|
||||||
|
file.writelines(lines[1:])
|
||||||
|
except FileNotFoundError:
|
||||||
|
print('Invalid input value')
|
||||||
|
sys.exit(1)
|
155
harvester/Facebook/linkCollector.py
Normal file
155
harvester/Facebook/linkCollector.py
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
# parse args
|
||||||
|
parser = argparse.ArgumentParser(description = "Facebook scraper")
|
||||||
|
parser.add_argument("URL", help = 'URL of the facebook page / profile') # dont need to specify type (default is string)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
# method for waiting on pages to load
|
||||||
|
def wait_for_url(driver, url):
|
||||||
|
# waiting to load main page
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, 10).until(EC.url_to_be(url))
|
||||||
|
print('Succesful !')
|
||||||
|
except:
|
||||||
|
print('Connection error')
|
||||||
|
driver.quit()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# web driver init
|
||||||
|
def webdriver_setup():
|
||||||
|
|
||||||
|
driver_path = r'C:\Users\vlferko\Desktop\projects\jarvis_scraper\chromedriver.exe'
|
||||||
|
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("accept-language=en-US")
|
||||||
|
chrome_options.add_argument("--headless")
|
||||||
|
chrome_options.add_argument("--log-level=OFF")
|
||||||
|
driver = webdriver.Chrome(ChromeDriverManager().install(), options= chrome_options)
|
||||||
|
|
||||||
|
driver.get("https://www.facebook.com/")
|
||||||
|
return driver
|
||||||
|
|
||||||
|
# login to a facebook acc
|
||||||
|
def login(driver):
|
||||||
|
print('Logging in')
|
||||||
|
# allow cookies
|
||||||
|
try:
|
||||||
|
driver.find_element(By.XPATH, "//button[contains(string(), 'Decline optional cookies')]").click()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# insert login data
|
||||||
|
driver.find_element(By.NAME, "email").send_keys(os.environ['FB_EMAIL']) # type email
|
||||||
|
driver.find_element(By.NAME, "pass").send_keys(os.environ['FB_PASSWD']) # type password
|
||||||
|
|
||||||
|
# click -> log in
|
||||||
|
driver.find_element(By.NAME, "login").click()
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# scrolling to the bottom of the page
|
||||||
|
def crawl_for_links(driver, url):
|
||||||
|
print('Crawling')
|
||||||
|
i = 1
|
||||||
|
driver.get(url)
|
||||||
|
time.sleep(2)
|
||||||
|
name = driver.find_elements(By.TAG_NAME, 'h2')[-1].text
|
||||||
|
|
||||||
|
|
||||||
|
for _ in range(0, 3):
|
||||||
|
# Get scroll height
|
||||||
|
last_height = driver.execute_script("return document.body.scrollHeight")
|
||||||
|
for _ in range(50):
|
||||||
|
# Scroll down to bottom
|
||||||
|
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
|
||||||
|
|
||||||
|
# Wait to load page
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# Calculate new scroll height and compare with last scroll height
|
||||||
|
new_height = driver.execute_script("return document.body.scrollHeight")
|
||||||
|
if new_height == last_height:
|
||||||
|
break
|
||||||
|
last_height = new_height
|
||||||
|
os.system('clear||cls')
|
||||||
|
print(f'Iteration num: {i}')
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return driver.page_source, name
|
||||||
|
|
||||||
|
# parse HTML
|
||||||
|
def parse_html(html):
|
||||||
|
soup = BeautifulSoup(html, 'lxml')
|
||||||
|
|
||||||
|
timeline = soup.find('div', {'class': 'x9f619 x1n2onr6 x1ja2u2z xeuugli xs83m0k x1xmf6yo x1emribx x1e56ztr x1i64zmx xjl7jj x19h7ccj xu9j1y6 x7ep2pv'})
|
||||||
|
posts = timeline.find_all('div', {'class': 'x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z'})
|
||||||
|
arr = []
|
||||||
|
for post in posts:
|
||||||
|
try:
|
||||||
|
commentsWidget = post.find('span', {'class': 'x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xi81zsa'})
|
||||||
|
if approveComments(commentsWidget.text):
|
||||||
|
links = post.find_all('a', {'role': 'link'})
|
||||||
|
arr.append(extractPostLink(links))
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return arr
|
||||||
|
|
||||||
|
def extractPostLink(links):
|
||||||
|
for link in links:
|
||||||
|
if '/videos/' in link['href'] or '/posts/' in link['href']:
|
||||||
|
return link['href']
|
||||||
|
|
||||||
|
# check if post has at least 50 comments
|
||||||
|
def approveComments(text):
|
||||||
|
nComments = text.split(' ')[0]
|
||||||
|
try:
|
||||||
|
num = int(nComments)
|
||||||
|
return int(num > 50)
|
||||||
|
except ValueError:
|
||||||
|
return 'K' or 'M' in nComments
|
||||||
|
|
||||||
|
# write all the links to the .txt
|
||||||
|
def write_out(arr, name):
|
||||||
|
|
||||||
|
with open(f"{os.getcwd()}/inputs/{name.strip().replace(' ', '_').lower()}.txt", 'w') as f:
|
||||||
|
for item in arr:
|
||||||
|
try:
|
||||||
|
f.write(item + '\n')
|
||||||
|
except TypeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# driver init
|
||||||
|
driver =webdriver_setup()
|
||||||
|
wait_for_url(driver, 'https://www.facebook.com/')
|
||||||
|
|
||||||
|
# login
|
||||||
|
login(driver)
|
||||||
|
wait_for_url(driver, 'https://www.facebook.com/')
|
||||||
|
|
||||||
|
# crawl
|
||||||
|
html, name =crawl_for_links(driver, args.URL)
|
||||||
|
driver.close()
|
||||||
|
|
||||||
|
# parsing HTML
|
||||||
|
arr =parse_html(html)
|
||||||
|
|
||||||
|
# write out
|
||||||
|
write_out(arr, name)
|
24
harvester/locators.json
Normal file
24
harvester/locators.json
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"facebook_post_locators": {
|
||||||
|
"POST_AUTHOR": "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f",
|
||||||
|
"TOP_LABEL": "xjkvuk6 xuyqlj2 x1odjw0f",
|
||||||
|
"TITLE_LIKES": "xt0b8zv x1jx94hy xrbpyxo xl423tq",
|
||||||
|
"STATUS_STRINGS": "text-align: start;",
|
||||||
|
"COMMENT_AUTHOR": "x3nfvp2",
|
||||||
|
"COMMENT_STR": "x1lliihq xjkvuk6 x1iorvi4",
|
||||||
|
"TMP_COMMENTS_CLASS": "xqcrz7y",
|
||||||
|
"REPLY_DIVIDER": "x1k70j0n",
|
||||||
|
"REPLY_DIVIDER_2": "x1n2onr6"
|
||||||
|
},
|
||||||
|
"facebok_video_locators": {
|
||||||
|
"POST_AUTHOR": "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f",
|
||||||
|
"TOP_LABEL": "message",
|
||||||
|
"TITLE_LIKES": "xt0b8zv x1jx94hy xrbpyxo xl423tq",
|
||||||
|
"STATUS_STRINGS": "text-align: start;",
|
||||||
|
"COMMENT_AUTHOR": "x3nfvp2",
|
||||||
|
"COMMENT_STR": "xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs",
|
||||||
|
"TMP_COMMENTS_CLASS": "xqcrz7y",
|
||||||
|
"REPLY_DIVIDER": "x1k70j0n",
|
||||||
|
"REPLY_DIVIDER_2": "x1n2onr6"
|
||||||
|
}
|
||||||
|
}
|
30
harvester/main.py
Normal file
30
harvester/main.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
import argparse
|
||||||
|
|
||||||
|
from Facebook.facebook_crawler import FacebookCrawler
|
||||||
|
from Reddit.reddit_crawler import RedditCrawler
|
||||||
|
|
||||||
|
FACEBOOK_URL = 'https://www.facebook.com/'
|
||||||
|
REDDIT_URL = 'https://www.reddit.com/'
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
# parsing arguments
|
||||||
|
parser = argparse.ArgumentParser(description = "Facebook scraper")
|
||||||
|
parser.add_argument("file_name", help = 'Name of the .txt file with URLS')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
user_input = input('Hello, do you want to scraper Facebook or reddit? [F/r]: ')
|
||||||
|
|
||||||
|
while user_input.upper() not in ['F', 'R']:
|
||||||
|
user_input = input('Do you want to scrape Facebook or reddit? [F/r]: ')
|
||||||
|
|
||||||
|
|
||||||
|
if user_input == 'F':
|
||||||
|
facebook = FacebookCrawler(FACEBOOK_URL, args.file_name)
|
||||||
|
facebook.allow_cookies()
|
||||||
|
facebook.login()
|
||||||
|
facebook.crawl()
|
||||||
|
else:
|
||||||
|
reddit = RedditCrawler(REDDIT_URL, args.file_name)
|
||||||
|
print(reddit)
|
4
harvester/requirements.txt
Normal file
4
harvester/requirements.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
beautifulsoup4==4.12.2
|
||||||
|
print_dict==0.1.19
|
||||||
|
selenium==4.10.0
|
||||||
|
webdriver_manager==3.8.6
|
176
preprocessing/chunking.ipynb
Normal file
176
preprocessing/chunking.ipynb
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Data chunking for effectiveness\n",
|
||||||
|
"\n",
|
||||||
|
"In our data, facebook user called Robert Fico has a lot of samples.\n",
|
||||||
|
"For efficiency, this notebook chunks those data in 4 parts."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import json"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### JSONL file loading and creation"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def load_jsonl(file_path):\n",
|
||||||
|
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
||||||
|
" return [json.loads(line) for line in file]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def create_jsonl(filename, new_dataset):\n",
|
||||||
|
" with open(f'{filename}l', 'w') as jsonl_file:\n",
|
||||||
|
" for item in new_dataset:\n",
|
||||||
|
" jsonl_file.write(json.dumps(item) + '\\n')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"fico = load_jsonl('jsonl_data/robert_fico_data.jsonl')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Split data into 4 parts equal parts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"135155"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"num_samples = len(fico)\n",
|
||||||
|
"chunk_size = int(num_samples / 4)\n",
|
||||||
|
"\n",
|
||||||
|
"num_samples"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"False"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chunk_size * 4 == num_samples # we have lost one sample, because our dataset has odd number of samples"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Actual chunking algorithm"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"chunk_arr = []\n",
|
||||||
|
"for chunks in range(0, 4):\n",
|
||||||
|
" chunk_arr.append(\n",
|
||||||
|
" fico[chunk_size * chunks: chunk_size * (chunks + 1)]\n",
|
||||||
|
" )"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Write chunked data to disk in a for loop"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for index, data in enumerate(chunk_arr):\n",
|
||||||
|
" create_jsonl(f'jsonl_data/fico_chunk_{index}.json', data)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "sentiment",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.18"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
1742
preprocessing/clustered_processing.ipynb
Normal file
1742
preprocessing/clustered_processing.ipynb
Normal file
File diff suppressed because one or more lines are too long
389
preprocessing/clustering.ipynb
Normal file
389
preprocessing/clustering.ipynb
Normal file
@ -0,0 +1,389 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# This notebook is clustering samples based on their semantic similarity.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/Users/A200119424/anaconda3/envs/sentiment/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||||
|
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# imports \n",
|
||||||
|
"\n",
|
||||||
|
"from sentence_transformers import SentenceTransformer, util\n",
|
||||||
|
"from tqdm import tqdm\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import warnings\n",
|
||||||
|
"import json\n",
|
||||||
|
"import os\n",
|
||||||
|
"\n",
|
||||||
|
"warnings.filterwarnings(\"ignore\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Model init\n",
|
||||||
|
"\n",
|
||||||
|
"In this clustering process will be used TUKE-DeutscheTelekom/slovakbert-skquad-mnlr"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"model = SentenceTransformer('TUKE-DeutscheTelekom/slovakbert-skquad-mnlr')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Data manipulation in file system"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def load_jsonl(file_path):\n",
|
||||||
|
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
||||||
|
" return [json.loads(line) for line in file]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Pipeline functions"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Embedding creation"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def create_embeddings(jsonl_file):\n",
|
||||||
|
" sentences = [item['text'] for item in jsonl_file]\n",
|
||||||
|
" return model.encode(sentences), sentences"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Clustering algorithm"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def cluster_data(embeddings, sentences):\n",
|
||||||
|
" embeddings_np = np.array(embeddings)\n",
|
||||||
|
"\n",
|
||||||
|
" similarity_threshold = 0.65\n",
|
||||||
|
"\n",
|
||||||
|
" long_enough_mask = np.array([len(sentence) > 20 for sentence in sentences])\n",
|
||||||
|
"\n",
|
||||||
|
" cosine_sim_matrix = util.pytorch_cos_sim(torch.tensor(embeddings_np), torch.tensor(embeddings_np)).numpy()\n",
|
||||||
|
"\n",
|
||||||
|
" below_threshold_mask = cosine_sim_matrix < similarity_threshold\n",
|
||||||
|
"\n",
|
||||||
|
" filtered_mask = np.logical_and(below_threshold_mask, np.outer(long_enough_mask, long_enough_mask))\n",
|
||||||
|
"\n",
|
||||||
|
" non_spam_indices = np.where(filtered_mask)\n",
|
||||||
|
"\n",
|
||||||
|
" filtered_sentences = list(set([sentences[i] for i in non_spam_indices[0]]))\n",
|
||||||
|
"\n",
|
||||||
|
" return filtered_sentences"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Prepare data to write it to JSONL"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def filter_null_text(json_list):\n",
|
||||||
|
" filtered_list = [obj for obj in json_list if \"text\" in obj and obj[\"text\"] is not None]\n",
|
||||||
|
" return filtered_list"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def create_jsonl_format(filtered, jsonl_file):\n",
|
||||||
|
"\n",
|
||||||
|
" return [\n",
|
||||||
|
" {\n",
|
||||||
|
" 'id': item['id'],\n",
|
||||||
|
" 'author': item['author'],\n",
|
||||||
|
" 'text': item['text']\n",
|
||||||
|
" }\n",
|
||||||
|
" for item in jsonl_file if item['text'] in filtered\n",
|
||||||
|
" ]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Write out JSONL file"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def write_jsonl(filename, data):\n",
|
||||||
|
" with open(filename, 'w') as f:\n",
|
||||||
|
" for item in data:\n",
|
||||||
|
" json.dump(item, f)\n",
|
||||||
|
" f.write('\\n')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Pipeline execution"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def execute_pipeline(jsonl_file):\n",
|
||||||
|
" embeddings, sentences = create_embeddings(jsonl_file)\n",
|
||||||
|
" filtered_data = cluster_data(embeddings, sentences)\n",
|
||||||
|
" return create_jsonl_format(filtered_data, jsonl_file)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Pipeline usecase"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"prepare data for clustering in a loop"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['aktuality_data.jsonl',\n",
|
||||||
|
" 'denník_n_data.jsonl',\n",
|
||||||
|
" 'televízia_joj_data.jsonl',\n",
|
||||||
|
" 'fakty_data.jsonl',\n",
|
||||||
|
" 'erik_kaliňák_data.jsonl',\n",
|
||||||
|
" 'zomri_data.jsonl',\n",
|
||||||
|
" 'igor_matovic_data.jsonl',\n",
|
||||||
|
" 'peter_marcin_data.jsonl',\n",
|
||||||
|
" 'ján_koleník_data.jsonl',\n",
|
||||||
|
" 'eva_-_hriešne_dobrá_data.jsonl',\n",
|
||||||
|
" 'emefka_data.jsonl',\n",
|
||||||
|
" 'marek_hamsik_data.jsonl',\n",
|
||||||
|
" 'hetrik_data.jsonl',\n",
|
||||||
|
" 'peter_sagan_data.jsonl',\n",
|
||||||
|
" 'marian_čekovský_data.jsonl',\n",
|
||||||
|
" 'zuzana_čaputová_data.jsonl',\n",
|
||||||
|
" 'sajfa_data.jsonl',\n",
|
||||||
|
" 'marian_kotleba_data.jsonl',\n",
|
||||||
|
" 'fico_chunk_3.jsonl',\n",
|
||||||
|
" 'fico_chunk_1.jsonl',\n",
|
||||||
|
" 'šport_v_rtvs_data.jsonl',\n",
|
||||||
|
" 'dominika_cibulkova_data.jsonl',\n",
|
||||||
|
" 'šport24_data.jsonl',\n",
|
||||||
|
" 'niké_liga_data.jsonl',\n",
|
||||||
|
" 'fico_chunk_0.jsonl',\n",
|
||||||
|
" 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
|
||||||
|
" 'fico_chunk_2.jsonl']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"data_to_cluster = [x for x in os.listdir('jsonl_data')]\n",
|
||||||
|
"\n",
|
||||||
|
"data_to_cluster.remove('robert_fico_data.jsonl')\n",
|
||||||
|
"\n",
|
||||||
|
"data_to_cluster"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Executing the actual pipeline"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"100%|██████████| 27/27 [1:59:52<00:00, 266.38s/it] \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for dataset_name in tqdm(data_to_cluster):\n",
|
||||||
|
" dataset = load_jsonl(f'jsonl_data/{dataset_name}')\n",
|
||||||
|
" dataset = filter_null_text(dataset)\n",
|
||||||
|
" write_jsonl(f'clustered_jsonl/{dataset_name}', execute_pipeline(dataset))\n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['aktuality_data.jsonl',\n",
|
||||||
|
" 'denník_n_data.jsonl',\n",
|
||||||
|
" 'televízia_joj_data.jsonl',\n",
|
||||||
|
" '.DS_Store',\n",
|
||||||
|
" 'fakty_data.jsonl',\n",
|
||||||
|
" 'erik_kaliňák_data.jsonl',\n",
|
||||||
|
" 'zomri_data.jsonl',\n",
|
||||||
|
" 'igor_matovic_data.jsonl',\n",
|
||||||
|
" 'peter_marcin_data.jsonl',\n",
|
||||||
|
" 'ján_koleník_data.jsonl',\n",
|
||||||
|
" 'eva_-_hriešne_dobrá_data.jsonl',\n",
|
||||||
|
" 'emefka_data.jsonl',\n",
|
||||||
|
" 'marek_hamsik_data.jsonl',\n",
|
||||||
|
" 'hetrik_data.jsonl',\n",
|
||||||
|
" 'peter_sagan_data.jsonl',\n",
|
||||||
|
" 'marian_čekovský_data.jsonl',\n",
|
||||||
|
" 'zuzana_čaputová_data.jsonl',\n",
|
||||||
|
" 'sajfa_data.jsonl',\n",
|
||||||
|
" 'marian_kotleba_data.jsonl',\n",
|
||||||
|
" 'fico_chunk_3.jsonl',\n",
|
||||||
|
" 'fico_chunk_1.jsonl',\n",
|
||||||
|
" 'šport_v_rtvs_data.jsonl',\n",
|
||||||
|
" 'dominika_cibulkova_data.jsonl',\n",
|
||||||
|
" 'šport24_data.jsonl',\n",
|
||||||
|
" 'niké_liga_data.jsonl',\n",
|
||||||
|
" 'fico_chunk_0.jsonl',\n",
|
||||||
|
" 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
|
||||||
|
" 'fico_chunk_2.jsonl']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"os.listdir('jsonl_data')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "sentiment",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.16"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
181
preprocessing/create_jsonl.ipynb
Normal file
181
preprocessing/create_jsonl.ipynb
Normal file
@ -0,0 +1,181 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# JSON to JSONL file converter\n",
|
||||||
|
"This notebook turns structured JSON file to a simplier form as a JSONL for easier data manipulation"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# imports \n",
|
||||||
|
"import json\n",
|
||||||
|
"import os"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Open JSON data, then write it as JSONL"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def open_json(filename):\n",
|
||||||
|
" # Read the JSON file\n",
|
||||||
|
" with open(filename, 'r') as json_file:\n",
|
||||||
|
" return json.load(json_file)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def create_jsonl(filename, new_dataset):\n",
|
||||||
|
" with open(f'{filename}l', 'w') as jsonl_file:\n",
|
||||||
|
" for item in new_dataset:\n",
|
||||||
|
" jsonl_file.write(json.dumps(item) + '\\n')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Loop through dataset, create new list of dictionaries, drop duplicate data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def traverse_dataset(dataset):\n",
|
||||||
|
" new_dataset = []\n",
|
||||||
|
" for post in dataset:\n",
|
||||||
|
" new_dataset.append(post)\n",
|
||||||
|
" for comment in post['comments']:\n",
|
||||||
|
" new_dataset.append(comment)\n",
|
||||||
|
" try:\n",
|
||||||
|
" for reply in comment['replies']:\n",
|
||||||
|
" new_dataset.append(reply)\n",
|
||||||
|
"\n",
|
||||||
|
" for sec_reply in reply['replies']:\n",
|
||||||
|
" new_dataset.append(sec_reply)\n",
|
||||||
|
" except KeyError:\n",
|
||||||
|
" pass\n",
|
||||||
|
" \n",
|
||||||
|
" return new_dataset"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 31,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def drop_keywords(dataset):\n",
|
||||||
|
" for item in dataset:\n",
|
||||||
|
" try:\n",
|
||||||
|
" del item['comments']\n",
|
||||||
|
" except KeyError:\n",
|
||||||
|
" pass\n",
|
||||||
|
" try:\n",
|
||||||
|
" del item['replies']\n",
|
||||||
|
" except KeyError:\n",
|
||||||
|
" pass\n",
|
||||||
|
" \n",
|
||||||
|
" return dataset"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 37,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def clean_dataset(dataset):\n",
|
||||||
|
" cleaned_dataset = []\n",
|
||||||
|
" for data in dataset:\n",
|
||||||
|
"\n",
|
||||||
|
" cleaned_data = {}\n",
|
||||||
|
" if 'id' in data:\n",
|
||||||
|
" cleaned_data['id'] = data.get('id')\n",
|
||||||
|
" \n",
|
||||||
|
" if 'publisher' in data:\n",
|
||||||
|
" cleaned_data['author'] = data.get('publisher')\n",
|
||||||
|
" \n",
|
||||||
|
" if 'text' in data:\n",
|
||||||
|
" cleaned_data['text'] = data.get('text')\n",
|
||||||
|
" elif 'title' in data:\n",
|
||||||
|
" cleaned_data['text'] = data.get('title')\n",
|
||||||
|
"\n",
|
||||||
|
" cleaned_dataset.append(cleaned_data)\n",
|
||||||
|
"\n",
|
||||||
|
" return cleaned_dataset"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Execution of functions defined above"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 38,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for dataset_name in os.listdir('json_data_id/'):\n",
|
||||||
|
" dataset = open_json(f'json_data_id/{dataset_name}')\n",
|
||||||
|
"\n",
|
||||||
|
" new_dataset = traverse_dataset(dataset)\n",
|
||||||
|
" new_dataset = drop_keywords(new_dataset)\n",
|
||||||
|
" new_dataset = clean_dataset(new_dataset)\n",
|
||||||
|
"\n",
|
||||||
|
" create_jsonl(f'jsonl_data/{dataset_name}', new_dataset)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "sentiment",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.16"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
1058
preprocessing/dataProcessing.ipynb
Normal file
1058
preprocessing/dataProcessing.ipynb
Normal file
File diff suppressed because one or more lines are too long
103
preprocessing/id_addition.ipynb
Normal file
103
preprocessing/id_addition.ipynb
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"import os"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def open_json(filename):\n",
|
||||||
|
" with open(filename, 'r') as json_file:\n",
|
||||||
|
" return json.load(json_file)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def add_ids(json_file):\n",
|
||||||
|
" id_counter = 1\n",
|
||||||
|
" for post in json_file:\n",
|
||||||
|
" post[\"id\"] = id_counter\n",
|
||||||
|
" id_counter += 1\n",
|
||||||
|
" if 'comments' in post:\n",
|
||||||
|
" for comment in post['comments']:\n",
|
||||||
|
" comment[\"id\"] = id_counter\n",
|
||||||
|
" id_counter += 1\n",
|
||||||
|
" if 'replies' in comment:\n",
|
||||||
|
" for reply in comment['replies']:\n",
|
||||||
|
" reply[\"id\"] = id_counter\n",
|
||||||
|
" id_counter += 1\n",
|
||||||
|
" if 'replies' in reply:\n",
|
||||||
|
" for sec_reply in reply['replies']:\n",
|
||||||
|
" sec_reply[\"id\"] = id_counter\n",
|
||||||
|
" id_counter += 1\n",
|
||||||
|
" return json_file"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def create_json(filename, data):\n",
|
||||||
|
" with open(filename, 'w', encoding = \"utf-8\", ) as file:\n",
|
||||||
|
" json.dump(data, file, indent=4, separators=(',',': '))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for json_file in os.listdir(\"json_data\"):\n",
|
||||||
|
" data = open_json(f'json_data/{json_file}')\n",
|
||||||
|
" data = add_ids(data)\n",
|
||||||
|
" create_json(f'json_data_id/{json_file}', data)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "sentiment",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.16"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
14847
preprocessing/name_extraction.ipynb
Normal file
14847
preprocessing/name_extraction.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user