annotation app finished
This commit is contained in:
commit
d0dc4fa0f4
27
.gitignore
vendored
Normal file
27
.gitignore
vendored
Normal file
@ -0,0 +1,27 @@
|
||||
# preprocessing ignore
|
||||
|
||||
/preprocessing/*.jsonl
|
||||
/preprocessing/*.pickle
|
||||
|
||||
/preprocessing/__pycache__
|
||||
/preprocessing/classified_data
|
||||
/preprocessing/clustered_jsonl
|
||||
/preprocessing/json_data
|
||||
/preprocessing/json_data_id
|
||||
/preprocessing/jsonl_data
|
||||
/preprocessing/.DS_Store
|
||||
|
||||
# harvesting ignore
|
||||
/harvester/Facebook/inputs
|
||||
/harvester/Facebook/outputs
|
||||
/harvester/Facebook/.*
|
||||
/harvester/Facebook/__pycache__
|
||||
/harvester/__pycache__
|
||||
/harvester/.DS_Store
|
||||
|
||||
# annotation_app
|
||||
/annotation_app/.env
|
||||
/annotation_app/__pycache__
|
||||
/annotation_app/.DS_Store
|
||||
|
||||
|
2
annotation_app/.dockerignore
Normal file
2
annotation_app/.dockerignore
Normal file
@ -0,0 +1,2 @@
|
||||
get_data.py
|
||||
/instance
|
13
annotation_app/Dockerfile
Normal file
13
annotation_app/Dockerfile
Normal file
@ -0,0 +1,13 @@
|
||||
FROM python:3.9
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . /app/
|
||||
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
RUN python initial.py
|
||||
|
||||
EXPOSE 5050
|
||||
|
||||
CMD ["python3", "app.py"]
|
140
annotation_app/app.py
Normal file
140
annotation_app/app.py
Normal file
@ -0,0 +1,140 @@
|
||||
from flask import Flask, render_template, request, redirect, flash, session, url_for
|
||||
from models import db, Users, Annotations, Samples
|
||||
from dotenv import load_dotenv
|
||||
from sqlalchemy.orm import aliased
|
||||
import sqlalchemy
|
||||
import os
|
||||
|
||||
|
||||
import logging
|
||||
|
||||
load_dotenv()
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
app.config['SQLALCHEMY_DATABASE_URI'] = os.getenv('DB_URI')
|
||||
app.secret_key = os.getenv('SECRET_KEY')
|
||||
|
||||
db.init_app(app)
|
||||
|
||||
|
||||
|
||||
@app.route('/', methods=['GET'])
|
||||
def home():
|
||||
session.pop('id_user', None)
|
||||
return render_template('home.html')
|
||||
|
||||
@app.route('/login', methods=['POST'])
|
||||
def login():
|
||||
if request.method == 'POST':
|
||||
email = request.form['email']
|
||||
if '@' in email:
|
||||
try:
|
||||
splitted = email.split('@')
|
||||
name, surname = splitted[0].split('.')[:2]
|
||||
domain = splitted[1]
|
||||
if 'tuke' not in domain:
|
||||
raise ValueError
|
||||
except ValueError:
|
||||
flash('Nie je validný TUKE email')
|
||||
return redirect('/')
|
||||
try:
|
||||
db.session.add(Users(name, surname, email))
|
||||
db.session.commit()
|
||||
except sqlalchemy.exc.IntegrityError as err:
|
||||
db.session.rollback()
|
||||
logging.info('Logged existing email')
|
||||
user = Users.query.filter_by(email=email).first()
|
||||
session['email'] = email
|
||||
session['id_user'] = user._id
|
||||
return redirect('/anot')
|
||||
flash('Nie je validný TUKE email')
|
||||
return redirect('/')
|
||||
|
||||
@app.route('/anot', methods=['GET'])
|
||||
def anot():
|
||||
if 'id_user' in session:
|
||||
try:
|
||||
annotated_count = Annotations.query.filter_by(user_id=session['id_user']).count()
|
||||
|
||||
# query = text(
|
||||
# f'''
|
||||
# SELECT samples._id, text
|
||||
# FROM samples
|
||||
# LEFT JOIN annotations
|
||||
# ON samples._id = annotations.sample_id
|
||||
# WHERE samples._id NOT IN (
|
||||
# SELECT sample_id
|
||||
# FROM annotations
|
||||
# GROUP BY sample_id
|
||||
# HAVING COUNT(sample_id) > 5
|
||||
# )
|
||||
# AND samples._id NOT IN (
|
||||
# SELECT samples._id
|
||||
# FROM samples
|
||||
# LEFT JOIN annotations
|
||||
# ON samples._id = annotations.sample_id
|
||||
# WHERE annotations.user_id IS {session['id_user']}
|
||||
# )
|
||||
# ORDER BY samples._id ASC
|
||||
# LIMIT 1;
|
||||
# '''
|
||||
# )
|
||||
|
||||
annotations_alias = aliased(Annotations)
|
||||
|
||||
# Construct the query
|
||||
query = (
|
||||
db.session.query(Samples._id, Samples.text)
|
||||
.outerjoin(annotations_alias, Samples._id == annotations_alias.sample_id)
|
||||
.filter(
|
||||
~Samples._id.in_(
|
||||
db.session.query(Annotations.sample_id)
|
||||
.group_by(Annotations.sample_id)
|
||||
.having(db.func.count(Annotations.sample_id) > 5)
|
||||
),
|
||||
~Samples._id.in_(
|
||||
db.session.query(Samples._id)
|
||||
.outerjoin(Annotations, Samples._id == Annotations.sample_id)
|
||||
.filter(Annotations.user_id == session['id_user'])
|
||||
)
|
||||
)
|
||||
.order_by(Samples._id.asc())
|
||||
.limit(1)
|
||||
)
|
||||
sample_id, sample_text = query.one_or_none()
|
||||
|
||||
data = {
|
||||
'email': session.get('email'),
|
||||
'text': sample_text,
|
||||
'sample_id': sample_id,
|
||||
'annotated_count': annotated_count
|
||||
}
|
||||
|
||||
except (sqlalchemy.exc.OperationalError) as err:
|
||||
print(err)
|
||||
logging.info('Annotationss started')
|
||||
data = {
|
||||
'email': session.get('email'),
|
||||
'text': Samples.query.order_by(Samples._id.asc()).first().text,
|
||||
'sample_id': Samples.query.order_by(Samples._id.asc()).first()._id,
|
||||
'annotated_count': annotated_count
|
||||
}
|
||||
return render_template('anot.html', **data)
|
||||
return redirect('/')
|
||||
|
||||
@app.route('/process_anot', methods=['POST'])
|
||||
def process():
|
||||
if request.method == 'POST':
|
||||
data = request.get_json()
|
||||
print(data)
|
||||
db.session.add(Annotations(
|
||||
user_id=session['id_user'],
|
||||
sample_id=data['sample_id'],
|
||||
label=data['value']
|
||||
))
|
||||
db.session.commit()
|
||||
return redirect(url_for('anot'))
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=5050)
|
57140
annotation_app/dataset/final_id_v2.jsonl
Normal file
57140
annotation_app/dataset/final_id_v2.jsonl
Normal file
File diff suppressed because it is too large
Load Diff
54
annotation_app/get_data.py
Normal file
54
annotation_app/get_data.py
Normal file
@ -0,0 +1,54 @@
|
||||
from models import db, Samples, Users, Annotations
|
||||
from app import app
|
||||
from sqlalchemy import text
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
with app.app_context():
|
||||
# AND annotations.user_id <> '{id_user}'
|
||||
id_user = 4
|
||||
|
||||
|
||||
query = text(
|
||||
f'''
|
||||
SELECT samples.id, text
|
||||
FROM samples
|
||||
LEFT JOIN annotations
|
||||
ON samples.id = annotations.sample_id
|
||||
WHERE samples.id NOT IN (
|
||||
SELECT sample_id
|
||||
FROM annotations
|
||||
GROUP BY sample_id
|
||||
HAVING COUNT(sample_id) > 5
|
||||
)
|
||||
AND samples.id NOT IN (
|
||||
SELECT samples.id
|
||||
FROM samples
|
||||
LEFT JOIN annotations
|
||||
ON samples.id = annotations.sample_id
|
||||
WHERE annotations.user_id IS {id_user}
|
||||
)
|
||||
ORDER BY samples.id ASC
|
||||
LIMIT 1;
|
||||
'''
|
||||
)
|
||||
|
||||
# query = text(
|
||||
# '''
|
||||
# SELECT samples.id
|
||||
# FROM samples
|
||||
# LEFT JOIN annotations
|
||||
# ON samples.id = annotations.sample_id
|
||||
# WHERE annotations.user_id IS NOT 1
|
||||
# '''
|
||||
# )
|
||||
|
||||
result = db.session.execute(query)
|
||||
|
||||
print(result.fetchall())
|
||||
|
||||
annotations = Annotations.query.all()
|
||||
print(len(annotations))
|
||||
# for annotation in annotations:
|
||||
# print(annotation.user_id)
|
26
annotation_app/initial.py
Normal file
26
annotation_app/initial.py
Normal file
@ -0,0 +1,26 @@
|
||||
from models import db, Samples
|
||||
from app import app
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
with app.app_context():
|
||||
# db.init_app(app)
|
||||
|
||||
# creating database
|
||||
db.create_all()
|
||||
|
||||
try:
|
||||
with open(os.path.join('dataset', 'final_id_v2.jsonl'), encoding='utf-8') as file:
|
||||
data = [json.loads(line) for line in file]
|
||||
|
||||
|
||||
for sample in data:
|
||||
db.session.add(Samples(sample['text']))
|
||||
db.session.commit()
|
||||
print('Data sucessfully inserted')
|
||||
except FileNotFoundError as err:
|
||||
print(err)
|
BIN
annotation_app/instance/anot_db.db
Normal file
BIN
annotation_app/instance/anot_db.db
Normal file
Binary file not shown.
40
annotation_app/models.py
Normal file
40
annotation_app/models.py
Normal file
@ -0,0 +1,40 @@
|
||||
from flask_sqlalchemy import SQLAlchemy
|
||||
|
||||
db = SQLAlchemy()
|
||||
|
||||
class Annotations(db.Model):
|
||||
__tablename__ = 'annotations'
|
||||
|
||||
_id = db.Column("id", db.Integer, primary_key=True)
|
||||
user_id = db.Column(db.Integer, db.ForeignKey('users.id'), nullable=False)
|
||||
sample_id = db.Column(db.Integer, db.ForeignKey('samples.id'), nullable=False)
|
||||
label = db.Column(db.String(32), nullable=False)
|
||||
|
||||
def __init__(self, user_id, sample_id, label):
|
||||
self.user_id = user_id
|
||||
self.sample_id = sample_id
|
||||
self.label = label
|
||||
|
||||
class Users(db.Model):
|
||||
__tablename__ = 'users'
|
||||
|
||||
_id = db.Column("id", db.Integer, primary_key=True)
|
||||
name = db.Column(db.String(32), nullable=False)
|
||||
surname = db.Column(db.String(32), nullable=False)
|
||||
email = db.Column(db.String(64), unique=True, nullable=False)
|
||||
annotations = db.relationship('Annotations', uselist=False, backref='user', lazy=True)
|
||||
|
||||
def __init__(self, name, surname, email):
|
||||
self.name = name
|
||||
self.surname = surname
|
||||
self.email = email
|
||||
|
||||
class Samples(db.Model):
|
||||
__tablename__ = 'samples'
|
||||
|
||||
_id = db.Column("id", db.Integer, primary_key=True)
|
||||
text = db.Column(db.String(512), nullable=False)
|
||||
annotations = db.relationship('Annotations', lazy=True, backref='sample') # corrected relationship and added backref
|
||||
|
||||
def __init__(self, text):
|
||||
self.text = text
|
13
annotation_app/requirements.txt
Normal file
13
annotation_app/requirements.txt
Normal file
@ -0,0 +1,13 @@
|
||||
blinker==1.7.0
|
||||
click==8.1.7
|
||||
Flask==3.0.2
|
||||
Flask-SQLAlchemy==3.1.1
|
||||
importlib_metadata==7.0.2
|
||||
itsdangerous==2.1.2
|
||||
Jinja2==3.1.3
|
||||
MarkupSafe==2.1.5
|
||||
python-dotenv==1.0.1
|
||||
SQLAlchemy==2.0.28
|
||||
typing_extensions==4.10.0
|
||||
Werkzeug==3.0.1
|
||||
zipp==3.18.1
|
76
annotation_app/static/stylesheets/styles.css
Normal file
76
annotation_app/static/stylesheets/styles.css
Normal file
@ -0,0 +1,76 @@
|
||||
body, html {
|
||||
height: 100%;
|
||||
margin: 0;
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
background-color: #FEFFFF;
|
||||
}
|
||||
|
||||
table {
|
||||
border-radius: 8px;
|
||||
}
|
||||
.container{
|
||||
position: absolute;
|
||||
top: 50%;
|
||||
left: 50%;
|
||||
transform: translate(-50%, -50%);
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.btn {
|
||||
background-color: #3AAFA9;
|
||||
border: 1px solid #3AAFA9;
|
||||
}
|
||||
|
||||
.logout-btn{
|
||||
background-color: #454d55;
|
||||
margin-top: 7.5%;
|
||||
}
|
||||
|
||||
#top-info{
|
||||
margin-top: 5%;
|
||||
position: fixed;
|
||||
top: 5%;
|
||||
}
|
||||
|
||||
.anot{
|
||||
border: 1px solid #000000;
|
||||
border-radius: 10px;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
.anot-text {
|
||||
padding: 2.5%;
|
||||
}
|
||||
|
||||
|
||||
.form-control{
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
#login{
|
||||
position: absolute;
|
||||
top: -20vh;
|
||||
left: 50%;
|
||||
transform: translate(-50%, 50%);
|
||||
border: 1px solid #000000;
|
||||
border-radius: 8px;
|
||||
padding: 4vh;
|
||||
width: 500px;
|
||||
}
|
||||
|
||||
|
||||
.top-info {
|
||||
width: 100%;
|
||||
border-collapse: collapse; /* Optional: collapse border */
|
||||
}
|
||||
|
||||
.top-info td {
|
||||
border: 1px solid #000; /* Add border to table cells */
|
||||
padding: 8px; /* Optional: Add padding */
|
||||
}
|
||||
|
||||
h3{
|
||||
margin-bottom: 3%;
|
||||
}
|
66
annotation_app/templates/anot.html
Normal file
66
annotation_app/templates/anot.html
Normal file
@ -0,0 +1,66 @@
|
||||
{% extends "base.html" %}
|
||||
|
||||
{% block title %} annotation {% endblock %}
|
||||
|
||||
{% block content%}
|
||||
|
||||
<div class="container" id="top-info">
|
||||
<table class="table top-info">
|
||||
<thead class="thead-dark">
|
||||
<tr>
|
||||
<th>Email</th>
|
||||
<th>Počet anotovaných jednotiek</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tr>
|
||||
<td>{{ email }}</td>
|
||||
<td>{{ annotated_count }}</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="container">
|
||||
<div class="anot">
|
||||
<p class="anot-text">{{ text }}</p>
|
||||
<button id="post" class="btn btn-primary" onclick="postBcknd('offensive', {{ sample_id }})">Ofenzívny</button>
|
||||
<button id="post" class="btn btn-primary" onclick="postBcknd('not_offensive', {{ sample_id }})">Neofenzívny</button>
|
||||
<button id="post" class="btn btn-primary" onclick="postBcknd('dont_know', {{ sample_id }})">Neviem</button>
|
||||
</div>
|
||||
|
||||
<button id="get" class="btn btn-primary logout-btn" onclick="logout()"> Odhlásiť sa</button>
|
||||
|
||||
<script>
|
||||
function postBcknd(value, sample_id){
|
||||
var xhr = new XMLHttpRequest();
|
||||
xhr.open('POST', '/process_anot', true);
|
||||
xhr.setRequestHeader('Content-Type', 'application/json');
|
||||
|
||||
xhr.onload = function () {
|
||||
if(xhr.status === 200) {
|
||||
console.log('request sent succesfully');
|
||||
window.location.href = '/anot';
|
||||
} else {
|
||||
console.log('request failed');
|
||||
}
|
||||
};
|
||||
xhr.send(JSON.stringify({value: value, sample_id: sample_id}));
|
||||
}
|
||||
|
||||
function logout() {
|
||||
var xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', '/', true);
|
||||
|
||||
xhr.onload = function () {
|
||||
if (xhr.status === 200) {
|
||||
console.log('Logout successful');
|
||||
window.location.href = '/';
|
||||
} else {
|
||||
console.log('Logout request failed');
|
||||
}
|
||||
}
|
||||
|
||||
xhr.send(); // Send the request
|
||||
}
|
||||
</script>
|
||||
|
||||
{% endblock %}
|
17
annotation_app/templates/base.html
Normal file
17
annotation_app/templates/base.html
Normal file
@ -0,0 +1,17 @@
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
{% block head %}
|
||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
|
||||
<link type="text/css" rel="stylesheet" href="{{ url_for('static', filename='stylesheets/styles.css') }}" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>{% block title %}{% endblock %}</title>
|
||||
{% endblock %}
|
||||
</head>
|
||||
<body>
|
||||
<div id="content">
|
||||
{% block content %}
|
||||
{% endblock %}
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
19
annotation_app/templates/home.html
Normal file
19
annotation_app/templates/home.html
Normal file
@ -0,0 +1,19 @@
|
||||
{% extends "base.html" %}
|
||||
{% block title %} Welcome {% endblock %}
|
||||
|
||||
{% block content %}
|
||||
|
||||
<div class="container">
|
||||
<form action="/login" id="login" method="post" >
|
||||
<h3>Login anotačnej aplikácie</h3>
|
||||
<input type="text" name="email" placeholder="meno.priezvisko@student.tuke.sk" class="form-control">
|
||||
<button id="post" class="btn btn-primary login-btn">Prihlásiť sa</button>
|
||||
{% with messages = get_flashed_messages() %}
|
||||
{% if messages %}
|
||||
<p style="margin-top: 2%;"> {{ messages[0] }} </p>
|
||||
{% endif %}
|
||||
{% endwith %}
|
||||
</form>
|
||||
</div>
|
||||
|
||||
{% endblock %}
|
134
harvester/Facebook/facebook_crawler.py
Normal file
134
harvester/Facebook/facebook_crawler.py
Normal file
@ -0,0 +1,134 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
import selenium.common.exceptions
|
||||
from Facebook.facebook_parser import FacebookParser
|
||||
|
||||
from crawler import Crawler
|
||||
|
||||
|
||||
class FacebookCrawler(Crawler, FacebookParser):
|
||||
|
||||
|
||||
def __init__(self, base_url: str, file_name: str):
|
||||
super().__init__(base_url, file_name)
|
||||
|
||||
try:
|
||||
with open(os.path.join('locators.json')) as file:
|
||||
self.locators = json.load(file)
|
||||
|
||||
with open(os.path.join('Facebook', 'inputs', self.filename)) as file:
|
||||
self.URLS = tuple(file.readlines())
|
||||
except FileNotFoundError:
|
||||
print(os.path.join('Facebook', 'inputs', self.filename))
|
||||
print("Invalid input value")
|
||||
sys.exit(1)
|
||||
|
||||
# crawling part of the code
|
||||
def crawl(self):
|
||||
counter = len(self.URLS)
|
||||
for idx,url in enumerate(self.URLS):
|
||||
# redirect and wait for page to load
|
||||
self.driver.get(url)
|
||||
self.driver.implicitly_wait(4)
|
||||
|
||||
if 'videos' in url:
|
||||
try:
|
||||
self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest' or text()='Oldest']")
|
||||
except selenium.common.exceptions.NoSuchElementException:
|
||||
self.driver.find_element(By.XPATH, "//span[contains(@class, 'x6ikm8r x10wlt62 xlyipyv')]").click()
|
||||
if self.driver.find_element(By.XPATH, "//div[contains(@class, 'x78zum5 xdt5ytf x1iyjqo2 x7ywyr2')]") is not None:
|
||||
print('Cant crawl comments section')
|
||||
continue
|
||||
|
||||
self.close_censorship('Newest')
|
||||
else:
|
||||
try:
|
||||
self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest' or text()='Oldest']")
|
||||
except:
|
||||
pass
|
||||
|
||||
self.close_censorship('All comments')
|
||||
|
||||
self.driver.implicitly_wait(3)
|
||||
print('continue scraping')
|
||||
|
||||
# clicking features
|
||||
self.view_more_comments()
|
||||
self.show_replies()
|
||||
self.click_see_more()
|
||||
|
||||
# # parsing part of the code
|
||||
|
||||
# Dictionary of classes, if facebook changes any class, rewrite this DICT
|
||||
if '/videos/' in url:
|
||||
self.class_dict = self.locators['facebook_video_locators']
|
||||
elif '/posts/' in url:
|
||||
self.class_dict = self.locators['facebook_post_locators']
|
||||
|
||||
self.parse(self.driver.page_source, self.class_dict, self.filename)
|
||||
print(f'Done: [{idx + 1}/{counter}]')
|
||||
|
||||
def view_more_comments(self):
|
||||
elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
|
||||
while elements:
|
||||
try:
|
||||
self.driver.execute_script("arguments[0].click();", elements[0])
|
||||
elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
|
||||
self.driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
|
||||
except selenium.common.exceptions.StaleElementReferenceException:
|
||||
elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
|
||||
|
||||
|
||||
# function, for showing hidden replies
|
||||
def show_replies(self):
|
||||
|
||||
repl_elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'repl') and not(contains(text(), 'Hide')) and not(contains(text(), 'Newest'))]")
|
||||
i = 1
|
||||
while repl_elements:
|
||||
|
||||
try:
|
||||
for element in repl_elements:
|
||||
self.driver.execute_script("arguments[0].click();", element)
|
||||
time.sleep(0.5)
|
||||
|
||||
except selenium.common.exceptions.StaleElementReferenceException:
|
||||
pass
|
||||
repl_elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'repl') and not(contains(text(), 'Hide')) and not(contains(text(), 'Newest'))]")
|
||||
|
||||
# method for expanding comments
|
||||
def click_see_more(self):
|
||||
|
||||
elements = self.driver.find_elements(By.XPATH, "//*[text()='See more']")
|
||||
|
||||
for element in elements:
|
||||
self.driver.execute_script("arguments[0].click();", element)
|
||||
|
||||
|
||||
|
||||
# method for clossing most relevant filter to Newest
|
||||
def close_censorship(self, classification: str):
|
||||
self.driver.implicitly_wait(3)
|
||||
try:
|
||||
dropdown = self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest']")
|
||||
self.driver.execute_script("arguments[0].click();", dropdown) # clicking on it
|
||||
|
||||
newest_comments = self.driver.find_element(By.XPATH, f"//*[text()='{classification}']")
|
||||
self.driver.execute_script("arguments[0].click();", newest_comments) # clicking on it
|
||||
except:
|
||||
self.close_censorship(classification)
|
||||
|
||||
|
||||
def close(self):
|
||||
print('Scraping ended succesffuly')
|
||||
self.driver.quit()
|
||||
sys.exit(0)
|
129
harvester/Facebook/facebook_parser.py
Normal file
129
harvester/Facebook/facebook_parser.py
Normal file
@ -0,0 +1,129 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from print_dict import pd
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
|
||||
class FacebookParser:
|
||||
|
||||
def parse(self, html, clsDict, fname = 'final_dataset.json'):
|
||||
|
||||
self.soup = BeautifulSoup(html, 'lxml')
|
||||
self.outFileName = fname
|
||||
self.outFileName = f"outputs/parts/{self.outFileName.split('.')[0]}_data.json"
|
||||
|
||||
# dict for data about facebook post
|
||||
self.post_data = {
|
||||
'publisher': None,
|
||||
'title': None,
|
||||
'comments': [],
|
||||
'post_reactions': None
|
||||
}
|
||||
|
||||
# dict for comments
|
||||
self.comment_data = {
|
||||
'publisher': None,
|
||||
'text': None,
|
||||
'replies': []
|
||||
}
|
||||
|
||||
# reply data
|
||||
self.reply_data = {
|
||||
'publisher': None,
|
||||
'text': None
|
||||
}
|
||||
|
||||
# post info
|
||||
self.name = self.soup.find('a', {'class': clsDict['POST_AUTHOR']})
|
||||
|
||||
if clsDict['TOP_LABEL'] == 'message':
|
||||
self.top = self.soup.find('div', {'data-ad-comet-preview': clsDict['TOP_LABEL']})
|
||||
else:
|
||||
self.top = self.soup.find('div', {'class': clsDict['TOP_LABEL']})
|
||||
if self.top is None:
|
||||
self.top = self.soup.find('div', {'class': 'x78zum5 xdt5ytf x2lah0s xyamay9 x1pi30zi x18d9i69 x1swvt13'})
|
||||
|
||||
self.title_likes = self.soup.find('span', {'class': clsDict['TITLE_LIKES']})
|
||||
try:
|
||||
self.tmp_strings = self.top.find_all('div', {'style': clsDict['STATUS_STRINGS']})
|
||||
self.title = ''
|
||||
for x in self.tmp_strings:
|
||||
try:
|
||||
self.title += x.text + '. '
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
self.title = None
|
||||
|
||||
|
||||
|
||||
self.post_data = {
|
||||
'publisher': self.name.text if self.name is not None else None,
|
||||
'title': self.title,
|
||||
'post_reactions': self.title_likes.text if self.title_likes is not None else None,
|
||||
'comments': []
|
||||
}
|
||||
|
||||
if self.post_data['publisher'] is None:
|
||||
return
|
||||
|
||||
# comment info
|
||||
self.all_comments = self.soup.find_all("div", {"aria-label": lambda x: x and x.endswith("ago")}) # arr with all comments under the post
|
||||
# print(len(self.all_comments))
|
||||
for item in self.all_comments:
|
||||
self.publisher = item.find('span', {'class', clsDict['COMMENT_AUTHOR']})
|
||||
self.txt = item.find('div', {'class': clsDict['COMMENT_STR']})
|
||||
try:
|
||||
tmp_type = item.get('aria-label').split(' ')[0]
|
||||
tmp_class = item.find('div', {'class': lambda x: x and x.startswith(clsDict['TMP_COMMENTS_CLASS'])}).get('class')[-1]
|
||||
if tmp_type == "Comment":
|
||||
self.comment_data = {
|
||||
'publisher': self.publisher.text,
|
||||
'text': self.txt.text if self.txt is not None else None,
|
||||
'replies': []
|
||||
}
|
||||
self.post_data['comments'].append(self.comment_data)
|
||||
|
||||
|
||||
elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER']:
|
||||
self.comment_data = {
|
||||
'publisher': self.publisher.text,
|
||||
'text': self.txt.text if self.txt is not None else None,
|
||||
'replies': []
|
||||
}
|
||||
self.post_data['comments'][-1]['replies'].append(self.comment_data)
|
||||
|
||||
elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER_2']:
|
||||
self.reply_data = {
|
||||
'publisher': self.publisher.text,
|
||||
'text': self.txt.text if self.txt is not None else None,
|
||||
}
|
||||
self.post_data['comments'][-1]['replies'][-1]['replies'].append(self.reply_data)
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if os.path.exists(self.outFileName):
|
||||
with open(self.outFileName, 'r+', encoding= "utf-8") as file:
|
||||
tmp = json.load(file)
|
||||
tmp.append(self.post_data)
|
||||
file.seek(0)
|
||||
json.dump(tmp, file, indent=4, separators=(',',': '))
|
||||
else:
|
||||
with open(self.outFileName, 'w', encoding = "utf-8", ) as file:
|
||||
json.dump([self.post_data], file, indent=4, separators=(',',': '))
|
||||
|
||||
#read URLS from a .txt
|
||||
try:
|
||||
with open(os.path.join('Facebook', 'inputs', self.filename), 'r+') as file:
|
||||
lines = file.readlines()
|
||||
# move file pointer to the beginning of a file
|
||||
file.seek(0)
|
||||
# truncate the file
|
||||
file.truncate()
|
||||
# start writing lines except the first line
|
||||
file.writelines(lines[1:])
|
||||
except FileNotFoundError:
|
||||
print('Invalid input value')
|
||||
sys.exit(1)
|
155
harvester/Facebook/linkCollector.py
Normal file
155
harvester/Facebook/linkCollector.py
Normal file
@ -0,0 +1,155 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
|
||||
|
||||
# parse args
|
||||
parser = argparse.ArgumentParser(description = "Facebook scraper")
|
||||
parser.add_argument("URL", help = 'URL of the facebook page / profile') # dont need to specify type (default is string)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
# method for waiting on pages to load
|
||||
def wait_for_url(driver, url):
|
||||
# waiting to load main page
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(EC.url_to_be(url))
|
||||
print('Succesful !')
|
||||
except:
|
||||
print('Connection error')
|
||||
driver.quit()
|
||||
sys.exit(1)
|
||||
|
||||
# web driver init
|
||||
def webdriver_setup():
|
||||
|
||||
driver_path = r'C:\Users\vlferko\Desktop\projects\jarvis_scraper\chromedriver.exe'
|
||||
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("accept-language=en-US")
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--log-level=OFF")
|
||||
driver = webdriver.Chrome(ChromeDriverManager().install(), options= chrome_options)
|
||||
|
||||
driver.get("https://www.facebook.com/")
|
||||
return driver
|
||||
|
||||
# login to a facebook acc
|
||||
def login(driver):
|
||||
print('Logging in')
|
||||
# allow cookies
|
||||
try:
|
||||
driver.find_element(By.XPATH, "//button[contains(string(), 'Decline optional cookies')]").click()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
# insert login data
|
||||
driver.find_element(By.NAME, "email").send_keys(os.environ['FB_EMAIL']) # type email
|
||||
driver.find_element(By.NAME, "pass").send_keys(os.environ['FB_PASSWD']) # type password
|
||||
|
||||
# click -> log in
|
||||
driver.find_element(By.NAME, "login").click()
|
||||
time.sleep(5)
|
||||
|
||||
# scrolling to the bottom of the page
|
||||
def crawl_for_links(driver, url):
|
||||
print('Crawling')
|
||||
i = 1
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
name = driver.find_elements(By.TAG_NAME, 'h2')[-1].text
|
||||
|
||||
|
||||
for _ in range(0, 3):
|
||||
# Get scroll height
|
||||
last_height = driver.execute_script("return document.body.scrollHeight")
|
||||
for _ in range(50):
|
||||
# Scroll down to bottom
|
||||
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
|
||||
|
||||
# Wait to load page
|
||||
time.sleep(3)
|
||||
|
||||
# Calculate new scroll height and compare with last scroll height
|
||||
new_height = driver.execute_script("return document.body.scrollHeight")
|
||||
if new_height == last_height:
|
||||
break
|
||||
last_height = new_height
|
||||
os.system('clear||cls')
|
||||
print(f'Iteration num: {i}')
|
||||
i += 1
|
||||
|
||||
return driver.page_source, name
|
||||
|
||||
# parse HTML
|
||||
def parse_html(html):
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
timeline = soup.find('div', {'class': 'x9f619 x1n2onr6 x1ja2u2z xeuugli xs83m0k x1xmf6yo x1emribx x1e56ztr x1i64zmx xjl7jj x19h7ccj xu9j1y6 x7ep2pv'})
|
||||
posts = timeline.find_all('div', {'class': 'x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z'})
|
||||
arr = []
|
||||
for post in posts:
|
||||
try:
|
||||
commentsWidget = post.find('span', {'class': 'x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xi81zsa'})
|
||||
if approveComments(commentsWidget.text):
|
||||
links = post.find_all('a', {'role': 'link'})
|
||||
arr.append(extractPostLink(links))
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
return arr
|
||||
|
||||
def extractPostLink(links):
|
||||
for link in links:
|
||||
if '/videos/' in link['href'] or '/posts/' in link['href']:
|
||||
return link['href']
|
||||
|
||||
# check if post has at least 50 comments
|
||||
def approveComments(text):
|
||||
nComments = text.split(' ')[0]
|
||||
try:
|
||||
num = int(nComments)
|
||||
return int(num > 50)
|
||||
except ValueError:
|
||||
return 'K' or 'M' in nComments
|
||||
|
||||
# write all the links to the .txt
|
||||
def write_out(arr, name):
|
||||
|
||||
with open(f"{os.getcwd()}/inputs/{name.strip().replace(' ', '_').lower()}.txt", 'w') as f:
|
||||
for item in arr:
|
||||
try:
|
||||
f.write(item + '\n')
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# driver init
|
||||
driver =webdriver_setup()
|
||||
wait_for_url(driver, 'https://www.facebook.com/')
|
||||
|
||||
# login
|
||||
login(driver)
|
||||
wait_for_url(driver, 'https://www.facebook.com/')
|
||||
|
||||
# crawl
|
||||
html, name =crawl_for_links(driver, args.URL)
|
||||
driver.close()
|
||||
|
||||
# parsing HTML
|
||||
arr =parse_html(html)
|
||||
|
||||
# write out
|
||||
write_out(arr, name)
|
24
harvester/locators.json
Normal file
24
harvester/locators.json
Normal file
@ -0,0 +1,24 @@
|
||||
{
|
||||
"facebook_post_locators": {
|
||||
"POST_AUTHOR": "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f",
|
||||
"TOP_LABEL": "xjkvuk6 xuyqlj2 x1odjw0f",
|
||||
"TITLE_LIKES": "xt0b8zv x1jx94hy xrbpyxo xl423tq",
|
||||
"STATUS_STRINGS": "text-align: start;",
|
||||
"COMMENT_AUTHOR": "x3nfvp2",
|
||||
"COMMENT_STR": "x1lliihq xjkvuk6 x1iorvi4",
|
||||
"TMP_COMMENTS_CLASS": "xqcrz7y",
|
||||
"REPLY_DIVIDER": "x1k70j0n",
|
||||
"REPLY_DIVIDER_2": "x1n2onr6"
|
||||
},
|
||||
"facebok_video_locators": {
|
||||
"POST_AUTHOR": "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f",
|
||||
"TOP_LABEL": "message",
|
||||
"TITLE_LIKES": "xt0b8zv x1jx94hy xrbpyxo xl423tq",
|
||||
"STATUS_STRINGS": "text-align: start;",
|
||||
"COMMENT_AUTHOR": "x3nfvp2",
|
||||
"COMMENT_STR": "xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs",
|
||||
"TMP_COMMENTS_CLASS": "xqcrz7y",
|
||||
"REPLY_DIVIDER": "x1k70j0n",
|
||||
"REPLY_DIVIDER_2": "x1n2onr6"
|
||||
}
|
||||
}
|
30
harvester/main.py
Normal file
30
harvester/main.py
Normal file
@ -0,0 +1,30 @@
|
||||
import argparse
|
||||
|
||||
from Facebook.facebook_crawler import FacebookCrawler
|
||||
from Reddit.reddit_crawler import RedditCrawler
|
||||
|
||||
FACEBOOK_URL = 'https://www.facebook.com/'
|
||||
REDDIT_URL = 'https://www.reddit.com/'
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# parsing arguments
|
||||
parser = argparse.ArgumentParser(description = "Facebook scraper")
|
||||
parser.add_argument("file_name", help = 'Name of the .txt file with URLS')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
user_input = input('Hello, do you want to scraper Facebook or reddit? [F/r]: ')
|
||||
|
||||
while user_input.upper() not in ['F', 'R']:
|
||||
user_input = input('Do you want to scrape Facebook or reddit? [F/r]: ')
|
||||
|
||||
|
||||
if user_input == 'F':
|
||||
facebook = FacebookCrawler(FACEBOOK_URL, args.file_name)
|
||||
facebook.allow_cookies()
|
||||
facebook.login()
|
||||
facebook.crawl()
|
||||
else:
|
||||
reddit = RedditCrawler(REDDIT_URL, args.file_name)
|
||||
print(reddit)
|
4
harvester/requirements.txt
Normal file
4
harvester/requirements.txt
Normal file
@ -0,0 +1,4 @@
|
||||
beautifulsoup4==4.12.2
|
||||
print_dict==0.1.19
|
||||
selenium==4.10.0
|
||||
webdriver_manager==3.8.6
|
176
preprocessing/chunking.ipynb
Normal file
176
preprocessing/chunking.ipynb
Normal file
@ -0,0 +1,176 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Data chunking for effectiveness\n",
|
||||
"\n",
|
||||
"In our data, facebook user called Robert Fico has a lot of samples.\n",
|
||||
"For efficiency, this notebook chunks those data in 4 parts."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### JSONL file loading and creation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_jsonl(file_path):\n",
|
||||
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
||||
" return [json.loads(line) for line in file]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_jsonl(filename, new_dataset):\n",
|
||||
" with open(f'{filename}l', 'w') as jsonl_file:\n",
|
||||
" for item in new_dataset:\n",
|
||||
" jsonl_file.write(json.dumps(item) + '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fico = load_jsonl('jsonl_data/robert_fico_data.jsonl')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Split data into 4 parts equal parts"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"135155"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"num_samples = len(fico)\n",
|
||||
"chunk_size = int(num_samples / 4)\n",
|
||||
"\n",
|
||||
"num_samples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chunk_size * 4 == num_samples # we have lost one sample, because our dataset has odd number of samples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Actual chunking algorithm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chunk_arr = []\n",
|
||||
"for chunks in range(0, 4):\n",
|
||||
" chunk_arr.append(\n",
|
||||
" fico[chunk_size * chunks: chunk_size * (chunks + 1)]\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Write chunked data to disk in a for loop"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for index, data in enumerate(chunk_arr):\n",
|
||||
" create_jsonl(f'jsonl_data/fico_chunk_{index}.json', data)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "sentiment",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.18"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
1742
preprocessing/clustered_processing.ipynb
Normal file
1742
preprocessing/clustered_processing.ipynb
Normal file
File diff suppressed because one or more lines are too long
389
preprocessing/clustering.ipynb
Normal file
389
preprocessing/clustering.ipynb
Normal file
@ -0,0 +1,389 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# This notebook is clustering samples based on their semantic similarity.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/A200119424/anaconda3/envs/sentiment/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# imports \n",
|
||||
"\n",
|
||||
"from sentence_transformers import SentenceTransformer, util\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import numpy as np\n",
|
||||
"import torch\n",
|
||||
"import numpy as np\n",
|
||||
"import warnings\n",
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"warnings.filterwarnings(\"ignore\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Model init\n",
|
||||
"\n",
|
||||
"In this clustering process will be used TUKE-DeutscheTelekom/slovakbert-skquad-mnlr"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = SentenceTransformer('TUKE-DeutscheTelekom/slovakbert-skquad-mnlr')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Data manipulation in file system"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_jsonl(file_path):\n",
|
||||
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
||||
" return [json.loads(line) for line in file]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Pipeline functions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Embedding creation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_embeddings(jsonl_file):\n",
|
||||
" sentences = [item['text'] for item in jsonl_file]\n",
|
||||
" return model.encode(sentences), sentences"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Clustering algorithm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def cluster_data(embeddings, sentences):\n",
|
||||
" embeddings_np = np.array(embeddings)\n",
|
||||
"\n",
|
||||
" similarity_threshold = 0.65\n",
|
||||
"\n",
|
||||
" long_enough_mask = np.array([len(sentence) > 20 for sentence in sentences])\n",
|
||||
"\n",
|
||||
" cosine_sim_matrix = util.pytorch_cos_sim(torch.tensor(embeddings_np), torch.tensor(embeddings_np)).numpy()\n",
|
||||
"\n",
|
||||
" below_threshold_mask = cosine_sim_matrix < similarity_threshold\n",
|
||||
"\n",
|
||||
" filtered_mask = np.logical_and(below_threshold_mask, np.outer(long_enough_mask, long_enough_mask))\n",
|
||||
"\n",
|
||||
" non_spam_indices = np.where(filtered_mask)\n",
|
||||
"\n",
|
||||
" filtered_sentences = list(set([sentences[i] for i in non_spam_indices[0]]))\n",
|
||||
"\n",
|
||||
" return filtered_sentences"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prepare data to write it to JSONL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def filter_null_text(json_list):\n",
|
||||
" filtered_list = [obj for obj in json_list if \"text\" in obj and obj[\"text\"] is not None]\n",
|
||||
" return filtered_list"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_jsonl_format(filtered, jsonl_file):\n",
|
||||
"\n",
|
||||
" return [\n",
|
||||
" {\n",
|
||||
" 'id': item['id'],\n",
|
||||
" 'author': item['author'],\n",
|
||||
" 'text': item['text']\n",
|
||||
" }\n",
|
||||
" for item in jsonl_file if item['text'] in filtered\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Write out JSONL file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def write_jsonl(filename, data):\n",
|
||||
" with open(filename, 'w') as f:\n",
|
||||
" for item in data:\n",
|
||||
" json.dump(item, f)\n",
|
||||
" f.write('\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Pipeline execution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def execute_pipeline(jsonl_file):\n",
|
||||
" embeddings, sentences = create_embeddings(jsonl_file)\n",
|
||||
" filtered_data = cluster_data(embeddings, sentences)\n",
|
||||
" return create_jsonl_format(filtered_data, jsonl_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Pipeline usecase"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"prepare data for clustering in a loop"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['aktuality_data.jsonl',\n",
|
||||
" 'denník_n_data.jsonl',\n",
|
||||
" 'televízia_joj_data.jsonl',\n",
|
||||
" 'fakty_data.jsonl',\n",
|
||||
" 'erik_kaliňák_data.jsonl',\n",
|
||||
" 'zomri_data.jsonl',\n",
|
||||
" 'igor_matovic_data.jsonl',\n",
|
||||
" 'peter_marcin_data.jsonl',\n",
|
||||
" 'ján_koleník_data.jsonl',\n",
|
||||
" 'eva_-_hriešne_dobrá_data.jsonl',\n",
|
||||
" 'emefka_data.jsonl',\n",
|
||||
" 'marek_hamsik_data.jsonl',\n",
|
||||
" 'hetrik_data.jsonl',\n",
|
||||
" 'peter_sagan_data.jsonl',\n",
|
||||
" 'marian_čekovský_data.jsonl',\n",
|
||||
" 'zuzana_čaputová_data.jsonl',\n",
|
||||
" 'sajfa_data.jsonl',\n",
|
||||
" 'marian_kotleba_data.jsonl',\n",
|
||||
" 'fico_chunk_3.jsonl',\n",
|
||||
" 'fico_chunk_1.jsonl',\n",
|
||||
" 'šport_v_rtvs_data.jsonl',\n",
|
||||
" 'dominika_cibulkova_data.jsonl',\n",
|
||||
" 'šport24_data.jsonl',\n",
|
||||
" 'niké_liga_data.jsonl',\n",
|
||||
" 'fico_chunk_0.jsonl',\n",
|
||||
" 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
|
||||
" 'fico_chunk_2.jsonl']"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data_to_cluster = [x for x in os.listdir('jsonl_data')]\n",
|
||||
"\n",
|
||||
"data_to_cluster.remove('robert_fico_data.jsonl')\n",
|
||||
"\n",
|
||||
"data_to_cluster"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Executing the actual pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 27/27 [1:59:52<00:00, 266.38s/it] \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for dataset_name in tqdm(data_to_cluster):\n",
|
||||
" dataset = load_jsonl(f'jsonl_data/{dataset_name}')\n",
|
||||
" dataset = filter_null_text(dataset)\n",
|
||||
" write_jsonl(f'clustered_jsonl/{dataset_name}', execute_pipeline(dataset))\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['aktuality_data.jsonl',\n",
|
||||
" 'denník_n_data.jsonl',\n",
|
||||
" 'televízia_joj_data.jsonl',\n",
|
||||
" '.DS_Store',\n",
|
||||
" 'fakty_data.jsonl',\n",
|
||||
" 'erik_kaliňák_data.jsonl',\n",
|
||||
" 'zomri_data.jsonl',\n",
|
||||
" 'igor_matovic_data.jsonl',\n",
|
||||
" 'peter_marcin_data.jsonl',\n",
|
||||
" 'ján_koleník_data.jsonl',\n",
|
||||
" 'eva_-_hriešne_dobrá_data.jsonl',\n",
|
||||
" 'emefka_data.jsonl',\n",
|
||||
" 'marek_hamsik_data.jsonl',\n",
|
||||
" 'hetrik_data.jsonl',\n",
|
||||
" 'peter_sagan_data.jsonl',\n",
|
||||
" 'marian_čekovský_data.jsonl',\n",
|
||||
" 'zuzana_čaputová_data.jsonl',\n",
|
||||
" 'sajfa_data.jsonl',\n",
|
||||
" 'marian_kotleba_data.jsonl',\n",
|
||||
" 'fico_chunk_3.jsonl',\n",
|
||||
" 'fico_chunk_1.jsonl',\n",
|
||||
" 'šport_v_rtvs_data.jsonl',\n",
|
||||
" 'dominika_cibulkova_data.jsonl',\n",
|
||||
" 'šport24_data.jsonl',\n",
|
||||
" 'niké_liga_data.jsonl',\n",
|
||||
" 'fico_chunk_0.jsonl',\n",
|
||||
" 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
|
||||
" 'fico_chunk_2.jsonl']"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"os.listdir('jsonl_data')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "sentiment",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
181
preprocessing/create_jsonl.ipynb
Normal file
181
preprocessing/create_jsonl.ipynb
Normal file
@ -0,0 +1,181 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# JSON to JSONL file converter\n",
|
||||
"This notebook turns structured JSON file to a simplier form as a JSONL for easier data manipulation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports \n",
|
||||
"import json\n",
|
||||
"import os"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Open JSON data, then write it as JSONL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def open_json(filename):\n",
|
||||
" # Read the JSON file\n",
|
||||
" with open(filename, 'r') as json_file:\n",
|
||||
" return json.load(json_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_jsonl(filename, new_dataset):\n",
|
||||
" with open(f'{filename}l', 'w') as jsonl_file:\n",
|
||||
" for item in new_dataset:\n",
|
||||
" jsonl_file.write(json.dumps(item) + '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Loop through dataset, create new list of dictionaries, drop duplicate data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def traverse_dataset(dataset):\n",
|
||||
" new_dataset = []\n",
|
||||
" for post in dataset:\n",
|
||||
" new_dataset.append(post)\n",
|
||||
" for comment in post['comments']:\n",
|
||||
" new_dataset.append(comment)\n",
|
||||
" try:\n",
|
||||
" for reply in comment['replies']:\n",
|
||||
" new_dataset.append(reply)\n",
|
||||
"\n",
|
||||
" for sec_reply in reply['replies']:\n",
|
||||
" new_dataset.append(sec_reply)\n",
|
||||
" except KeyError:\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" return new_dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def drop_keywords(dataset):\n",
|
||||
" for item in dataset:\n",
|
||||
" try:\n",
|
||||
" del item['comments']\n",
|
||||
" except KeyError:\n",
|
||||
" pass\n",
|
||||
" try:\n",
|
||||
" del item['replies']\n",
|
||||
" except KeyError:\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" return dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def clean_dataset(dataset):\n",
|
||||
" cleaned_dataset = []\n",
|
||||
" for data in dataset:\n",
|
||||
"\n",
|
||||
" cleaned_data = {}\n",
|
||||
" if 'id' in data:\n",
|
||||
" cleaned_data['id'] = data.get('id')\n",
|
||||
" \n",
|
||||
" if 'publisher' in data:\n",
|
||||
" cleaned_data['author'] = data.get('publisher')\n",
|
||||
" \n",
|
||||
" if 'text' in data:\n",
|
||||
" cleaned_data['text'] = data.get('text')\n",
|
||||
" elif 'title' in data:\n",
|
||||
" cleaned_data['text'] = data.get('title')\n",
|
||||
"\n",
|
||||
" cleaned_dataset.append(cleaned_data)\n",
|
||||
"\n",
|
||||
" return cleaned_dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Execution of functions defined above"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for dataset_name in os.listdir('json_data_id/'):\n",
|
||||
" dataset = open_json(f'json_data_id/{dataset_name}')\n",
|
||||
"\n",
|
||||
" new_dataset = traverse_dataset(dataset)\n",
|
||||
" new_dataset = drop_keywords(new_dataset)\n",
|
||||
" new_dataset = clean_dataset(new_dataset)\n",
|
||||
"\n",
|
||||
" create_jsonl(f'jsonl_data/{dataset_name}', new_dataset)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "sentiment",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
1058
preprocessing/dataProcessing.ipynb
Normal file
1058
preprocessing/dataProcessing.ipynb
Normal file
File diff suppressed because one or more lines are too long
103
preprocessing/id_addition.ipynb
Normal file
103
preprocessing/id_addition.ipynb
Normal file
@ -0,0 +1,103 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import os"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def open_json(filename):\n",
|
||||
" with open(filename, 'r') as json_file:\n",
|
||||
" return json.load(json_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def add_ids(json_file):\n",
|
||||
" id_counter = 1\n",
|
||||
" for post in json_file:\n",
|
||||
" post[\"id\"] = id_counter\n",
|
||||
" id_counter += 1\n",
|
||||
" if 'comments' in post:\n",
|
||||
" for comment in post['comments']:\n",
|
||||
" comment[\"id\"] = id_counter\n",
|
||||
" id_counter += 1\n",
|
||||
" if 'replies' in comment:\n",
|
||||
" for reply in comment['replies']:\n",
|
||||
" reply[\"id\"] = id_counter\n",
|
||||
" id_counter += 1\n",
|
||||
" if 'replies' in reply:\n",
|
||||
" for sec_reply in reply['replies']:\n",
|
||||
" sec_reply[\"id\"] = id_counter\n",
|
||||
" id_counter += 1\n",
|
||||
" return json_file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_json(filename, data):\n",
|
||||
" with open(filename, 'w', encoding = \"utf-8\", ) as file:\n",
|
||||
" json.dump(data, file, indent=4, separators=(',',': '))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for json_file in os.listdir(\"json_data\"):\n",
|
||||
" data = open_json(f'json_data/{json_file}')\n",
|
||||
" data = add_ids(data)\n",
|
||||
" create_json(f'json_data_id/{json_file}', data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "sentiment",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
14847
preprocessing/name_extraction.ipynb
Normal file
14847
preprocessing/name_extraction.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user