vladimir.ferko
1 month ago
commit
d0dc4fa0f4
27 changed files with 76605 additions and 0 deletions
@ -0,0 +1,27 @@ |
|||
# preprocessing ignore |
|||
|
|||
/preprocessing/*.jsonl |
|||
/preprocessing/*.pickle |
|||
|
|||
/preprocessing/__pycache__ |
|||
/preprocessing/classified_data |
|||
/preprocessing/clustered_jsonl |
|||
/preprocessing/json_data |
|||
/preprocessing/json_data_id |
|||
/preprocessing/jsonl_data |
|||
/preprocessing/.DS_Store |
|||
|
|||
# harvesting ignore |
|||
/harvester/Facebook/inputs |
|||
/harvester/Facebook/outputs |
|||
/harvester/Facebook/.* |
|||
/harvester/Facebook/__pycache__ |
|||
/harvester/__pycache__ |
|||
/harvester/.DS_Store |
|||
|
|||
# annotation_app |
|||
/annotation_app/.env |
|||
/annotation_app/__pycache__ |
|||
/annotation_app/.DS_Store |
|||
|
|||
|
@ -0,0 +1,2 @@ |
|||
get_data.py |
|||
/instance |
@ -0,0 +1,13 @@ |
|||
FROM python:3.9 |
|||
|
|||
WORKDIR /app |
|||
|
|||
COPY . /app/ |
|||
|
|||
RUN pip install --no-cache-dir -r requirements.txt |
|||
|
|||
RUN python initial.py |
|||
|
|||
EXPOSE 5050 |
|||
|
|||
CMD ["python3", "app.py"] |
@ -0,0 +1,140 @@ |
|||
from flask import Flask, render_template, request, redirect, flash, session, url_for |
|||
from models import db, Users, Annotations, Samples |
|||
from dotenv import load_dotenv |
|||
from sqlalchemy.orm import aliased |
|||
import sqlalchemy |
|||
import os |
|||
|
|||
|
|||
import logging |
|||
|
|||
load_dotenv() |
|||
|
|||
app = Flask(__name__) |
|||
|
|||
app.config['SQLALCHEMY_DATABASE_URI'] = os.getenv('DB_URI') |
|||
app.secret_key = os.getenv('SECRET_KEY') |
|||
|
|||
db.init_app(app) |
|||
|
|||
|
|||
|
|||
@app.route('/', methods=['GET']) |
|||
def home(): |
|||
session.pop('id_user', None) |
|||
return render_template('home.html') |
|||
|
|||
@app.route('/login', methods=['POST']) |
|||
def login(): |
|||
if request.method == 'POST': |
|||
email = request.form['email'] |
|||
if '@' in email: |
|||
try: |
|||
splitted = email.split('@') |
|||
name, surname = splitted[0].split('.')[:2] |
|||
domain = splitted[1] |
|||
if 'tuke' not in domain: |
|||
raise ValueError |
|||
except ValueError: |
|||
flash('Nie je validný TUKE email') |
|||
return redirect('/') |
|||
try: |
|||
db.session.add(Users(name, surname, email)) |
|||
db.session.commit() |
|||
except sqlalchemy.exc.IntegrityError as err: |
|||
db.session.rollback() |
|||
logging.info('Logged existing email') |
|||
user = Users.query.filter_by(email=email).first() |
|||
session['email'] = email |
|||
session['id_user'] = user._id |
|||
return redirect('/anot') |
|||
flash('Nie je validný TUKE email') |
|||
return redirect('/') |
|||
|
|||
@app.route('/anot', methods=['GET']) |
|||
def anot(): |
|||
if 'id_user' in session: |
|||
try: |
|||
annotated_count = Annotations.query.filter_by(user_id=session['id_user']).count() |
|||
|
|||
# query = text( |
|||
# f''' |
|||
# SELECT samples._id, text |
|||
# FROM samples |
|||
# LEFT JOIN annotations |
|||
# ON samples._id = annotations.sample_id |
|||
# WHERE samples._id NOT IN ( |
|||
# SELECT sample_id |
|||
# FROM annotations |
|||
# GROUP BY sample_id |
|||
# HAVING COUNT(sample_id) > 5 |
|||
# ) |
|||
# AND samples._id NOT IN ( |
|||
# SELECT samples._id |
|||
# FROM samples |
|||
# LEFT JOIN annotations |
|||
# ON samples._id = annotations.sample_id |
|||
# WHERE annotations.user_id IS {session['id_user']} |
|||
# ) |
|||
# ORDER BY samples._id ASC |
|||
# LIMIT 1; |
|||
# ''' |
|||
# ) |
|||
|
|||
annotations_alias = aliased(Annotations) |
|||
|
|||
# Construct the query |
|||
query = ( |
|||
db.session.query(Samples._id, Samples.text) |
|||
.outerjoin(annotations_alias, Samples._id == annotations_alias.sample_id) |
|||
.filter( |
|||
~Samples._id.in_( |
|||
db.session.query(Annotations.sample_id) |
|||
.group_by(Annotations.sample_id) |
|||
.having(db.func.count(Annotations.sample_id) > 5) |
|||
), |
|||
~Samples._id.in_( |
|||
db.session.query(Samples._id) |
|||
.outerjoin(Annotations, Samples._id == Annotations.sample_id) |
|||
.filter(Annotations.user_id == session['id_user']) |
|||
) |
|||
) |
|||
.order_by(Samples._id.asc()) |
|||
.limit(1) |
|||
) |
|||
sample_id, sample_text = query.one_or_none() |
|||
|
|||
data = { |
|||
'email': session.get('email'), |
|||
'text': sample_text, |
|||
'sample_id': sample_id, |
|||
'annotated_count': annotated_count |
|||
} |
|||
|
|||
except (sqlalchemy.exc.OperationalError) as err: |
|||
print(err) |
|||
logging.info('Annotationss started') |
|||
data = { |
|||
'email': session.get('email'), |
|||
'text': Samples.query.order_by(Samples._id.asc()).first().text, |
|||
'sample_id': Samples.query.order_by(Samples._id.asc()).first()._id, |
|||
'annotated_count': annotated_count |
|||
} |
|||
return render_template('anot.html', **data) |
|||
return redirect('/') |
|||
|
|||
@app.route('/process_anot', methods=['POST']) |
|||
def process(): |
|||
if request.method == 'POST': |
|||
data = request.get_json() |
|||
print(data) |
|||
db.session.add(Annotations( |
|||
user_id=session['id_user'], |
|||
sample_id=data['sample_id'], |
|||
label=data['value'] |
|||
)) |
|||
db.session.commit() |
|||
return redirect(url_for('anot')) |
|||
|
|||
if __name__ == "__main__": |
|||
app.run(host="0.0.0.0", port=5050) |
File diff suppressed because it is too large
@ -0,0 +1,54 @@ |
|||
from models import db, Samples, Users, Annotations |
|||
from app import app |
|||
from sqlalchemy import text |
|||
|
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
with app.app_context(): |
|||
# AND annotations.user_id <> '{id_user}' |
|||
id_user = 4 |
|||
|
|||
|
|||
query = text( |
|||
f''' |
|||
SELECT samples.id, text |
|||
FROM samples |
|||
LEFT JOIN annotations |
|||
ON samples.id = annotations.sample_id |
|||
WHERE samples.id NOT IN ( |
|||
SELECT sample_id |
|||
FROM annotations |
|||
GROUP BY sample_id |
|||
HAVING COUNT(sample_id) > 5 |
|||
) |
|||
AND samples.id NOT IN ( |
|||
SELECT samples.id |
|||
FROM samples |
|||
LEFT JOIN annotations |
|||
ON samples.id = annotations.sample_id |
|||
WHERE annotations.user_id IS {id_user} |
|||
) |
|||
ORDER BY samples.id ASC |
|||
LIMIT 1; |
|||
''' |
|||
) |
|||
|
|||
# query = text( |
|||
# ''' |
|||
# SELECT samples.id |
|||
# FROM samples |
|||
# LEFT JOIN annotations |
|||
# ON samples.id = annotations.sample_id |
|||
# WHERE annotations.user_id IS NOT 1 |
|||
# ''' |
|||
# ) |
|||
|
|||
result = db.session.execute(query) |
|||
|
|||
print(result.fetchall()) |
|||
|
|||
annotations = Annotations.query.all() |
|||
print(len(annotations)) |
|||
# for annotation in annotations: |
|||
# print(annotation.user_id) |
@ -0,0 +1,26 @@ |
|||
from models import db, Samples |
|||
from app import app |
|||
import json |
|||
import os |
|||
|
|||
|
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
with app.app_context(): |
|||
# db.init_app(app) |
|||
|
|||
# creating database |
|||
db.create_all() |
|||
|
|||
try: |
|||
with open(os.path.join('dataset', 'final_id_v2.jsonl'), encoding='utf-8') as file: |
|||
data = [json.loads(line) for line in file] |
|||
|
|||
|
|||
for sample in data: |
|||
db.session.add(Samples(sample['text'])) |
|||
db.session.commit() |
|||
print('Data sucessfully inserted') |
|||
except FileNotFoundError as err: |
|||
print(err) |
Binary file not shown.
@ -0,0 +1,40 @@ |
|||
from flask_sqlalchemy import SQLAlchemy |
|||
|
|||
db = SQLAlchemy() |
|||
|
|||
class Annotations(db.Model): |
|||
__tablename__ = 'annotations' |
|||
|
|||
_id = db.Column("id", db.Integer, primary_key=True) |
|||
user_id = db.Column(db.Integer, db.ForeignKey('users.id'), nullable=False) |
|||
sample_id = db.Column(db.Integer, db.ForeignKey('samples.id'), nullable=False) |
|||
label = db.Column(db.String(32), nullable=False) |
|||
|
|||
def __init__(self, user_id, sample_id, label): |
|||
self.user_id = user_id |
|||
self.sample_id = sample_id |
|||
self.label = label |
|||
|
|||
class Users(db.Model): |
|||
__tablename__ = 'users' |
|||
|
|||
_id = db.Column("id", db.Integer, primary_key=True) |
|||
name = db.Column(db.String(32), nullable=False) |
|||
surname = db.Column(db.String(32), nullable=False) |
|||
email = db.Column(db.String(64), unique=True, nullable=False) |
|||
annotations = db.relationship('Annotations', uselist=False, backref='user', lazy=True) |
|||
|
|||
def __init__(self, name, surname, email): |
|||
self.name = name |
|||
self.surname = surname |
|||
self.email = email |
|||
|
|||
class Samples(db.Model): |
|||
__tablename__ = 'samples' |
|||
|
|||
_id = db.Column("id", db.Integer, primary_key=True) |
|||
text = db.Column(db.String(512), nullable=False) |
|||
annotations = db.relationship('Annotations', lazy=True, backref='sample') # corrected relationship and added backref |
|||
|
|||
def __init__(self, text): |
|||
self.text = text |
@ -0,0 +1,13 @@ |
|||
blinker==1.7.0 |
|||
click==8.1.7 |
|||
Flask==3.0.2 |
|||
Flask-SQLAlchemy==3.1.1 |
|||
importlib_metadata==7.0.2 |
|||
itsdangerous==2.1.2 |
|||
Jinja2==3.1.3 |
|||
MarkupSafe==2.1.5 |
|||
python-dotenv==1.0.1 |
|||
SQLAlchemy==2.0.28 |
|||
typing_extensions==4.10.0 |
|||
Werkzeug==3.0.1 |
|||
zipp==3.18.1 |
@ -0,0 +1,76 @@ |
|||
body, html { |
|||
height: 100%; |
|||
margin: 0; |
|||
display: flex; |
|||
justify-content: center; |
|||
align-items: center; |
|||
background-color: #FEFFFF; |
|||
} |
|||
|
|||
table { |
|||
border-radius: 8px; |
|||
} |
|||
.container{ |
|||
position: absolute; |
|||
top: 50%; |
|||
left: 50%; |
|||
transform: translate(-50%, -50%); |
|||
text-align: center; |
|||
} |
|||
|
|||
.btn { |
|||
background-color: #3AAFA9; |
|||
border: 1px solid #3AAFA9; |
|||
} |
|||
|
|||
.logout-btn{ |
|||
background-color: #454d55; |
|||
margin-top: 7.5%; |
|||
} |
|||
|
|||
#top-info{ |
|||
margin-top: 5%; |
|||
position: fixed; |
|||
top: 5%; |
|||
} |
|||
|
|||
.anot{ |
|||
border: 1px solid #000000; |
|||
border-radius: 10px; |
|||
padding: 10px; |
|||
} |
|||
|
|||
.anot-text { |
|||
padding: 2.5%; |
|||
} |
|||
|
|||
|
|||
.form-control{ |
|||
margin-bottom: 5px; |
|||
} |
|||
|
|||
#login{ |
|||
position: absolute; |
|||
top: -20vh; |
|||
left: 50%; |
|||
transform: translate(-50%, 50%); |
|||
border: 1px solid #000000; |
|||
border-radius: 8px; |
|||
padding: 4vh; |
|||
width: 500px; |
|||
} |
|||
|
|||
|
|||
.top-info { |
|||
width: 100%; |
|||
border-collapse: collapse; /* Optional: collapse border */ |
|||
} |
|||
|
|||
.top-info td { |
|||
border: 1px solid #000; /* Add border to table cells */ |
|||
padding: 8px; /* Optional: Add padding */ |
|||
} |
|||
|
|||
h3{ |
|||
margin-bottom: 3%; |
|||
} |
@ -0,0 +1,66 @@ |
|||
{% extends "base.html" %} |
|||
|
|||
{% block title %} annotation {% endblock %} |
|||
|
|||
{% block content%} |
|||
|
|||
<div class="container" id="top-info"> |
|||
<table class="table top-info"> |
|||
<thead class="thead-dark"> |
|||
<tr> |
|||
<th>Email</th> |
|||
<th>Počet anotovaných jednotiek</th> |
|||
</tr> |
|||
</thead> |
|||
<tr> |
|||
<td>{{ email }}</td> |
|||
<td>{{ annotated_count }}</td> |
|||
</tr> |
|||
</table> |
|||
</div> |
|||
|
|||
<div class="container"> |
|||
<div class="anot"> |
|||
<p class="anot-text">{{ text }}</p> |
|||
<button id="post" class="btn btn-primary" onclick="postBcknd('offensive', {{ sample_id }})">Ofenzívny</button> |
|||
<button id="post" class="btn btn-primary" onclick="postBcknd('not_offensive', {{ sample_id }})">Neofenzívny</button> |
|||
<button id="post" class="btn btn-primary" onclick="postBcknd('dont_know', {{ sample_id }})">Neviem</button> |
|||
</div> |
|||
|
|||
<button id="get" class="btn btn-primary logout-btn" onclick="logout()"> Odhlásiť sa</button> |
|||
|
|||
<script> |
|||
function postBcknd(value, sample_id){ |
|||
var xhr = new XMLHttpRequest(); |
|||
xhr.open('POST', '/process_anot', true); |
|||
xhr.setRequestHeader('Content-Type', 'application/json'); |
|||
|
|||
xhr.onload = function () { |
|||
if(xhr.status === 200) { |
|||
console.log('request sent succesfully'); |
|||
window.location.href = '/anot'; |
|||
} else { |
|||
console.log('request failed'); |
|||
} |
|||
}; |
|||
xhr.send(JSON.stringify({value: value, sample_id: sample_id})); |
|||
} |
|||
|
|||
function logout() { |
|||
var xhr = new XMLHttpRequest(); |
|||
xhr.open('GET', '/', true); |
|||
|
|||
xhr.onload = function () { |
|||
if (xhr.status === 200) { |
|||
console.log('Logout successful'); |
|||
window.location.href = '/'; |
|||
} else { |
|||
console.log('Logout request failed'); |
|||
} |
|||
} |
|||
|
|||
xhr.send(); // Send the request |
|||
} |
|||
</script> |
|||
|
|||
{% endblock %} |
@ -0,0 +1,17 @@ |
|||
<!doctype html> |
|||
<html> |
|||
<head> |
|||
{% block head %} |
|||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css"> |
|||
<link type="text/css" rel="stylesheet" href="{{ url_for('static', filename='stylesheets/styles.css') }}" /> |
|||
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|||
<title>{% block title %}{% endblock %}</title> |
|||
{% endblock %} |
|||
</head> |
|||
<body> |
|||
<div id="content"> |
|||
{% block content %} |
|||
{% endblock %} |
|||
</div> |
|||
</body> |
|||
</html> |
@ -0,0 +1,19 @@ |
|||
{% extends "base.html" %} |
|||
{% block title %} Welcome {% endblock %} |
|||
|
|||
{% block content %} |
|||
|
|||
<div class="container"> |
|||
<form action="/login" id="login" method="post" > |
|||
<h3>Login anotačnej aplikácie</h3> |
|||
<input type="text" name="email" placeholder="meno.priezvisko@student.tuke.sk" class="form-control"> |
|||
<button id="post" class="btn btn-primary login-btn">Prihlásiť sa</button> |
|||
{% with messages = get_flashed_messages() %} |
|||
{% if messages %} |
|||
<p style="margin-top: 2%;"> {{ messages[0] }} </p> |
|||
{% endif %} |
|||
{% endwith %} |
|||
</form> |
|||
</div> |
|||
|
|||
{% endblock %} |
@ -0,0 +1,134 @@ |
|||
import os |
|||
import sys |
|||
import time |
|||
import json |
|||
|
|||
from selenium import webdriver |
|||
from selenium.webdriver.common.by import By |
|||
from selenium.webdriver.chrome.options import Options |
|||
from selenium.webdriver.support.ui import WebDriverWait |
|||
from selenium.webdriver.support import expected_conditions as EC |
|||
from selenium.webdriver.common.keys import Keys |
|||
from selenium.webdriver.common.action_chains import ActionChains |
|||
import selenium.common.exceptions |
|||
from Facebook.facebook_parser import FacebookParser |
|||
|
|||
from crawler import Crawler |
|||
|
|||
|
|||
class FacebookCrawler(Crawler, FacebookParser): |
|||
|
|||
|
|||
def __init__(self, base_url: str, file_name: str): |
|||
super().__init__(base_url, file_name) |
|||
|
|||
try: |
|||
with open(os.path.join('locators.json')) as file: |
|||
self.locators = json.load(file) |
|||
|
|||
with open(os.path.join('Facebook', 'inputs', self.filename)) as file: |
|||
self.URLS = tuple(file.readlines()) |
|||
except FileNotFoundError: |
|||
print(os.path.join('Facebook', 'inputs', self.filename)) |
|||
print("Invalid input value") |
|||
sys.exit(1) |
|||
|
|||
# crawling part of the code |
|||
def crawl(self): |
|||
counter = len(self.URLS) |
|||
for idx,url in enumerate(self.URLS): |
|||
# redirect and wait for page to load |
|||
self.driver.get(url) |
|||
self.driver.implicitly_wait(4) |
|||
|
|||
if 'videos' in url: |
|||
try: |
|||
self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest' or text()='Oldest']") |
|||
except selenium.common.exceptions.NoSuchElementException: |
|||
self.driver.find_element(By.XPATH, "//span[contains(@class, 'x6ikm8r x10wlt62 xlyipyv')]").click() |
|||
if self.driver.find_element(By.XPATH, "//div[contains(@class, 'x78zum5 xdt5ytf x1iyjqo2 x7ywyr2')]") is not None: |
|||
print('Cant crawl comments section') |
|||
continue |
|||
|
|||
self.close_censorship('Newest') |
|||
else: |
|||
try: |
|||
self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest' or text()='Oldest']") |
|||
except: |
|||
pass |
|||
|
|||
self.close_censorship('All comments') |
|||
|
|||
self.driver.implicitly_wait(3) |
|||
print('continue scraping') |
|||
|
|||
# clicking features |
|||
self.view_more_comments() |
|||
self.show_replies() |
|||
self.click_see_more() |
|||
|
|||
# # parsing part of the code |
|||
|
|||
# Dictionary of classes, if facebook changes any class, rewrite this DICT |
|||
if '/videos/' in url: |
|||
self.class_dict = self.locators['facebook_video_locators'] |
|||
elif '/posts/' in url: |
|||
self.class_dict = self.locators['facebook_post_locators'] |
|||
|
|||
self.parse(self.driver.page_source, self.class_dict, self.filename) |
|||
print(f'Done: [{idx + 1}/{counter}]') |
|||
|
|||
def view_more_comments(self): |
|||
elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]") |
|||
while elements: |
|||
try: |
|||
self.driver.execute_script("arguments[0].click();", elements[0]) |
|||
elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]") |
|||
self.driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;") |
|||
except selenium.common.exceptions.StaleElementReferenceException: |
|||
elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]") |
|||
|
|||
|
|||
# function, for showing hidden replies |
|||
def show_replies(self): |
|||
|
|||
repl_elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'repl') and not(contains(text(), 'Hide')) and not(contains(text(), 'Newest'))]") |
|||
i = 1 |
|||
while repl_elements: |
|||
|
|||
try: |
|||
for element in repl_elements: |
|||
self.driver.execute_script("arguments[0].click();", element) |
|||
time.sleep(0.5) |
|||
|
|||
except selenium.common.exceptions.StaleElementReferenceException: |
|||
pass |
|||
repl_elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'repl') and not(contains(text(), 'Hide')) and not(contains(text(), 'Newest'))]") |
|||
|
|||
# method for expanding comments |
|||
def click_see_more(self): |
|||
|
|||
elements = self.driver.find_elements(By.XPATH, "//*[text()='See more']") |
|||
|
|||
for element in elements: |
|||
self.driver.execute_script("arguments[0].click();", element) |
|||
|
|||
|
|||
|
|||
# method for clossing most relevant filter to Newest |
|||
def close_censorship(self, classification: str): |
|||
self.driver.implicitly_wait(3) |
|||
try: |
|||
dropdown = self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest']") |
|||
self.driver.execute_script("arguments[0].click();", dropdown) # clicking on it |
|||
|
|||
newest_comments = self.driver.find_element(By.XPATH, f"//*[text()='{classification}']") |
|||
self.driver.execute_script("arguments[0].click();", newest_comments) # clicking on it |
|||
except: |
|||
self.close_censorship(classification) |
|||
|
|||
|
|||
def close(self): |
|||
print('Scraping ended succesffuly') |
|||
self.driver.quit() |
|||
sys.exit(0) |
@ -0,0 +1,129 @@ |
|||
from bs4 import BeautifulSoup |
|||
from print_dict import pd |
|||
import json |
|||
import sys |
|||
import os |
|||
|
|||
class FacebookParser: |
|||
|
|||
def parse(self, html, clsDict, fname = 'final_dataset.json'): |
|||
|
|||
self.soup = BeautifulSoup(html, 'lxml') |
|||
self.outFileName = fname |
|||
self.outFileName = f"outputs/parts/{self.outFileName.split('.')[0]}_data.json" |
|||
|
|||
# dict for data about facebook post |
|||
self.post_data = { |
|||
'publisher': None, |
|||
'title': None, |
|||
'comments': [], |
|||
'post_reactions': None |
|||
} |
|||
|
|||
# dict for comments |
|||
self.comment_data = { |
|||
'publisher': None, |
|||
'text': None, |
|||
'replies': [] |
|||
} |
|||
|
|||
# reply data |
|||
self.reply_data = { |
|||
'publisher': None, |
|||
'text': None |
|||
} |
|||
|
|||
# post info |
|||
self.name = self.soup.find('a', {'class': clsDict['POST_AUTHOR']}) |
|||
|
|||
if clsDict['TOP_LABEL'] == 'message': |
|||
self.top = self.soup.find('div', {'data-ad-comet-preview': clsDict['TOP_LABEL']}) |
|||
else: |
|||
self.top = self.soup.find('div', {'class': clsDict['TOP_LABEL']}) |
|||
if self.top is None: |
|||
self.top = self.soup.find('div', {'class': 'x78zum5 xdt5ytf x2lah0s xyamay9 x1pi30zi x18d9i69 x1swvt13'}) |
|||
|
|||
self.title_likes = self.soup.find('span', {'class': clsDict['TITLE_LIKES']}) |
|||
try: |
|||
self.tmp_strings = self.top.find_all('div', {'style': clsDict['STATUS_STRINGS']}) |
|||
self.title = '' |
|||
for x in self.tmp_strings: |
|||
try: |
|||
self.title += x.text + '. ' |
|||
except: |
|||
pass |
|||
except: |
|||
self.title = None |
|||
|
|||
|
|||
|
|||
self.post_data = { |
|||
'publisher': self.name.text if self.name is not None else None, |
|||
'title': self.title, |
|||
'post_reactions': self.title_likes.text if self.title_likes is not None else None, |
|||
'comments': [] |
|||
} |
|||
|
|||
if self.post_data['publisher'] is None: |
|||
return |
|||
|
|||
# comment info |
|||
self.all_comments = self.soup.find_all("div", {"aria-label": lambda x: x and x.endswith("ago")}) # arr with all comments under the post |
|||
# print(len(self.all_comments)) |
|||
for item in self.all_comments: |
|||
self.publisher = item.find('span', {'class', clsDict['COMMENT_AUTHOR']}) |
|||
self.txt = item.find('div', {'class': clsDict['COMMENT_STR']}) |
|||
try: |
|||
tmp_type = item.get('aria-label').split(' ')[0] |
|||
tmp_class = item.find('div', {'class': lambda x: x and x.startswith(clsDict['TMP_COMMENTS_CLASS'])}).get('class')[-1] |
|||
if tmp_type == "Comment": |
|||
self.comment_data = { |
|||
'publisher': self.publisher.text, |
|||
'text': self.txt.text if self.txt is not None else None, |
|||
'replies': [] |
|||
} |
|||
self.post_data['comments'].append(self.comment_data) |
|||
|
|||
|
|||
elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER']: |
|||
self.comment_data = { |
|||
'publisher': self.publisher.text, |
|||
'text': self.txt.text if self.txt is not None else None, |
|||
'replies': [] |
|||
} |
|||
self.post_data['comments'][-1]['replies'].append(self.comment_data) |
|||
|
|||
elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER_2']: |
|||
self.reply_data = { |
|||
'publisher': self.publisher.text, |
|||
'text': self.txt.text if self.txt is not None else None, |
|||
} |
|||
self.post_data['comments'][-1]['replies'][-1]['replies'].append(self.reply_data) |
|||
|
|||
except: |
|||
pass |
|||
|
|||
|
|||
if os.path.exists(self.outFileName): |
|||
with open(self.outFileName, 'r+', encoding= "utf-8") as file: |
|||
tmp = json.load(file) |
|||
tmp.append(self.post_data) |
|||
file.seek(0) |
|||
json.dump(tmp, file, indent=4, separators=(',',': ')) |
|||
else: |
|||
with open(self.outFileName, 'w', encoding = "utf-8", ) as file: |
|||
json.dump([self.post_data], file, indent=4, separators=(',',': ')) |
|||
|
|||
#read URLS from a .txt |
|||
try: |
|||
with open(os.path.join('Facebook', 'inputs', self.filename), 'r+') as file: |
|||
lines = file.readlines() |
|||
# move file pointer to the beginning of a file |
|||
file.seek(0) |
|||
# truncate the file |
|||
file.truncate() |
|||
# start writing lines except the first line |
|||
file.writelines(lines[1:]) |
|||
except FileNotFoundError: |
|||
print('Invalid input value') |
|||
sys.exit(1) |
@ -0,0 +1,155 @@ |
|||
from selenium import webdriver |
|||
from selenium.webdriver.common.by import By |
|||
from selenium.webdriver.chrome.options import Options |
|||
from selenium.webdriver.support.ui import WebDriverWait |
|||
from selenium.webdriver.support import expected_conditions as EC |
|||
from webdriver_manager.chrome import ChromeDriverManager |
|||
|
|||
from bs4 import BeautifulSoup |
|||
|
|||
import os |
|||
import sys |
|||
import time |
|||
import argparse |
|||
|
|||
|
|||
# parse args |
|||
parser = argparse.ArgumentParser(description = "Facebook scraper") |
|||
parser.add_argument("URL", help = 'URL of the facebook page / profile') # dont need to specify type (default is string) |
|||
args = parser.parse_args() |
|||
|
|||
|
|||
# method for waiting on pages to load |
|||
def wait_for_url(driver, url): |
|||
# waiting to load main page |
|||
try: |
|||
WebDriverWait(driver, 10).until(EC.url_to_be(url)) |
|||
print('Succesful !') |
|||
except: |
|||
print('Connection error') |
|||
driver.quit() |
|||
sys.exit(1) |
|||
|
|||
# web driver init |
|||
def webdriver_setup(): |
|||
|
|||
driver_path = r'C:\Users\vlferko\Desktop\projects\jarvis_scraper\chromedriver.exe' |
|||
|
|||
chrome_options = Options() |
|||
chrome_options.add_argument("accept-language=en-US") |
|||
chrome_options.add_argument("--headless") |
|||
chrome_options.add_argument("--log-level=OFF") |
|||
driver = webdriver.Chrome(ChromeDriverManager().install(), options= chrome_options) |
|||
|
|||
driver.get("https://www.facebook.com/") |
|||
return driver |
|||
|
|||
# login to a facebook acc |
|||
def login(driver): |
|||
print('Logging in') |
|||
# allow cookies |
|||
try: |
|||
driver.find_element(By.XPATH, "//button[contains(string(), 'Decline optional cookies')]").click() |
|||
except: |
|||
pass |
|||
|
|||
|
|||
# insert login data |
|||
driver.find_element(By.NAME, "email").send_keys(os.environ['FB_EMAIL']) # type email |
|||
driver.find_element(By.NAME, "pass").send_keys(os.environ['FB_PASSWD']) # type password |
|||
|
|||
# click -> log in |
|||
driver.find_element(By.NAME, "login").click() |
|||
time.sleep(5) |
|||
|
|||
# scrolling to the bottom of the page |
|||
def crawl_for_links(driver, url): |
|||
print('Crawling') |
|||
i = 1 |
|||
driver.get(url) |
|||
time.sleep(2) |
|||
name = driver.find_elements(By.TAG_NAME, 'h2')[-1].text |
|||
|
|||
|
|||
for _ in range(0, 3): |
|||
# Get scroll height |
|||
last_height = driver.execute_script("return document.body.scrollHeight") |
|||
for _ in range(50): |
|||
# Scroll down to bottom |
|||
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;") |
|||
|
|||
# Wait to load page |
|||
time.sleep(3) |
|||
|
|||
# Calculate new scroll height and compare with last scroll height |
|||
new_height = driver.execute_script("return document.body.scrollHeight") |
|||
if new_height == last_height: |
|||
break |
|||
last_height = new_height |
|||
os.system('clear||cls') |
|||
print(f'Iteration num: {i}') |
|||
i += 1 |
|||
|
|||
return driver.page_source, name |
|||
|
|||
# parse HTML |
|||
def parse_html(html): |
|||
soup = BeautifulSoup(html, 'lxml') |
|||
|
|||
timeline = soup.find('div', {'class': 'x9f619 x1n2onr6 x1ja2u2z xeuugli xs83m0k x1xmf6yo x1emribx x1e56ztr x1i64zmx xjl7jj x19h7ccj xu9j1y6 x7ep2pv'}) |
|||
posts = timeline.find_all('div', {'class': 'x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z'}) |
|||
arr = [] |
|||
for post in posts: |
|||
try: |
|||
commentsWidget = post.find('span', {'class': 'x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xi81zsa'}) |
|||
if approveComments(commentsWidget.text): |
|||
links = post.find_all('a', {'role': 'link'}) |
|||
arr.append(extractPostLink(links)) |
|||
except AttributeError: |
|||
pass |
|||
|
|||
return arr |
|||
|
|||
def extractPostLink(links): |
|||
for link in links: |
|||
if '/videos/' in link['href'] or '/posts/' in link['href']: |
|||
return link['href'] |
|||
|
|||
# check if post has at least 50 comments |
|||
def approveComments(text): |
|||
nComments = text.split(' ')[0] |
|||
try: |
|||
num = int(nComments) |
|||
return int(num > 50) |
|||
except ValueError: |
|||
return 'K' or 'M' in nComments |
|||
|
|||
# write all the links to the .txt |
|||
def write_out(arr, name): |
|||
|
|||
with open(f"{os.getcwd()}/inputs/{name.strip().replace(' ', '_').lower()}.txt", 'w') as f: |
|||
for item in arr: |
|||
try: |
|||
f.write(item + '\n') |
|||
except TypeError: |
|||
pass |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
# driver init |
|||
driver =webdriver_setup() |
|||
wait_for_url(driver, 'https://www.facebook.com/') |
|||
|
|||
# login |
|||
login(driver) |
|||
wait_for_url(driver, 'https://www.facebook.com/') |
|||
|
|||
# crawl |
|||
html, name =crawl_for_links(driver, args.URL) |
|||
driver.close() |
|||
|
|||
# parsing HTML |
|||
arr =parse_html(html) |
|||
|
|||
# write out |
|||
write_out(arr, name) |
@ -0,0 +1,24 @@ |
|||
{ |
|||
"facebook_post_locators": { |
|||
"POST_AUTHOR": "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f", |
|||
"TOP_LABEL": "xjkvuk6 xuyqlj2 x1odjw0f", |
|||
"TITLE_LIKES": "xt0b8zv x1jx94hy xrbpyxo xl423tq", |
|||
"STATUS_STRINGS": "text-align: start;", |
|||
"COMMENT_AUTHOR": "x3nfvp2", |
|||
"COMMENT_STR": "x1lliihq xjkvuk6 x1iorvi4", |
|||
"TMP_COMMENTS_CLASS": "xqcrz7y", |
|||
"REPLY_DIVIDER": "x1k70j0n", |
|||
"REPLY_DIVIDER_2": "x1n2onr6" |
|||
}, |
|||
"facebok_video_locators": { |
|||
"POST_AUTHOR": "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f", |
|||
"TOP_LABEL": "message", |
|||
"TITLE_LIKES": "xt0b8zv x1jx94hy xrbpyxo xl423tq", |
|||
"STATUS_STRINGS": "text-align: start;", |
|||
"COMMENT_AUTHOR": "x3nfvp2", |
|||
"COMMENT_STR": "xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs", |
|||
"TMP_COMMENTS_CLASS": "xqcrz7y", |
|||
"REPLY_DIVIDER": "x1k70j0n", |
|||
"REPLY_DIVIDER_2": "x1n2onr6" |
|||
} |
|||
} |
@ -0,0 +1,30 @@ |
|||
import argparse |
|||
|
|||
from Facebook.facebook_crawler import FacebookCrawler |
|||
from Reddit.reddit_crawler import RedditCrawler |
|||
|
|||
FACEBOOK_URL = 'https://www.facebook.com/' |
|||
REDDIT_URL = 'https://www.reddit.com/' |
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
# parsing arguments |
|||
parser = argparse.ArgumentParser(description = "Facebook scraper") |
|||
parser.add_argument("file_name", help = 'Name of the .txt file with URLS') |
|||
args = parser.parse_args() |
|||
|
|||
|
|||
user_input = input('Hello, do you want to scraper Facebook or reddit? [F/r]: ') |
|||
|
|||
while user_input.upper() not in ['F', 'R']: |
|||
user_input = input('Do you want to scrape Facebook or reddit? [F/r]: ') |
|||
|
|||
|
|||
if user_input == 'F': |
|||
facebook = FacebookCrawler(FACEBOOK_URL, args.file_name) |
|||
facebook.allow_cookies() |
|||
facebook.login() |
|||
facebook.crawl() |
|||
else: |
|||
reddit = RedditCrawler(REDDIT_URL, args.file_name) |
|||
print(reddit) |
@ -0,0 +1,4 @@ |
|||
beautifulsoup4==4.12.2 |
|||
print_dict==0.1.19 |
|||
selenium==4.10.0 |
|||
webdriver_manager==3.8.6 |
@ -0,0 +1,176 @@ |
|||
{ |
|||
"cells": [ |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"# Data chunking for effectiveness\n", |
|||
"\n", |
|||
"In our data, facebook user called Robert Fico has a lot of samples.\n", |
|||
"For efficiency, this notebook chunks those data in 4 parts." |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 1, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"import json" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### JSONL file loading and creation" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 2, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def load_jsonl(file_path):\n", |
|||
" with open(file_path, 'r', encoding='utf-8') as file:\n", |
|||
" return [json.loads(line) for line in file]" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 3, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def create_jsonl(filename, new_dataset):\n", |
|||
" with open(f'{filename}l', 'w') as jsonl_file:\n", |
|||
" for item in new_dataset:\n", |
|||
" jsonl_file.write(json.dumps(item) + '\\n')" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 4, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"fico = load_jsonl('jsonl_data/robert_fico_data.jsonl')" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Split data into 4 parts equal parts" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 5, |
|||
"metadata": {}, |
|||
"outputs": [ |
|||
{ |
|||
"data": { |
|||
"text/plain": [ |
|||
"135155" |
|||
] |
|||
}, |
|||
"execution_count": 5, |
|||
"metadata": {}, |
|||
"output_type": "execute_result" |
|||
} |
|||
], |
|||
"source": [ |
|||
"num_samples = len(fico)\n", |
|||
"chunk_size = int(num_samples / 4)\n", |
|||
"\n", |
|||
"num_samples" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 6, |
|||
"metadata": {}, |
|||
"outputs": [ |
|||
{ |
|||
"data": { |
|||
"text/plain": [ |
|||
"False" |
|||
] |
|||
}, |
|||
"execution_count": 6, |
|||
"metadata": {}, |
|||
"output_type": "execute_result" |
|||
} |
|||
], |
|||
"source": [ |
|||
"chunk_size * 4 == num_samples # we have lost one sample, because our dataset has odd number of samples" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Actual chunking algorithm" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 18, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"chunk_arr = []\n", |
|||
"for chunks in range(0, 4):\n", |
|||
" chunk_arr.append(\n", |
|||
" fico[chunk_size * chunks: chunk_size * (chunks + 1)]\n", |
|||
" )" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Write chunked data to disk in a for loop" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 20, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"for index, data in enumerate(chunk_arr):\n", |
|||
" create_jsonl(f'jsonl_data/fico_chunk_{index}.json', data)" |
|||
] |
|||
} |
|||
], |
|||
"metadata": { |
|||
"kernelspec": { |
|||
"display_name": "sentiment", |
|||
"language": "python", |
|||
"name": "python3" |
|||
}, |
|||
"language_info": { |
|||
"codemirror_mode": { |
|||
"name": "ipython", |
|||
"version": 3 |
|||
}, |
|||
"file_extension": ".py", |
|||
"mimetype": "text/x-python", |
|||
"name": "python", |
|||
"nbconvert_exporter": "python", |
|||
"pygments_lexer": "ipython3", |
|||
"version": "3.9.18" |
|||
}, |
|||
"orig_nbformat": 4 |
|||
}, |
|||
"nbformat": 4, |
|||
"nbformat_minor": 2 |
|||
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,389 @@ |
|||
{ |
|||
"cells": [ |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"# This notebook is clustering samples based on their semantic similarity.\n" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 1, |
|||
"metadata": {}, |
|||
"outputs": [ |
|||
{ |
|||
"name": "stderr", |
|||
"output_type": "stream", |
|||
"text": [ |
|||
"/Users/A200119424/anaconda3/envs/sentiment/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", |
|||
" from .autonotebook import tqdm as notebook_tqdm\n" |
|||
] |
|||
} |
|||
], |
|||
"source": [ |
|||
"# imports \n", |
|||
"\n", |
|||
"from sentence_transformers import SentenceTransformer, util\n", |
|||
"from tqdm import tqdm\n", |
|||
"import numpy as np\n", |
|||
"import torch\n", |
|||
"import numpy as np\n", |
|||
"import warnings\n", |
|||
"import json\n", |
|||
"import os\n", |
|||
"\n", |
|||
"warnings.filterwarnings(\"ignore\")" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Model init\n", |
|||
"\n", |
|||
"In this clustering process will be used TUKE-DeutscheTelekom/slovakbert-skquad-mnlr" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 2, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"model = SentenceTransformer('TUKE-DeutscheTelekom/slovakbert-skquad-mnlr')" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Data manipulation in file system" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 3, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def load_jsonl(file_path):\n", |
|||
" with open(file_path, 'r', encoding='utf-8') as file:\n", |
|||
" return [json.loads(line) for line in file]" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"## Pipeline functions" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Embedding creation" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 4, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def create_embeddings(jsonl_file):\n", |
|||
" sentences = [item['text'] for item in jsonl_file]\n", |
|||
" return model.encode(sentences), sentences" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Clustering algorithm" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 5, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def cluster_data(embeddings, sentences):\n", |
|||
" embeddings_np = np.array(embeddings)\n", |
|||
"\n", |
|||
" similarity_threshold = 0.65\n", |
|||
"\n", |
|||
" long_enough_mask = np.array([len(sentence) > 20 for sentence in sentences])\n", |
|||
"\n", |
|||
" cosine_sim_matrix = util.pytorch_cos_sim(torch.tensor(embeddings_np), torch.tensor(embeddings_np)).numpy()\n", |
|||
"\n", |
|||
" below_threshold_mask = cosine_sim_matrix < similarity_threshold\n", |
|||
"\n", |
|||
" filtered_mask = np.logical_and(below_threshold_mask, np.outer(long_enough_mask, long_enough_mask))\n", |
|||
"\n", |
|||
" non_spam_indices = np.where(filtered_mask)\n", |
|||
"\n", |
|||
" filtered_sentences = list(set([sentences[i] for i in non_spam_indices[0]]))\n", |
|||
"\n", |
|||
" return filtered_sentences" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Prepare data to write it to JSONL" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 12, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def filter_null_text(json_list):\n", |
|||
" filtered_list = [obj for obj in json_list if \"text\" in obj and obj[\"text\"] is not None]\n", |
|||
" return filtered_list" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 6, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def create_jsonl_format(filtered, jsonl_file):\n", |
|||
"\n", |
|||
" return [\n", |
|||
" {\n", |
|||
" 'id': item['id'],\n", |
|||
" 'author': item['author'],\n", |
|||
" 'text': item['text']\n", |
|||
" }\n", |
|||
" for item in jsonl_file if item['text'] in filtered\n", |
|||
" ]" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Write out JSONL file" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 7, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def write_jsonl(filename, data):\n", |
|||
" with open(filename, 'w') as f:\n", |
|||
" for item in data:\n", |
|||
" json.dump(item, f)\n", |
|||
" f.write('\\n')" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Pipeline execution" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 8, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def execute_pipeline(jsonl_file):\n", |
|||
" embeddings, sentences = create_embeddings(jsonl_file)\n", |
|||
" filtered_data = cluster_data(embeddings, sentences)\n", |
|||
" return create_jsonl_format(filtered_data, jsonl_file)" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"# Pipeline usecase" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"prepare data for clustering in a loop" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 9, |
|||
"metadata": {}, |
|||
"outputs": [ |
|||
{ |
|||
"data": { |
|||
"text/plain": [ |
|||
"['aktuality_data.jsonl',\n", |
|||
" 'denník_n_data.jsonl',\n", |
|||
" 'televízia_joj_data.jsonl',\n", |
|||
" 'fakty_data.jsonl',\n", |
|||
" 'erik_kaliňák_data.jsonl',\n", |
|||
" 'zomri_data.jsonl',\n", |
|||
" 'igor_matovic_data.jsonl',\n", |
|||
" 'peter_marcin_data.jsonl',\n", |
|||
" 'ján_koleník_data.jsonl',\n", |
|||
" 'eva_-_hriešne_dobrá_data.jsonl',\n", |
|||
" 'emefka_data.jsonl',\n", |
|||
" 'marek_hamsik_data.jsonl',\n", |
|||
" 'hetrik_data.jsonl',\n", |
|||
" 'peter_sagan_data.jsonl',\n", |
|||
" 'marian_čekovský_data.jsonl',\n", |
|||
" 'zuzana_čaputová_data.jsonl',\n", |
|||
" 'sajfa_data.jsonl',\n", |
|||
" 'marian_kotleba_data.jsonl',\n", |
|||
" 'fico_chunk_3.jsonl',\n", |
|||
" 'fico_chunk_1.jsonl',\n", |
|||
" 'šport_v_rtvs_data.jsonl',\n", |
|||
" 'dominika_cibulkova_data.jsonl',\n", |
|||
" 'šport24_data.jsonl',\n", |
|||
" 'niké_liga_data.jsonl',\n", |
|||
" 'fico_chunk_0.jsonl',\n", |
|||
" 'ok,ale_ideš_prvý_:d_data.jsonl',\n", |
|||
" 'fico_chunk_2.jsonl']" |
|||
] |
|||
}, |
|||
"execution_count": 9, |
|||
"metadata": {}, |
|||
"output_type": "execute_result" |
|||
} |
|||
], |
|||
"source": [ |
|||
"data_to_cluster = [x for x in os.listdir('jsonl_data')]\n", |
|||
"\n", |
|||
"data_to_cluster.remove('robert_fico_data.jsonl')\n", |
|||
"\n", |
|||
"data_to_cluster" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"Executing the actual pipeline" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 13, |
|||
"metadata": {}, |
|||
"outputs": [ |
|||
{ |
|||
"name": "stderr", |
|||
"output_type": "stream", |
|||
"text": [ |
|||
"100%|██████████| 27/27 [1:59:52<00:00, 266.38s/it] \n" |
|||
] |
|||
} |
|||
], |
|||
"source": [ |
|||
"for dataset_name in tqdm(data_to_cluster):\n", |
|||
" dataset = load_jsonl(f'jsonl_data/{dataset_name}')\n", |
|||
" dataset = filter_null_text(dataset)\n", |
|||
" write_jsonl(f'clustered_jsonl/{dataset_name}', execute_pipeline(dataset))\n", |
|||
" " |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": null, |
|||
"metadata": {}, |
|||
"outputs": [ |
|||
{ |
|||
"data": { |
|||
"text/plain": [ |
|||
"['aktuality_data.jsonl',\n", |
|||
" 'denník_n_data.jsonl',\n", |
|||
" 'televízia_joj_data.jsonl',\n", |
|||
" '.DS_Store',\n", |
|||
" 'fakty_data.jsonl',\n", |
|||
" 'erik_kaliňák_data.jsonl',\n", |
|||
" 'zomri_data.jsonl',\n", |
|||
" 'igor_matovic_data.jsonl',\n", |
|||
" 'peter_marcin_data.jsonl',\n", |
|||
" 'ján_koleník_data.jsonl',\n", |
|||
" 'eva_-_hriešne_dobrá_data.jsonl',\n", |
|||
" 'emefka_data.jsonl',\n", |
|||
" 'marek_hamsik_data.jsonl',\n", |
|||
" 'hetrik_data.jsonl',\n", |
|||
" 'peter_sagan_data.jsonl',\n", |
|||
" 'marian_čekovský_data.jsonl',\n", |
|||
" 'zuzana_čaputová_data.jsonl',\n", |
|||
" 'sajfa_data.jsonl',\n", |
|||
" 'marian_kotleba_data.jsonl',\n", |
|||
" 'fico_chunk_3.jsonl',\n", |
|||
" 'fico_chunk_1.jsonl',\n", |
|||
" 'šport_v_rtvs_data.jsonl',\n", |
|||
" 'dominika_cibulkova_data.jsonl',\n", |
|||
" 'šport24_data.jsonl',\n", |
|||
" 'niké_liga_data.jsonl',\n", |
|||
" 'fico_chunk_0.jsonl',\n", |
|||
" 'ok,ale_ideš_prvý_:d_data.jsonl',\n", |
|||
" 'fico_chunk_2.jsonl']" |
|||
] |
|||
}, |
|||
"execution_count": 11, |
|||
"metadata": {}, |
|||
"output_type": "execute_result" |
|||
} |
|||
], |
|||
"source": [ |
|||
"os.listdir('jsonl_data')" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": null, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [] |
|||
} |
|||
], |
|||
"metadata": { |
|||
"kernelspec": { |
|||
"display_name": "sentiment", |
|||
"language": "python", |
|||
"name": "python3" |
|||
}, |
|||
"language_info": { |
|||
"codemirror_mode": { |
|||
"name": "ipython", |
|||
"version": 3 |
|||
}, |
|||
"file_extension": ".py", |
|||
"mimetype": "text/x-python", |
|||
"name": "python", |
|||
"nbconvert_exporter": "python", |
|||
"pygments_lexer": "ipython3", |
|||
"version": "3.9.16" |
|||
}, |
|||
"orig_nbformat": 4 |
|||
}, |
|||
"nbformat": 4, |
|||
"nbformat_minor": 2 |
|||
} |
@ -0,0 +1,181 @@ |
|||
{ |
|||
"cells": [ |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"# JSON to JSONL file converter\n", |
|||
"This notebook turns structured JSON file to a simplier form as a JSONL for easier data manipulation" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 27, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"# imports \n", |
|||
"import json\n", |
|||
"import os" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Open JSON data, then write it as JSONL" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 28, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def open_json(filename):\n", |
|||
" # Read the JSON file\n", |
|||
" with open(filename, 'r') as json_file:\n", |
|||
" return json.load(json_file)" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 29, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def create_jsonl(filename, new_dataset):\n", |
|||
" with open(f'{filename}l', 'w') as jsonl_file:\n", |
|||
" for item in new_dataset:\n", |
|||
" jsonl_file.write(json.dumps(item) + '\\n')" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Loop through dataset, create new list of dictionaries, drop duplicate data" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 30, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def traverse_dataset(dataset):\n", |
|||
" new_dataset = []\n", |
|||
" for post in dataset:\n", |
|||
" new_dataset.append(post)\n", |
|||
" for comment in post['comments']:\n", |
|||
" new_dataset.append(comment)\n", |
|||
" try:\n", |
|||
" for reply in comment['replies']:\n", |
|||
" new_dataset.append(reply)\n", |
|||
"\n", |
|||
" for sec_reply in reply['replies']:\n", |
|||
" new_dataset.append(sec_reply)\n", |
|||
" except KeyError:\n", |
|||
" pass\n", |
|||
" \n", |
|||
" return new_dataset" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 31, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def drop_keywords(dataset):\n", |
|||
" for item in dataset:\n", |
|||
" try:\n", |
|||
" del item['comments']\n", |
|||
" except KeyError:\n", |
|||
" pass\n", |
|||
" try:\n", |
|||
" del item['replies']\n", |
|||
" except KeyError:\n", |
|||
" pass\n", |
|||
" \n", |
|||
" return dataset" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 37, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def clean_dataset(dataset):\n", |
|||
" cleaned_dataset = []\n", |
|||
" for data in dataset:\n", |
|||
"\n", |
|||
" cleaned_data = {}\n", |
|||
" if 'id' in data:\n", |
|||
" cleaned_data['id'] = data.get('id')\n", |
|||
" \n", |
|||
" if 'publisher' in data:\n", |
|||
" cleaned_data['author'] = data.get('publisher')\n", |
|||
" \n", |
|||
" if 'text' in data:\n", |
|||
" cleaned_data['text'] = data.get('text')\n", |
|||
" elif 'title' in data:\n", |
|||
" cleaned_data['text'] = data.get('title')\n", |
|||
"\n", |
|||
" cleaned_dataset.append(cleaned_data)\n", |
|||
"\n", |
|||
" return cleaned_dataset" |
|||
] |
|||
}, |
|||
{ |
|||
"attachments": {}, |
|||
"cell_type": "markdown", |
|||
"metadata": {}, |
|||
"source": [ |
|||
"### Execution of functions defined above" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 38, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"for dataset_name in os.listdir('json_data_id/'):\n", |
|||
" dataset = open_json(f'json_data_id/{dataset_name}')\n", |
|||
"\n", |
|||
" new_dataset = traverse_dataset(dataset)\n", |
|||
" new_dataset = drop_keywords(new_dataset)\n", |
|||
" new_dataset = clean_dataset(new_dataset)\n", |
|||
"\n", |
|||
" create_jsonl(f'jsonl_data/{dataset_name}', new_dataset)" |
|||
] |
|||
} |
|||
], |
|||
"metadata": { |
|||
"kernelspec": { |
|||
"display_name": "sentiment", |
|||
"language": "python", |
|||
"name": "python3" |
|||
}, |
|||
"language_info": { |
|||
"codemirror_mode": { |
|||
"name": "ipython", |
|||
"version": 3 |
|||
}, |
|||
"file_extension": ".py", |
|||
"mimetype": "text/x-python", |
|||
"name": "python", |
|||
"nbconvert_exporter": "python", |
|||
"pygments_lexer": "ipython3", |
|||
"version": "3.9.16" |
|||
}, |
|||
"orig_nbformat": 4 |
|||
}, |
|||
"nbformat": 4, |
|||
"nbformat_minor": 2 |
|||
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,103 @@ |
|||
{ |
|||
"cells": [ |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 3, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"import json\n", |
|||
"import os" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 2, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def open_json(filename):\n", |
|||
" with open(filename, 'r') as json_file:\n", |
|||
" return json.load(json_file)" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 16, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def add_ids(json_file):\n", |
|||
" id_counter = 1\n", |
|||
" for post in json_file:\n", |
|||
" post[\"id\"] = id_counter\n", |
|||
" id_counter += 1\n", |
|||
" if 'comments' in post:\n", |
|||
" for comment in post['comments']:\n", |
|||
" comment[\"id\"] = id_counter\n", |
|||
" id_counter += 1\n", |
|||
" if 'replies' in comment:\n", |
|||
" for reply in comment['replies']:\n", |
|||
" reply[\"id\"] = id_counter\n", |
|||
" id_counter += 1\n", |
|||
" if 'replies' in reply:\n", |
|||
" for sec_reply in reply['replies']:\n", |
|||
" sec_reply[\"id\"] = id_counter\n", |
|||
" id_counter += 1\n", |
|||
" return json_file" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 21, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"def create_json(filename, data):\n", |
|||
" with open(filename, 'w', encoding = \"utf-8\", ) as file:\n", |
|||
" json.dump(data, file, indent=4, separators=(',',': '))" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": 17, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [ |
|||
"for json_file in os.listdir(\"json_data\"):\n", |
|||
" data = open_json(f'json_data/{json_file}')\n", |
|||
" data = add_ids(data)\n", |
|||
" create_json(f'json_data_id/{json_file}', data)" |
|||
] |
|||
}, |
|||
{ |
|||
"cell_type": "code", |
|||
"execution_count": null, |
|||
"metadata": {}, |
|||
"outputs": [], |
|||
"source": [] |
|||
} |
|||
], |
|||
"metadata": { |
|||
"kernelspec": { |
|||
"display_name": "sentiment", |
|||
"language": "python", |
|||
"name": "python3" |
|||
}, |
|||
"language_info": { |
|||
"codemirror_mode": { |
|||
"name": "ipython", |
|||
"version": 3 |
|||
}, |
|||
"file_extension": ".py", |
|||
"mimetype": "text/x-python", |
|||
"name": "python", |
|||
"nbconvert_exporter": "python", |
|||
"pygments_lexer": "ipython3", |
|||
"version": "3.9.16" |
|||
}, |
|||
"orig_nbformat": 4 |
|||
}, |
|||
"nbformat": 4, |
|||
"nbformat_minor": 2 |
|||
} |
File diff suppressed because it is too large
Loading…
Reference in new issue