annotation app finished

This commit is contained in:
vladimir.ferko 2024-04-09 15:39:11 +02:00
commit d0dc4fa0f4
27 changed files with 76605 additions and 0 deletions

27
.gitignore vendored Normal file
View File

@ -0,0 +1,27 @@
# preprocessing ignore
/preprocessing/*.jsonl
/preprocessing/*.pickle
/preprocessing/__pycache__
/preprocessing/classified_data
/preprocessing/clustered_jsonl
/preprocessing/json_data
/preprocessing/json_data_id
/preprocessing/jsonl_data
/preprocessing/.DS_Store
# harvesting ignore
/harvester/Facebook/inputs
/harvester/Facebook/outputs
/harvester/Facebook/.*
/harvester/Facebook/__pycache__
/harvester/__pycache__
/harvester/.DS_Store
# annotation_app
/annotation_app/.env
/annotation_app/__pycache__
/annotation_app/.DS_Store

View File

@ -0,0 +1,2 @@
get_data.py
/instance

13
annotation_app/Dockerfile Normal file
View File

@ -0,0 +1,13 @@
FROM python:3.9
WORKDIR /app
COPY . /app/
RUN pip install --no-cache-dir -r requirements.txt
RUN python initial.py
EXPOSE 5050
CMD ["python3", "app.py"]

140
annotation_app/app.py Normal file
View File

@ -0,0 +1,140 @@
from flask import Flask, render_template, request, redirect, flash, session, url_for
from models import db, Users, Annotations, Samples
from dotenv import load_dotenv
from sqlalchemy.orm import aliased
import sqlalchemy
import os
import logging
load_dotenv()
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = os.getenv('DB_URI')
app.secret_key = os.getenv('SECRET_KEY')
db.init_app(app)
@app.route('/', methods=['GET'])
def home():
session.pop('id_user', None)
return render_template('home.html')
@app.route('/login', methods=['POST'])
def login():
if request.method == 'POST':
email = request.form['email']
if '@' in email:
try:
splitted = email.split('@')
name, surname = splitted[0].split('.')[:2]
domain = splitted[1]
if 'tuke' not in domain:
raise ValueError
except ValueError:
flash('Nie je validný TUKE email')
return redirect('/')
try:
db.session.add(Users(name, surname, email))
db.session.commit()
except sqlalchemy.exc.IntegrityError as err:
db.session.rollback()
logging.info('Logged existing email')
user = Users.query.filter_by(email=email).first()
session['email'] = email
session['id_user'] = user._id
return redirect('/anot')
flash('Nie je validný TUKE email')
return redirect('/')
@app.route('/anot', methods=['GET'])
def anot():
if 'id_user' in session:
try:
annotated_count = Annotations.query.filter_by(user_id=session['id_user']).count()
# query = text(
# f'''
# SELECT samples._id, text
# FROM samples
# LEFT JOIN annotations
# ON samples._id = annotations.sample_id
# WHERE samples._id NOT IN (
# SELECT sample_id
# FROM annotations
# GROUP BY sample_id
# HAVING COUNT(sample_id) > 5
# )
# AND samples._id NOT IN (
# SELECT samples._id
# FROM samples
# LEFT JOIN annotations
# ON samples._id = annotations.sample_id
# WHERE annotations.user_id IS {session['id_user']}
# )
# ORDER BY samples._id ASC
# LIMIT 1;
# '''
# )
annotations_alias = aliased(Annotations)
# Construct the query
query = (
db.session.query(Samples._id, Samples.text)
.outerjoin(annotations_alias, Samples._id == annotations_alias.sample_id)
.filter(
~Samples._id.in_(
db.session.query(Annotations.sample_id)
.group_by(Annotations.sample_id)
.having(db.func.count(Annotations.sample_id) > 5)
),
~Samples._id.in_(
db.session.query(Samples._id)
.outerjoin(Annotations, Samples._id == Annotations.sample_id)
.filter(Annotations.user_id == session['id_user'])
)
)
.order_by(Samples._id.asc())
.limit(1)
)
sample_id, sample_text = query.one_or_none()
data = {
'email': session.get('email'),
'text': sample_text,
'sample_id': sample_id,
'annotated_count': annotated_count
}
except (sqlalchemy.exc.OperationalError) as err:
print(err)
logging.info('Annotationss started')
data = {
'email': session.get('email'),
'text': Samples.query.order_by(Samples._id.asc()).first().text,
'sample_id': Samples.query.order_by(Samples._id.asc()).first()._id,
'annotated_count': annotated_count
}
return render_template('anot.html', **data)
return redirect('/')
@app.route('/process_anot', methods=['POST'])
def process():
if request.method == 'POST':
data = request.get_json()
print(data)
db.session.add(Annotations(
user_id=session['id_user'],
sample_id=data['sample_id'],
label=data['value']
))
db.session.commit()
return redirect(url_for('anot'))
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5050)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,54 @@
from models import db, Samples, Users, Annotations
from app import app
from sqlalchemy import text
if __name__ == '__main__':
with app.app_context():
# AND annotations.user_id <> '{id_user}'
id_user = 4
query = text(
f'''
SELECT samples.id, text
FROM samples
LEFT JOIN annotations
ON samples.id = annotations.sample_id
WHERE samples.id NOT IN (
SELECT sample_id
FROM annotations
GROUP BY sample_id
HAVING COUNT(sample_id) > 5
)
AND samples.id NOT IN (
SELECT samples.id
FROM samples
LEFT JOIN annotations
ON samples.id = annotations.sample_id
WHERE annotations.user_id IS {id_user}
)
ORDER BY samples.id ASC
LIMIT 1;
'''
)
# query = text(
# '''
# SELECT samples.id
# FROM samples
# LEFT JOIN annotations
# ON samples.id = annotations.sample_id
# WHERE annotations.user_id IS NOT 1
# '''
# )
result = db.session.execute(query)
print(result.fetchall())
annotations = Annotations.query.all()
print(len(annotations))
# for annotation in annotations:
# print(annotation.user_id)

26
annotation_app/initial.py Normal file
View File

@ -0,0 +1,26 @@
from models import db, Samples
from app import app
import json
import os
if __name__ == '__main__':
with app.app_context():
# db.init_app(app)
# creating database
db.create_all()
try:
with open(os.path.join('dataset', 'final_id_v2.jsonl'), encoding='utf-8') as file:
data = [json.loads(line) for line in file]
for sample in data:
db.session.add(Samples(sample['text']))
db.session.commit()
print('Data sucessfully inserted')
except FileNotFoundError as err:
print(err)

Binary file not shown.

40
annotation_app/models.py Normal file
View File

@ -0,0 +1,40 @@
from flask_sqlalchemy import SQLAlchemy
db = SQLAlchemy()
class Annotations(db.Model):
__tablename__ = 'annotations'
_id = db.Column("id", db.Integer, primary_key=True)
user_id = db.Column(db.Integer, db.ForeignKey('users.id'), nullable=False)
sample_id = db.Column(db.Integer, db.ForeignKey('samples.id'), nullable=False)
label = db.Column(db.String(32), nullable=False)
def __init__(self, user_id, sample_id, label):
self.user_id = user_id
self.sample_id = sample_id
self.label = label
class Users(db.Model):
__tablename__ = 'users'
_id = db.Column("id", db.Integer, primary_key=True)
name = db.Column(db.String(32), nullable=False)
surname = db.Column(db.String(32), nullable=False)
email = db.Column(db.String(64), unique=True, nullable=False)
annotations = db.relationship('Annotations', uselist=False, backref='user', lazy=True)
def __init__(self, name, surname, email):
self.name = name
self.surname = surname
self.email = email
class Samples(db.Model):
__tablename__ = 'samples'
_id = db.Column("id", db.Integer, primary_key=True)
text = db.Column(db.String(512), nullable=False)
annotations = db.relationship('Annotations', lazy=True, backref='sample') # corrected relationship and added backref
def __init__(self, text):
self.text = text

View File

@ -0,0 +1,13 @@
blinker==1.7.0
click==8.1.7
Flask==3.0.2
Flask-SQLAlchemy==3.1.1
importlib_metadata==7.0.2
itsdangerous==2.1.2
Jinja2==3.1.3
MarkupSafe==2.1.5
python-dotenv==1.0.1
SQLAlchemy==2.0.28
typing_extensions==4.10.0
Werkzeug==3.0.1
zipp==3.18.1

View File

@ -0,0 +1,76 @@
body, html {
height: 100%;
margin: 0;
display: flex;
justify-content: center;
align-items: center;
background-color: #FEFFFF;
}
table {
border-radius: 8px;
}
.container{
position: absolute;
top: 50%;
left: 50%;
transform: translate(-50%, -50%);
text-align: center;
}
.btn {
background-color: #3AAFA9;
border: 1px solid #3AAFA9;
}
.logout-btn{
background-color: #454d55;
margin-top: 7.5%;
}
#top-info{
margin-top: 5%;
position: fixed;
top: 5%;
}
.anot{
border: 1px solid #000000;
border-radius: 10px;
padding: 10px;
}
.anot-text {
padding: 2.5%;
}
.form-control{
margin-bottom: 5px;
}
#login{
position: absolute;
top: -20vh;
left: 50%;
transform: translate(-50%, 50%);
border: 1px solid #000000;
border-radius: 8px;
padding: 4vh;
width: 500px;
}
.top-info {
width: 100%;
border-collapse: collapse; /* Optional: collapse border */
}
.top-info td {
border: 1px solid #000; /* Add border to table cells */
padding: 8px; /* Optional: Add padding */
}
h3{
margin-bottom: 3%;
}

View File

@ -0,0 +1,66 @@
{% extends "base.html" %}
{% block title %} annotation {% endblock %}
{% block content%}
<div class="container" id="top-info">
<table class="table top-info">
<thead class="thead-dark">
<tr>
<th>Email</th>
<th>Počet anotovaných jednotiek</th>
</tr>
</thead>
<tr>
<td>{{ email }}</td>
<td>{{ annotated_count }}</td>
</tr>
</table>
</div>
<div class="container">
<div class="anot">
<p class="anot-text">{{ text }}</p>
<button id="post" class="btn btn-primary" onclick="postBcknd('offensive', {{ sample_id }})">Ofenzívny</button>
<button id="post" class="btn btn-primary" onclick="postBcknd('not_offensive', {{ sample_id }})">Neofenzívny</button>
<button id="post" class="btn btn-primary" onclick="postBcknd('dont_know', {{ sample_id }})">Neviem</button>
</div>
<button id="get" class="btn btn-primary logout-btn" onclick="logout()"> Odhlásiť sa</button>
<script>
function postBcknd(value, sample_id){
var xhr = new XMLHttpRequest();
xhr.open('POST', '/process_anot', true);
xhr.setRequestHeader('Content-Type', 'application/json');
xhr.onload = function () {
if(xhr.status === 200) {
console.log('request sent succesfully');
window.location.href = '/anot';
} else {
console.log('request failed');
}
};
xhr.send(JSON.stringify({value: value, sample_id: sample_id}));
}
function logout() {
var xhr = new XMLHttpRequest();
xhr.open('GET', '/', true);
xhr.onload = function () {
if (xhr.status === 200) {
console.log('Logout successful');
window.location.href = '/';
} else {
console.log('Logout request failed');
}
}
xhr.send(); // Send the request
}
</script>
{% endblock %}

View File

@ -0,0 +1,17 @@
<!doctype html>
<html>
<head>
{% block head %}
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
<link type="text/css" rel="stylesheet" href="{{ url_for('static', filename='stylesheets/styles.css') }}" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% block title %}{% endblock %}</title>
{% endblock %}
</head>
<body>
<div id="content">
{% block content %}
{% endblock %}
</div>
</body>
</html>

View File

@ -0,0 +1,19 @@
{% extends "base.html" %}
{% block title %} Welcome {% endblock %}
{% block content %}
<div class="container">
<form action="/login" id="login" method="post" >
<h3>Login anotačnej aplikácie</h3>
<input type="text" name="email" placeholder="meno.priezvisko@student.tuke.sk" class="form-control">
<button id="post" class="btn btn-primary login-btn">Prihlásiť sa</button>
{% with messages = get_flashed_messages() %}
{% if messages %}
<p style="margin-top: 2%;"> {{ messages[0] }} </p>
{% endif %}
{% endwith %}
</form>
</div>
{% endblock %}

View File

@ -0,0 +1,134 @@
import os
import sys
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import selenium.common.exceptions
from Facebook.facebook_parser import FacebookParser
from crawler import Crawler
class FacebookCrawler(Crawler, FacebookParser):
def __init__(self, base_url: str, file_name: str):
super().__init__(base_url, file_name)
try:
with open(os.path.join('locators.json')) as file:
self.locators = json.load(file)
with open(os.path.join('Facebook', 'inputs', self.filename)) as file:
self.URLS = tuple(file.readlines())
except FileNotFoundError:
print(os.path.join('Facebook', 'inputs', self.filename))
print("Invalid input value")
sys.exit(1)
# crawling part of the code
def crawl(self):
counter = len(self.URLS)
for idx,url in enumerate(self.URLS):
# redirect and wait for page to load
self.driver.get(url)
self.driver.implicitly_wait(4)
if 'videos' in url:
try:
self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest' or text()='Oldest']")
except selenium.common.exceptions.NoSuchElementException:
self.driver.find_element(By.XPATH, "//span[contains(@class, 'x6ikm8r x10wlt62 xlyipyv')]").click()
if self.driver.find_element(By.XPATH, "//div[contains(@class, 'x78zum5 xdt5ytf x1iyjqo2 x7ywyr2')]") is not None:
print('Cant crawl comments section')
continue
self.close_censorship('Newest')
else:
try:
self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest' or text()='Oldest']")
except:
pass
self.close_censorship('All comments')
self.driver.implicitly_wait(3)
print('continue scraping')
# clicking features
self.view_more_comments()
self.show_replies()
self.click_see_more()
# # parsing part of the code
# Dictionary of classes, if facebook changes any class, rewrite this DICT
if '/videos/' in url:
self.class_dict = self.locators['facebook_video_locators']
elif '/posts/' in url:
self.class_dict = self.locators['facebook_post_locators']
self.parse(self.driver.page_source, self.class_dict, self.filename)
print(f'Done: [{idx + 1}/{counter}]')
def view_more_comments(self):
elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
while elements:
try:
self.driver.execute_script("arguments[0].click();", elements[0])
elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
self.driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
except selenium.common.exceptions.StaleElementReferenceException:
elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
# function, for showing hidden replies
def show_replies(self):
repl_elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'repl') and not(contains(text(), 'Hide')) and not(contains(text(), 'Newest'))]")
i = 1
while repl_elements:
try:
for element in repl_elements:
self.driver.execute_script("arguments[0].click();", element)
time.sleep(0.5)
except selenium.common.exceptions.StaleElementReferenceException:
pass
repl_elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'repl') and not(contains(text(), 'Hide')) and not(contains(text(), 'Newest'))]")
# method for expanding comments
def click_see_more(self):
elements = self.driver.find_elements(By.XPATH, "//*[text()='See more']")
for element in elements:
self.driver.execute_script("arguments[0].click();", element)
# method for clossing most relevant filter to Newest
def close_censorship(self, classification: str):
self.driver.implicitly_wait(3)
try:
dropdown = self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest']")
self.driver.execute_script("arguments[0].click();", dropdown) # clicking on it
newest_comments = self.driver.find_element(By.XPATH, f"//*[text()='{classification}']")
self.driver.execute_script("arguments[0].click();", newest_comments) # clicking on it
except:
self.close_censorship(classification)
def close(self):
print('Scraping ended succesffuly')
self.driver.quit()
sys.exit(0)

View File

@ -0,0 +1,129 @@
from bs4 import BeautifulSoup
from print_dict import pd
import json
import sys
import os
class FacebookParser:
def parse(self, html, clsDict, fname = 'final_dataset.json'):
self.soup = BeautifulSoup(html, 'lxml')
self.outFileName = fname
self.outFileName = f"outputs/parts/{self.outFileName.split('.')[0]}_data.json"
# dict for data about facebook post
self.post_data = {
'publisher': None,
'title': None,
'comments': [],
'post_reactions': None
}
# dict for comments
self.comment_data = {
'publisher': None,
'text': None,
'replies': []
}
# reply data
self.reply_data = {
'publisher': None,
'text': None
}
# post info
self.name = self.soup.find('a', {'class': clsDict['POST_AUTHOR']})
if clsDict['TOP_LABEL'] == 'message':
self.top = self.soup.find('div', {'data-ad-comet-preview': clsDict['TOP_LABEL']})
else:
self.top = self.soup.find('div', {'class': clsDict['TOP_LABEL']})
if self.top is None:
self.top = self.soup.find('div', {'class': 'x78zum5 xdt5ytf x2lah0s xyamay9 x1pi30zi x18d9i69 x1swvt13'})
self.title_likes = self.soup.find('span', {'class': clsDict['TITLE_LIKES']})
try:
self.tmp_strings = self.top.find_all('div', {'style': clsDict['STATUS_STRINGS']})
self.title = ''
for x in self.tmp_strings:
try:
self.title += x.text + '. '
except:
pass
except:
self.title = None
self.post_data = {
'publisher': self.name.text if self.name is not None else None,
'title': self.title,
'post_reactions': self.title_likes.text if self.title_likes is not None else None,
'comments': []
}
if self.post_data['publisher'] is None:
return
# comment info
self.all_comments = self.soup.find_all("div", {"aria-label": lambda x: x and x.endswith("ago")}) # arr with all comments under the post
# print(len(self.all_comments))
for item in self.all_comments:
self.publisher = item.find('span', {'class', clsDict['COMMENT_AUTHOR']})
self.txt = item.find('div', {'class': clsDict['COMMENT_STR']})
try:
tmp_type = item.get('aria-label').split(' ')[0]
tmp_class = item.find('div', {'class': lambda x: x and x.startswith(clsDict['TMP_COMMENTS_CLASS'])}).get('class')[-1]
if tmp_type == "Comment":
self.comment_data = {
'publisher': self.publisher.text,
'text': self.txt.text if self.txt is not None else None,
'replies': []
}
self.post_data['comments'].append(self.comment_data)
elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER']:
self.comment_data = {
'publisher': self.publisher.text,
'text': self.txt.text if self.txt is not None else None,
'replies': []
}
self.post_data['comments'][-1]['replies'].append(self.comment_data)
elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER_2']:
self.reply_data = {
'publisher': self.publisher.text,
'text': self.txt.text if self.txt is not None else None,
}
self.post_data['comments'][-1]['replies'][-1]['replies'].append(self.reply_data)
except:
pass
if os.path.exists(self.outFileName):
with open(self.outFileName, 'r+', encoding= "utf-8") as file:
tmp = json.load(file)
tmp.append(self.post_data)
file.seek(0)
json.dump(tmp, file, indent=4, separators=(',',': '))
else:
with open(self.outFileName, 'w', encoding = "utf-8", ) as file:
json.dump([self.post_data], file, indent=4, separators=(',',': '))
#read URLS from a .txt
try:
with open(os.path.join('Facebook', 'inputs', self.filename), 'r+') as file:
lines = file.readlines()
# move file pointer to the beginning of a file
file.seek(0)
# truncate the file
file.truncate()
# start writing lines except the first line
file.writelines(lines[1:])
except FileNotFoundError:
print('Invalid input value')
sys.exit(1)

View File

@ -0,0 +1,155 @@
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import os
import sys
import time
import argparse
# parse args
parser = argparse.ArgumentParser(description = "Facebook scraper")
parser.add_argument("URL", help = 'URL of the facebook page / profile') # dont need to specify type (default is string)
args = parser.parse_args()
# method for waiting on pages to load
def wait_for_url(driver, url):
# waiting to load main page
try:
WebDriverWait(driver, 10).until(EC.url_to_be(url))
print('Succesful !')
except:
print('Connection error')
driver.quit()
sys.exit(1)
# web driver init
def webdriver_setup():
driver_path = r'C:\Users\vlferko\Desktop\projects\jarvis_scraper\chromedriver.exe'
chrome_options = Options()
chrome_options.add_argument("accept-language=en-US")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--log-level=OFF")
driver = webdriver.Chrome(ChromeDriverManager().install(), options= chrome_options)
driver.get("https://www.facebook.com/")
return driver
# login to a facebook acc
def login(driver):
print('Logging in')
# allow cookies
try:
driver.find_element(By.XPATH, "//button[contains(string(), 'Decline optional cookies')]").click()
except:
pass
# insert login data
driver.find_element(By.NAME, "email").send_keys(os.environ['FB_EMAIL']) # type email
driver.find_element(By.NAME, "pass").send_keys(os.environ['FB_PASSWD']) # type password
# click -> log in
driver.find_element(By.NAME, "login").click()
time.sleep(5)
# scrolling to the bottom of the page
def crawl_for_links(driver, url):
print('Crawling')
i = 1
driver.get(url)
time.sleep(2)
name = driver.find_elements(By.TAG_NAME, 'h2')[-1].text
for _ in range(0, 3):
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
for _ in range(50):
# Scroll down to bottom
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
# Wait to load page
time.sleep(3)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
os.system('clear||cls')
print(f'Iteration num: {i}')
i += 1
return driver.page_source, name
# parse HTML
def parse_html(html):
soup = BeautifulSoup(html, 'lxml')
timeline = soup.find('div', {'class': 'x9f619 x1n2onr6 x1ja2u2z xeuugli xs83m0k x1xmf6yo x1emribx x1e56ztr x1i64zmx xjl7jj x19h7ccj xu9j1y6 x7ep2pv'})
posts = timeline.find_all('div', {'class': 'x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z'})
arr = []
for post in posts:
try:
commentsWidget = post.find('span', {'class': 'x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xi81zsa'})
if approveComments(commentsWidget.text):
links = post.find_all('a', {'role': 'link'})
arr.append(extractPostLink(links))
except AttributeError:
pass
return arr
def extractPostLink(links):
for link in links:
if '/videos/' in link['href'] or '/posts/' in link['href']:
return link['href']
# check if post has at least 50 comments
def approveComments(text):
nComments = text.split(' ')[0]
try:
num = int(nComments)
return int(num > 50)
except ValueError:
return 'K' or 'M' in nComments
# write all the links to the .txt
def write_out(arr, name):
with open(f"{os.getcwd()}/inputs/{name.strip().replace(' ', '_').lower()}.txt", 'w') as f:
for item in arr:
try:
f.write(item + '\n')
except TypeError:
pass
if __name__ == '__main__':
# driver init
driver =webdriver_setup()
wait_for_url(driver, 'https://www.facebook.com/')
# login
login(driver)
wait_for_url(driver, 'https://www.facebook.com/')
# crawl
html, name =crawl_for_links(driver, args.URL)
driver.close()
# parsing HTML
arr =parse_html(html)
# write out
write_out(arr, name)

24
harvester/locators.json Normal file
View File

@ -0,0 +1,24 @@
{
"facebook_post_locators": {
"POST_AUTHOR": "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f",
"TOP_LABEL": "xjkvuk6 xuyqlj2 x1odjw0f",
"TITLE_LIKES": "xt0b8zv x1jx94hy xrbpyxo xl423tq",
"STATUS_STRINGS": "text-align: start;",
"COMMENT_AUTHOR": "x3nfvp2",
"COMMENT_STR": "x1lliihq xjkvuk6 x1iorvi4",
"TMP_COMMENTS_CLASS": "xqcrz7y",
"REPLY_DIVIDER": "x1k70j0n",
"REPLY_DIVIDER_2": "x1n2onr6"
},
"facebok_video_locators": {
"POST_AUTHOR": "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f",
"TOP_LABEL": "message",
"TITLE_LIKES": "xt0b8zv x1jx94hy xrbpyxo xl423tq",
"STATUS_STRINGS": "text-align: start;",
"COMMENT_AUTHOR": "x3nfvp2",
"COMMENT_STR": "xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs",
"TMP_COMMENTS_CLASS": "xqcrz7y",
"REPLY_DIVIDER": "x1k70j0n",
"REPLY_DIVIDER_2": "x1n2onr6"
}
}

30
harvester/main.py Normal file
View File

@ -0,0 +1,30 @@
import argparse
from Facebook.facebook_crawler import FacebookCrawler
from Reddit.reddit_crawler import RedditCrawler
FACEBOOK_URL = 'https://www.facebook.com/'
REDDIT_URL = 'https://www.reddit.com/'
if __name__ == '__main__':
# parsing arguments
parser = argparse.ArgumentParser(description = "Facebook scraper")
parser.add_argument("file_name", help = 'Name of the .txt file with URLS')
args = parser.parse_args()
user_input = input('Hello, do you want to scraper Facebook or reddit? [F/r]: ')
while user_input.upper() not in ['F', 'R']:
user_input = input('Do you want to scrape Facebook or reddit? [F/r]: ')
if user_input == 'F':
facebook = FacebookCrawler(FACEBOOK_URL, args.file_name)
facebook.allow_cookies()
facebook.login()
facebook.crawl()
else:
reddit = RedditCrawler(REDDIT_URL, args.file_name)
print(reddit)

View File

@ -0,0 +1,4 @@
beautifulsoup4==4.12.2
print_dict==0.1.19
selenium==4.10.0
webdriver_manager==3.8.6

View File

@ -0,0 +1,176 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data chunking for effectiveness\n",
"\n",
"In our data, facebook user called Robert Fico has a lot of samples.\n",
"For efficiency, this notebook chunks those data in 4 parts."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### JSONL file loading and creation"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def load_jsonl(file_path):\n",
" with open(file_path, 'r', encoding='utf-8') as file:\n",
" return [json.loads(line) for line in file]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def create_jsonl(filename, new_dataset):\n",
" with open(f'{filename}l', 'w') as jsonl_file:\n",
" for item in new_dataset:\n",
" jsonl_file.write(json.dumps(item) + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"fico = load_jsonl('jsonl_data/robert_fico_data.jsonl')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Split data into 4 parts equal parts"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"135155"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"num_samples = len(fico)\n",
"chunk_size = int(num_samples / 4)\n",
"\n",
"num_samples"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chunk_size * 4 == num_samples # we have lost one sample, because our dataset has odd number of samples"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Actual chunking algorithm"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"chunk_arr = []\n",
"for chunks in range(0, 4):\n",
" chunk_arr.append(\n",
" fico[chunk_size * chunks: chunk_size * (chunks + 1)]\n",
" )"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Write chunked data to disk in a for loop"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"for index, data in enumerate(chunk_arr):\n",
" create_jsonl(f'jsonl_data/fico_chunk_{index}.json', data)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "sentiment",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,389 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# This notebook is clustering samples based on their semantic similarity.\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/A200119424/anaconda3/envs/sentiment/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"# imports \n",
"\n",
"from sentence_transformers import SentenceTransformer, util\n",
"from tqdm import tqdm\n",
"import numpy as np\n",
"import torch\n",
"import numpy as np\n",
"import warnings\n",
"import json\n",
"import os\n",
"\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Model init\n",
"\n",
"In this clustering process will be used TUKE-DeutscheTelekom/slovakbert-skquad-mnlr"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"model = SentenceTransformer('TUKE-DeutscheTelekom/slovakbert-skquad-mnlr')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Data manipulation in file system"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def load_jsonl(file_path):\n",
" with open(file_path, 'r', encoding='utf-8') as file:\n",
" return [json.loads(line) for line in file]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pipeline functions"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Embedding creation"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def create_embeddings(jsonl_file):\n",
" sentences = [item['text'] for item in jsonl_file]\n",
" return model.encode(sentences), sentences"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Clustering algorithm"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def cluster_data(embeddings, sentences):\n",
" embeddings_np = np.array(embeddings)\n",
"\n",
" similarity_threshold = 0.65\n",
"\n",
" long_enough_mask = np.array([len(sentence) > 20 for sentence in sentences])\n",
"\n",
" cosine_sim_matrix = util.pytorch_cos_sim(torch.tensor(embeddings_np), torch.tensor(embeddings_np)).numpy()\n",
"\n",
" below_threshold_mask = cosine_sim_matrix < similarity_threshold\n",
"\n",
" filtered_mask = np.logical_and(below_threshold_mask, np.outer(long_enough_mask, long_enough_mask))\n",
"\n",
" non_spam_indices = np.where(filtered_mask)\n",
"\n",
" filtered_sentences = list(set([sentences[i] for i in non_spam_indices[0]]))\n",
"\n",
" return filtered_sentences"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Prepare data to write it to JSONL"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def filter_null_text(json_list):\n",
" filtered_list = [obj for obj in json_list if \"text\" in obj and obj[\"text\"] is not None]\n",
" return filtered_list"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def create_jsonl_format(filtered, jsonl_file):\n",
"\n",
" return [\n",
" {\n",
" 'id': item['id'],\n",
" 'author': item['author'],\n",
" 'text': item['text']\n",
" }\n",
" for item in jsonl_file if item['text'] in filtered\n",
" ]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Write out JSONL file"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def write_jsonl(filename, data):\n",
" with open(filename, 'w') as f:\n",
" for item in data:\n",
" json.dump(item, f)\n",
" f.write('\\n')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pipeline execution"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def execute_pipeline(jsonl_file):\n",
" embeddings, sentences = create_embeddings(jsonl_file)\n",
" filtered_data = cluster_data(embeddings, sentences)\n",
" return create_jsonl_format(filtered_data, jsonl_file)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pipeline usecase"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"prepare data for clustering in a loop"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['aktuality_data.jsonl',\n",
" 'denník_n_data.jsonl',\n",
" 'televízia_joj_data.jsonl',\n",
" 'fakty_data.jsonl',\n",
" 'erik_kaliňák_data.jsonl',\n",
" 'zomri_data.jsonl',\n",
" 'igor_matovic_data.jsonl',\n",
" 'peter_marcin_data.jsonl',\n",
" 'ján_koleník_data.jsonl',\n",
" 'eva_-_hriešne_dobrá_data.jsonl',\n",
" 'emefka_data.jsonl',\n",
" 'marek_hamsik_data.jsonl',\n",
" 'hetrik_data.jsonl',\n",
" 'peter_sagan_data.jsonl',\n",
" 'marian_čekovský_data.jsonl',\n",
" 'zuzana_čaputová_data.jsonl',\n",
" 'sajfa_data.jsonl',\n",
" 'marian_kotleba_data.jsonl',\n",
" 'fico_chunk_3.jsonl',\n",
" 'fico_chunk_1.jsonl',\n",
" 'šport_v_rtvs_data.jsonl',\n",
" 'dominika_cibulkova_data.jsonl',\n",
" 'šport24_data.jsonl',\n",
" 'niké_liga_data.jsonl',\n",
" 'fico_chunk_0.jsonl',\n",
" 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
" 'fico_chunk_2.jsonl']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_to_cluster = [x for x in os.listdir('jsonl_data')]\n",
"\n",
"data_to_cluster.remove('robert_fico_data.jsonl')\n",
"\n",
"data_to_cluster"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Executing the actual pipeline"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 27/27 [1:59:52<00:00, 266.38s/it] \n"
]
}
],
"source": [
"for dataset_name in tqdm(data_to_cluster):\n",
" dataset = load_jsonl(f'jsonl_data/{dataset_name}')\n",
" dataset = filter_null_text(dataset)\n",
" write_jsonl(f'clustered_jsonl/{dataset_name}', execute_pipeline(dataset))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['aktuality_data.jsonl',\n",
" 'denník_n_data.jsonl',\n",
" 'televízia_joj_data.jsonl',\n",
" '.DS_Store',\n",
" 'fakty_data.jsonl',\n",
" 'erik_kaliňák_data.jsonl',\n",
" 'zomri_data.jsonl',\n",
" 'igor_matovic_data.jsonl',\n",
" 'peter_marcin_data.jsonl',\n",
" 'ján_koleník_data.jsonl',\n",
" 'eva_-_hriešne_dobrá_data.jsonl',\n",
" 'emefka_data.jsonl',\n",
" 'marek_hamsik_data.jsonl',\n",
" 'hetrik_data.jsonl',\n",
" 'peter_sagan_data.jsonl',\n",
" 'marian_čekovský_data.jsonl',\n",
" 'zuzana_čaputová_data.jsonl',\n",
" 'sajfa_data.jsonl',\n",
" 'marian_kotleba_data.jsonl',\n",
" 'fico_chunk_3.jsonl',\n",
" 'fico_chunk_1.jsonl',\n",
" 'šport_v_rtvs_data.jsonl',\n",
" 'dominika_cibulkova_data.jsonl',\n",
" 'šport24_data.jsonl',\n",
" 'niké_liga_data.jsonl',\n",
" 'fico_chunk_0.jsonl',\n",
" 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
" 'fico_chunk_2.jsonl']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.listdir('jsonl_data')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "sentiment",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,181 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# JSON to JSONL file converter\n",
"This notebook turns structured JSON file to a simplier form as a JSONL for easier data manipulation"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# imports \n",
"import json\n",
"import os"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Open JSON data, then write it as JSONL"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def open_json(filename):\n",
" # Read the JSON file\n",
" with open(filename, 'r') as json_file:\n",
" return json.load(json_file)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"def create_jsonl(filename, new_dataset):\n",
" with open(f'{filename}l', 'w') as jsonl_file:\n",
" for item in new_dataset:\n",
" jsonl_file.write(json.dumps(item) + '\\n')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Loop through dataset, create new list of dictionaries, drop duplicate data"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"def traverse_dataset(dataset):\n",
" new_dataset = []\n",
" for post in dataset:\n",
" new_dataset.append(post)\n",
" for comment in post['comments']:\n",
" new_dataset.append(comment)\n",
" try:\n",
" for reply in comment['replies']:\n",
" new_dataset.append(reply)\n",
"\n",
" for sec_reply in reply['replies']:\n",
" new_dataset.append(sec_reply)\n",
" except KeyError:\n",
" pass\n",
" \n",
" return new_dataset"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"def drop_keywords(dataset):\n",
" for item in dataset:\n",
" try:\n",
" del item['comments']\n",
" except KeyError:\n",
" pass\n",
" try:\n",
" del item['replies']\n",
" except KeyError:\n",
" pass\n",
" \n",
" return dataset"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"def clean_dataset(dataset):\n",
" cleaned_dataset = []\n",
" for data in dataset:\n",
"\n",
" cleaned_data = {}\n",
" if 'id' in data:\n",
" cleaned_data['id'] = data.get('id')\n",
" \n",
" if 'publisher' in data:\n",
" cleaned_data['author'] = data.get('publisher')\n",
" \n",
" if 'text' in data:\n",
" cleaned_data['text'] = data.get('text')\n",
" elif 'title' in data:\n",
" cleaned_data['text'] = data.get('title')\n",
"\n",
" cleaned_dataset.append(cleaned_data)\n",
"\n",
" return cleaned_dataset"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Execution of functions defined above"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"for dataset_name in os.listdir('json_data_id/'):\n",
" dataset = open_json(f'json_data_id/{dataset_name}')\n",
"\n",
" new_dataset = traverse_dataset(dataset)\n",
" new_dataset = drop_keywords(new_dataset)\n",
" new_dataset = clean_dataset(new_dataset)\n",
"\n",
" create_jsonl(f'jsonl_data/{dataset_name}', new_dataset)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "sentiment",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,103 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def open_json(filename):\n",
" with open(filename, 'r') as json_file:\n",
" return json.load(json_file)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def add_ids(json_file):\n",
" id_counter = 1\n",
" for post in json_file:\n",
" post[\"id\"] = id_counter\n",
" id_counter += 1\n",
" if 'comments' in post:\n",
" for comment in post['comments']:\n",
" comment[\"id\"] = id_counter\n",
" id_counter += 1\n",
" if 'replies' in comment:\n",
" for reply in comment['replies']:\n",
" reply[\"id\"] = id_counter\n",
" id_counter += 1\n",
" if 'replies' in reply:\n",
" for sec_reply in reply['replies']:\n",
" sec_reply[\"id\"] = id_counter\n",
" id_counter += 1\n",
" return json_file"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"def create_json(filename, data):\n",
" with open(filename, 'w', encoding = \"utf-8\", ) as file:\n",
" json.dump(data, file, indent=4, separators=(',',': '))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"for json_file in os.listdir(\"json_data\"):\n",
" data = open_json(f'json_data/{json_file}')\n",
" data = add_ids(data)\n",
" create_json(f'json_data_id/{json_file}', data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "sentiment",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because it is too large Load Diff