annotation app finished

2024-04-09 15:39:11 +02:00 · 2024-04-09 15:39:11 +02:00 · d0dc4fa0f4
commit d0dc4fa0f4
27 changed files with 76605 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,27 @@
+# preprocessing ignore
+
+/preprocessing/*.jsonl
+/preprocessing/*.pickle
+
+/preprocessing/__pycache__
+/preprocessing/classified_data
+/preprocessing/clustered_jsonl
+/preprocessing/json_data
+/preprocessing/json_data_id
+/preprocessing/jsonl_data
+/preprocessing/.DS_Store
+
+# harvesting ignore
+/harvester/Facebook/inputs
+/harvester/Facebook/outputs
+/harvester/Facebook/.*
+/harvester/Facebook/__pycache__
+/harvester/__pycache__
+/harvester/.DS_Store
+
+# annotation_app
+/annotation_app/.env
+/annotation_app/__pycache__
+/annotation_app/.DS_Store
+
+
--- a/annotation_app/.dockerignore
+++ b/annotation_app/.dockerignore
@ -0,0 +1,2 @@
+get_data.py
+/instance
--- a/annotation_app/Dockerfile
+++ b/annotation_app/Dockerfile
@ -0,0 +1,13 @@
+FROM python:3.9
+
+WORKDIR /app
+
+COPY . /app/
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+RUN python initial.py
+
+EXPOSE 5050
+
+CMD ["python3", "app.py"]
--- a/annotation_app/app.py
+++ b/annotation_app/app.py
@ -0,0 +1,140 @@
+from flask import Flask, render_template, request, redirect, flash, session, url_for
+from models import db, Users, Annotations, Samples
+from dotenv import load_dotenv
+from sqlalchemy.orm import aliased
+import sqlalchemy
+import os
+
+
+import logging
+
+load_dotenv()
+
+app = Flask(__name__)
+
+app.config['SQLALCHEMY_DATABASE_URI'] = os.getenv('DB_URI')
+app.secret_key = os.getenv('SECRET_KEY')
+
+db.init_app(app)
+
+
+
+@app.route('/', methods=['GET'])
+def home():
+    session.pop('id_user', None)
+    return render_template('home.html')
+
+@app.route('/login', methods=['POST'])
+def login():
+    if request.method == 'POST':
+        email = request.form['email']
+        if '@' in email:
+            try:
+                splitted = email.split('@')
+                name, surname = splitted[0].split('.')[:2]
+                domain = splitted[1]
+                if 'tuke' not in domain:
+                    raise ValueError
+            except ValueError:
+                flash('Nie je validný TUKE email')
+                return redirect('/')
+            try:
+                db.session.add(Users(name, surname, email))
+                db.session.commit()
+            except sqlalchemy.exc.IntegrityError as err:
+                db.session.rollback()
+                logging.info('Logged existing email')
+            user = Users.query.filter_by(email=email).first()
+            session['email'] = email
+            session['id_user'] = user._id
+            return redirect('/anot')
+        flash('Nie je validný TUKE email')
+        return redirect('/')
+
+@app.route('/anot', methods=['GET'])
+def anot():
+    if 'id_user' in session:
+        try:
+            annotated_count = Annotations.query.filter_by(user_id=session['id_user']).count()
+            
+            # query = text(
+            #     f'''
+            #     SELECT samples._id, text 
+            #     FROM samples
+            #     LEFT JOIN annotations
+            #     ON samples._id = annotations.sample_id
+            #     WHERE samples._id NOT IN (
+            #         SELECT sample_id
+            #         FROM annotations
+            #         GROUP BY sample_id
+            #         HAVING COUNT(sample_id) > 5
+            #     )
+            #     AND samples._id NOT IN (
+            #         SELECT samples._id
+            #         FROM samples
+            #         LEFT JOIN annotations
+            #         ON samples._id = annotations.sample_id
+            #         WHERE annotations.user_id IS {session['id_user']}
+            #     )
+            #     ORDER BY samples._id ASC
+            #     LIMIT 1;
+            #     '''
+            # )
+
+            annotations_alias = aliased(Annotations)
+
+            # Construct the query
+            query = (
+                db.session.query(Samples._id, Samples.text)
+                .outerjoin(annotations_alias, Samples._id == annotations_alias.sample_id)
+                .filter(
+                    ~Samples._id.in_(
+                        db.session.query(Annotations.sample_id)
+                        .group_by(Annotations.sample_id)
+                        .having(db.func.count(Annotations.sample_id) > 5)
+                    ),
+                    ~Samples._id.in_(
+                        db.session.query(Samples._id)
+                        .outerjoin(Annotations, Samples._id == Annotations.sample_id)
+                        .filter(Annotations.user_id == session['id_user'])
+                    )
+                )
+                .order_by(Samples._id.asc())
+                .limit(1)
+            )
+            sample_id, sample_text = query.one_or_none()
+
+            data = {
+                'email': session.get('email'),
+                'text': sample_text,
+                'sample_id': sample_id,
+                'annotated_count': annotated_count
+            }
+
+        except (sqlalchemy.exc.OperationalError) as err:
+            print(err)
+            logging.info('Annotationss started')
+            data = {
+                'email': session.get('email'),
+                'text': Samples.query.order_by(Samples._id.asc()).first().text,
+                'sample_id': Samples.query.order_by(Samples._id.asc()).first()._id,
+                'annotated_count': annotated_count
+            }
+        return render_template('anot.html', **data)
+    return redirect('/')
+
+@app.route('/process_anot', methods=['POST'])
+def process():
+    if request.method == 'POST':
+        data = request.get_json()
+        print(data)
+        db.session.add(Annotations(
+            user_id=session['id_user'],
+            sample_id=data['sample_id'],
+            label=data['value']
+        ))
+        db.session.commit()
+        return redirect(url_for('anot'))
+
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=5050)
--- a/annotation_app/dataset/final_id_v2.jsonl
+++ b/annotation_app/dataset/final_id_v2.jsonl
--- a/annotation_app/get_data.py
+++ b/annotation_app/get_data.py
@ -0,0 +1,54 @@
+from models import db, Samples, Users, Annotations
+from app import app
+from sqlalchemy import text
+
+
+
+if __name__ == '__main__':
+    with app.app_context():
+        # AND annotations.user_id <> '{id_user}'
+        id_user = 4
+        
+        
+        query = text(
+            f'''
+            SELECT samples.id, text 
+            FROM samples
+            LEFT JOIN annotations
+            ON samples.id = annotations.sample_id
+            WHERE samples.id NOT IN (
+                SELECT sample_id
+                FROM annotations
+                GROUP BY sample_id
+                HAVING COUNT(sample_id) > 5
+            )
+            AND samples.id NOT IN (
+                SELECT samples.id
+                FROM samples
+                LEFT JOIN annotations
+                ON samples.id = annotations.sample_id
+                WHERE annotations.user_id IS {id_user}
+            )
+            ORDER BY samples.id ASC
+            LIMIT 1;
+            '''
+        )
+        
+        # query = text(
+        #     '''
+        #     SELECT samples.id
+        #     FROM samples
+        #     LEFT JOIN annotations
+        #     ON samples.id = annotations.sample_id
+        #     WHERE annotations.user_id IS NOT 1
+        #     '''
+        # )
+
+        result = db.session.execute(query)
+
+        print(result.fetchall())
+
+        annotations = Annotations.query.all()
+        print(len(annotations))
+        # for annotation in annotations:
+        #     print(annotation.user_id)
--- a/annotation_app/initial.py
+++ b/annotation_app/initial.py
@ -0,0 +1,26 @@
+from models import db, Samples
+from app import app
+import json
+import os
+
+
+
+
+if __name__ == '__main__':
+    with app.app_context():
+        # db.init_app(app)
+
+        # creating database
+        db.create_all()
+
+        try:
+            with open(os.path.join('dataset', 'final_id_v2.jsonl'), encoding='utf-8') as file:
+                data = [json.loads(line) for line in file]
+
+
+            for sample in data:
+                db.session.add(Samples(sample['text']))
+            db.session.commit()
+            print('Data sucessfully inserted')
+        except FileNotFoundError as err:
+            print(err)
--- a/annotation_app/instance/anot_db.db
+++ b/annotation_app/instance/anot_db.db
--- a/annotation_app/models.py
+++ b/annotation_app/models.py
@ -0,0 +1,40 @@
+from flask_sqlalchemy import SQLAlchemy
+
+db = SQLAlchemy()
+
+class Annotations(db.Model):
+    __tablename__ = 'annotations'
+
+    _id = db.Column("id", db.Integer, primary_key=True)
+    user_id = db.Column(db.Integer, db.ForeignKey('users.id'), nullable=False)
+    sample_id = db.Column(db.Integer, db.ForeignKey('samples.id'), nullable=False)
+    label = db.Column(db.String(32), nullable=False)
+
+    def __init__(self, user_id, sample_id, label):
+        self.user_id = user_id
+        self.sample_id = sample_id
+        self.label = label
+
+class Users(db.Model):
+    __tablename__ = 'users'
+
+    _id = db.Column("id", db.Integer, primary_key=True)
+    name = db.Column(db.String(32), nullable=False)
+    surname = db.Column(db.String(32), nullable=False)
+    email = db.Column(db.String(64), unique=True, nullable=False)
+    annotations = db.relationship('Annotations', uselist=False, backref='user', lazy=True)
+
+    def __init__(self, name, surname, email):
+        self.name = name
+        self.surname = surname
+        self.email = email
+
+class Samples(db.Model):
+    __tablename__ = 'samples'
+
+    _id = db.Column("id", db.Integer, primary_key=True)
+    text = db.Column(db.String(512), nullable=False)
+    annotations = db.relationship('Annotations', lazy=True, backref='sample')  # corrected relationship and added backref
+
+    def __init__(self, text):
+        self.text = text
--- a/annotation_app/requirements.txt
+++ b/annotation_app/requirements.txt
@ -0,0 +1,13 @@
+blinker==1.7.0
+click==8.1.7
+Flask==3.0.2
+Flask-SQLAlchemy==3.1.1
+importlib_metadata==7.0.2
+itsdangerous==2.1.2
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+python-dotenv==1.0.1
+SQLAlchemy==2.0.28
+typing_extensions==4.10.0
+Werkzeug==3.0.1
+zipp==3.18.1
--- a/annotation_app/static/stylesheets/styles.css
+++ b/annotation_app/static/stylesheets/styles.css
@ -0,0 +1,76 @@
+body, html {
+    height: 100%;
+    margin: 0;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    background-color: #FEFFFF;
+}
+
+table {
+    border-radius: 8px;
+}
+.container{
+    position: absolute;
+    top: 50%;
+    left: 50%;
+    transform: translate(-50%, -50%);
+    text-align: center;
+} 
+
+.btn { 
+    background-color: #3AAFA9;
+    border: 1px solid #3AAFA9;
+}
+
+.logout-btn{
+    background-color: #454d55;
+    margin-top: 7.5%;
+}
+
+#top-info{
+    margin-top: 5%;
+    position: fixed;
+    top: 5%;
+}
+
+.anot{
+    border: 1px solid #000000; 
+    border-radius: 10px; 
+    padding: 10px;
+}
+
+.anot-text {
+    padding: 2.5%;
+}
+
+
+.form-control{
+    margin-bottom: 5px;
+}
+
+#login{
+    position: absolute;
+    top: -20vh;
+    left: 50%;
+    transform: translate(-50%, 50%);
+    border: 1px solid #000000;
+    border-radius: 8px;
+    padding: 4vh;
+    width: 500px;
+}
+
+
+.top-info {
+    width: 100%;
+    border-collapse: collapse; /* Optional: collapse border */
+}
+
+.top-info td {
+    border: 1px solid #000; /* Add border to table cells */
+    padding: 8px; /* Optional: Add padding */
+}
+
+h3{
+    margin-bottom: 3%;
+}
--- a/annotation_app/templates/anot.html
+++ b/annotation_app/templates/anot.html
@ -0,0 +1,66 @@
+{% extends "base.html" %}
+
+{% block title %} annotation {% endblock %}
+
+{% block content%}
+
+    <div class="container" id="top-info">
+        <table class="table top-info">
+            <thead class="thead-dark">
+                <tr>
+                    <th>Email</th>
+                    <th>Počet anotovaných jednotiek</th>
+                </tr>
+            </thead>    
+            <tr>
+                <td>{{ email }}</td>
+                <td>{{ annotated_count }}</td>
+            </tr>
+        </table>
+    </div>
+    
+    <div class="container">
+        <div class="anot">
+            <p class="anot-text">{{ text }}</p>
+            <button id="post" class="btn btn-primary" onclick="postBcknd('offensive', {{ sample_id }})">Ofenzívny</button>
+            <button id="post" class="btn btn-primary" onclick="postBcknd('not_offensive', {{ sample_id }})">Neofenzívny</button>
+            <button id="post" class="btn btn-primary" onclick="postBcknd('dont_know', {{ sample_id }})">Neviem</button> 
+        </div>
+
+    <button id="get" class="btn btn-primary logout-btn" onclick="logout()"> Odhlásiť sa</button>
+
+<script>
+    function postBcknd(value, sample_id){
+        var xhr = new XMLHttpRequest();
+        xhr.open('POST', '/process_anot', true);
+        xhr.setRequestHeader('Content-Type', 'application/json');
+
+        xhr.onload = function () {
+            if(xhr.status === 200) {
+                console.log('request sent succesfully');
+                window.location.href = '/anot';
+            } else {
+                console.log('request failed');
+            }
+        };
+        xhr.send(JSON.stringify({value: value, sample_id: sample_id}));
+    }
+
+    function logout() {
+        var xhr = new XMLHttpRequest();
+        xhr.open('GET', '/', true);
+
+        xhr.onload = function () {
+            if (xhr.status === 200) {
+                console.log('Logout successful');
+                window.location.href = '/';
+            } else { 
+                console.log('Logout request failed');
+            }
+        }
+
+        xhr.send(); // Send the request
+    }
+</script>
+
+{% endblock %}
--- a/annotation_app/templates/base.html
+++ b/annotation_app/templates/base.html
@ -0,0 +1,17 @@
+<!doctype html>
+<html>
+  <head>
+    {% block head %}
+    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
+    <link type="text/css" rel="stylesheet" href="{{ url_for('static', filename='stylesheets/styles.css') }}" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{% block title %}{% endblock %}</title>
+    {% endblock %}
+  </head>
+  <body>
+    <div id="content"> 
+        {% block content %}
+        {% endblock %}
+    </div>
+  </body>
+</html>
--- a/annotation_app/templates/home.html
+++ b/annotation_app/templates/home.html
@ -0,0 +1,19 @@
+{% extends "base.html" %}
+{% block title %} Welcome {% endblock %}
+
+{% block content %}
+    
+    <div class="container">
+        <form action="/login" id="login" method="post" >
+            <h3>Login anotačnej aplikácie</h3>
+            <input type="text" name="email" placeholder="meno.priezvisko@student.tuke.sk" class="form-control">
+            <button id="post" class="btn btn-primary login-btn">Prihlásiť sa</button>
+            {% with messages = get_flashed_messages() %}
+            {% if messages %}
+                <p style="margin-top: 2%;"> {{ messages[0] }} </p> 
+            {% endif %}
+        {% endwith %}
+        </form>
+    </div>
+    
+{% endblock %}
--- a/harvester/Facebook/facebook_crawler.py
+++ b/harvester/Facebook/facebook_crawler.py
@ -0,0 +1,134 @@
+import os
+import sys
+import time
+import json
+
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.action_chains import ActionChains
+import selenium.common.exceptions
+from Facebook.facebook_parser import FacebookParser
+
+from crawler import Crawler
+
+
+class FacebookCrawler(Crawler, FacebookParser):
+
+
+    def __init__(self, base_url: str, file_name: str):
+        super().__init__(base_url, file_name)
+
+        try:
+            with open(os.path.join('locators.json')) as file:
+                self.locators = json.load(file)
+
+            with open(os.path.join('Facebook', 'inputs', self.filename)) as file:
+                self.URLS = tuple(file.readlines())
+        except FileNotFoundError:
+            print(os.path.join('Facebook', 'inputs', self.filename))
+            print("Invalid input value")
+            sys.exit(1)
+
+    # crawling part of the code 
+    def crawl(self):
+        counter = len(self.URLS)
+        for idx,url in enumerate(self.URLS):
+            # redirect and wait for page to load
+            self.driver.get(url)
+            self.driver.implicitly_wait(4)
+
+            if 'videos' in url:
+                try:
+                    self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest' or text()='Oldest']")
+                except selenium.common.exceptions.NoSuchElementException:
+                    self.driver.find_element(By.XPATH, "//span[contains(@class, 'x6ikm8r x10wlt62 xlyipyv')]").click()
+                    if self.driver.find_element(By.XPATH, "//div[contains(@class, 'x78zum5 xdt5ytf x1iyjqo2 x7ywyr2')]") is not None:
+                        print('Cant crawl comments section')
+                        continue
+                
+                self.close_censorship('Newest')
+            else:
+                try:
+                    self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest' or text()='Oldest']")
+                except:
+                    pass
+
+                self.close_censorship('All comments')
+
+            self.driver.implicitly_wait(3)
+            print('continue scraping')
+
+            # clicking features
+            self.view_more_comments()
+            self.show_replies()
+            self.click_see_more()
+            
+            # # parsing part of the code
+
+            # Dictionary of classes, if facebook changes any class, rewrite this DICT
+            if '/videos/' in url:
+                self.class_dict = self.locators['facebook_video_locators']
+            elif '/posts/' in url:
+                self.class_dict = self.locators['facebook_post_locators']
+
+            self.parse(self.driver.page_source, self.class_dict, self.filename)
+            print(f'Done: [{idx + 1}/{counter}]')
+
+    def view_more_comments(self):
+        elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
+        while elements:
+            try:
+                self.driver.execute_script("arguments[0].click();", elements[0])
+                elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
+                self.driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
+            except selenium.common.exceptions.StaleElementReferenceException:
+                elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
+
+ 
+    # function, for showing hidden replies
+    def show_replies(self):
+
+        repl_elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'repl') and not(contains(text(), 'Hide')) and not(contains(text(), 'Newest'))]")
+        i = 1
+        while repl_elements:
+            
+            try:
+                for element in repl_elements:
+                    self.driver.execute_script("arguments[0].click();", element)
+                    time.sleep(0.5)
+
+            except selenium.common.exceptions.StaleElementReferenceException:
+                pass
+            repl_elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'repl') and not(contains(text(), 'Hide')) and not(contains(text(), 'Newest'))]")
+
+    # method for expanding comments
+    def click_see_more(self):
+
+        elements = self.driver.find_elements(By.XPATH, "//*[text()='See more']")
+
+        for element in elements:   
+            self.driver.execute_script("arguments[0].click();", element)
+            
+
+
+    # method for clossing most relevant filter to Newest
+    def close_censorship(self, classification: str):
+        self.driver.implicitly_wait(3)
+        try:
+            dropdown = self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest']")
+            self.driver.execute_script("arguments[0].click();", dropdown) # clicking on it
+            
+            newest_comments = self.driver.find_element(By.XPATH, f"//*[text()='{classification}']")
+            self.driver.execute_script("arguments[0].click();", newest_comments) # clicking on it
+        except:
+            self.close_censorship(classification)
+
+   
+    def close(self):
+        print('Scraping ended succesffuly')
+        self.driver.quit()
+        sys.exit(0)
--- a/harvester/Facebook/facebook_parser.py
+++ b/harvester/Facebook/facebook_parser.py
@ -0,0 +1,129 @@
+from bs4 import BeautifulSoup
+from print_dict import pd
+import json
+import sys
+import os
+
+class FacebookParser:
+
+    def parse(self, html, clsDict, fname = 'final_dataset.json'):
+
+        self.soup = BeautifulSoup(html, 'lxml')
+        self.outFileName = fname
+        self.outFileName = f"outputs/parts/{self.outFileName.split('.')[0]}_data.json"
+
+        # dict for data about facebook post
+        self.post_data = {
+            'publisher': None,
+            'title': None,
+            'comments': [],
+            'post_reactions': None
+        }
+
+        # dict for comments
+        self.comment_data = {
+            'publisher': None,
+            'text': None,
+            'replies': []
+        }
+
+        # reply data
+        self.reply_data = {
+            'publisher': None,
+            'text': None
+        }
+
+        # post info
+        self.name = self.soup.find('a', {'class': clsDict['POST_AUTHOR']})
+
+        if clsDict['TOP_LABEL'] == 'message':
+            self.top = self.soup.find('div', {'data-ad-comet-preview': clsDict['TOP_LABEL']})
+        else:
+            self.top = self.soup.find('div', {'class': clsDict['TOP_LABEL']})
+            if self.top is None:
+                self.top = self.soup.find('div', {'class': 'x78zum5 xdt5ytf x2lah0s xyamay9 x1pi30zi x18d9i69 x1swvt13'})
+
+        self.title_likes = self.soup.find('span', {'class': clsDict['TITLE_LIKES']})
+        try:
+            self.tmp_strings = self.top.find_all('div', {'style': clsDict['STATUS_STRINGS']})
+            self.title = ''
+            for x in self.tmp_strings:
+                try:
+                    self.title += x.text + '. '
+                except:
+                    pass
+        except:
+            self.title = None
+
+        
+
+        self.post_data = {
+            'publisher': self.name.text if self.name is not None else None,
+            'title': self.title,
+            'post_reactions': self.title_likes.text if self.title_likes is not None else None,
+            'comments': []
+        }
+
+        if self.post_data['publisher'] is None:
+            return
+
+        # comment info
+        self.all_comments = self.soup.find_all("div", {"aria-label": lambda x: x and x.endswith("ago")}) # arr with all comments under the post
+        # print(len(self.all_comments))
+        for item in self.all_comments:
+            self.publisher = item.find('span', {'class', clsDict['COMMENT_AUTHOR']})
+            self.txt = item.find('div', {'class': clsDict['COMMENT_STR']})
+            try:
+                tmp_type = item.get('aria-label').split(' ')[0]
+                tmp_class = item.find('div', {'class': lambda x: x and x.startswith(clsDict['TMP_COMMENTS_CLASS'])}).get('class')[-1]
+                if tmp_type == "Comment":
+                    self.comment_data = {
+                        'publisher': self.publisher.text,
+                        'text': self.txt.text if self.txt is not None else None,
+                        'replies': []
+                    }
+                    self.post_data['comments'].append(self.comment_data)
+
+
+                elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER']:
+                    self.comment_data = {
+                        'publisher': self.publisher.text,
+                        'text': self.txt.text if self.txt is not None else None,
+                        'replies': []
+                    }
+                    self.post_data['comments'][-1]['replies'].append(self.comment_data)
+
+                elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER_2']:
+                    self.reply_data = {
+                    'publisher': self.publisher.text,
+                    'text': self.txt.text if self.txt is not None else None,
+                    }
+                    self.post_data['comments'][-1]['replies'][-1]['replies'].append(self.reply_data)
+
+            except:
+                pass
+
+      
+        if os.path.exists(self.outFileName):
+            with open(self.outFileName, 'r+', encoding= "utf-8") as file:
+                tmp = json.load(file)
+                tmp.append(self.post_data)
+                file.seek(0)
+                json.dump(tmp, file, indent=4, separators=(',',': '))
+        else:
+            with open(self.outFileName, 'w', encoding = "utf-8", ) as file:
+                json.dump([self.post_data], file, indent=4, separators=(',',': '))
+
+        #read URLS from a .txt
+        try:
+            with open(os.path.join('Facebook', 'inputs', self.filename), 'r+') as file:
+                lines = file.readlines()
+                # move file pointer to the beginning of a file
+                file.seek(0)
+                # truncate the file
+                file.truncate()
+                # start writing lines except the first line
+                file.writelines(lines[1:])
+        except FileNotFoundError:
+            print('Invalid input value')
+            sys.exit(1)
--- a/harvester/Facebook/linkCollector.py
+++ b/harvester/Facebook/linkCollector.py
@ -0,0 +1,155 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from webdriver_manager.chrome import ChromeDriverManager
+
+from bs4 import BeautifulSoup
+
+import os
+import sys
+import time
+import argparse
+
+
+# parse args
+parser = argparse.ArgumentParser(description = "Facebook scraper")
+parser.add_argument("URL", help = 'URL of the facebook page / profile') # dont need to specify type (default is string)
+args = parser.parse_args()
+
+
+# method for waiting on pages to load
+def wait_for_url(driver, url):
+    # waiting to load main page
+    try:
+        WebDriverWait(driver, 10).until(EC.url_to_be(url))
+        print('Succesful !')
+    except:
+        print('Connection error')
+        driver.quit()
+        sys.exit(1)
+
+# web driver init
+def webdriver_setup():
+
+    driver_path = r'C:\Users\vlferko\Desktop\projects\jarvis_scraper\chromedriver.exe'
+        
+    chrome_options = Options()
+    chrome_options.add_argument("accept-language=en-US")
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--log-level=OFF")
+    driver = webdriver.Chrome(ChromeDriverManager().install(), options= chrome_options)
+
+    driver.get("https://www.facebook.com/")
+    return driver
+
+# login to a facebook acc
+def login(driver):
+    print('Logging in')
+    # allow cookies
+    try:
+        driver.find_element(By.XPATH, "//button[contains(string(), 'Decline optional cookies')]").click()
+    except:
+        pass
+    
+    
+    # insert login data
+    driver.find_element(By.NAME, "email").send_keys(os.environ['FB_EMAIL']) # type email
+    driver.find_element(By.NAME, "pass").send_keys(os.environ['FB_PASSWD']) # type password
+
+    # click -> log in 
+    driver.find_element(By.NAME, "login").click()
+    time.sleep(5)
+
+# scrolling to the bottom of the page
+def crawl_for_links(driver, url):
+    print('Crawling')
+    i = 1
+    driver.get(url)
+    time.sleep(2)
+    name = driver.find_elements(By.TAG_NAME, 'h2')[-1].text
+    
+
+    for _ in range(0, 3):
+        # Get scroll height
+        last_height = driver.execute_script("return document.body.scrollHeight")
+        for _ in range(50):
+            # Scroll down to bottom
+            driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
+
+            # Wait to load page
+            time.sleep(3)
+
+            # Calculate new scroll height and compare with last scroll height
+            new_height = driver.execute_script("return document.body.scrollHeight")
+            if new_height == last_height:
+                break
+            last_height = new_height
+            os.system('clear||cls')
+            print(f'Iteration num: {i}')
+            i += 1
+
+    return driver.page_source, name
+
+# parse HTML 
+def parse_html(html):
+    soup = BeautifulSoup(html, 'lxml')
+
+    timeline = soup.find('div', {'class': 'x9f619 x1n2onr6 x1ja2u2z xeuugli xs83m0k x1xmf6yo x1emribx x1e56ztr x1i64zmx xjl7jj x19h7ccj xu9j1y6 x7ep2pv'})
+    posts = timeline.find_all('div', {'class': 'x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z'})
+    arr = []
+    for post in posts:
+        try:
+            commentsWidget = post.find('span', {'class': 'x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xi81zsa'})
+            if approveComments(commentsWidget.text):
+                links = post.find_all('a', {'role': 'link'})
+                arr.append(extractPostLink(links))
+        except AttributeError:
+            pass
+    
+    return arr
+
+def extractPostLink(links):
+    for link in links:
+        if '/videos/' in link['href'] or '/posts/' in link['href']:
+            return link['href']
+
+# check if post has at least 50 comments
+def approveComments(text):
+    nComments = text.split(' ')[0]
+    try:
+        num = int(nComments)
+        return int(num > 50)
+    except ValueError:
+        return 'K' or 'M' in nComments
+
+# write all the links to the .txt
+def write_out(arr, name):
+    
+    with open(f"{os.getcwd()}/inputs/{name.strip().replace(' ', '_').lower()}.txt", 'w') as f:
+        for item in arr:
+            try:
+                f.write(item + '\n')
+            except TypeError:
+                pass
+
+
+if __name__ == '__main__':
+    # driver init
+    driver =webdriver_setup() 
+    wait_for_url(driver, 'https://www.facebook.com/')
+    
+    # login
+    login(driver)
+    wait_for_url(driver, 'https://www.facebook.com/')
+    
+    # crawl
+    html, name =crawl_for_links(driver, args.URL)
+    driver.close()
+
+    # parsing HTML
+    arr =parse_html(html)
+
+    # write out
+    write_out(arr, name)
--- a/harvester/locators.json
+++ b/harvester/locators.json
@ -0,0 +1,24 @@
+{
+    "facebook_post_locators": {
+        "POST_AUTHOR": "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f",
+        "TOP_LABEL": "xjkvuk6 xuyqlj2 x1odjw0f",
+        "TITLE_LIKES": "xt0b8zv x1jx94hy xrbpyxo xl423tq",
+        "STATUS_STRINGS": "text-align: start;",
+        "COMMENT_AUTHOR": "x3nfvp2",
+        "COMMENT_STR": "x1lliihq xjkvuk6 x1iorvi4",
+        "TMP_COMMENTS_CLASS": "xqcrz7y",
+        "REPLY_DIVIDER": "x1k70j0n",
+        "REPLY_DIVIDER_2": "x1n2onr6"
+    },
+    "facebok_video_locators": {
+        "POST_AUTHOR": "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f",
+        "TOP_LABEL": "message",
+        "TITLE_LIKES": "xt0b8zv x1jx94hy xrbpyxo xl423tq",
+        "STATUS_STRINGS": "text-align: start;",
+        "COMMENT_AUTHOR": "x3nfvp2",
+        "COMMENT_STR": "xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs",
+        "TMP_COMMENTS_CLASS": "xqcrz7y",
+        "REPLY_DIVIDER": "x1k70j0n",
+        "REPLY_DIVIDER_2": "x1n2onr6"
+    }
+}
--- a/harvester/main.py
+++ b/harvester/main.py
@ -0,0 +1,30 @@
+import argparse
+
+from Facebook.facebook_crawler import FacebookCrawler
+from Reddit.reddit_crawler import RedditCrawler
+
+FACEBOOK_URL = 'https://www.facebook.com/'
+REDDIT_URL = 'https://www.reddit.com/'
+
+if __name__ == '__main__':
+    
+    # parsing arguments
+    parser = argparse.ArgumentParser(description = "Facebook scraper")
+    parser.add_argument("file_name", help = 'Name of the .txt file with URLS')
+    args = parser.parse_args()
+
+
+    user_input = input('Hello, do you want to scraper Facebook or reddit? [F/r]: ')
+
+    while user_input.upper() not in ['F', 'R']:
+        user_input = input('Do you want to scrape Facebook or reddit? [F/r]: ')
+
+
+    if user_input == 'F':
+        facebook = FacebookCrawler(FACEBOOK_URL, args.file_name)
+        facebook.allow_cookies()
+        facebook.login()
+        facebook.crawl()
+    else:
+        reddit = RedditCrawler(REDDIT_URL, args.file_name)
+        print(reddit)
--- a/harvester/requirements.txt
+++ b/harvester/requirements.txt
@ -0,0 +1,4 @@
+beautifulsoup4==4.12.2
+print_dict==0.1.19
+selenium==4.10.0
+webdriver_manager==3.8.6
--- a/preprocessing/chunking.ipynb
+++ b/preprocessing/chunking.ipynb
@ -0,0 +1,176 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data chunking for effectiveness\n",
+    "\n",
+    "In our data, facebook user called Robert Fico has a lot of samples.\n",
+    "For efficiency, this notebook chunks those data in 4 parts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### JSONL file loading and creation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_jsonl(file_path):\n",
+    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
+    "        return [json.loads(line) for line in file]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_jsonl(filename, new_dataset):\n",
+    "    with open(f'{filename}l', 'w') as jsonl_file:\n",
+    "        for item in new_dataset:\n",
+    "            jsonl_file.write(json.dumps(item) + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fico = load_jsonl('jsonl_data/robert_fico_data.jsonl')"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Split data into 4 parts equal parts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "135155"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "num_samples = len(fico)\n",
+    "chunk_size = int(num_samples / 4)\n",
+    "\n",
+    "num_samples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chunk_size * 4 == num_samples  # we have lost one sample, because our dataset has odd number of samples"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Actual chunking algorithm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chunk_arr = []\n",
+    "for chunks in range(0, 4):\n",
+    "    chunk_arr.append(\n",
+    "        fico[chunk_size * chunks: chunk_size * (chunks + 1)]\n",
+    "    )"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Write chunked data to disk in a for loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for index, data in enumerate(chunk_arr):\n",
+    "    create_jsonl(f'jsonl_data/fico_chunk_{index}.json', data)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sentiment",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/preprocessing/clustered_processing.ipynb
+++ b/preprocessing/clustered_processing.ipynb
--- a/preprocessing/clustering.ipynb
+++ b/preprocessing/clustering.ipynb
@ -0,0 +1,389 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# This notebook is clustering samples based on their semantic similarity.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/A200119424/anaconda3/envs/sentiment/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "# imports \n",
+    "\n",
+    "from sentence_transformers import SentenceTransformer, util\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "import warnings\n",
+    "import json\n",
+    "import os\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Model init\n",
+    "\n",
+    "In this clustering process will be used TUKE-DeutscheTelekom/slovakbert-skquad-mnlr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = SentenceTransformer('TUKE-DeutscheTelekom/slovakbert-skquad-mnlr')"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Data manipulation in file system"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_jsonl(file_path):\n",
+    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
+    "        return [json.loads(line) for line in file]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pipeline functions"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Embedding creation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_embeddings(jsonl_file):\n",
+    "    sentences = [item['text'] for item in jsonl_file]\n",
+    "    return model.encode(sentences), sentences"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Clustering algorithm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def cluster_data(embeddings, sentences):\n",
+    "    embeddings_np = np.array(embeddings)\n",
+    "\n",
+    "    similarity_threshold = 0.65\n",
+    "\n",
+    "    long_enough_mask = np.array([len(sentence) > 20 for sentence in sentences])\n",
+    "\n",
+    "    cosine_sim_matrix = util.pytorch_cos_sim(torch.tensor(embeddings_np), torch.tensor(embeddings_np)).numpy()\n",
+    "\n",
+    "    below_threshold_mask = cosine_sim_matrix < similarity_threshold\n",
+    "\n",
+    "    filtered_mask = np.logical_and(below_threshold_mask, np.outer(long_enough_mask, long_enough_mask))\n",
+    "\n",
+    "    non_spam_indices = np.where(filtered_mask)\n",
+    "\n",
+    "    filtered_sentences = list(set([sentences[i] for i in non_spam_indices[0]]))\n",
+    "\n",
+    "    return filtered_sentences"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prepare data to write it to JSONL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def filter_null_text(json_list):\n",
+    "    filtered_list = [obj for obj in json_list if \"text\" in obj and obj[\"text\"] is not None]\n",
+    "    return filtered_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_jsonl_format(filtered, jsonl_file):\n",
+    "\n",
+    "    return [\n",
+    "        {\n",
+    "            'id': item['id'],\n",
+    "            'author': item['author'],\n",
+    "            'text': item['text']\n",
+    "        }\n",
+    "        for item in jsonl_file if item['text'] in filtered\n",
+    "    ]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Write out JSONL file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def write_jsonl(filename, data):\n",
+    "    with open(filename, 'w') as f:\n",
+    "        for item in data:\n",
+    "            json.dump(item, f)\n",
+    "            f.write('\\n')"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Pipeline execution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def execute_pipeline(jsonl_file):\n",
+    "    embeddings, sentences = create_embeddings(jsonl_file)\n",
+    "    filtered_data = cluster_data(embeddings, sentences)\n",
+    "    return create_jsonl_format(filtered_data, jsonl_file)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pipeline usecase"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "prepare data for clustering in a loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['aktuality_data.jsonl',\n",
+       " 'denník_n_data.jsonl',\n",
+       " 'televízia_joj_data.jsonl',\n",
+       " 'fakty_data.jsonl',\n",
+       " 'erik_kaliňák_data.jsonl',\n",
+       " 'zomri_data.jsonl',\n",
+       " 'igor_matovic_data.jsonl',\n",
+       " 'peter_marcin_data.jsonl',\n",
+       " 'ján_koleník_data.jsonl',\n",
+       " 'eva_-_hriešne_dobrá_data.jsonl',\n",
+       " 'emefka_data.jsonl',\n",
+       " 'marek_hamsik_data.jsonl',\n",
+       " 'hetrik_data.jsonl',\n",
+       " 'peter_sagan_data.jsonl',\n",
+       " 'marian_čekovský_data.jsonl',\n",
+       " 'zuzana_čaputová_data.jsonl',\n",
+       " 'sajfa_data.jsonl',\n",
+       " 'marian_kotleba_data.jsonl',\n",
+       " 'fico_chunk_3.jsonl',\n",
+       " 'fico_chunk_1.jsonl',\n",
+       " 'šport_v_rtvs_data.jsonl',\n",
+       " 'dominika_cibulkova_data.jsonl',\n",
+       " 'šport24_data.jsonl',\n",
+       " 'niké_liga_data.jsonl',\n",
+       " 'fico_chunk_0.jsonl',\n",
+       " 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
+       " 'fico_chunk_2.jsonl']"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data_to_cluster = [x for x in os.listdir('jsonl_data')]\n",
+    "\n",
+    "data_to_cluster.remove('robert_fico_data.jsonl')\n",
+    "\n",
+    "data_to_cluster"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Executing the actual pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 27/27 [1:59:52<00:00, 266.38s/it]   \n"
+     ]
+    }
+   ],
+   "source": [
+    "for dataset_name in tqdm(data_to_cluster):\n",
+    "    dataset = load_jsonl(f'jsonl_data/{dataset_name}')\n",
+    "    dataset = filter_null_text(dataset)\n",
+    "    write_jsonl(f'clustered_jsonl/{dataset_name}', execute_pipeline(dataset))\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['aktuality_data.jsonl',\n",
+       " 'denník_n_data.jsonl',\n",
+       " 'televízia_joj_data.jsonl',\n",
+       " '.DS_Store',\n",
+       " 'fakty_data.jsonl',\n",
+       " 'erik_kaliňák_data.jsonl',\n",
+       " 'zomri_data.jsonl',\n",
+       " 'igor_matovic_data.jsonl',\n",
+       " 'peter_marcin_data.jsonl',\n",
+       " 'ján_koleník_data.jsonl',\n",
+       " 'eva_-_hriešne_dobrá_data.jsonl',\n",
+       " 'emefka_data.jsonl',\n",
+       " 'marek_hamsik_data.jsonl',\n",
+       " 'hetrik_data.jsonl',\n",
+       " 'peter_sagan_data.jsonl',\n",
+       " 'marian_čekovský_data.jsonl',\n",
+       " 'zuzana_čaputová_data.jsonl',\n",
+       " 'sajfa_data.jsonl',\n",
+       " 'marian_kotleba_data.jsonl',\n",
+       " 'fico_chunk_3.jsonl',\n",
+       " 'fico_chunk_1.jsonl',\n",
+       " 'šport_v_rtvs_data.jsonl',\n",
+       " 'dominika_cibulkova_data.jsonl',\n",
+       " 'šport24_data.jsonl',\n",
+       " 'niké_liga_data.jsonl',\n",
+       " 'fico_chunk_0.jsonl',\n",
+       " 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
+       " 'fico_chunk_2.jsonl']"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "os.listdir('jsonl_data')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sentiment",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/preprocessing/create_jsonl.ipynb
+++ b/preprocessing/create_jsonl.ipynb
@ -0,0 +1,181 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# JSON to JSONL file converter\n",
+    "This notebook turns structured JSON file to a simplier form as a JSONL for easier data manipulation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports \n",
+    "import json\n",
+    "import os"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Open JSON data, then write it as JSONL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def open_json(filename):\n",
+    "    # Read the JSON file\n",
+    "    with open(filename, 'r') as json_file:\n",
+    "        return json.load(json_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_jsonl(filename, new_dataset):\n",
+    "    with open(f'{filename}l', 'w') as jsonl_file:\n",
+    "        for item in new_dataset:\n",
+    "            jsonl_file.write(json.dumps(item) + '\\n')"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Loop through dataset, create new list of dictionaries, drop duplicate data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def traverse_dataset(dataset):\n",
+    "    new_dataset = []\n",
+    "    for post in dataset:\n",
+    "        new_dataset.append(post)\n",
+    "        for comment in post['comments']:\n",
+    "            new_dataset.append(comment)\n",
+    "            try:\n",
+    "                for reply in comment['replies']:\n",
+    "                    new_dataset.append(reply)\n",
+    "\n",
+    "                    for sec_reply in reply['replies']:\n",
+    "                        new_dataset.append(sec_reply)\n",
+    "            except KeyError:\n",
+    "                pass\n",
+    "    \n",
+    "    return new_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def drop_keywords(dataset):\n",
+    "    for item in dataset:\n",
+    "        try:\n",
+    "            del item['comments']\n",
+    "        except KeyError:\n",
+    "            pass\n",
+    "        try:\n",
+    "            del item['replies']\n",
+    "        except KeyError:\n",
+    "            pass\n",
+    "    \n",
+    "    return dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_dataset(dataset):\n",
+    "    cleaned_dataset = []\n",
+    "    for data in dataset:\n",
+    "\n",
+    "        cleaned_data = {}\n",
+    "        if 'id' in data:\n",
+    "            cleaned_data['id'] = data.get('id')\n",
+    "        \n",
+    "        if 'publisher' in data:\n",
+    "            cleaned_data['author'] = data.get('publisher')\n",
+    "        \n",
+    "        if 'text' in data:\n",
+    "            cleaned_data['text'] = data.get('text')\n",
+    "        elif 'title' in data:\n",
+    "            cleaned_data['text'] = data.get('title')\n",
+    "\n",
+    "        cleaned_dataset.append(cleaned_data)\n",
+    "\n",
+    "    return cleaned_dataset"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Execution of functions defined above"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for dataset_name in os.listdir('json_data_id/'):\n",
+    "    dataset = open_json(f'json_data_id/{dataset_name}')\n",
+    "\n",
+    "    new_dataset = traverse_dataset(dataset)\n",
+    "    new_dataset = drop_keywords(new_dataset)\n",
+    "    new_dataset = clean_dataset(new_dataset)\n",
+    "\n",
+    "    create_jsonl(f'jsonl_data/{dataset_name}', new_dataset)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sentiment",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/preprocessing/dataProcessing.ipynb
+++ b/preprocessing/dataProcessing.ipynb
--- a/preprocessing/id_addition.ipynb
+++ b/preprocessing/id_addition.ipynb
@ -0,0 +1,103 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def open_json(filename):\n",
+    "    with open(filename, 'r') as json_file:\n",
+    "        return json.load(json_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_ids(json_file):\n",
+    "    id_counter = 1\n",
+    "    for post in json_file:\n",
+    "        post[\"id\"] = id_counter\n",
+    "        id_counter += 1\n",
+    "        if 'comments' in post:\n",
+    "            for comment in post['comments']:\n",
+    "                comment[\"id\"] = id_counter\n",
+    "                id_counter += 1\n",
+    "                if 'replies' in comment:\n",
+    "                    for reply in comment['replies']:\n",
+    "                        reply[\"id\"] = id_counter\n",
+    "                        id_counter += 1\n",
+    "                        if 'replies' in reply:\n",
+    "                            for sec_reply in reply['replies']:\n",
+    "                                sec_reply[\"id\"] = id_counter\n",
+    "                                id_counter += 1\n",
+    "    return json_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_json(filename, data):\n",
+    "    with open(filename, 'w', encoding = \"utf-8\", ) as file:\n",
+    "        json.dump(data, file, indent=4, separators=(',',': '))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for json_file in os.listdir(\"json_data\"):\n",
+    "    data = open_json(f'json_data/{json_file}')\n",
+    "    data = add_ids(data)\n",
+    "    create_json(f'json_data_id/{json_file}', data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sentiment",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/preprocessing/name_extraction.ipynb
+++ b/preprocessing/name_extraction.ipynb