annotation app finished

2024-04-09 15:39:11 +02:00 · 2024-04-09 15:39:11 +02:00 · d0dc4fa0f4
commit d0dc4fa0f4
27 changed files with 76605 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,27 @@
 # preprocessing ignore
 /preprocessing/*.jsonl
 /preprocessing/*.pickle
 /preprocessing/__pycache__
 /preprocessing/classified_data
 /preprocessing/clustered_jsonl
 /preprocessing/json_data
 /preprocessing/json_data_id
 /preprocessing/jsonl_data
 /preprocessing/.DS_Store
 # harvesting ignore
 /harvester/Facebook/inputs
 /harvester/Facebook/outputs
 /harvester/Facebook/.*
 /harvester/Facebook/__pycache__
 /harvester/__pycache__
 /harvester/.DS_Store
 # annotation_app
 /annotation_app/.env
 /annotation_app/__pycache__
 /annotation_app/.DS_Store
--- a/annotation_app/.dockerignore
+++ b/annotation_app/.dockerignore
@ -0,0 +1,2 @@
 get_data.py
 /instance
--- a/annotation_app/Dockerfile
+++ b/annotation_app/Dockerfile
@ -0,0 +1,13 @@
 FROM python:3.9
 WORKDIR /app
 COPY . /app/
 RUN pip install --no-cache-dir -r requirements.txt
 RUN python initial.py
 EXPOSE 5050
 CMD ["python3", "app.py"]
--- a/annotation_app/app.py
+++ b/annotation_app/app.py
@ -0,0 +1,140 @@
 from flask import Flask, render_template, request, redirect, flash, session, url_for
 from models import db, Users, Annotations, Samples
 from dotenv import load_dotenv
 from sqlalchemy.orm import aliased
 import sqlalchemy
 import os
 import logging
 load_dotenv()
 app = Flask(__name__)
 app.config['SQLALCHEMY_DATABASE_URI'] = os.getenv('DB_URI')
 app.secret_key = os.getenv('SECRET_KEY')
 db.init_app(app)
@app.route('/', methods=['GET'])
 def home():
    session.pop('id_user', None)
    return render_template('home.html')
@app.route('/login', methods=['POST'])
 def login():
    if request.method == 'POST':
        email = request.form['email']
        if '@' in email:
            try:
                splitted = email.split('@')
                name, surname = splitted[0].split('.')[:2]
                domain = splitted[1]
                if 'tuke' not in domain:
                    raise ValueError
            except ValueError:
                flash('Nie je validný TUKE email')
                return redirect('/')
            try:
                db.session.add(Users(name, surname, email))
                db.session.commit()
            except sqlalchemy.exc.IntegrityError as err:
                db.session.rollback()
                logging.info('Logged existing email')
            user = Users.query.filter_by(email=email).first()
            session['email'] = email
            session['id_user'] = user._id
            return redirect('/anot')
        flash('Nie je validný TUKE email')
        return redirect('/')
@app.route('/anot', methods=['GET'])
 def anot():
    if 'id_user' in session:
        try:
            annotated_count = Annotations.query.filter_by(user_id=session['id_user']).count()
            # query = text(
            #     f'''
            #     SELECT samples._id, text 
            #     FROM samples
            #     LEFT JOIN annotations
            #     ON samples._id = annotations.sample_id
            #     WHERE samples._id NOT IN (
            #         SELECT sample_id
            #         FROM annotations
            #         GROUP BY sample_id
            #         HAVING COUNT(sample_id) > 5
            #     )
            #     AND samples._id NOT IN (
            #         SELECT samples._id
            #         FROM samples
            #         LEFT JOIN annotations
            #         ON samples._id = annotations.sample_id
            #         WHERE annotations.user_id IS {session['id_user']}
            #     )
            #     ORDER BY samples._id ASC
            #     LIMIT 1;
            #     '''
            # )
            annotations_alias = aliased(Annotations)
            # Construct the query
            query = (
                db.session.query(Samples._id, Samples.text)
                .outerjoin(annotations_alias, Samples._id == annotations_alias.sample_id)
                .filter(
                    ~Samples._id.in_(
                        db.session.query(Annotations.sample_id)
                        .group_by(Annotations.sample_id)
                        .having(db.func.count(Annotations.sample_id) > 5)
                    ),
                    ~Samples._id.in_(
                        db.session.query(Samples._id)
                        .outerjoin(Annotations, Samples._id == Annotations.sample_id)
                        .filter(Annotations.user_id == session['id_user'])
                    )
                )
                .order_by(Samples._id.asc())
                .limit(1)
            )
            sample_id, sample_text = query.one_or_none()
            data = {
                'email': session.get('email'),
                'text': sample_text,
                'sample_id': sample_id,
                'annotated_count': annotated_count
            }
        except (sqlalchemy.exc.OperationalError) as err:
            print(err)
            logging.info('Annotationss started')
            data = {
                'email': session.get('email'),
                'text': Samples.query.order_by(Samples._id.asc()).first().text,
                'sample_id': Samples.query.order_by(Samples._id.asc()).first()._id,
                'annotated_count': annotated_count
            }
        return render_template('anot.html', **data)
    return redirect('/')
@app.route('/process_anot', methods=['POST'])
 def process():
    if request.method == 'POST':
        data = request.get_json()
        print(data)
        db.session.add(Annotations(
            user_id=session['id_user'],
            sample_id=data['sample_id'],
            label=data['value']
        ))
        db.session.commit()
        return redirect(url_for('anot'))
 if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5050)
--- a/annotation_app/dataset/final_id_v2.jsonl
+++ b/annotation_app/dataset/final_id_v2.jsonl
--- a/annotation_app/get_data.py
+++ b/annotation_app/get_data.py
@ -0,0 +1,54 @@
 from models import db, Samples, Users, Annotations
 from app import app
 from sqlalchemy import text
 if __name__ == '__main__':
    with app.app_context():
        # AND annotations.user_id <> '{id_user}'
        id_user = 4
        query = text(
            f'''
            SELECT samples.id, text 
            FROM samples
            LEFT JOIN annotations
            ON samples.id = annotations.sample_id
            WHERE samples.id NOT IN (
                SELECT sample_id
                FROM annotations
                GROUP BY sample_id
                HAVING COUNT(sample_id) > 5
            )
            AND samples.id NOT IN (
                SELECT samples.id
                FROM samples
                LEFT JOIN annotations
                ON samples.id = annotations.sample_id
                WHERE annotations.user_id IS {id_user}
            )
            ORDER BY samples.id ASC
            LIMIT 1;
            '''
        )
        # query = text(
        #     '''
        #     SELECT samples.id
        #     FROM samples
        #     LEFT JOIN annotations
        #     ON samples.id = annotations.sample_id
        #     WHERE annotations.user_id IS NOT 1
        #     '''
        # )
        result = db.session.execute(query)
        print(result.fetchall())
        annotations = Annotations.query.all()
        print(len(annotations))
        # for annotation in annotations:
        #     print(annotation.user_id)
--- a/annotation_app/initial.py
+++ b/annotation_app/initial.py
@ -0,0 +1,26 @@
 from models import db, Samples
 from app import app
 import json
 import os
 if __name__ == '__main__':
    with app.app_context():
        # db.init_app(app)
        # creating database
        db.create_all()
        try:
            with open(os.path.join('dataset', 'final_id_v2.jsonl'), encoding='utf-8') as file:
                data = [json.loads(line) for line in file]
            for sample in data:
                db.session.add(Samples(sample['text']))
            db.session.commit()
            print('Data sucessfully inserted')
        except FileNotFoundError as err:
            print(err)
--- a/annotation_app/instance/anot_db.db
+++ b/annotation_app/instance/anot_db.db
--- a/annotation_app/models.py
+++ b/annotation_app/models.py
@ -0,0 +1,40 @@
 from flask_sqlalchemy import SQLAlchemy
 db = SQLAlchemy()
 class Annotations(db.Model):
    __tablename__ = 'annotations'
    _id = db.Column("id", db.Integer, primary_key=True)
    user_id = db.Column(db.Integer, db.ForeignKey('users.id'), nullable=False)
    sample_id = db.Column(db.Integer, db.ForeignKey('samples.id'), nullable=False)
    label = db.Column(db.String(32), nullable=False)
    def __init__(self, user_id, sample_id, label):
        self.user_id = user_id
        self.sample_id = sample_id
        self.label = label
 class Users(db.Model):
    __tablename__ = 'users'
    _id = db.Column("id", db.Integer, primary_key=True)
    name = db.Column(db.String(32), nullable=False)
    surname = db.Column(db.String(32), nullable=False)
    email = db.Column(db.String(64), unique=True, nullable=False)
    annotations = db.relationship('Annotations', uselist=False, backref='user', lazy=True)
    def __init__(self, name, surname, email):
        self.name = name
        self.surname = surname
        self.email = email
 class Samples(db.Model):
    __tablename__ = 'samples'
    _id = db.Column("id", db.Integer, primary_key=True)
    text = db.Column(db.String(512), nullable=False)
    annotations = db.relationship('Annotations', lazy=True, backref='sample')  # corrected relationship and added backref
    def __init__(self, text):
        self.text = text
--- a/annotation_app/requirements.txt
+++ b/annotation_app/requirements.txt
@ -0,0 +1,13 @@
 blinker==1.7.0
 click==8.1.7
 Flask==3.0.2
 Flask-SQLAlchemy==3.1.1
 importlib_metadata==7.0.2
 itsdangerous==2.1.2
 Jinja2==3.1.3
 MarkupSafe==2.1.5
 python-dotenv==1.0.1
 SQLAlchemy==2.0.28
 typing_extensions==4.10.0
 Werkzeug==3.0.1
 zipp==3.18.1
--- a/annotation_app/static/stylesheets/styles.css
+++ b/annotation_app/static/stylesheets/styles.css
@ -0,0 +1,76 @@
 body, html {
    height: 100%;
    margin: 0;
    display: flex;
    justify-content: center;
    align-items: center;
    background-color: #FEFFFF;
 }
 table {
    border-radius: 8px;
 }
 .container{
    position: absolute;
    top: 50%;
    left: 50%;
    transform: translate(-50%, -50%);
    text-align: center;
 } 
 .btn { 
    background-color: #3AAFA9;
    border: 1px solid #3AAFA9;
 }
 .logout-btn{
    background-color: #454d55;
    margin-top: 7.5%;
 }
 #top-info{
    margin-top: 5%;
    position: fixed;
    top: 5%;
 }
 .anot{
    border: 1px solid #000000; 
    border-radius: 10px; 
    padding: 10px;
 }
 .anot-text {
    padding: 2.5%;
 }
 .form-control{
    margin-bottom: 5px;
 }
 #login{
    position: absolute;
    top: -20vh;
    left: 50%;
    transform: translate(-50%, 50%);
    border: 1px solid #000000;
    border-radius: 8px;
    padding: 4vh;
    width: 500px;
 }
 .top-info {
    width: 100%;
    border-collapse: collapse; /* Optional: collapse border */
 }
 .top-info td {
    border: 1px solid #000; /* Add border to table cells */
    padding: 8px; /* Optional: Add padding */
 }
 h3{
    margin-bottom: 3%;
 }
--- a/annotation_app/templates/anot.html
+++ b/annotation_app/templates/anot.html
@ -0,0 +1,66 @@
 {% extends "base.html" %}
 {% block title %} annotation {% endblock %}
 {% block content%}
    <div class="container" id="top-info">
        <table class="table top-info">
            <thead class="thead-dark">
                <tr>
                    <th>Email</th>
                    <th>Počet anotovaných jednotiek</th>
                </tr>
            </thead>    
            <tr>
                <td>{{ email }}</td>
                <td>{{ annotated_count }}</td>
            </tr>
        </table>
    </div>
    <div class="container">
        <div class="anot">
            <p class="anot-text">{{ text }}</p>
            <button id="post" class="btn btn-primary" onclick="postBcknd('offensive', {{ sample_id }})">Ofenzívny</button>
            <button id="post" class="btn btn-primary" onclick="postBcknd('not_offensive', {{ sample_id }})">Neofenzívny</button>
            <button id="post" class="btn btn-primary" onclick="postBcknd('dont_know', {{ sample_id }})">Neviem</button> 
        </div>
    <button id="get" class="btn btn-primary logout-btn" onclick="logout()"> Odhlásiť sa</button>
 <script>
    function postBcknd(value, sample_id){
        var xhr = new XMLHttpRequest();
        xhr.open('POST', '/process_anot', true);
        xhr.setRequestHeader('Content-Type', 'application/json');
        xhr.onload = function () {
            if(xhr.status === 200) {
                console.log('request sent succesfully');
                window.location.href = '/anot';
            } else {
                console.log('request failed');
            }
        };
        xhr.send(JSON.stringify({value: value, sample_id: sample_id}));
    }
    function logout() {
        var xhr = new XMLHttpRequest();
        xhr.open('GET', '/', true);
        xhr.onload = function () {
            if (xhr.status === 200) {
                console.log('Logout successful');
                window.location.href = '/';
            } else { 
                console.log('Logout request failed');
            }
        }
        xhr.send(); // Send the request
    }
 </script>
 {% endblock %}
--- a/annotation_app/templates/base.html
+++ b/annotation_app/templates/base.html
@ -0,0 +1,17 @@
 <!doctype html>
 <html>
  <head>
    {% block head %}
    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
    <link type="text/css" rel="stylesheet" href="{{ url_for('static', filename='stylesheets/styles.css') }}" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{% block title %}{% endblock %}</title>
    {% endblock %}
  </head>
  <body>
    <div id="content"> 
        {% block content %}
        {% endblock %}
    </div>
  </body>
 </html>
--- a/annotation_app/templates/home.html
+++ b/annotation_app/templates/home.html
@ -0,0 +1,19 @@
 {% extends "base.html" %}
 {% block title %} Welcome {% endblock %}
 {% block content %}
    <div class="container">
        <form action="/login" id="login" method="post" >
            <h3>Login anotačnej aplikácie</h3>
            <input type="text" name="email" placeholder="meno.priezvisko@student.tuke.sk" class="form-control">
            <button id="post" class="btn btn-primary login-btn">Prihlásiť sa</button>
            {% with messages = get_flashed_messages() %}
            {% if messages %}
                <p style="margin-top: 2%;"> {{ messages[0] }} </p> 
            {% endif %}
        {% endwith %}
        </form>
    </div>
 {% endblock %}
--- a/harvester/Facebook/facebook_crawler.py
+++ b/harvester/Facebook/facebook_crawler.py
@ -0,0 +1,134 @@
 import os
 import sys
 import time
 import json
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.common.action_chains import ActionChains
 import selenium.common.exceptions
 from Facebook.facebook_parser import FacebookParser
 from crawler import Crawler
 class FacebookCrawler(Crawler, FacebookParser):
    def __init__(self, base_url: str, file_name: str):
        super().__init__(base_url, file_name)
        try:
            with open(os.path.join('locators.json')) as file:
                self.locators = json.load(file)
            with open(os.path.join('Facebook', 'inputs', self.filename)) as file:
                self.URLS = tuple(file.readlines())
        except FileNotFoundError:
            print(os.path.join('Facebook', 'inputs', self.filename))
            print("Invalid input value")
            sys.exit(1)
    # crawling part of the code 
    def crawl(self):
        counter = len(self.URLS)
        for idx,url in enumerate(self.URLS):
            # redirect and wait for page to load
            self.driver.get(url)
            self.driver.implicitly_wait(4)
            if 'videos' in url:
                try:
                    self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest' or text()='Oldest']")
                except selenium.common.exceptions.NoSuchElementException:
                    self.driver.find_element(By.XPATH, "//span[contains(@class, 'x6ikm8r x10wlt62 xlyipyv')]").click()
                    if self.driver.find_element(By.XPATH, "//div[contains(@class, 'x78zum5 xdt5ytf x1iyjqo2 x7ywyr2')]") is not None:
                        print('Cant crawl comments section')
                        continue
                self.close_censorship('Newest')
            else:
                try:
                    self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest' or text()='Oldest']")
                except:
                    pass
                self.close_censorship('All comments')
            self.driver.implicitly_wait(3)
            print('continue scraping')
            # clicking features
            self.view_more_comments()
            self.show_replies()
            self.click_see_more()
            # # parsing part of the code
            # Dictionary of classes, if facebook changes any class, rewrite this DICT
            if '/videos/' in url:
                self.class_dict = self.locators['facebook_video_locators']
            elif '/posts/' in url:
                self.class_dict = self.locators['facebook_post_locators']
            self.parse(self.driver.page_source, self.class_dict, self.filename)
            print(f'Done: [{idx + 1}/{counter}]')
    def view_more_comments(self):
        elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
        while elements:
            try:
                self.driver.execute_script("arguments[0].click();", elements[0])
                elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
                self.driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
            except selenium.common.exceptions.StaleElementReferenceException:
                elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'comments') and contains(text(), 'View') and contains(text(), 'more')]")
    # function, for showing hidden replies
    def show_replies(self):
        repl_elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'repl') and not(contains(text(), 'Hide')) and not(contains(text(), 'Newest'))]")
        i = 1
        while repl_elements:
            try:
                for element in repl_elements:
                    self.driver.execute_script("arguments[0].click();", element)
                    time.sleep(0.5)
            except selenium.common.exceptions.StaleElementReferenceException:
                pass
            repl_elements = self.driver.find_elements(By.XPATH, "//span[contains(text(), 'repl') and not(contains(text(), 'Hide')) and not(contains(text(), 'Newest'))]")
    # method for expanding comments
    def click_see_more(self):
        elements = self.driver.find_elements(By.XPATH, "//*[text()='See more']")
        for element in elements:   
            self.driver.execute_script("arguments[0].click();", element)
    # method for clossing most relevant filter to Newest
    def close_censorship(self, classification: str):
        self.driver.implicitly_wait(3)
        try:
            dropdown = self.driver.find_element(By.XPATH, "//*[text()='Most relevant' or text()='Newest']")
            self.driver.execute_script("arguments[0].click();", dropdown) # clicking on it
            newest_comments = self.driver.find_element(By.XPATH, f"//*[text()='{classification}']")
            self.driver.execute_script("arguments[0].click();", newest_comments) # clicking on it
        except:
            self.close_censorship(classification)
    def close(self):
        print('Scraping ended succesffuly')
        self.driver.quit()
        sys.exit(0)
--- a/harvester/Facebook/facebook_parser.py
+++ b/harvester/Facebook/facebook_parser.py
@ -0,0 +1,129 @@
 from bs4 import BeautifulSoup
 from print_dict import pd
 import json
 import sys
 import os
 class FacebookParser:
    def parse(self, html, clsDict, fname = 'final_dataset.json'):
        self.soup = BeautifulSoup(html, 'lxml')
        self.outFileName = fname
        self.outFileName = f"outputs/parts/{self.outFileName.split('.')[0]}_data.json"
        # dict for data about facebook post
        self.post_data = {
            'publisher': None,
            'title': None,
            'comments': [],
            'post_reactions': None
        }
        # dict for comments
        self.comment_data = {
            'publisher': None,
            'text': None,
            'replies': []
        }
        # reply data
        self.reply_data = {
            'publisher': None,
            'text': None
        }
        # post info
        self.name = self.soup.find('a', {'class': clsDict['POST_AUTHOR']})
        if clsDict['TOP_LABEL'] == 'message':
            self.top = self.soup.find('div', {'data-ad-comet-preview': clsDict['TOP_LABEL']})
        else:
            self.top = self.soup.find('div', {'class': clsDict['TOP_LABEL']})
            if self.top is None:
                self.top = self.soup.find('div', {'class': 'x78zum5 xdt5ytf x2lah0s xyamay9 x1pi30zi x18d9i69 x1swvt13'})
        self.title_likes = self.soup.find('span', {'class': clsDict['TITLE_LIKES']})
        try:
            self.tmp_strings = self.top.find_all('div', {'style': clsDict['STATUS_STRINGS']})
            self.title = ''
            for x in self.tmp_strings:
                try:
                    self.title += x.text + '. '
                except:
                    pass
        except:
            self.title = None
        self.post_data = {
            'publisher': self.name.text if self.name is not None else None,
            'title': self.title,
            'post_reactions': self.title_likes.text if self.title_likes is not None else None,
            'comments': []
        }
        if self.post_data['publisher'] is None:
            return
        # comment info
        self.all_comments = self.soup.find_all("div", {"aria-label": lambda x: x and x.endswith("ago")}) # arr with all comments under the post
        # print(len(self.all_comments))
        for item in self.all_comments:
            self.publisher = item.find('span', {'class', clsDict['COMMENT_AUTHOR']})
            self.txt = item.find('div', {'class': clsDict['COMMENT_STR']})
            try:
                tmp_type = item.get('aria-label').split(' ')[0]
                tmp_class = item.find('div', {'class': lambda x: x and x.startswith(clsDict['TMP_COMMENTS_CLASS'])}).get('class')[-1]
                if tmp_type == "Comment":
                    self.comment_data = {
                        'publisher': self.publisher.text,
                        'text': self.txt.text if self.txt is not None else None,
                        'replies': []
                    }
                    self.post_data['comments'].append(self.comment_data)
                elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER']:
                    self.comment_data = {
                        'publisher': self.publisher.text,
                        'text': self.txt.text if self.txt is not None else None,
                        'replies': []
                    }
                    self.post_data['comments'][-1]['replies'].append(self.comment_data)
                elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER_2']:
                    self.reply_data = {
                    'publisher': self.publisher.text,
                    'text': self.txt.text if self.txt is not None else None,
                    }
                    self.post_data['comments'][-1]['replies'][-1]['replies'].append(self.reply_data)
            except:
                pass
        if os.path.exists(self.outFileName):
            with open(self.outFileName, 'r+', encoding= "utf-8") as file:
                tmp = json.load(file)
                tmp.append(self.post_data)
                file.seek(0)
                json.dump(tmp, file, indent=4, separators=(',',': '))
        else:
            with open(self.outFileName, 'w', encoding = "utf-8", ) as file:
                json.dump([self.post_data], file, indent=4, separators=(',',': '))
        #read URLS from a .txt
        try:
            with open(os.path.join('Facebook', 'inputs', self.filename), 'r+') as file:
                lines = file.readlines()
                # move file pointer to the beginning of a file
                file.seek(0)
                # truncate the file
                file.truncate()
                # start writing lines except the first line
                file.writelines(lines[1:])
        except FileNotFoundError:
            print('Invalid input value')
            sys.exit(1)
--- a/harvester/Facebook/linkCollector.py
+++ b/harvester/Facebook/linkCollector.py
@ -0,0 +1,155 @@
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from webdriver_manager.chrome import ChromeDriverManager
 from bs4 import BeautifulSoup
 import os
 import sys
 import time
 import argparse
 # parse args
 parser = argparse.ArgumentParser(description = "Facebook scraper")
 parser.add_argument("URL", help = 'URL of the facebook page / profile') # dont need to specify type (default is string)
 args = parser.parse_args()
 # method for waiting on pages to load
 def wait_for_url(driver, url):
    # waiting to load main page
    try:
        WebDriverWait(driver, 10).until(EC.url_to_be(url))
        print('Succesful !')
    except:
        print('Connection error')
        driver.quit()
        sys.exit(1)
 # web driver init
 def webdriver_setup():
    driver_path = r'C:\Users\vlferko\Desktop\projects\jarvis_scraper\chromedriver.exe'
    chrome_options = Options()
    chrome_options.add_argument("accept-language=en-US")
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--log-level=OFF")
    driver = webdriver.Chrome(ChromeDriverManager().install(), options= chrome_options)
    driver.get("https://www.facebook.com/")
    return driver
 # login to a facebook acc
 def login(driver):
    print('Logging in')
    # allow cookies
    try:
        driver.find_element(By.XPATH, "//button[contains(string(), 'Decline optional cookies')]").click()
    except:
        pass
    # insert login data
    driver.find_element(By.NAME, "email").send_keys(os.environ['FB_EMAIL']) # type email
    driver.find_element(By.NAME, "pass").send_keys(os.environ['FB_PASSWD']) # type password
    # click -> log in 
    driver.find_element(By.NAME, "login").click()
    time.sleep(5)
 # scrolling to the bottom of the page
 def crawl_for_links(driver, url):
    print('Crawling')
    i = 1
    driver.get(url)
    time.sleep(2)
    name = driver.find_elements(By.TAG_NAME, 'h2')[-1].text
    for _ in range(0, 3):
        # Get scroll height
        last_height = driver.execute_script("return document.body.scrollHeight")
        for _ in range(50):
            # Scroll down to bottom
            driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
            # Wait to load page
            time.sleep(3)
            # Calculate new scroll height and compare with last scroll height
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
            os.system('clear||cls')
            print(f'Iteration num: {i}')
            i += 1
    return driver.page_source, name
 # parse HTML 
 def parse_html(html):
    soup = BeautifulSoup(html, 'lxml')
    timeline = soup.find('div', {'class': 'x9f619 x1n2onr6 x1ja2u2z xeuugli xs83m0k x1xmf6yo x1emribx x1e56ztr x1i64zmx xjl7jj x19h7ccj xu9j1y6 x7ep2pv'})
    posts = timeline.find_all('div', {'class': 'x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z'})
    arr = []
    for post in posts:
        try:
            commentsWidget = post.find('span', {'class': 'x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xi81zsa'})
            if approveComments(commentsWidget.text):
                links = post.find_all('a', {'role': 'link'})
                arr.append(extractPostLink(links))
        except AttributeError:
            pass
    return arr
 def extractPostLink(links):
    for link in links:
        if '/videos/' in link['href'] or '/posts/' in link['href']:
            return link['href']
 # check if post has at least 50 comments
 def approveComments(text):
    nComments = text.split(' ')[0]
    try:
        num = int(nComments)
        return int(num > 50)
    except ValueError:
        return 'K' or 'M' in nComments
 # write all the links to the .txt
 def write_out(arr, name):
    with open(f"{os.getcwd()}/inputs/{name.strip().replace(' ', '_').lower()}.txt", 'w') as f:
        for item in arr:
            try:
                f.write(item + '\n')
            except TypeError:
                pass
 if __name__ == '__main__':
    # driver init
    driver =webdriver_setup() 
    wait_for_url(driver, 'https://www.facebook.com/')
    # login
    login(driver)
    wait_for_url(driver, 'https://www.facebook.com/')
    # crawl
    html, name =crawl_for_links(driver, args.URL)
    driver.close()
    # parsing HTML
    arr =parse_html(html)
    # write out
    write_out(arr, name)
--- a/harvester/locators.json
+++ b/harvester/locators.json
@ -0,0 +1,24 @@
 {
    "facebook_post_locators": {
        "POST_AUTHOR": "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f",
        "TOP_LABEL": "xjkvuk6 xuyqlj2 x1odjw0f",
        "TITLE_LIKES": "xt0b8zv x1jx94hy xrbpyxo xl423tq",
        "STATUS_STRINGS": "text-align: start;",
        "COMMENT_AUTHOR": "x3nfvp2",
        "COMMENT_STR": "x1lliihq xjkvuk6 x1iorvi4",
        "TMP_COMMENTS_CLASS": "xqcrz7y",
        "REPLY_DIVIDER": "x1k70j0n",
        "REPLY_DIVIDER_2": "x1n2onr6"
    },
    "facebok_video_locators": {
        "POST_AUTHOR": "x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f",
        "TOP_LABEL": "message",
        "TITLE_LIKES": "xt0b8zv x1jx94hy xrbpyxo xl423tq",
        "STATUS_STRINGS": "text-align: start;",
        "COMMENT_AUTHOR": "x3nfvp2",
        "COMMENT_STR": "xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs",
        "TMP_COMMENTS_CLASS": "xqcrz7y",
        "REPLY_DIVIDER": "x1k70j0n",
        "REPLY_DIVIDER_2": "x1n2onr6"
    }
 }
--- a/harvester/main.py
+++ b/harvester/main.py
@ -0,0 +1,30 @@
 import argparse
 from Facebook.facebook_crawler import FacebookCrawler
 from Reddit.reddit_crawler import RedditCrawler
 FACEBOOK_URL = 'https://www.facebook.com/'
 REDDIT_URL = 'https://www.reddit.com/'
 if __name__ == '__main__':
    # parsing arguments
    parser = argparse.ArgumentParser(description = "Facebook scraper")
    parser.add_argument("file_name", help = 'Name of the .txt file with URLS')
    args = parser.parse_args()
    user_input = input('Hello, do you want to scraper Facebook or reddit? [F/r]: ')
    while user_input.upper() not in ['F', 'R']:
        user_input = input('Do you want to scrape Facebook or reddit? [F/r]: ')
    if user_input == 'F':
        facebook = FacebookCrawler(FACEBOOK_URL, args.file_name)
        facebook.allow_cookies()
        facebook.login()
        facebook.crawl()
    else:
        reddit = RedditCrawler(REDDIT_URL, args.file_name)
        print(reddit)
--- a/harvester/requirements.txt
+++ b/harvester/requirements.txt
@ -0,0 +1,4 @@
 beautifulsoup4==4.12.2
 print_dict==0.1.19
 selenium==4.10.0
 webdriver_manager==3.8.6
--- a/preprocessing/chunking.ipynb
+++ b/preprocessing/chunking.ipynb
@ -0,0 +1,176 @@
 {
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data chunking for effectiveness\n",
    "\n",
    "In our data, facebook user called Robert Fico has a lot of samples.\n",
    "For efficiency, this notebook chunks those data in 4 parts."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### JSONL file loading and creation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_jsonl(file_path):\n",
    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
    "        return [json.loads(line) for line in file]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_jsonl(filename, new_dataset):\n",
    "    with open(f'{filename}l', 'w') as jsonl_file:\n",
    "        for item in new_dataset:\n",
    "            jsonl_file.write(json.dumps(item) + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "fico = load_jsonl('jsonl_data/robert_fico_data.jsonl')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Split data into 4 parts equal parts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "135155"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_samples = len(fico)\n",
    "chunk_size = int(num_samples / 4)\n",
    "\n",
    "num_samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk_size * 4 == num_samples  # we have lost one sample, because our dataset has odd number of samples"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Actual chunking algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunk_arr = []\n",
    "for chunks in range(0, 4):\n",
    "    chunk_arr.append(\n",
    "        fico[chunk_size * chunks: chunk_size * (chunks + 1)]\n",
    "    )"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Write chunked data to disk in a for loop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "for index, data in enumerate(chunk_arr):\n",
    "    create_jsonl(f'jsonl_data/fico_chunk_{index}.json', data)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "sentiment",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/preprocessing/clustered_processing.ipynb
+++ b/preprocessing/clustered_processing.ipynb
--- a/preprocessing/clustering.ipynb
+++ b/preprocessing/clustering.ipynb
@ -0,0 +1,389 @@
 {
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# This notebook is clustering samples based on their semantic similarity.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/A200119424/anaconda3/envs/sentiment/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "# imports \n",
    "\n",
    "from sentence_transformers import SentenceTransformer, util\n",
    "from tqdm import tqdm\n",
    "import numpy as np\n",
    "import torch\n",
    "import numpy as np\n",
    "import warnings\n",
    "import json\n",
    "import os\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model init\n",
    "\n",
    "In this clustering process will be used TUKE-DeutscheTelekom/slovakbert-skquad-mnlr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = SentenceTransformer('TUKE-DeutscheTelekom/slovakbert-skquad-mnlr')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data manipulation in file system"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_jsonl(file_path):\n",
    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
    "        return [json.loads(line) for line in file]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pipeline functions"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Embedding creation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_embeddings(jsonl_file):\n",
    "    sentences = [item['text'] for item in jsonl_file]\n",
    "    return model.encode(sentences), sentences"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Clustering algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cluster_data(embeddings, sentences):\n",
    "    embeddings_np = np.array(embeddings)\n",
    "\n",
    "    similarity_threshold = 0.65\n",
    "\n",
    "    long_enough_mask = np.array([len(sentence) > 20 for sentence in sentences])\n",
    "\n",
    "    cosine_sim_matrix = util.pytorch_cos_sim(torch.tensor(embeddings_np), torch.tensor(embeddings_np)).numpy()\n",
    "\n",
    "    below_threshold_mask = cosine_sim_matrix < similarity_threshold\n",
    "\n",
    "    filtered_mask = np.logical_and(below_threshold_mask, np.outer(long_enough_mask, long_enough_mask))\n",
    "\n",
    "    non_spam_indices = np.where(filtered_mask)\n",
    "\n",
    "    filtered_sentences = list(set([sentences[i] for i in non_spam_indices[0]]))\n",
    "\n",
    "    return filtered_sentences"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prepare data to write it to JSONL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_null_text(json_list):\n",
    "    filtered_list = [obj for obj in json_list if \"text\" in obj and obj[\"text\"] is not None]\n",
    "    return filtered_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_jsonl_format(filtered, jsonl_file):\n",
    "\n",
    "    return [\n",
    "        {\n",
    "            'id': item['id'],\n",
    "            'author': item['author'],\n",
    "            'text': item['text']\n",
    "        }\n",
    "        for item in jsonl_file if item['text'] in filtered\n",
    "    ]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Write out JSONL file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def write_jsonl(filename, data):\n",
    "    with open(filename, 'w') as f:\n",
    "        for item in data:\n",
    "            json.dump(item, f)\n",
    "            f.write('\\n')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pipeline execution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def execute_pipeline(jsonl_file):\n",
    "    embeddings, sentences = create_embeddings(jsonl_file)\n",
    "    filtered_data = cluster_data(embeddings, sentences)\n",
    "    return create_jsonl_format(filtered_data, jsonl_file)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pipeline usecase"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "prepare data for clustering in a loop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['aktuality_data.jsonl',\n",
       " 'denník_n_data.jsonl',\n",
       " 'televízia_joj_data.jsonl',\n",
       " 'fakty_data.jsonl',\n",
       " 'erik_kaliňák_data.jsonl',\n",
       " 'zomri_data.jsonl',\n",
       " 'igor_matovic_data.jsonl',\n",
       " 'peter_marcin_data.jsonl',\n",
       " 'ján_koleník_data.jsonl',\n",
       " 'eva_-_hriešne_dobrá_data.jsonl',\n",
       " 'emefka_data.jsonl',\n",
       " 'marek_hamsik_data.jsonl',\n",
       " 'hetrik_data.jsonl',\n",
       " 'peter_sagan_data.jsonl',\n",
       " 'marian_čekovský_data.jsonl',\n",
       " 'zuzana_čaputová_data.jsonl',\n",
       " 'sajfa_data.jsonl',\n",
       " 'marian_kotleba_data.jsonl',\n",
       " 'fico_chunk_3.jsonl',\n",
       " 'fico_chunk_1.jsonl',\n",
       " 'šport_v_rtvs_data.jsonl',\n",
       " 'dominika_cibulkova_data.jsonl',\n",
       " 'šport24_data.jsonl',\n",
       " 'niké_liga_data.jsonl',\n",
       " 'fico_chunk_0.jsonl',\n",
       " 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
       " 'fico_chunk_2.jsonl']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_to_cluster = [x for x in os.listdir('jsonl_data')]\n",
    "\n",
    "data_to_cluster.remove('robert_fico_data.jsonl')\n",
    "\n",
    "data_to_cluster"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Executing the actual pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 27/27 [1:59:52<00:00, 266.38s/it]   \n"
     ]
    }
   ],
   "source": [
    "for dataset_name in tqdm(data_to_cluster):\n",
    "    dataset = load_jsonl(f'jsonl_data/{dataset_name}')\n",
    "    dataset = filter_null_text(dataset)\n",
    "    write_jsonl(f'clustered_jsonl/{dataset_name}', execute_pipeline(dataset))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['aktuality_data.jsonl',\n",
       " 'denník_n_data.jsonl',\n",
       " 'televízia_joj_data.jsonl',\n",
       " '.DS_Store',\n",
       " 'fakty_data.jsonl',\n",
       " 'erik_kaliňák_data.jsonl',\n",
       " 'zomri_data.jsonl',\n",
       " 'igor_matovic_data.jsonl',\n",
       " 'peter_marcin_data.jsonl',\n",
       " 'ján_koleník_data.jsonl',\n",
       " 'eva_-_hriešne_dobrá_data.jsonl',\n",
       " 'emefka_data.jsonl',\n",
       " 'marek_hamsik_data.jsonl',\n",
       " 'hetrik_data.jsonl',\n",
       " 'peter_sagan_data.jsonl',\n",
       " 'marian_čekovský_data.jsonl',\n",
       " 'zuzana_čaputová_data.jsonl',\n",
       " 'sajfa_data.jsonl',\n",
       " 'marian_kotleba_data.jsonl',\n",
       " 'fico_chunk_3.jsonl',\n",
       " 'fico_chunk_1.jsonl',\n",
       " 'šport_v_rtvs_data.jsonl',\n",
       " 'dominika_cibulkova_data.jsonl',\n",
       " 'šport24_data.jsonl',\n",
       " 'niké_liga_data.jsonl',\n",
       " 'fico_chunk_0.jsonl',\n",
       " 'ok,ale_ideš_prvý_:d_data.jsonl',\n",
       " 'fico_chunk_2.jsonl']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.listdir('jsonl_data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "sentiment",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/preprocessing/create_jsonl.ipynb
+++ b/preprocessing/create_jsonl.ipynb
@ -0,0 +1,181 @@
 {
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# JSON to JSONL file converter\n",
    "This notebook turns structured JSON file to a simplier form as a JSONL for easier data manipulation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports \n",
    "import json\n",
    "import os"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Open JSON data, then write it as JSONL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def open_json(filename):\n",
    "    # Read the JSON file\n",
    "    with open(filename, 'r') as json_file:\n",
    "        return json.load(json_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_jsonl(filename, new_dataset):\n",
    "    with open(f'{filename}l', 'w') as jsonl_file:\n",
    "        for item in new_dataset:\n",
    "            jsonl_file.write(json.dumps(item) + '\\n')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loop through dataset, create new list of dictionaries, drop duplicate data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def traverse_dataset(dataset):\n",
    "    new_dataset = []\n",
    "    for post in dataset:\n",
    "        new_dataset.append(post)\n",
    "        for comment in post['comments']:\n",
    "            new_dataset.append(comment)\n",
    "            try:\n",
    "                for reply in comment['replies']:\n",
    "                    new_dataset.append(reply)\n",
    "\n",
    "                    for sec_reply in reply['replies']:\n",
    "                        new_dataset.append(sec_reply)\n",
    "            except KeyError:\n",
    "                pass\n",
    "    \n",
    "    return new_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def drop_keywords(dataset):\n",
    "    for item in dataset:\n",
    "        try:\n",
    "            del item['comments']\n",
    "        except KeyError:\n",
    "            pass\n",
    "        try:\n",
    "            del item['replies']\n",
    "        except KeyError:\n",
    "            pass\n",
    "    \n",
    "    return dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_dataset(dataset):\n",
    "    cleaned_dataset = []\n",
    "    for data in dataset:\n",
    "\n",
    "        cleaned_data = {}\n",
    "        if 'id' in data:\n",
    "            cleaned_data['id'] = data.get('id')\n",
    "        \n",
    "        if 'publisher' in data:\n",
    "            cleaned_data['author'] = data.get('publisher')\n",
    "        \n",
    "        if 'text' in data:\n",
    "            cleaned_data['text'] = data.get('text')\n",
    "        elif 'title' in data:\n",
    "            cleaned_data['text'] = data.get('title')\n",
    "\n",
    "        cleaned_dataset.append(cleaned_data)\n",
    "\n",
    "    return cleaned_dataset"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Execution of functions defined above"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "for dataset_name in os.listdir('json_data_id/'):\n",
    "    dataset = open_json(f'json_data_id/{dataset_name}')\n",
    "\n",
    "    new_dataset = traverse_dataset(dataset)\n",
    "    new_dataset = drop_keywords(new_dataset)\n",
    "    new_dataset = clean_dataset(new_dataset)\n",
    "\n",
    "    create_jsonl(f'jsonl_data/{dataset_name}', new_dataset)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "sentiment",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/preprocessing/dataProcessing.ipynb
+++ b/preprocessing/dataProcessing.ipynb
--- a/preprocessing/id_addition.ipynb
+++ b/preprocessing/id_addition.ipynb
@ -0,0 +1,103 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def open_json(filename):\n",
    "    with open(filename, 'r') as json_file:\n",
    "        return json.load(json_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_ids(json_file):\n",
    "    id_counter = 1\n",
    "    for post in json_file:\n",
    "        post[\"id\"] = id_counter\n",
    "        id_counter += 1\n",
    "        if 'comments' in post:\n",
    "            for comment in post['comments']:\n",
    "                comment[\"id\"] = id_counter\n",
    "                id_counter += 1\n",
    "                if 'replies' in comment:\n",
    "                    for reply in comment['replies']:\n",
    "                        reply[\"id\"] = id_counter\n",
    "                        id_counter += 1\n",
    "                        if 'replies' in reply:\n",
    "                            for sec_reply in reply['replies']:\n",
    "                                sec_reply[\"id\"] = id_counter\n",
    "                                id_counter += 1\n",
    "    return json_file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_json(filename, data):\n",
    "    with open(filename, 'w', encoding = \"utf-8\", ) as file:\n",
    "        json.dump(data, file, indent=4, separators=(',',': '))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "for json_file in os.listdir(\"json_data\"):\n",
    "    data = open_json(f'json_data/{json_file}')\n",
    "    data = add_ids(data)\n",
    "    create_json(f'json_data_id/{json_file}', data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "sentiment",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/preprocessing/name_extraction.ipynb
+++ b/preprocessing/name_extraction.ipynb