BP2024/harvester/Facebook/facebook_parser.py

from bs4 import BeautifulSoup
from print_dict import pd
import json
import sys
import os

class FacebookParser:

    def parse(self, html, clsDict, fname = 'final_dataset.json'):

        self.soup = BeautifulSoup(html, 'lxml')
        self.outFileName = fname
        self.outFileName = f"outputs/parts/{self.outFileName.split('.')[0]}_data.json"

        # dict for data about facebook post
        self.post_data = {
            'publisher': None,
            'title': None,
            'comments': [],
            'post_reactions': None
        }

        # dict for comments
        self.comment_data = {
            'publisher': None,
            'text': None,
            'replies': []
        }

        # reply data
        self.reply_data = {
            'publisher': None,
            'text': None
        }

        # post info
        self.name = self.soup.find('a', {'class': clsDict['POST_AUTHOR']})

        if clsDict['TOP_LABEL'] == 'message':
            self.top = self.soup.find('div', {'data-ad-comet-preview': clsDict['TOP_LABEL']})
        else:
            self.top = self.soup.find('div', {'class': clsDict['TOP_LABEL']})
            if self.top is None:
                self.top = self.soup.find('div', {'class': 'x78zum5 xdt5ytf x2lah0s xyamay9 x1pi30zi x18d9i69 x1swvt13'})

        self.title_likes = self.soup.find('span', {'class': clsDict['TITLE_LIKES']})
        try:
            self.tmp_strings = self.top.find_all('div', {'style': clsDict['STATUS_STRINGS']})
            self.title = ''
            for x in self.tmp_strings:
                try:
                    self.title += x.text + '. '
                except:
                    pass
        except:
            self.title = None


        self.post_data = {
            'publisher': self.name.text if self.name is not None else None,
            'title': self.title,
            'post_reactions': self.title_likes.text if self.title_likes is not None else None,
            'comments': []
        }

        if self.post_data['publisher'] is None:
            return

        # comment info
        self.all_comments = self.soup.find_all("div", {"aria-label": lambda x: x and x.endswith("ago")}) # arr with all comments under the post
        # print(len(self.all_comments))
        for item in self.all_comments:
            self.publisher = item.find('span', {'class', clsDict['COMMENT_AUTHOR']})
            self.txt = item.find('div', {'class': clsDict['COMMENT_STR']})
            try:
                tmp_type = item.get('aria-label').split(' ')[0]
                tmp_class = item.find('div', {'class': lambda x: x and x.startswith(clsDict['TMP_COMMENTS_CLASS'])}).get('class')[-1]
                if tmp_type == "Comment":
                    self.comment_data = {
                        'publisher': self.publisher.text,
                        'text': self.txt.text if self.txt is not None else None,
                        'replies': []
                    }
                    self.post_data['comments'].append(self.comment_data)


                elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER']:
                    self.comment_data = {
                        'publisher': self.publisher.text,
                        'text': self.txt.text if self.txt is not None else None,
                        'replies': []
                    }
                    self.post_data['comments'][-1]['replies'].append(self.comment_data)

                elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER_2']:
                    self.reply_data = {
                    'publisher': self.publisher.text,
                    'text': self.txt.text if self.txt is not None else None,
                    }
                    self.post_data['comments'][-1]['replies'][-1]['replies'].append(self.reply_data)

            except:
                pass


        if os.path.exists(self.outFileName):
            with open(self.outFileName, 'r+', encoding= "utf-8") as file:
                tmp = json.load(file)
                tmp.append(self.post_data)
                file.seek(0)
                json.dump(tmp, file, indent=4, separators=(',',': '))
        else:
            with open(self.outFileName, 'w', encoding = "utf-8", ) as file:
                json.dump([self.post_data], file, indent=4, separators=(',',': '))

        #read URLS from a .txt
        try:
            with open(os.path.join('Facebook', 'inputs', self.filename), 'r+') as file:
                lines = file.readlines()
                # move file pointer to the beginning of a file
                file.seek(0)
                # truncate the file
                file.truncate()
                # start writing lines except the first line
                file.writelines(lines[1:])
        except FileNotFoundError:
            print('Invalid input value')
            sys.exit(1)