from bs4 import BeautifulSoup from print_dict import pd import json import sys import os class FacebookParser: def parse(self, html, clsDict, fname = 'final_dataset.json'): self.soup = BeautifulSoup(html, 'lxml') self.outFileName = fname self.outFileName = f"outputs/parts/{self.outFileName.split('.')[0]}_data.json" # dict for data about facebook post self.post_data = { 'publisher': None, 'title': None, 'comments': [], 'post_reactions': None } # dict for comments self.comment_data = { 'publisher': None, 'text': None, 'replies': [] } # reply data self.reply_data = { 'publisher': None, 'text': None } # post info self.name = self.soup.find('a', {'class': clsDict['POST_AUTHOR']}) if clsDict['TOP_LABEL'] == 'message': self.top = self.soup.find('div', {'data-ad-comet-preview': clsDict['TOP_LABEL']}) else: self.top = self.soup.find('div', {'class': clsDict['TOP_LABEL']}) if self.top is None: self.top = self.soup.find('div', {'class': 'x78zum5 xdt5ytf x2lah0s xyamay9 x1pi30zi x18d9i69 x1swvt13'}) self.title_likes = self.soup.find('span', {'class': clsDict['TITLE_LIKES']}) try: self.tmp_strings = self.top.find_all('div', {'style': clsDict['STATUS_STRINGS']}) self.title = '' for x in self.tmp_strings: try: self.title += x.text + '. ' except: pass except: self.title = None self.post_data = { 'publisher': self.name.text if self.name is not None else None, 'title': self.title, 'post_reactions': self.title_likes.text if self.title_likes is not None else None, 'comments': [] } if self.post_data['publisher'] is None: return # comment info self.all_comments = self.soup.find_all("div", {"aria-label": lambda x: x and x.endswith("ago")}) # arr with all comments under the post # print(len(self.all_comments)) for item in self.all_comments: self.publisher = item.find('span', {'class', clsDict['COMMENT_AUTHOR']}) self.txt = item.find('div', {'class': clsDict['COMMENT_STR']}) try: tmp_type = item.get('aria-label').split(' ')[0] tmp_class = item.find('div', {'class': lambda x: x and x.startswith(clsDict['TMP_COMMENTS_CLASS'])}).get('class')[-1] if tmp_type == "Comment": self.comment_data = { 'publisher': self.publisher.text, 'text': self.txt.text if self.txt is not None else None, 'replies': [] } self.post_data['comments'].append(self.comment_data) elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER']: self.comment_data = { 'publisher': self.publisher.text, 'text': self.txt.text if self.txt is not None else None, 'replies': [] } self.post_data['comments'][-1]['replies'].append(self.comment_data) elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER_2']: self.reply_data = { 'publisher': self.publisher.text, 'text': self.txt.text if self.txt is not None else None, } self.post_data['comments'][-1]['replies'][-1]['replies'].append(self.reply_data) except: pass if os.path.exists(self.outFileName): with open(self.outFileName, 'r+', encoding= "utf-8") as file: tmp = json.load(file) tmp.append(self.post_data) file.seek(0) json.dump(tmp, file, indent=4, separators=(',',': ')) else: with open(self.outFileName, 'w', encoding = "utf-8", ) as file: json.dump([self.post_data], file, indent=4, separators=(',',': ')) #read URLS from a .txt try: with open(os.path.join('Facebook', 'inputs', self.filename), 'r+') as file: lines = file.readlines() # move file pointer to the beginning of a file file.seek(0) # truncate the file file.truncate() # start writing lines except the first line file.writelines(lines[1:]) except FileNotFoundError: print('Invalid input value') sys.exit(1)