BP2024/harvester/Facebook/facebook_parser.py
2024-04-09 15:39:11 +02:00

130 lines
4.8 KiB
Python

from bs4 import BeautifulSoup
from print_dict import pd
import json
import sys
import os
class FacebookParser:
def parse(self, html, clsDict, fname = 'final_dataset.json'):
self.soup = BeautifulSoup(html, 'lxml')
self.outFileName = fname
self.outFileName = f"outputs/parts/{self.outFileName.split('.')[0]}_data.json"
# dict for data about facebook post
self.post_data = {
'publisher': None,
'title': None,
'comments': [],
'post_reactions': None
}
# dict for comments
self.comment_data = {
'publisher': None,
'text': None,
'replies': []
}
# reply data
self.reply_data = {
'publisher': None,
'text': None
}
# post info
self.name = self.soup.find('a', {'class': clsDict['POST_AUTHOR']})
if clsDict['TOP_LABEL'] == 'message':
self.top = self.soup.find('div', {'data-ad-comet-preview': clsDict['TOP_LABEL']})
else:
self.top = self.soup.find('div', {'class': clsDict['TOP_LABEL']})
if self.top is None:
self.top = self.soup.find('div', {'class': 'x78zum5 xdt5ytf x2lah0s xyamay9 x1pi30zi x18d9i69 x1swvt13'})
self.title_likes = self.soup.find('span', {'class': clsDict['TITLE_LIKES']})
try:
self.tmp_strings = self.top.find_all('div', {'style': clsDict['STATUS_STRINGS']})
self.title = ''
for x in self.tmp_strings:
try:
self.title += x.text + '. '
except:
pass
except:
self.title = None
self.post_data = {
'publisher': self.name.text if self.name is not None else None,
'title': self.title,
'post_reactions': self.title_likes.text if self.title_likes is not None else None,
'comments': []
}
if self.post_data['publisher'] is None:
return
# comment info
self.all_comments = self.soup.find_all("div", {"aria-label": lambda x: x and x.endswith("ago")}) # arr with all comments under the post
# print(len(self.all_comments))
for item in self.all_comments:
self.publisher = item.find('span', {'class', clsDict['COMMENT_AUTHOR']})
self.txt = item.find('div', {'class': clsDict['COMMENT_STR']})
try:
tmp_type = item.get('aria-label').split(' ')[0]
tmp_class = item.find('div', {'class': lambda x: x and x.startswith(clsDict['TMP_COMMENTS_CLASS'])}).get('class')[-1]
if tmp_type == "Comment":
self.comment_data = {
'publisher': self.publisher.text,
'text': self.txt.text if self.txt is not None else None,
'replies': []
}
self.post_data['comments'].append(self.comment_data)
elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER']:
self.comment_data = {
'publisher': self.publisher.text,
'text': self.txt.text if self.txt is not None else None,
'replies': []
}
self.post_data['comments'][-1]['replies'].append(self.comment_data)
elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER_2']:
self.reply_data = {
'publisher': self.publisher.text,
'text': self.txt.text if self.txt is not None else None,
}
self.post_data['comments'][-1]['replies'][-1]['replies'].append(self.reply_data)
except:
pass
if os.path.exists(self.outFileName):
with open(self.outFileName, 'r+', encoding= "utf-8") as file:
tmp = json.load(file)
tmp.append(self.post_data)
file.seek(0)
json.dump(tmp, file, indent=4, separators=(',',': '))
else:
with open(self.outFileName, 'w', encoding = "utf-8", ) as file:
json.dump([self.post_data], file, indent=4, separators=(',',': '))
#read URLS from a .txt
try:
with open(os.path.join('Facebook', 'inputs', self.filename), 'r+') as file:
lines = file.readlines()
# move file pointer to the beginning of a file
file.seek(0)
# truncate the file
file.truncate()
# start writing lines except the first line
file.writelines(lines[1:])
except FileNotFoundError:
print('Invalid input value')
sys.exit(1)