130 lines
4.8 KiB
Python
130 lines
4.8 KiB
Python
from bs4 import BeautifulSoup
|
|
from print_dict import pd
|
|
import json
|
|
import sys
|
|
import os
|
|
|
|
class FacebookParser:
|
|
|
|
def parse(self, html, clsDict, fname = 'final_dataset.json'):
|
|
|
|
self.soup = BeautifulSoup(html, 'lxml')
|
|
self.outFileName = fname
|
|
self.outFileName = f"outputs/parts/{self.outFileName.split('.')[0]}_data.json"
|
|
|
|
# dict for data about facebook post
|
|
self.post_data = {
|
|
'publisher': None,
|
|
'title': None,
|
|
'comments': [],
|
|
'post_reactions': None
|
|
}
|
|
|
|
# dict for comments
|
|
self.comment_data = {
|
|
'publisher': None,
|
|
'text': None,
|
|
'replies': []
|
|
}
|
|
|
|
# reply data
|
|
self.reply_data = {
|
|
'publisher': None,
|
|
'text': None
|
|
}
|
|
|
|
# post info
|
|
self.name = self.soup.find('a', {'class': clsDict['POST_AUTHOR']})
|
|
|
|
if clsDict['TOP_LABEL'] == 'message':
|
|
self.top = self.soup.find('div', {'data-ad-comet-preview': clsDict['TOP_LABEL']})
|
|
else:
|
|
self.top = self.soup.find('div', {'class': clsDict['TOP_LABEL']})
|
|
if self.top is None:
|
|
self.top = self.soup.find('div', {'class': 'x78zum5 xdt5ytf x2lah0s xyamay9 x1pi30zi x18d9i69 x1swvt13'})
|
|
|
|
self.title_likes = self.soup.find('span', {'class': clsDict['TITLE_LIKES']})
|
|
try:
|
|
self.tmp_strings = self.top.find_all('div', {'style': clsDict['STATUS_STRINGS']})
|
|
self.title = ''
|
|
for x in self.tmp_strings:
|
|
try:
|
|
self.title += x.text + '. '
|
|
except:
|
|
pass
|
|
except:
|
|
self.title = None
|
|
|
|
|
|
|
|
self.post_data = {
|
|
'publisher': self.name.text if self.name is not None else None,
|
|
'title': self.title,
|
|
'post_reactions': self.title_likes.text if self.title_likes is not None else None,
|
|
'comments': []
|
|
}
|
|
|
|
if self.post_data['publisher'] is None:
|
|
return
|
|
|
|
# comment info
|
|
self.all_comments = self.soup.find_all("div", {"aria-label": lambda x: x and x.endswith("ago")}) # arr with all comments under the post
|
|
# print(len(self.all_comments))
|
|
for item in self.all_comments:
|
|
self.publisher = item.find('span', {'class', clsDict['COMMENT_AUTHOR']})
|
|
self.txt = item.find('div', {'class': clsDict['COMMENT_STR']})
|
|
try:
|
|
tmp_type = item.get('aria-label').split(' ')[0]
|
|
tmp_class = item.find('div', {'class': lambda x: x and x.startswith(clsDict['TMP_COMMENTS_CLASS'])}).get('class')[-1]
|
|
if tmp_type == "Comment":
|
|
self.comment_data = {
|
|
'publisher': self.publisher.text,
|
|
'text': self.txt.text if self.txt is not None else None,
|
|
'replies': []
|
|
}
|
|
self.post_data['comments'].append(self.comment_data)
|
|
|
|
|
|
elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER']:
|
|
self.comment_data = {
|
|
'publisher': self.publisher.text,
|
|
'text': self.txt.text if self.txt is not None else None,
|
|
'replies': []
|
|
}
|
|
self.post_data['comments'][-1]['replies'].append(self.comment_data)
|
|
|
|
elif tmp_type == "Reply" and tmp_class == clsDict['REPLY_DIVIDER_2']:
|
|
self.reply_data = {
|
|
'publisher': self.publisher.text,
|
|
'text': self.txt.text if self.txt is not None else None,
|
|
}
|
|
self.post_data['comments'][-1]['replies'][-1]['replies'].append(self.reply_data)
|
|
|
|
except:
|
|
pass
|
|
|
|
|
|
if os.path.exists(self.outFileName):
|
|
with open(self.outFileName, 'r+', encoding= "utf-8") as file:
|
|
tmp = json.load(file)
|
|
tmp.append(self.post_data)
|
|
file.seek(0)
|
|
json.dump(tmp, file, indent=4, separators=(',',': '))
|
|
else:
|
|
with open(self.outFileName, 'w', encoding = "utf-8", ) as file:
|
|
json.dump([self.post_data], file, indent=4, separators=(',',': '))
|
|
|
|
#read URLS from a .txt
|
|
try:
|
|
with open(os.path.join('Facebook', 'inputs', self.filename), 'r+') as file:
|
|
lines = file.readlines()
|
|
# move file pointer to the beginning of a file
|
|
file.seek(0)
|
|
# truncate the file
|
|
file.truncate()
|
|
# start writing lines except the first line
|
|
file.writelines(lines[1:])
|
|
except FileNotFoundError:
|
|
print('Invalid input value')
|
|
sys.exit(1)
|