155 lines
4.7 KiB
Python
155 lines
4.7 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from webdriver_manager.chrome import ChromeDriverManager
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import argparse
|
|
|
|
|
|
# parse args
|
|
parser = argparse.ArgumentParser(description = "Facebook scraper")
|
|
parser.add_argument("URL", help = 'URL of the facebook page / profile') # dont need to specify type (default is string)
|
|
args = parser.parse_args()
|
|
|
|
|
|
# method for waiting on pages to load
|
|
def wait_for_url(driver, url):
|
|
# waiting to load main page
|
|
try:
|
|
WebDriverWait(driver, 10).until(EC.url_to_be(url))
|
|
print('Succesful !')
|
|
except:
|
|
print('Connection error')
|
|
driver.quit()
|
|
sys.exit(1)
|
|
|
|
# web driver init
|
|
def webdriver_setup():
|
|
|
|
driver_path = r'C:\Users\vlferko\Desktop\projects\jarvis_scraper\chromedriver.exe'
|
|
|
|
chrome_options = Options()
|
|
chrome_options.add_argument("accept-language=en-US")
|
|
chrome_options.add_argument("--headless")
|
|
chrome_options.add_argument("--log-level=OFF")
|
|
driver = webdriver.Chrome(ChromeDriverManager().install(), options= chrome_options)
|
|
|
|
driver.get("https://www.facebook.com/")
|
|
return driver
|
|
|
|
# login to a facebook acc
|
|
def login(driver):
|
|
print('Logging in')
|
|
# allow cookies
|
|
try:
|
|
driver.find_element(By.XPATH, "//button[contains(string(), 'Decline optional cookies')]").click()
|
|
except:
|
|
pass
|
|
|
|
|
|
# insert login data
|
|
driver.find_element(By.NAME, "email").send_keys(os.environ['FB_EMAIL']) # type email
|
|
driver.find_element(By.NAME, "pass").send_keys(os.environ['FB_PASSWD']) # type password
|
|
|
|
# click -> log in
|
|
driver.find_element(By.NAME, "login").click()
|
|
time.sleep(5)
|
|
|
|
# scrolling to the bottom of the page
|
|
def crawl_for_links(driver, url):
|
|
print('Crawling')
|
|
i = 1
|
|
driver.get(url)
|
|
time.sleep(2)
|
|
name = driver.find_elements(By.TAG_NAME, 'h2')[-1].text
|
|
|
|
|
|
for _ in range(0, 3):
|
|
# Get scroll height
|
|
last_height = driver.execute_script("return document.body.scrollHeight")
|
|
for _ in range(50):
|
|
# Scroll down to bottom
|
|
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
|
|
|
|
# Wait to load page
|
|
time.sleep(3)
|
|
|
|
# Calculate new scroll height and compare with last scroll height
|
|
new_height = driver.execute_script("return document.body.scrollHeight")
|
|
if new_height == last_height:
|
|
break
|
|
last_height = new_height
|
|
os.system('clear||cls')
|
|
print(f'Iteration num: {i}')
|
|
i += 1
|
|
|
|
return driver.page_source, name
|
|
|
|
# parse HTML
|
|
def parse_html(html):
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
|
timeline = soup.find('div', {'class': 'x9f619 x1n2onr6 x1ja2u2z xeuugli xs83m0k x1xmf6yo x1emribx x1e56ztr x1i64zmx xjl7jj x19h7ccj xu9j1y6 x7ep2pv'})
|
|
posts = timeline.find_all('div', {'class': 'x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z'})
|
|
arr = []
|
|
for post in posts:
|
|
try:
|
|
commentsWidget = post.find('span', {'class': 'x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xi81zsa'})
|
|
if approveComments(commentsWidget.text):
|
|
links = post.find_all('a', {'role': 'link'})
|
|
arr.append(extractPostLink(links))
|
|
except AttributeError:
|
|
pass
|
|
|
|
return arr
|
|
|
|
def extractPostLink(links):
|
|
for link in links:
|
|
if '/videos/' in link['href'] or '/posts/' in link['href']:
|
|
return link['href']
|
|
|
|
# check if post has at least 50 comments
|
|
def approveComments(text):
|
|
nComments = text.split(' ')[0]
|
|
try:
|
|
num = int(nComments)
|
|
return int(num > 50)
|
|
except ValueError:
|
|
return 'K' or 'M' in nComments
|
|
|
|
# write all the links to the .txt
|
|
def write_out(arr, name):
|
|
|
|
with open(f"{os.getcwd()}/inputs/{name.strip().replace(' ', '_').lower()}.txt", 'w') as f:
|
|
for item in arr:
|
|
try:
|
|
f.write(item + '\n')
|
|
except TypeError:
|
|
pass
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# driver init
|
|
driver =webdriver_setup()
|
|
wait_for_url(driver, 'https://www.facebook.com/')
|
|
|
|
# login
|
|
login(driver)
|
|
wait_for_url(driver, 'https://www.facebook.com/')
|
|
|
|
# crawl
|
|
html, name =crawl_for_links(driver, args.URL)
|
|
driver.close()
|
|
|
|
# parsing HTML
|
|
arr =parse_html(html)
|
|
|
|
# write out
|
|
write_out(arr, name) |