BP2024/harvester/Facebook/linkCollector.py

155 lines
4.7 KiB
Python
Raw Normal View History

2024-04-09 13:39:11 +00:00
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import os
import sys
import time
import argparse
# parse args
parser = argparse.ArgumentParser(description = "Facebook scraper")
parser.add_argument("URL", help = 'URL of the facebook page / profile') # dont need to specify type (default is string)
args = parser.parse_args()
# method for waiting on pages to load
def wait_for_url(driver, url):
# waiting to load main page
try:
WebDriverWait(driver, 10).until(EC.url_to_be(url))
print('Succesful !')
except:
print('Connection error')
driver.quit()
sys.exit(1)
# web driver init
def webdriver_setup():
driver_path = r'C:\Users\vlferko\Desktop\projects\jarvis_scraper\chromedriver.exe'
chrome_options = Options()
chrome_options.add_argument("accept-language=en-US")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--log-level=OFF")
driver = webdriver.Chrome(ChromeDriverManager().install(), options= chrome_options)
driver.get("https://www.facebook.com/")
return driver
# login to a facebook acc
def login(driver):
print('Logging in')
# allow cookies
try:
driver.find_element(By.XPATH, "//button[contains(string(), 'Decline optional cookies')]").click()
except:
pass
# insert login data
driver.find_element(By.NAME, "email").send_keys(os.environ['FB_EMAIL']) # type email
driver.find_element(By.NAME, "pass").send_keys(os.environ['FB_PASSWD']) # type password
# click -> log in
driver.find_element(By.NAME, "login").click()
time.sleep(5)
# scrolling to the bottom of the page
def crawl_for_links(driver, url):
print('Crawling')
i = 1
driver.get(url)
time.sleep(2)
name = driver.find_elements(By.TAG_NAME, 'h2')[-1].text
for _ in range(0, 3):
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
for _ in range(50):
# Scroll down to bottom
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
# Wait to load page
time.sleep(3)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
os.system('clear||cls')
print(f'Iteration num: {i}')
i += 1
return driver.page_source, name
# parse HTML
def parse_html(html):
soup = BeautifulSoup(html, 'lxml')
timeline = soup.find('div', {'class': 'x9f619 x1n2onr6 x1ja2u2z xeuugli xs83m0k x1xmf6yo x1emribx x1e56ztr x1i64zmx xjl7jj x19h7ccj xu9j1y6 x7ep2pv'})
posts = timeline.find_all('div', {'class': 'x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z'})
arr = []
for post in posts:
try:
commentsWidget = post.find('span', {'class': 'x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xi81zsa'})
if approveComments(commentsWidget.text):
links = post.find_all('a', {'role': 'link'})
arr.append(extractPostLink(links))
except AttributeError:
pass
return arr
def extractPostLink(links):
for link in links:
if '/videos/' in link['href'] or '/posts/' in link['href']:
return link['href']
# check if post has at least 50 comments
def approveComments(text):
nComments = text.split(' ')[0]
try:
num = int(nComments)
return int(num > 50)
except ValueError:
return 'K' or 'M' in nComments
# write all the links to the .txt
def write_out(arr, name):
with open(f"{os.getcwd()}/inputs/{name.strip().replace(' ', '_').lower()}.txt", 'w') as f:
for item in arr:
try:
f.write(item + '\n')
except TypeError:
pass
if __name__ == '__main__':
# driver init
driver =webdriver_setup()
wait_for_url(driver, 'https://www.facebook.com/')
# login
login(driver)
wait_for_url(driver, 'https://www.facebook.com/')
# crawl
html, name =crawl_for_links(driver, args.URL)
driver.close()
# parsing HTML
arr =parse_html(html)
# write out
write_out(arr, name)