from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup import os import sys import time import argparse # parse args parser = argparse.ArgumentParser(description = "Facebook scraper") parser.add_argument("URL", help = 'URL of the facebook page / profile') # dont need to specify type (default is string) args = parser.parse_args() # method for waiting on pages to load def wait_for_url(driver, url): # waiting to load main page try: WebDriverWait(driver, 10).until(EC.url_to_be(url)) print('Succesful !') except: print('Connection error') driver.quit() sys.exit(1) # web driver init def webdriver_setup(): driver_path = r'C:\Users\vlferko\Desktop\projects\jarvis_scraper\chromedriver.exe' chrome_options = Options() chrome_options.add_argument("accept-language=en-US") chrome_options.add_argument("--headless") chrome_options.add_argument("--log-level=OFF") driver = webdriver.Chrome(ChromeDriverManager().install(), options= chrome_options) driver.get("https://www.facebook.com/") return driver # login to a facebook acc def login(driver): print('Logging in') # allow cookies try: driver.find_element(By.XPATH, "//button[contains(string(), 'Decline optional cookies')]").click() except: pass # insert login data driver.find_element(By.NAME, "email").send_keys(os.environ['FB_EMAIL']) # type email driver.find_element(By.NAME, "pass").send_keys(os.environ['FB_PASSWD']) # type password # click -> log in driver.find_element(By.NAME, "login").click() time.sleep(5) # scrolling to the bottom of the page def crawl_for_links(driver, url): print('Crawling') i = 1 driver.get(url) time.sleep(2) name = driver.find_elements(By.TAG_NAME, 'h2')[-1].text for _ in range(0, 3): # Get scroll height last_height = driver.execute_script("return document.body.scrollHeight") for _ in range(50): # Scroll down to bottom driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;") # Wait to load page time.sleep(3) # Calculate new scroll height and compare with last scroll height new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height os.system('clear||cls') print(f'Iteration num: {i}') i += 1 return driver.page_source, name # parse HTML def parse_html(html): soup = BeautifulSoup(html, 'lxml') timeline = soup.find('div', {'class': 'x9f619 x1n2onr6 x1ja2u2z xeuugli xs83m0k x1xmf6yo x1emribx x1e56ztr x1i64zmx xjl7jj x19h7ccj xu9j1y6 x7ep2pv'}) posts = timeline.find_all('div', {'class': 'x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z'}) arr = [] for post in posts: try: commentsWidget = post.find('span', {'class': 'x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xi81zsa'}) if approveComments(commentsWidget.text): links = post.find_all('a', {'role': 'link'}) arr.append(extractPostLink(links)) except AttributeError: pass return arr def extractPostLink(links): for link in links: if '/videos/' in link['href'] or '/posts/' in link['href']: return link['href'] # check if post has at least 50 comments def approveComments(text): nComments = text.split(' ')[0] try: num = int(nComments) return int(num > 50) except ValueError: return 'K' or 'M' in nComments # write all the links to the .txt def write_out(arr, name): with open(f"{os.getcwd()}/inputs/{name.strip().replace(' ', '_').lower()}.txt", 'w') as f: for item in arr: try: f.write(item + '\n') except TypeError: pass if __name__ == '__main__': # driver init driver =webdriver_setup() wait_for_url(driver, 'https://www.facebook.com/') # login login(driver) wait_for_url(driver, 'https://www.facebook.com/') # crawl html, name =crawl_for_links(driver, args.URL) driver.close() # parsing HTML arr =parse_html(html) # write out write_out(arr, name)