import pymongo import trafilatura import trafilatura.feeds import trafilatura.sitemaps import trafilatura.spider import trafilatura.utils import sys import courlan import urllib from datetime import datetime import click import logging as LOGGER import os LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") DOMAIN = os.getenv("SUCKER_DOMAIN","sk") BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10) CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") DBNAME=os.getenv("SUCKER_DBNAME","crawler") MINFILESIZE=300 MAXFILESIZE=10000000 MINTEXTSIZE=200 def calculate_checksums(text): """ @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line """ checksums = [] sizes = [] hval = 0 hsz = 0 sz = 0 for c in text: cv = ord(c) sz += 1 if cv > 64: hval += (hval << 3) + cv zv = hval >> 31 hval &= 0x7fffffff hval += zv hsz += 1 if c == "\n" and hsz > 0: if hsz > 100: checksums.append(hval) sizes.append(sz) sz = 0 hsz = 0 if hsz > 100: checksums.append(hval) sizes.append(sz) return checksums, sizes def is_robot_good(link,rules): # check robots.txt rules if rules is not None and not rules.can_fetch("*", link): return False return True def is_link_good(link): r = courlan.check_url(link,strict=True,language=LANGUAGE) if r is None: print(link) return None llink,ldomain = r print(llink,ldomain) # domain rules if not ldomain.endswith(DOMAIN): LOGGER.debug("bad domain") return None if courlan.is_not_crawlable(llink): LOGGER.debug("not crawlable") return None return llink def filter_links(links,rules=None): out = set() for link in links: r = is_link_good(link) if r is None: continue # check robots.txt rules if rules is not None and not rules.can_fetch("*", r): continue out.add(llink) return out def get_link_doc(link,status="frontlink"): r = courlan.check_url(link) assert r is not None link,host = r domain = courlan.extract_domain(link) return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()} def generic_visit(domain): known_links = set(get_visited_links(domain)) visit_links = [] visit_links = trafilatura.find_feed_urls(domain) if visit_links is None: visit_links = trafilatura.sitemap_search(domain) if visit_links is None: visit_links = trafilatura.focused_crawler(dommain,known_links=known_links) def fetch_pages(link_batch): htmls = [] #print(link_batch) #print("zzzzzzzzzz") for link in link_batch: print("fetching:::::") print(link) final_link = link response = trafilatura.fetch_url(link,decode=False) html = None if response is not None : good = True if response.status != 200: good = False LOGGER.error('not a 200 response: %s for URL %s', response.status, url) elif response.data is None or len(response.data) < MINFILESIZE: LOGGER.error('too small/incorrect for URL %s', url) good = False # raise error instead? elif len(response.data) > MAXFILESIZE: good = False LOGGER.error('too large: length %s for URL %s', len(response.data), url) if good: html = trafilatura.utils.decode_response(response) final_link = response.url if html is not None: html, final_link = trafilatura.spider.refresh_detection(html, final_link) # is there a meta-refresh on the page? if final_link is None: # malformed or malicious content html = None htmls.append((final_link,html)) return htmls def fetch_robot(base_url): rules = urllib.robotparser.RobotFileParser() rules.set_url("https://" + base_url + '/robots.txt') # exceptions happening here try: rules.read() except Exception as exc: LOGGER.error('cannot read robots.txt: %s', exc) rules = None return rules def extract_pages(link_batch,responses): out = [] for original_link,(final_link,html) in zip(link_batch,responses): doc = None assert original_link is not None if html is not None: doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE) if doc is not None: if not "text" in doc or len(doc["text"]) < MINTEXTSIZE: # text too small doc = None out.append((original_link,final_link,html,doc)) return out def index_pages(db,domain,extracted_pages): extracted_links = set() linkcol = db["links"] htmlcol = db["html"] contentcol = db["content"] links = [] for original_link,final_link,html,doc in extracted_pages: state = "good" link = original_link if original_link != final_link: linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}}) link = final_link if html is None: state = "html_error" elif doc is None: state = "content_error" if doc is not None: checksums,sizes = calculate_checksums(doc["text"]) doc["text_size"] = len(doc["text"]) doc["paragraph_checksums"] = checksums doc["paragraph_sizes"] = sizes if len(checksums) < 1: state = "trash" if state == "good": htdoc = get_link_doc(link,state) htdoc["html"] = html htdoc["html_size"] = len(html) htmlcol.insert_one(htdoc) doc.update(get_link_doc(link,"good")) # todo extract links print(doc) contentcol.insert_one(doc) linkcol.update_one({"url":original_link},{"$set":{"status":state}}) def extract_links(link_batch,responses,domain,rules,default_status="frontlink"): links = {} for original_link,(final_link,html) in zip(link_batch,responses): status = default_status extracted_links = courlan.extract_links(html,final_link,False,language=LANGUAGE) #print(extracted_links) for link in extracted_links: if courlan.is_external(link,domain): status = "frontlink" elif courlan.is_navigation(link): status = "navigation" #print(link,status) links[link] = status outlinks = [] for link,status in links.items(): if not is_robot_good(link,rules): continue link = is_link_good(link) if link is None: continue outlinks.append((link,status)) return outlinks def index_links(db,extracted_links): linkcol=db["links"] for link,status in extracted_links: doc = get_link_doc(link,status) try: linkcol.insert_one(doc) except pymongo.errors.DuplicateKeyError as ex: pass def get_links(db,domain,status,batch_size=BATCHSIZE): linkcol = db["links"] res = linkcol.find({"status":status,"host":domain},{"url":1},limit=batch_size) links = [] for doc in res: print(">>>>>" + status) print(doc) links.append(doc["url"]) return links def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE): #print(links) responses = fetch_pages(links) #print(responses) extracted_pages = extract_pages(links,responses) #print(extracted_pages) extracted_links = extract_links(links,responses,domain,rules,status) print(extracted_links) index_links(db,extracted_links) index_pages(db,domain,extracted_pages) def link_summary(db,domain): linkcol = db["links"] #res = linkcol.distinct("domain",{"hostname":domain}) # count links res = linkcol.aggregate([ {"$match":{"host":domain}}, {"$group":{"_id":"$status","count":{"$sum":1}}}, ]) for item in res: print(item) contentcol = db["content"] res = contentcol.aggregate([ {"$match":{"hostname":domain}}, {"$group":{"_id":None,"text_size_sum":{"$sum":"text_size"}}}, ]) for item in res: print(item) global DB @click.group() def cli(): pass @cli.command() def createdb(): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] linkcol = db["links"] linkcol.create_index("url",unique=True) linkcol.create_index("host") contentcol = db["content"] contentcol.create_index("url",unique=True) #contentcol.create_index({"paragraph_checksums":1}) #contentcol.create_index({"domain":1}) htmlcol = db["html"] htmlcol.create_index("url",unique=True) @cli.command() @click.argument("start_link") def visit(start_link): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] start_link,domain = courlan.check_url(start_link) rules = fetch_robot(domain) print(rules) batch_size = BATCHSIZE navigation_links = get_links(db,domain,"navigation",batch_size) if start_link is not None: navigation_links.append(start_link) print(navigation_links) process_links(db,domain,"frontlink",navigation_links,rules) links = get_links(db,domain,"frontlink",batch_size) bl = len(links) - batch_size if bl > 0: print("Getting backlinks") front_links = get_links(db,domain,"backlink",bl) process_links(db,domain,"backlink",links,rules=rules) link_summary(db,domain) if __name__ == "__main__": cli()