import pymongo import trafilatura import trafilatura.feeds import trafilatura.sitemaps import trafilatura.spider import trafilatura.utils import sys import courlan import urllib LANGUAGE="sk" DOMAIN = "sk" BATCHSIZE=10 MINFILESIZE=300 MAXFILESIZE=1000000 def calculate_checksums(text): """ @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line """ checksums = [] sizes = [] hval = 0 hsz = 0 sz = 0 for c in text: cv = ord(c) sz += 1 if cv > 64: hval += (hval << 3) + cv zv = hval >> 31 hval &= 0x7fffffff hval += zv hsz += 1 if c == "\n" and hsz > 0: if hsz > 100: checksums.append(hval) sizes.append(sz) sz = 0 hsz = 0 if hsz > 100: checksums.append(hval) sizes.append(sz) return checksums, sizes def is_robot_good(link,rules): # check robots.txt rules if rules is not None and not rules.can_fetch("*", llink): return False return True def is_link_good(link): r = courlan.check_url(link,strict=True,language=LANGUAGE) if r is None: print(link) return None llink,ldomain = r print(llink,ldomain) # domain rules if not ldomain.endswith(DOMAIN): print("bad domain") return None if courlan.is_not_crawlable(llink): print("not crawlable") return None return llink def filter_links(links,rules=None): out = set() for link in links: r = is_link_good(link) if r is None: continue # check robots.txt rules if rules is not None and not rules.can_fetch("*", r): continue out.add(llink) return out def get_link_doc(link,status="frontlink"): r = courlan.check_url(link) assert r is not None link,host = r domain = courlan.extract_domain(link) return {"url":link,"host":host,"domain":domain,"status":status} def generic_visit(domain): known_links = set(get_visited_links(domain)) visit_links = [] visit_links = trafilatura.find_feed_urls(domain) if visit_links is None: visit_links = trafilatura.sitemap_search(domain) if visit_links is None: visit_links = trafilatura.focused_crawler(dommain,known_links=known_links) def fetch_pages(link_batch): htmls = [] #print(link_batch) #print("zzzzzzzzzz") for link in link_batch: print("fetching:::::") print(link) final_link = link response = trafilatura.fetch_url(link,decode=False) html = None if response is not None : good = True if response.status != 200: good = False #LOGGER.error('not a 200 response: %s for URL %s', response.status, url) elif response.data is None or len(response.data) < MINFILESIZE: #LOGGER.error('too small/incorrect for URL %s', url) good = False # raise error instead? elif len(response.data) > MAXFILESIZE: good = False #LOGGER.error('too large: length %s for URL %s', len(response.data), url) if good: html = trafilatura.utils.decode_response(response) final_link = response.url if html is not None: html, final_link = trafilatura.spider.refresh_detection(html, final_link) # is there a meta-refresh on the page? if final_link is None: # malformed or malicious content html = None htmls.append((final_link,html)) return htmls def fetch_robot(base_url): rules = urllib.robotparser.RobotFileParser() rules.set_url(base_url + '/robots.txt') # exceptions happening here try: rules.read() except Exception as exc: #LOGGER.error('cannot read robots.txt: %s', exc) rules = None return rules def extract_pages(link_batch,responses): out = [] for original_link,(final_link,html) in zip(link_batch,responses): doc = None assert original_link is not None if html is not None: doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE) out.append((original_link,final_link,html,doc)) return out def index_pages(db,domain,extracted_pages): extracted_links = set() linkcol = db["links"] htmlcol = db["html"] contentcol = db["content"] links = [] for original_link,final_link,html,doc in extracted_pages: state = "good" if html is None: state = "html_error" elif doc is None: state = "content_error" if original_link != final_link: linkcol.insert_one(get_link_doc(final_link,state)) state = "redirect" linkcol.update_one({"url":original_link},{"$set":{"status":state}}) if html is not None: htmlcol.insert_one({"url":final_link,"html":html}) if doc is not None: checksums,sizes = calculate_checksums(doc["text"]) doc["paragraph_checksums"] = checksums doc["paragraph_sizes"] = sizes # todo extract links print(doc) contentcol.insert_one(doc) def extract_links(link_batch,responses,domain,rules,default_status="frontlink"): links = {} for original_link,(final_link,html) in zip(link_batch,responses): status = default_status extracted_links = courlan.extract_links(html,final_link,False,language=LANGUAGE) #print(extracted_links) for link in extracted_links: if courlan.is_external(link,domain): status = "frontlink" elif courlan.is_navigation(link): status = "navigation" #print(link,status) links[link] = status outlinks = [] for link,status in links.items(): if not is_robot_good(link,rules): continue link = is_link_good(link) if link is None: continue outlinks.append((link,status)) return outlinks def index_links(db,extracted_links): linkcol=db["links"] for link,status in extracted_links: doc = get_link_doc(link,status) linkcol.insert_one(doc) def get_links(db,domain,status,batch_size=BATCHSIZE): linkcol = db["links"] res = linkcol.find({"status":status,"host":domain},{"url":1},limit=batch_size) links = [] for doc in res: print(">>>>>" + status) print(doc) links.append(doc["url"]) return links def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE): links += get_links(db,domain,status,batch_size) #print(links) responses = fetch_pages(links) #print(responses) extracted_pages = extract_pages(links,responses) #print(extracted_pages) extracted_links = extract_links(links,responses,domain,rules,status) print(extracted_links) index_links(db,extracted_links) index_pages(db,domain,extracted_pages) def simple_visit(start_link): start_link,domain = courlan.check_url(start_link) myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/") db=myclient["crawler"] rules = fetch_robot(domain) navigation_links =[start_link] print(navigation_links) process_links(db,domain,"navigation",navigation_links,rules) process_links(db,domain,"frontlink",rules=rules) process_links(db,domain,"backlink",rules=rules) def create_indices(db): linkcol = db["links"] linkcol.create_index({"url":1},{"name":"url"}) linkcol.create_index({"hostname":1,"status":1},{"name":"hostname_status"}) contentcol = db["content"] contentcol.create_index({"url":1}) contentcol.create_index({"paragraph_checksums":1}) contentcol.create_index({"domain":1}) htmlcol = db["html"] htmlcol.create_index({"url":1}) def link_summary(db,domain): linkcol = db["links"] res = linkcol.aggregate([ {"$match":{"hostname":domain}}, {"$group":{"_id":"status":domain,"count":{"$count":1}}}, ]) print(res) simple_visit(sys.argv[1])