import pymongo import trafilatura import trafilatura.feeds import trafilatura.sitemaps import trafilatura.spider import trafilatura.utils import sys import courlan LANGUAGE="sk" BATCHSIZE=10 def calculate_checksums(text): """ @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line """ checksums = [] sizes = [] hval = 0 hsz = 0 sz = 0 for c in text: cv = ord(c) sz += 1 if cv > 64: hval += (hval << 3) + cv zv = hval >> 31 hval &= 0x7fffffff hval += zv hsz += 1 if c == "\n" and hsz > 0: if hsz > 100: checksums.append(hval) sizes.append(sz) sz = 0 hsz = 0 if hsz > 100: checksums.append(hval) sizes.append(sz) return checksums, sizes def is_robot_good(link,rules): # check robots.txt rules if rules is not None and not rules.can_fetch("*", llink): return False return True def is_link_good(link): r = courlan.check_url(link,strict=True,language=language) if r is None: return None llink,ldomain = r print(llink,ldomain) # domain rules if not ldomain.endswith("sk"): print("bad domain") return None if courlan.is_not_crawlable(llink): print("not crawlable") return None return None def filter_links(links,rules=None): out = set() for link in links: r = is_link_good(link) if r is None: continue # check robots.txt rules if rules is not None and not rules.can_fetch("*", r): continue out.add(llink) return out def sort_links(links,domain): for llink in filtered_links: doc = get_link_doc(link,"backlink") if courlan.is_external(link,domain): doc["status"]= "frontlink" elif courlan.is_navigation(link): doc["status"] = "navigation" linkcol.insert_one(doc) def get_link_doc(link,status="frontlink"): r = courlan.check_url(link) assert r is not None link,host = r domain = courlan.extract_domain(link) return {"url":link,"host":host,"domain":domain,"status":status} def generic_visit(domain): known_links = set(get_visited_links(domain)) visit_links = [] visit_links = trafilatura.find_feed_urls(domain) if visit_links is None: visit_links = trafilatura.sitemap_search(domain) if visit_links is None: visit_links = trafilatura.focused_crawler(dommain,known_links=known_links) def fetch_pages(link_batch): htmls = [] print(link_batch) print("zzzzzzzzzz") for link in link_batch: print("fetching:::::") print(link) response = trafilatura.fetch_url(link,decode=False) htmlstring, homepage = trafilatura.spider.refresh_detection(response.data, link) # is there a meta-refresh on the page? if homepage is None: # malformed or malicious content response = None htmls.append(response) return htmls def fetch_rules(base_url): rules = urllib.robotparser.RobotFileParser() rules.set_url(base_url + '/robots.txt') # exceptions happening here try: rules.read() except Exception as exc: LOGGER.error('cannot read robots.txt: %s', exc) rules = None return rules def extract_pages(link_batch,responses): out = [] extracted_links = set() for link,response in zip(link_batch,responses): doc = None assert link is not None html = None response_link = None if response is not None: html = trafilatura.utils.decode_response(response) if html is not None: doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language=LANGUAGE) out.append((link,response_link,html,doc)) return out def index_pages(db,domain,extracted_pages): extracted_links = set() linkcol = db["links"] htmlcol = db["html"] contentcol = db["content"] for original_link,final_link,html,doc in extracted_pages: state = "good" if html is None: state = "html_error" elif doc is None: state = "content_error" if html is not None: htmlcol.insert_one({"url":final_link,"html":html}) if doc is not None: checksums,sizes = calculate_checksums(doc["text"]) doc["paragraph_checksums"] = checksums doc["paragraph_sizes"] = sizes # todo extract links print(doc) contentcol.insert_one(doc) def extract_links(link_batch,responses,domain,rules,default_status="frontlink"): links = {} for original_link,response in zip(link_batch,resposnes): final_link = response.url status = default_status link = original_link if original_link != final_link: links[original_link] = "redirect" link = final_link if courlan.is_external(link,domain): status = "frontlink" elif courlan.is_navigation(link): status = "navigation" links[link] = status extracted_links = courlan.extract_links(response.content) for link in extracted_links if courlan.is_external(link,domain): status = "frontlink" elif courlan.is_navigation(link): status = "navigation" links[link] = status outlinks = [] for link,status in links.items(): if not is_robot_good(rules): continue link = is_link_good(link) if link is None: continue outlinks.append((link,status)) return outlinks def index_links(db,extracted_links): linkcol=db["links"] for link,status in extracted_links: doc = get_link_doc(link,status) linkcol.insert_one(doc) def get_links(db,domain,status,batch_size=BATCHSIZE): linkcol = db["links"] res = linkcol.find({"status":status,"domain":domain},{"url":1},limit=batch_size) print(res,domain,status) front_links = [] for doc in res: print(doc) front_links.append(doc["url"]) return front_links def index_front_links(db,filtered_links): linkcol = db["links"] for link in filtered_links: linkcol.insert_one(get_link_doc(link,"frontlink")) #visit_links = trafilatura.feeds.find_feed_urls(domain) #visit_links = trafilatura.sitemaps.sitemap_search(domain) #print(visit_links) #for link in visit_links: # content = trafilatura.fetch_url(link,decode=True) # document = trafilatura.bare_extraction(content) # print(content) def process_links(status,domain,links=[],rules=None): links += get_links(db,domain,status) responses = fetch_pages(links) extracted_pages = extract_pages(links,responses) extracted_links = extract_links(links,responses,domain,rules,status) index_links(db,extracted_links) index_pages(db,domain,extracted_pages) def simple_visit(start_link): start_link,domain = courlan.check_url(start_link) myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/") db=myclient["crawler"] rules = fetch_robot(domain) navigation_links =[start_link] print(navigation_links) process_links(db,domain,"navigation",navigation_links,rules) process_links(db,domain,front_links,rules) back_links = get_links(db,domain,"backlink") process_links(db,domain,front_links,rules) #new_front_links = fetch_front_links(navigation_links) print("NEW FRONT LINKS") #print(new_front_links) #index_front_links(db,new_front_links) front_links = get_links(db,domain,"frontlink") print("NEW VISIT LINKS") visit_links = front_links print(visit_links) responses = fetch_pages(visit_links) simple_visit(sys.argv[1])