diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 85285ca..4bd3fb0 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -6,9 +6,12 @@ import trafilatura.spider import trafilatura.utils import sys import courlan +import urllib LANGUAGE="sk" BATCHSIZE=10 +MINFILESIZE=300 +MAXFILESIZE=1000000 def calculate_checksums(text): """ @@ -46,8 +49,10 @@ def is_robot_good(link,rules): return True def is_link_good(link): - r = courlan.check_url(link,strict=True,language=language) + r = courlan.check_url(link,strict=True,language=LANGUAGE) if r is None: + print("BBBBBBB") + print(link) return None llink,ldomain = r print(llink,ldomain) @@ -58,7 +63,7 @@ def is_link_good(link): if courlan.is_not_crawlable(llink): print("not crawlable") return None - return None + return llink def filter_links(links,rules=None): out = set() @@ -72,14 +77,6 @@ def filter_links(links,rules=None): out.add(llink) return out -def sort_links(links,domain): - for llink in filtered_links: - doc = get_link_doc(link,"backlink") - if courlan.is_external(link,domain): - doc["status"]= "frontlink" - elif courlan.is_navigation(link): - doc["status"] = "navigation" - linkcol.insert_one(doc) def get_link_doc(link,status="frontlink"): r = courlan.check_url(link) @@ -100,44 +97,57 @@ def generic_visit(domain): def fetch_pages(link_batch): htmls = [] - print(link_batch) - print("zzzzzzzzzz") + #print(link_batch) + #print("zzzzzzzzzz") for link in link_batch: print("fetching:::::") print(link) + final_link = link response = trafilatura.fetch_url(link,decode=False) - htmlstring, homepage = trafilatura.spider.refresh_detection(response.data, link) - # is there a meta-refresh on the page? - if homepage is None: # malformed or malicious content - response = None - htmls.append(response) + html = None + if response is not None : + good = True + if response.status != 200: + good = False + #LOGGER.error('not a 200 response: %s for URL %s', response.status, url) + elif response.data is None or len(response.data) < MINFILESIZE: + #LOGGER.error('too small/incorrect for URL %s', url) + good = False + # raise error instead? + elif len(response.data) > MAXFILESIZE: + good = False + #LOGGER.error('too large: length %s for URL %s', len(response.data), url) + if good: + html = trafilatura.utils.decode_response(response) + final_link = response.url + if html is not None: + html, final_link = trafilatura.spider.refresh_detection(html, final_link) + # is there a meta-refresh on the page? + if final_link is None: # malformed or malicious content + html = None + htmls.append((final_link,html)) return htmls -def fetch_rules(base_url): +def fetch_robot(base_url): rules = urllib.robotparser.RobotFileParser() rules.set_url(base_url + '/robots.txt') # exceptions happening here try: rules.read() except Exception as exc: - LOGGER.error('cannot read robots.txt: %s', exc) + #LOGGER.error('cannot read robots.txt: %s', exc) rules = None return rules def extract_pages(link_batch,responses): out = [] - extracted_links = set() - for link,response in zip(link_batch,responses): + for original_link,(final_link,html) in zip(link_batch,responses): doc = None - assert link is not None - html = None - response_link = None - if response is not None: - html = trafilatura.utils.decode_response(response) + assert original_link is not None if html is not None: - doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language=LANGUAGE) - out.append((link,response_link,html,doc)) + doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE) + out.append((original_link,final_link,html,doc)) return out @@ -146,12 +156,17 @@ def index_pages(db,domain,extracted_pages): linkcol = db["links"] htmlcol = db["html"] contentcol = db["content"] + links = [] for original_link,final_link,html,doc in extracted_pages: state = "good" if html is None: state = "html_error" elif doc is None: state = "content_error" + if original_link != final_link: + linkcol.insert_one(get_link_doc(final_link,state)) + state = "redirect" + linkcol.update_one({"url":original_link},{"$set":{"status":state}}) if html is not None: htmlcol.insert_one({"url":final_link,"html":html}) if doc is not None: @@ -165,28 +180,20 @@ def index_pages(db,domain,extracted_pages): def extract_links(link_batch,responses,domain,rules,default_status="frontlink"): links = {} - for original_link,response in zip(link_batch,resposnes): - final_link = response.url + for original_link,(final_link,html) in zip(link_batch,responses): status = default_status - link = original_link - if original_link != final_link: - links[original_link] = "redirect" - link = final_link - if courlan.is_external(link,domain): - status = "frontlink" - elif courlan.is_navigation(link): - status = "navigation" - links[link] = status - extracted_links = courlan.extract_links(response.content) - for link in extracted_links + extracted_links = courlan.extract_links(html,final_link,False,language=LANGUAGE) + #print(extracted_links) + for link in extracted_links: if courlan.is_external(link,domain): status = "frontlink" elif courlan.is_navigation(link): status = "navigation" + #print(link,status) links[link] = status outlinks = [] for link,status in links.items(): - if not is_robot_good(rules): + if not is_robot_good(link,rules): continue link = is_link_good(link) if link is None: @@ -203,34 +210,25 @@ def index_links(db,extracted_links): def get_links(db,domain,status,batch_size=BATCHSIZE): linkcol = db["links"] - res = linkcol.find({"status":status,"domain":domain},{"url":1},limit=batch_size) - print(res,domain,status) - front_links = [] + res = linkcol.find({"status":status,"host":domain},{"url":1},limit=batch_size) + links = [] for doc in res: + print(">>>>>" + status) print(doc) - front_links.append(doc["url"]) - return front_links + links.append(doc["url"]) + return links -def index_front_links(db,filtered_links): - linkcol = db["links"] - for link in filtered_links: - linkcol.insert_one(get_link_doc(link,"frontlink")) - - #visit_links = trafilatura.feeds.find_feed_urls(domain) - #visit_links = trafilatura.sitemaps.sitemap_search(domain) - #print(visit_links) - #for link in visit_links: - # content = trafilatura.fetch_url(link,decode=True) - # document = trafilatura.bare_extraction(content) - # print(content) - -def process_links(status,domain,links=[],rules=None): +def process_links(db,domain,status,links=[],rules=None): links += get_links(db,domain,status) + #print(links) responses = fetch_pages(links) + #print(responses) extracted_pages = extract_pages(links,responses) + #print(extracted_pages) extracted_links = extract_links(links,responses,domain,rules,status) + print(extracted_links) index_links(db,extracted_links) index_pages(db,domain,extracted_pages) @@ -242,17 +240,7 @@ def simple_visit(start_link): navigation_links =[start_link] print(navigation_links) process_links(db,domain,"navigation",navigation_links,rules) - process_links(db,domain,front_links,rules) - back_links = get_links(db,domain,"backlink") - process_links(db,domain,front_links,rules) - #new_front_links = fetch_front_links(navigation_links) - print("NEW FRONT LINKS") - #print(new_front_links) - #index_front_links(db,new_front_links) - front_links = get_links(db,domain,"frontlink") - print("NEW VISIT LINKS") - visit_links = front_links - print(visit_links) - responses = fetch_pages(visit_links) + process_links(db,domain,"frontlink",rules=rules) + process_links(db,domain,"backlink",rules=rules) simple_visit(sys.argv[1])