diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 3b83b5f..78a4b9c 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -3,9 +3,12 @@ import trafilatura import trafilatura.feeds import trafilatura.sitemaps import trafilatura.spider +import trafilatura.utils import sys import courlan +LANGUAGE="sk" +BATCHSIZE=10 def calculate_checksums(text): """ @@ -36,7 +39,7 @@ def calculate_checksums(text): sizes.append(sz) return checksums, sizes -def filter_links(links,language="sk"): +def filter_links(links,language=LANGUAGE,rules=None): out = set() for link in links: r = courlan.check_url(link,strict=True,language=language) @@ -51,9 +54,21 @@ def filter_links(links,language="sk"): if courlan.is_not_crawlable(llink): print("not crawlable") continue + # check robots.txt rules + if rules is not None and not rules.can_fetch("*", llink): + continue out.add(llink) return out +def sort_links(links,domain): + for llink in filtered_links: + doc = get_link_doc(link,"backlink") + if courlan.is_external(link,domain): + doc["status"]= "frontlink" + elif courlan.is_navigation(link): + doc["status"] = "navigation" + linkcol.insert_one(doc) + def get_link_doc(link,status="frontlink"): r = courlan.check_url(link) assert r is not None @@ -78,14 +93,37 @@ def fetch_pages(link_batch): for link in link_batch: print("fetching:::::") print(link) - htmls.append(trafilatura.fetch_url(link)) + htmls.append(trafilatura.fetch_url(link,decode=False)) return htmls def fetch_front_links(navigation_links): start_link = navigation_links[0] - known_links = navigation_links[1:] - visit_links,known_links = trafilatura.spider.focused_crawler(start_link,known_links=known_links) - filtered_links = filter_links(visit_links) + known_links =navigation_links[1:] + rules = urllib.robotparser.RobotFileParser() + rules.set_url(base_url + '/robots.txt') + # exceptions happening here + try: + rules.read() + except Exception as exc: + LOGGER.error('cannot read robots.txt: %s', exc) + rules = None + response = trafilatura.fetch_url(homepage, decode=False) + if response is None or response == '': + return None, None, None + # get redirected URL here? + if response.url != homepage: + logging.info('followed redirect: %s', response.url) + homepage = response.url + # decode response + htmlstring = trafilatura.utils.decode_response(response.data) + # is there a meta-refresh on the page? + htmlstring, homepage = trafilatura.spider.refresh_detection(htmlstring, homepage) + if homepage is None: # malformed or malicious content + return None, None, None + visit_links = courlan.extract_links(htmlstring) + visit_links,known_links = trafilatura.spider.focused_crawler(start_link,lang=LANGUAGE) + print(visit_links,known_links) + filtered_links = filter_links(visit_links,LANGUAGE,rules=rules) return filtered_links def extract_pages(link_batch,responses): @@ -93,39 +131,54 @@ def extract_pages(link_batch,responses): for link,response in zip(link_batch,responses): doc = None assert link is not None - html = trafilatura.util.decode_response(response.data) + html = None + response_link = None + if response is not None: + # is reponse link good? + # filter and normalize + rl = list(filter_links([response.url])) + if len(rl) == 1: + response_link = rl[0] + html = trafilatura.utils.decode_response(response) if html is not None: - doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language="sk") - out.append((link,response.url,html,doc)) + doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language=LANGUAGE) + out.append((link,response_link,html,doc)) return out -def index_pages(db,domain,extracted_pages): +def index_pages(db,domain,extracted_pages,rules): extracted_links = set() linkcol = db["links"] htmlcol = db["html"] contentcol = db["content"] - for link,html,doc in extracted_pages: + for original_link,final_link,html,doc in extracted_pages: state = "good" if html is None: state = "html_error" elif doc is None: state = "content_error" if html is not None: - htmlcol.insert_one({"url":link,"html":html}) + htmlcol.insert_one({"url":final_link,"html":html}) if doc is not None: - print(doc) checksums,sizes = calculate_checksums(doc["text"]) doc["paragraph_checksums"] = checksums doc["paragraph_sizes"] = sizes + # todo extract links if "links" in doc: extracted_links.union(doc["links"]) del doc["links"] + print(doc) contentcol.insert_one(doc) - doc = get_link_doc(link,state) - linkcol.replace_one({"url":link},doc,upsert=True) - filtered_links = filter_links(extracted_links) + # todo extract links + doc = get_link_doc(original_link,state) + linkcol.replace_one({"url":final_link},doc,upsert=True) + if original_link != final_link: + rl = list(filter_links([final_link],rules)) + if len(rl) == 1: + doc = get_link_doc(rl[0],"redirect") + linkcol.replace_one({"url":rl[0]},doc,upsert=True) + filtered_links = filter_links(extracted_links,rules) for llink in filtered_links: doc = get_link_doc(link,"backlink") if courlan.is_external(link,domain): @@ -134,11 +187,13 @@ def index_pages(db,domain,extracted_pages): doc["status"] = "navigation" linkcol.insert_one(doc) -def get_links(db,domain,status,batch_size=100): +def get_links(db,domain,status,batch_size=BATCHSIZE): linkcol = db["links"] - res = linkcol.find({"status":status,"hostname":domain},limit=batch_size) + res = linkcol.find({"status":status,"domain":domain},{"url":1},limit=batch_size) + print(res,domain,status) front_links = [] for doc in res: + print(doc) front_links.append(doc["url"]) return filter_links(front_links) @@ -162,12 +217,13 @@ def simple_visit(start_link): myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/") db=myclient["crawler"] navigation_links =[start_link] - navigation_links += get_links(db,"navigation",domain) - new_front_links = fetch_front_links(navigation_links) + navigation_links += get_links(db,domain,"navigation") + print(navigation_links) + #new_front_links = fetch_front_links(navigation_links) print("NEW FRONT LINKS") - print(new_front_links) - index_front_links(db,new_front_links) - front_links = get_links(db,"frontlink",domain) + #print(new_front_links) + #index_front_links(db,new_front_links) + front_links = get_links(db,domain,"frontlink") print("NEW VISIT LINKS") visit_links = front_links print(visit_links)