diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 78a4b9c..6bff0d3 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -96,9 +96,7 @@ def fetch_pages(link_batch): htmls.append(trafilatura.fetch_url(link,decode=False)) return htmls -def fetch_front_links(navigation_links): - start_link = navigation_links[0] - known_links =navigation_links[1:] +def fetch_rules(base_url): rules = urllib.robotparser.RobotFileParser() rules.set_url(base_url + '/robots.txt') # exceptions happening here @@ -107,13 +105,11 @@ def fetch_front_links(navigation_links): except Exception as exc: LOGGER.error('cannot read robots.txt: %s', exc) rules = None - response = trafilatura.fetch_url(homepage, decode=False) - if response is None or response == '': - return None, None, None - # get redirected URL here? - if response.url != homepage: - logging.info('followed redirect: %s', response.url) - homepage = response.url + return rules + +def fetch_front_links(start_links,rules): + responses = fetch_pages(start_links) + # decode response htmlstring = trafilatura.utils.decode_response(response.data) # is there a meta-refresh on the page? @@ -165,9 +161,6 @@ def index_pages(db,domain,extracted_pages,rules): doc["paragraph_checksums"] = checksums doc["paragraph_sizes"] = sizes # todo extract links - if "links" in doc: - extracted_links.union(doc["links"]) - del doc["links"] print(doc) contentcol.insert_one(doc) # todo extract links @@ -178,14 +171,16 @@ def index_pages(db,domain,extracted_pages,rules): if len(rl) == 1: doc = get_link_doc(rl[0],"redirect") linkcol.replace_one({"url":rl[0]},doc,upsert=True) - filtered_links = filter_links(extracted_links,rules) - for llink in filtered_links: - doc = get_link_doc(link,"backlink") - if courlan.is_external(link,domain): - doc["status"]= "frontlink" - elif courlan.is_navigation(link): - doc["status"] = "navigation" - linkcol.insert_one(doc) + +def extract_links(extracted_pages,domain,rules,status="frontlink"): + filtered_links = filter_links(extracted_links,rules) + for original_link,final_link,html,doc in extracted_pages: + doc = get_link_doc(link,status) + if courlan.is_external(link,domain): + doc["status"]= "frontlink" + elif courlan.is_navigation(link): + doc["status"] = "navigation" + linkcol.insert_one(doc) def get_links(db,domain,status,batch_size=BATCHSIZE): linkcol = db["links"]