diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 06ac526..3b83b5f 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -36,10 +36,10 @@ def calculate_checksums(text): sizes.append(sz) return checksums, sizes -def filter_links(links): +def filter_links(links,language="sk"): out = set() for link in links: - r = courlan.check_url(link) + r = courlan.check_url(link,strict=True,language=language) if r is None: continue llink,ldomain = r @@ -58,7 +58,7 @@ def get_link_doc(link,status="frontlink"): r = courlan.check_url(link) assert r is not None link,host = r - domain = extract_domain(link) + domain = courlan.extract_domain(link) return {"url":link,"host":host,"domain":domain,"status":status} def generic_visit(domain): @@ -81,13 +81,14 @@ def fetch_pages(link_batch): htmls.append(trafilatura.fetch_url(link)) return htmls -def fetch_front_links(start_link): - known_links = [] +def fetch_front_links(navigation_links): + start_link = navigation_links[0] + known_links = navigation_links[1:] visit_links,known_links = trafilatura.spider.focused_crawler(start_link,known_links=known_links) filtered_links = filter_links(visit_links) return filtered_links -def extract_pages(link_batch,htmls): +def extract_pages(link_batch,responses): out = [] for link,response in zip(link_batch,responses): doc = None @@ -127,11 +128,15 @@ def index_pages(db,domain,extracted_pages): filtered_links = filter_links(extracted_links) for llink in filtered_links: doc = get_link_doc(link,"backlink") - linkcol.insert_one(doc,upsert=True) + if courlan.is_external(link,domain): + doc["status"]= "frontlink" + elif courlan.is_navigation(link): + doc["status"] = "navigation" + linkcol.insert_one(doc) def get_links(db,domain,status,batch_size=100): linkcol = db["links"] - res = linkcol.find({"status":status,"domain":domain},limit=batch_size) + res = linkcol.find({"status":status,"hostname":domain},limit=batch_size) front_links = [] for doc in res: front_links.append(doc["url"]) @@ -156,15 +161,18 @@ def simple_visit(start_link): start_link,domain = courlan.check_url(start_link) myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/") db=myclient["crawler"] - new_front_links = fetch_front_links(start_link) + navigation_links =[start_link] + navigation_links += get_links(db,"navigation",domain) + new_front_links = fetch_front_links(navigation_links) print("NEW FRONT LINKS") print(new_front_links) index_front_links(db,new_front_links) - visit_links = get_links(db,"frontlink",domain) + front_links = get_links(db,"frontlink",domain) print("NEW VISIT LINKS") + visit_links = front_links print(visit_links) responses = fetch_pages(visit_links) - extracted_pages = extract_pages(visit_links,htmls) + extracted_pages = extract_pages(visit_links,responses) index_pages(db,domain,extracted_pages) simple_visit(sys.argv[1])