diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index e35900c..70e1b65 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -32,6 +32,9 @@ MINTEXTSIZE=200 CHECK_PARAGRAPH_SIZE=150 TEXT_TRASH_SIZE=200 TEXT_TRASH_RATIO=0.6 +DISCOVER_LINK_RATIO = 0.3 +SAMPLE_SET_SIZE =10000 +CLASSIFIER_SET_SIZE = 200 def split_train(res): trainset = [] @@ -122,12 +125,13 @@ def fetch_page(link:str)->(str,str): LOGGER.error('too large: length %s for URL %s', len(response.data), link) if good: html = trafilatura.utils.decode_response(response) - final_link = response.url if html is not None: html, final_link = trafilatura.spider.refresh_detection(html, final_link) # is there a meta-refresh on the page? if final_link is None: # malformed or malicious content html = None + + final_link = courlan.normalize_url(final_link) return final_link,html def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser: @@ -170,6 +174,7 @@ def index_pages(db,hostname,extracted_pages): state = "good" link = original_link if original_link != final_link: + print(original_link,final_link) linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}}) link = final_link if html is None: @@ -215,7 +220,7 @@ def index_pages(db,hostname,extracted_pages): htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) doc.update(get_link_doc(link,"good")) # todo extract links - print(doc) + print(link,doc) del doc["url"] contentcol.update_one({"url":link},{"$set":doc},upsert=True) for chs in doc["paragraph_checksums"]: @@ -223,11 +228,13 @@ def index_pages(db,hostname,extracted_pages): checkcol.insert_one({"_id":chs}) except pymongo.errors.DuplicateKeyError as err: pass - linkcol.update_one({"url":link},{"$set":{"status":state}}) + + linkdoc = get_link_doc(link,state) + del linkdoc["url"] + linkcol.update_one({"url":link},{"$set":linkdoc}) from bs4 import BeautifulSoup import urllib.parse -import w3lib.url import os.path def get_bs_links(link,html): @@ -236,7 +243,8 @@ def get_bs_links(link,html): base = link if bs.base is not None and "href" in bs.base.attrs: base = bs.base["href"] - base = urllib.parse.urlparse(w3lib.url.canonicalize_url(base)) + base = urllib.parse.urlparse(courlan.normalize_url(base)) + external_links = set() internal_links = set() # Normalizacia linkov @@ -245,12 +253,10 @@ def get_bs_links(link,html): continue href = l["href"] try: - parsed = urllib.parse.urlparse(w3lib.url.canonicalize_url(href)) + parsed = urllib.parse.urlparse(courlan.normalize_url(href)) netloc = parsed.netloc path = os.path.normpath(parsed.path) scheme = parsed.scheme - query = w3lib.url.url_query_cleaner(parsed.query,["id","aid","p","page","pid"]) - print(parsed) if parsed.netloc == "": scheme = base.scheme if parsed.path == "/": @@ -266,8 +272,7 @@ def get_bs_links(link,html): if parsed.netloc == base.netloc: external = False href = urllib.parse.urlunparse((scheme,netloc,path,"","","")) - href = w3lib.url.canonicalize_url(href) - print(href) + href = courlan.normalize_url(href) if external: external_links.add(href) else: @@ -275,7 +280,6 @@ def get_bs_links(link,html): except ValueError as err: print(err) pass - print(internal_links,external_links) return internal_links,external_links def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list: @@ -283,6 +287,8 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat badrobot = 0 for original_link,(final_link,html) in zip(link_batch,responses): status = default_status + if html is None or len(html) < 256: + continue internal_links, external_links = get_bs_links(final_link,html) #external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE) for link in external_links: @@ -516,19 +522,19 @@ def link_summary(db,hostname): print(res) def sample_links(db,hostname,status,batch_size): - print("Getting backlinks") + print("Sampling links") linkcol = db["links"] res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) cl = LinkClassifier() crawled_links = list(res) crawled_count = len(crawled_links) prediction_accuracy = 0 - if crawled_count > 200: + if crawled_count > CLASSIFIER_SET_SIZE: # train on crawled links trainset,testset = split_train(crawled_links) cl.train(trainset) prediction_accuracy = cl.test(testset) - sample_set_size = 10000 + sample_set_size = SAMPLE_SET_SIZE res = linkcol.find({"host":hostname,"status": status}) sample_links = [] predicted_good = 0 @@ -546,7 +552,7 @@ def sample_links(db,hostname,status,batch_size): continue for feature in features: visitcounter[feature] += 1 - mls = int(min(batch_size/2,len(good_links))) + mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links))) random.shuffle(good_links) links = good_links[0:mls] numdiscover = len(discover_links) @@ -556,11 +562,12 @@ def sample_links(db,hostname,status,batch_size): prob = 0 if features is not None: for feature in features: - prob += math.log(visitcounter[feature] / numdiscover) + c = visitcounter[feature] + prob -= math.log(c) / c eval_discover_links.append((link,prob)) eval_discover_links.sort(key=lambda x: x[1],reverse=True) - print(eval_discover_links) - mls = int(min(batch_size/2,len(discover_links))) + #print(eval_discover_links) + mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links))) links += [l[0] for l in eval_discover_links[0:mls]] return links