diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index e05bee8..da8ac11 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -33,10 +33,11 @@ DBNAME=os.getenv("SUCKER_DBNAME","crawler") MINFILESIZE=300 MAXFILESIZE=10000000 MINTEXTSIZE=200 -CHECK_PARAGRAPH_SIZE=150 +CHECK_PARAGRAPH_SIZE=200 TEXT_TRASH_SIZE=200 TEXT_TRASH_RATIO=0.6 DISCOVER_LINK_RATIO = 0.3 +DISCOVER_DOMAIN_RATIO = 0.5 SAMPLE_SET_SIZE =10000 CLASSIFIER_SET_SIZE = 200 STOP_PATHS=["xml","rss","login","admin"] @@ -61,6 +62,7 @@ def get_bs_links(link,html): netloc = parsed.netloc path = os.path.normpath(parsed.path) scheme = parsed.scheme + query = parsed.query # internal link if parsed.netloc == "": scheme = base.scheme @@ -74,7 +76,7 @@ def get_bs_links(link,html): if path.endswith(")"): # javascript continue - href = urllib.parse.urlunparse((scheme,netloc,path,"","","")) + href = urllib.parse.urlunparse((scheme,netloc,path,"",query,"")) href = courlan.normalize_url(href) links.add(href) except ValueError as err: @@ -232,7 +234,6 @@ def index_page(db,original_link,final_link,html,doc): state = "good" link = original_link if original_link != final_link: - print(original_link,final_link) linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}}) link = final_link if html is None: @@ -250,7 +251,6 @@ def index_page(db,original_link,final_link,html,doc): origsz = 0 for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]): # index paragraph checksums - print(checkcol) nd = checkcol.find_one({"_id":chs}) if nd is None: origsz += paragraph_size @@ -258,7 +258,6 @@ def index_page(db,original_link,final_link,html,doc): if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO: state = "copy" - print(origsz) if state == "good": htdoc = get_link_doc(link,state) htdoc["html"] = html @@ -273,10 +272,7 @@ def index_page(db,original_link,final_link,html,doc): del doc["url"] contentcol.update_one({"url":link},{"$set":doc},upsert=True) for chs in doc["paragraph_checksums"]: - try: - checkcol.insert_one({"_id":chs}) - except pymongo.errors.DuplicateKeyError as err: - pass + checkcol.update_one({"_id":chs},{"$inc":{"count":1}},upsert=True) linkdoc = get_link_doc(link,state) del linkdoc["url"] @@ -304,7 +300,6 @@ def save_batch_info(db,host,states,docs): "batch_size": batch_size, } db["batches"].insert_one(batchdoc) - print(batchdoc) def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list: @@ -315,15 +310,11 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat if html is None or len(html) < 256: continue page_links = get_bs_links(final_link,html) - #external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE) - #internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE) - #print(extracted_links) for link in page_links: if not courlan.is_external(link,final_link) and not is_robot_good(link,rules): badrobot += 1 continue status = str(default_status) - #print(link,status) links[link] = status outlinks = [] badlink = 0 @@ -449,7 +440,6 @@ class LinkClassifier: goodprob += g b = math.log(self.badcounter[feature] + self.alpha) - bcc badprob += b - print(feature,g,b) pa = math.exp(goodprob + gp) pb = math.exp(badprob + bp) return pa - pb #+ random.uniform(-0.001,0.001) @@ -730,7 +720,7 @@ def crawl_summary(): {"$sort":{"original_text_size":-1}}, ]) print(">>>> Batches") - headers = ["_id","document_count","good_document_count","batch_count","text_size","original_text_size"] + headers = ["_id","document_count","good_document_count","batch_size","original_text_size"] print("\t".join(headers)) for item in res: values = [str(item[x]) for x in headers] @@ -761,7 +751,7 @@ def sample_domains(): all_domains = [] for domain in domains: all_domains.append(domain) - sample_size = min(int(DISCOVER_LINK_RATIO* BATCHSIZE), len(all_domains)) + sample_size = min(int(DISCOVER_DOMAIN_RATIO* BATCHSIZE), len(all_domains)) print(">>> Discover domains {}".format(sample_size)) sample_domains = random.sample(all_domains,sample_size) domaincol = db["domains"] @@ -770,7 +760,7 @@ def sample_domains(): all_domains = [] for item in res: all_domains.append(item["host"]) - sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCHSIZE),len(all_domains)) + sample_size = min(int((1 - DISCOVER_DOMAIN_RATIO) * BATCHSIZE),len(all_domains)) print(">>>> Best domains {}".format(sample_size)) sample_domains += random.sample(all_domains,sample_size) for domain in sample_domains: