From 289fbf7fb2551bbccf7c5cf60eb96749b2b28e84 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Fri, 7 Apr 2023 15:56:43 +0200 Subject: [PATCH] zz --- mongo/mongocrawler.py | 126 ++++++++++++++++++++++++++++++------------ 1 file changed, 92 insertions(+), 34 deletions(-) diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 0daf82e..e35900c 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -93,7 +93,7 @@ def is_link_good(link): return None return llink -def get_link_doc(link,status="frontlink"): +def get_link_doc(link:str,status="frontlink")->dict: r = courlan.check_url(link) assert r is not None link,host = r @@ -101,7 +101,7 @@ def get_link_doc(link,status="frontlink"): return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()} -def fetch_page(link): +def fetch_page(link:str)->(str,str): print("fetching:::::") print(link) final_link = link @@ -130,7 +130,7 @@ def fetch_page(link): html = None return final_link,html -def fetch_robot(base_url): +def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser: try: rawrules = trafilatura.fetch_url("https://"+ base_url + "/robots.txt") #print(rawrules) @@ -144,7 +144,7 @@ def fetch_robot(base_url): return rules -def extract_pages(link_batch,responses): +def extract_pages(link_batch:list,responses:list)->list: out = [] for original_link,(final_link,html) in zip(link_batch,responses): doc = None @@ -225,16 +225,69 @@ def index_pages(db,hostname,extracted_pages): pass linkcol.update_one({"url":link},{"$set":{"status":state}}) +from bs4 import BeautifulSoup +import urllib.parse +import w3lib.url +import os.path -def extract_links(link_batch,responses,hostname,rules,default_status="frontlink"): +def get_bs_links(link,html): + # Extrakcia linkov zo stranky + bs = BeautifulSoup(html, "lxml") + base = link + if bs.base is not None and "href" in bs.base.attrs: + base = bs.base["href"] + base = urllib.parse.urlparse(w3lib.url.canonicalize_url(base)) + external_links = set() + internal_links = set() + # Normalizacia linkov + for l in bs.find_all("a", href=True): + if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs: + continue + href = l["href"] + try: + parsed = urllib.parse.urlparse(w3lib.url.canonicalize_url(href)) + netloc = parsed.netloc + path = os.path.normpath(parsed.path) + scheme = parsed.scheme + query = w3lib.url.url_query_cleaner(parsed.query,["id","aid","p","page","pid"]) + print(parsed) + if parsed.netloc == "": + scheme = base.scheme + if parsed.path == "/": + netloc = base.netloc + else: + netloc = base.netloc + path = os.path.normpath(base.path +"/" + path) + if not scheme.startswith("http"): + continue + if path.startswith("/"): + path = path[1:] + external = True + if parsed.netloc == base.netloc: + external = False + href = urllib.parse.urlunparse((scheme,netloc,path,"","","")) + href = w3lib.url.canonicalize_url(href) + print(href) + if external: + external_links.add(href) + else: + internal_links.add(href) + except ValueError as err: + print(err) + pass + print(internal_links,external_links) + return internal_links,external_links + +def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list: links = {} badrobot = 0 for original_link,(final_link,html) in zip(link_batch,responses): status = default_status - external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE) + internal_links, external_links = get_bs_links(final_link,html) + #external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE) for link in external_links: links[link] = "frontlink" - internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE) + #internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE) #print(extracted_links) for link in internal_links: if not is_robot_good(link,rules): @@ -283,7 +336,6 @@ def get_link_features(link): if len(res) < 2: return None res = res[:-1] - print(res) return res class LinkClassifier: @@ -477,30 +529,39 @@ def sample_links(db,hostname,status,batch_size): cl.train(trainset) prediction_accuracy = cl.test(testset) sample_set_size = 10000 - res = linkcol.find({"host":hostname,"status": status},limit = sample_set_size) + res = linkcol.find({"host":hostname,"status": status}) sample_links = [] predicted_good = 0 + visitcounter = collections.Counter() + good_links = [] + discover_links = [] for item in res: - for item in res: - cll = cl.classify(item["url"]) - #cll += random.uniform(-0.1,0.1) - sample_links.append((item["url"],cll)) - if cll > 0: - predicted_good += 1 - # TODO frontlinks are not unique! - sample_links.sort(key=lambda x: x[1],reverse=True) - predicted_good_prob = 0 - if len(sample_links) > 0: - predicted_good_prob = predicted_good / len(sample_links) - domaincol = db["domain"] - info = { - "predicted_good_prob":predicted_good_prob, - "prediction_accuracy": prediction_accuracy, - "crawled_count": crawled_count, - } - print(info) - domaincol.update_one({"host":hostname},{"$set":info}) - links = [l[0] for l in sample_links[0:batch_size]] + link = item["url"] + cll = cl.classify(link) + if cll > 0: + good_links.append(link) + features = get_link_features(link) + discover_links.append(link) + if features is None: + continue + for feature in features: + visitcounter[feature] += 1 + mls = int(min(batch_size/2,len(good_links))) + random.shuffle(good_links) + links = good_links[0:mls] + numdiscover = len(discover_links) + eval_discover_links = [] + for link in discover_links: + features = get_link_features(link) + prob = 0 + if features is not None: + for feature in features: + prob += math.log(visitcounter[feature] / numdiscover) + eval_discover_links.append((link,prob)) + eval_discover_links.sort(key=lambda x: x[1],reverse=True) + print(eval_discover_links) + mls = int(min(batch_size/2,len(discover_links))) + links += [l[0] for l in eval_discover_links[0:mls]] return links def domain_summary(db,hostname): @@ -549,6 +610,7 @@ def parseurl(link): print(rules.site_maps()) print(rules.crawl_delay("*")) html = trafilatura.fetch_url(link,decode=True) + get_bs_links(link,html) doc = trafilatura.bare_extraction(html) import pprint pprint.pprint(doc) @@ -597,17 +659,13 @@ def visit(start_link): # frontlinks first links = sample_links(db,hostname,"frontlink",batch_size) links.insert(0,start_link) - # then backlinks - if len(links) < batch_size: - back_links = sample_links(db,hostname,"backlink",batch_size - len(links)) - links += back_links # index results print("Processing links") responses = [] for link in links: responses.append(fetch_page(link)) extracted_pages = extract_pages(links,responses) - extracted_links = extract_links(links,responses,hostname,rules,"backlink") + extracted_links = extract_links(links,responses,hostname,rules,"frontlink") index_links(db,extracted_links) index_pages(db,hostname,extracted_pages) link_summary(db,hostname)