zz

2023-04-07 15:56:43 +02:00 · 2023-04-07 15:56:43 +02:00 · 289fbf7fb2
commit 289fbf7fb2
parent 7d09f112df
1 changed files with 92 additions and 34 deletions
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -93,7 +93,7 @@ def is_link_good(link):
        return None
    return llink

-def get_link_doc(link,status="frontlink"):
+def get_link_doc(link:str,status="frontlink")->dict:
    r  = courlan.check_url(link)
    assert r is not None
    link,host = r
@ -101,7 +101,7 @@ def get_link_doc(link,status="frontlink"):
    return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()}


-def fetch_page(link):
+def fetch_page(link:str)->(str,str):
    print("fetching:::::")
    print(link)
    final_link = link
@ -130,7 +130,7 @@ def fetch_page(link):
                html = None
    return final_link,html

-def fetch_robot(base_url):
+def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
    try:
        rawrules = trafilatura.fetch_url("https://"+ base_url + "/robots.txt")
        #print(rawrules)
@ -144,7 +144,7 @@ def fetch_robot(base_url):
    return rules


-def extract_pages(link_batch,responses):
+def extract_pages(link_batch:list,responses:list)->list:
    out = []
    for original_link,(final_link,html) in zip(link_batch,responses):
        doc = None
@ -225,16 +225,69 @@ def index_pages(db,hostname,extracted_pages):
                    pass
        linkcol.update_one({"url":link},{"$set":{"status":state}})

+from bs4 import BeautifulSoup
+import urllib.parse
+import w3lib.url
+import os.path

-def extract_links(link_batch,responses,hostname,rules,default_status="frontlink"):
+def get_bs_links(link,html):
+    # Extrakcia linkov zo stranky
+    bs = BeautifulSoup(html, "lxml")
+    base = link
+    if bs.base is not None and "href" in bs.base.attrs:
+        base = bs.base["href"]
+    base = urllib.parse.urlparse(w3lib.url.canonicalize_url(base))
+    external_links = set()
+    internal_links = set()
+    # Normalizacia linkov
+    for l in bs.find_all("a", href=True):
+        if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
+            continue
+        href = l["href"]
+        try:
+            parsed = urllib.parse.urlparse(w3lib.url.canonicalize_url(href))
+            netloc = parsed.netloc
+            path = os.path.normpath(parsed.path)
+            scheme = parsed.scheme
+            query = w3lib.url.url_query_cleaner(parsed.query,["id","aid","p","page","pid"])
+            print(parsed)
+            if parsed.netloc == "":
+                scheme = base.scheme
+                if parsed.path == "/":
+                    netloc = base.netloc
+                else:
+                    netloc = base.netloc
+                    path = os.path.normpath(base.path +"/" + path)
+            if not scheme.startswith("http"):
+                continue
+            if path.startswith("/"):
+                path = path[1:]
+            external = True
+            if parsed.netloc == base.netloc:
+                external = False
+            href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
+            href = w3lib.url.canonicalize_url(href)
+            print(href)
+            if external:
+                external_links.add(href)
+            else:
+                internal_links.add(href)
+        except ValueError as err:
+            print(err)
+            pass
+    print(internal_links,external_links)
+    return internal_links,external_links
+
+def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
    links = {}
    badrobot = 0
    for original_link,(final_link,html) in zip(link_batch,responses):
        status = default_status
-        external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
+        internal_links, external_links = get_bs_links(final_link,html)
+        #external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
        for link in external_links:
            links[link] = "frontlink"
-        internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
+        #internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
        #print(extracted_links)
        for link in internal_links:
            if not is_robot_good(link,rules):
@ -283,7 +336,6 @@ def get_link_features(link):
    if len(res) < 2:
        return None
    res = res[:-1]
-    print(res)
    return res

 class LinkClassifier:
@ -477,30 +529,39 @@ def sample_links(db,hostname,status,batch_size):
        cl.train(trainset)
        prediction_accuracy = cl.test(testset)
    sample_set_size = 10000
-    res = linkcol.find({"host":hostname,"status": status},limit = sample_set_size)
+    res = linkcol.find({"host":hostname,"status": status})
    sample_links = []
    predicted_good = 0
+    visitcounter = collections.Counter()
+    good_links = []
+    discover_links = []
    for item in res:
-        for item in res:
-            cll = cl.classify(item["url"])
-            #cll += random.uniform(-0.1,0.1)
-            sample_links.append((item["url"],cll))
-            if cll > 0:
-                predicted_good += 1
-    # TODO frontlinks are not unique!
-    sample_links.sort(key=lambda x: x[1],reverse=True)
-    predicted_good_prob = 0
-    if len(sample_links) > 0:
-        predicted_good_prob = predicted_good / len(sample_links)
-    domaincol = db["domain"]
-    info = {
-        "predicted_good_prob":predicted_good_prob,
-        "prediction_accuracy": prediction_accuracy,
-        "crawled_count": crawled_count,
-    }
-    print(info)
-    domaincol.update_one({"host":hostname},{"$set":info})
-    links = [l[0] for l in sample_links[0:batch_size]]
+        link = item["url"]
+        cll = cl.classify(link)
+        if cll > 0: 
+            good_links.append(link)
+        features = get_link_features(link)
+        discover_links.append(link)
+        if features is None:
+            continue
+        for feature in features:
+            visitcounter[feature] += 1
+    mls = int(min(batch_size/2,len(good_links)))
+    random.shuffle(good_links)
+    links = good_links[0:mls]
+    numdiscover = len(discover_links)
+    eval_discover_links = []
+    for link in discover_links:
+        features = get_link_features(link)
+        prob = 0
+        if features is not None:
+            for feature in features:
+                prob += math.log(visitcounter[feature] / numdiscover)
+        eval_discover_links.append((link,prob))
+    eval_discover_links.sort(key=lambda x: x[1],reverse=True)
+    print(eval_discover_links)
+    mls = int(min(batch_size/2,len(discover_links)))
+    links += [l[0] for l in eval_discover_links[0:mls]]
    return links

 def domain_summary(db,hostname):
@ -549,6 +610,7 @@ def parseurl(link):
    print(rules.site_maps())
    print(rules.crawl_delay("*"))
    html = trafilatura.fetch_url(link,decode=True)
+    get_bs_links(link,html)
    doc = trafilatura.bare_extraction(html)
    import pprint
    pprint.pprint(doc)
@ -597,17 +659,13 @@ def visit(start_link):
    # frontlinks first
    links = sample_links(db,hostname,"frontlink",batch_size)
    links.insert(0,start_link)
-    # then backlinks
-    if len(links) < batch_size:
-        back_links = sample_links(db,hostname,"backlink",batch_size - len(links))
-        links += back_links
    # index results
    print("Processing links")
    responses = []
    for link in links:
        responses.append(fetch_page(link))
    extracted_pages = extract_pages(links,responses)
-    extracted_links = extract_links(links,responses,hostname,rules,"backlink")
+    extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
    index_links(db,extracted_links)
    index_pages(db,hostname,extracted_pages)
    link_summary(db,hostname)