zz

2023-04-08 10:33:40 +02:00 · 2023-04-08 10:12:31 +02:00
1 changed files with 34 additions and 18 deletions
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -32,6 +32,10 @@ MINTEXTSIZE=200
 CHECK_PARAGRAPH_SIZE=150
 TEXT_TRASH_SIZE=200
 TEXT_TRASH_RATIO=0.6
+DISCOVER_LINK_RATIO = 0.3
+SAMPLE_SET_SIZE =10000
+CLASSIFIER_SET_SIZE = 200
+STOP_PATHS=["xml","rss","login","admin"]

 def split_train(res):
    trainset = []
@ -83,6 +87,11 @@ def is_link_good(link):
    if r is None:
        return None
    llink,lhostname = r
+    paths = set(llink.split("/"))
+    for item in STOP_PATHS:
+        if item in paths:
+            return None
+
    #print(llink,lhostname)
    # hostname rules
    if not lhostname.endswith(DOMAIN):
@ -122,12 +131,13 @@ def fetch_page(link:str)->(str,str):
            LOGGER.error('too large: length %s for URL %s', len(response.data), link)
        if good:
            html = trafilatura.utils.decode_response(response) 
-            final_link = response.url
        if html is not None:
            html, final_link = trafilatura.spider.refresh_detection(html, final_link)
            # is there a meta-refresh on the page?
            if final_link is None:  # malformed or malicious content
                html = None
+
+        final_link = courlan.normalize_url(final_link)
    return final_link,html

 def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
@ -170,6 +180,7 @@ def index_pages(db,hostname,extracted_pages):
        state = "good"
        link = original_link
        if original_link != final_link:
+            print(original_link,final_link)
            linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
            link = final_link
        if html is None:
@ -215,7 +226,7 @@ def index_pages(db,hostname,extracted_pages):
            htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
            doc.update(get_link_doc(link,"good"))
            # todo extract links
-            print(doc)
+            print(link,doc)
            del doc["url"]
            contentcol.update_one({"url":link},{"$set":doc},upsert=True)
            for chs in doc["paragraph_checksums"]:
@ -223,11 +234,13 @@ def index_pages(db,hostname,extracted_pages):
                    checkcol.insert_one({"_id":chs})
                except pymongo.errors.DuplicateKeyError as err:
                    pass
-        linkcol.update_one({"url":link},{"$set":{"status":state}})
+
+        linkdoc = get_link_doc(link,state)
+        del linkdoc["url"]
+        linkcol.update_one({"url":link},{"$set":linkdoc})

 from bs4 import BeautifulSoup
 import urllib.parse
-import w3lib.url
 import os.path

 def get_bs_links(link,html):
@ -236,7 +249,8 @@ def get_bs_links(link,html):
    base = link
    if bs.base is not None and "href" in bs.base.attrs:
        base = bs.base["href"]
-    base = urllib.parse.urlparse(w3lib.url.canonicalize_url(base))
+    base = urllib.parse.urlparse(courlan.normalize_url(base))
+
    external_links = set()
    internal_links = set()
    # Normalizacia linkov
@ -245,12 +259,10 @@ def get_bs_links(link,html):
            continue
        href = l["href"]
        try:
-            parsed = urllib.parse.urlparse(w3lib.url.canonicalize_url(href))
+            parsed = urllib.parse.urlparse(courlan.normalize_url(href))
            netloc = parsed.netloc
            path = os.path.normpath(parsed.path)
            scheme = parsed.scheme
-            query = w3lib.url.url_query_cleaner(parsed.query,["id","aid","p","page","pid"])
-            print(parsed)
            if parsed.netloc == "":
                scheme = base.scheme
                if parsed.path == "/":
@ -262,12 +274,14 @@ def get_bs_links(link,html):
                continue
            if path.startswith("/"):
                path = path[1:]
+            if path.endswith(")"):
+                # javascript
+                continue
            external = True
            if parsed.netloc == base.netloc:
                external = False
            href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
-            href = w3lib.url.canonicalize_url(href)
-            print(href)
+            href = courlan.normalize_url(href)
            if external:
                external_links.add(href)
            else:
@ -275,7 +289,6 @@ def get_bs_links(link,html):
        except ValueError as err:
            print(err)
            pass
-    print(internal_links,external_links)
    return internal_links,external_links

 def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
@ -283,6 +296,8 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
    badrobot = 0
    for original_link,(final_link,html) in zip(link_batch,responses):
        status = default_status
+        if html is None or len(html) < 256:
+            continue
        internal_links, external_links = get_bs_links(final_link,html)
        #external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
        for link in external_links:
@ -516,19 +531,19 @@ def link_summary(db,hostname):
    print(res)

 def sample_links(db,hostname,status,batch_size):
-    print("Getting backlinks")
+    print("Sampling links")
    linkcol = db["links"]
    res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}})
    cl = LinkClassifier()
    crawled_links = list(res)
    crawled_count = len(crawled_links)
    prediction_accuracy = 0
-    if crawled_count > 200:
+    if crawled_count > CLASSIFIER_SET_SIZE:
        # train on crawled links
        trainset,testset = split_train(crawled_links)
        cl.train(trainset)
        prediction_accuracy = cl.test(testset)
-    sample_set_size = 10000
+    sample_set_size = SAMPLE_SET_SIZE
    res = linkcol.find({"host":hostname,"status": status})
    sample_links = []
    predicted_good = 0
@ -546,7 +561,7 @@ def sample_links(db,hostname,status,batch_size):
            continue
        for feature in features:
            visitcounter[feature] += 1
-    mls = int(min(batch_size/2,len(good_links)))
+    mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links)))
    random.shuffle(good_links)
    links = good_links[0:mls]
    numdiscover = len(discover_links)
@ -556,11 +571,12 @@ def sample_links(db,hostname,status,batch_size):
        prob = 0
        if features is not None:
            for feature in features:
-                prob += math.log(visitcounter[feature] / numdiscover)
+                c = visitcounter[feature]
+                prob -= math.log(c) / c
        eval_discover_links.append((link,prob))
    eval_discover_links.sort(key=lambda x: x[1],reverse=True)
-    print(eval_discover_links)
-    mls = int(min(batch_size/2,len(discover_links)))
+    #print(eval_discover_links)
+    mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links)))
    links += [l[0] for l in eval_discover_links[0:mls]]
    return links
Author	SHA1	Message	Date
Daniel Hladek	9d06223012	zz	2023-04-08 10:33:40 +02:00
Daniel Hladek	ce8f939980	zz	2023-04-08 10:12:31 +02:00