zz

2023-04-13 16:16:11 +02:00 · 2023-04-13 16:16:11 +02:00 · 44dc4be8c3
commit 44dc4be8c3
parent 8e8d4b9625
1 changed files with 46 additions and 46 deletions
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -21,6 +21,9 @@ import collections
 import math
 import random
 import hashlib
 from bs4 import BeautifulSoup
 import urllib.parse
 import os.path
 LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
 DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
@ -38,6 +41,46 @@ SAMPLE_SET_SIZE =10000
 CLASSIFIER_SET_SIZE = 200
 STOP_PATHS=["xml","rss","login","admin"]
 def get_bs_links(link,html):
    # Extrakcia linkov zo stranky
    bs = BeautifulSoup(html, "lxml")
    base = link
    if bs.base is not None and "href" in bs.base.attrs:
        base = bs.base["href"]
    base = urllib.parse.urlparse(courlan.normalize_url(base))
    links = set()
    # Normalizacia linkov
    for l in bs.find_all("a", href=True):
        if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
            continue
        href = l["href"]
        try:
            parsed = urllib.parse.urlparse(courlan.normalize_url(href))
            netloc = parsed.netloc
            path = os.path.normpath(parsed.path)
            scheme = parsed.scheme
            # internal link
            if parsed.netloc == "":
                scheme = base.scheme
                netloc = base.netloc
                if not parsed.path.startswith("/"):
                    path = os.path.normpath(base.path +"/" + path)
            if not scheme.startswith("http"):
                continue
            if path.startswith("/"):
                path = path[1:]
            if path.endswith(")"):
                # javascript
                continue
            href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
            href = courlan.normalize_url(href)
            links.add(href)
        except ValueError as err:
            print(err)
            pass
    return links
 def split_train(res):
    trainset = []
    testset = []
@ -243,7 +286,8 @@ def save_batch_info(db,host,states,docs):
    good_document_count = 0
    original_text_size = 0
    batch_size = 0
-    _,domain = courlan.get_hostinfo(host)
+    d = host.split(".")
    domain = d[-2] + "." + d[-1]
    for state,doc in zip(states,docs):
        batch_size += 1
        if state == "good":
@ -261,49 +305,6 @@ def save_batch_info(db,host,states,docs):
    db["batches"].insert_one(batchdoc)
    print(batchdoc)
 from bs4 import BeautifulSoup
 import urllib.parse
 import os.path
 def get_bs_links(link,html):
    # Extrakcia linkov zo stranky
    bs = BeautifulSoup(html, "lxml")
    base = link
    if bs.base is not None and "href" in bs.base.attrs:
        base = bs.base["href"]
    base = urllib.parse.urlparse(courlan.normalize_url(base))
    links = set()
    # Normalizacia linkov
    for l in bs.find_all("a", href=True):
        if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
            continue
        href = l["href"]
        try:
            parsed = urllib.parse.urlparse(courlan.normalize_url(href))
            netloc = parsed.netloc
            path = os.path.normpath(parsed.path)
            scheme = parsed.scheme
            # internal link
            if parsed.netloc == "":
                scheme = base.scheme
                netloc = base.netloc
                if not parsed.path.startswith("/"):
                    path = os.path.normpath(base.path +"/" + path)
            if not scheme.startswith("http"):
                continue
            if path.startswith("/"):
                path = path[1:]
            if path.endswith(")"):
                # javascript
                continue
            href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
            href = courlan.normalize_url(href)
            links.add(href)
        except ValueError as err:
            print(err)
            pass
    return links
 def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
    links = {}
@ -721,8 +722,7 @@ def crawl_summary():
        {"$group":{"_id":"$host",
                   "document_count":{"$sum":"$document_count"},
                   "good_document_count":{"$sum":"$good_document_count"},
-                   "batch_count":{"$sum":"$batch_size"},
+                   "batch_size":{"$sum":"$batch_size"},
                   "text_size":{"$sum":"$text_size"},
                   "original_text_size":{"$sum":"$original_text_size"},
                   }
         },