zz

2023-04-13 16:16:11 +02:00 · 2023-04-13 16:16:11 +02:00 · 44dc4be8c3
commit 44dc4be8c3
parent 8e8d4b9625
1 changed files with 46 additions and 46 deletions
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -21,6 +21,9 @@ import collections
 import math
 import random
 import hashlib
+from bs4 import BeautifulSoup
+import urllib.parse
+import os.path

 LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
 DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
@ -38,6 +41,46 @@ SAMPLE_SET_SIZE =10000
 CLASSIFIER_SET_SIZE = 200
 STOP_PATHS=["xml","rss","login","admin"]

+
+def get_bs_links(link,html):
+    # Extrakcia linkov zo stranky
+    bs = BeautifulSoup(html, "lxml")
+    base = link
+    if bs.base is not None and "href" in bs.base.attrs:
+        base = bs.base["href"]
+    base = urllib.parse.urlparse(courlan.normalize_url(base))
+
+    links = set()
+    # Normalizacia linkov
+    for l in bs.find_all("a", href=True):
+        if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
+            continue
+        href = l["href"]
+        try:
+            parsed = urllib.parse.urlparse(courlan.normalize_url(href))
+            netloc = parsed.netloc
+            path = os.path.normpath(parsed.path)
+            scheme = parsed.scheme
+            # internal link
+            if parsed.netloc == "":
+                scheme = base.scheme
+                netloc = base.netloc
+                if not parsed.path.startswith("/"):
+                    path = os.path.normpath(base.path +"/" + path)
+            if not scheme.startswith("http"):
+                continue
+            if path.startswith("/"):
+                path = path[1:]
+            if path.endswith(")"):
+                # javascript
+                continue
+            href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
+            href = courlan.normalize_url(href)
+            links.add(href)
+        except ValueError as err:
+            print(err)
+            pass
+    return links
 def split_train(res):
    trainset = []
    testset = []
@ -243,7 +286,8 @@ def save_batch_info(db,host,states,docs):
    good_document_count = 0
    original_text_size = 0
    batch_size = 0
-    _,domain = courlan.get_hostinfo(host)
+    d = host.split(".")
+    domain = d[-2] + "." + d[-1]
    for state,doc in zip(states,docs):
        batch_size += 1
        if state == "good":
@ -261,49 +305,6 @@ def save_batch_info(db,host,states,docs):
    db["batches"].insert_one(batchdoc)
    print(batchdoc)

-from bs4 import BeautifulSoup
-import urllib.parse
-import os.path
-
-def get_bs_links(link,html):
-    # Extrakcia linkov zo stranky
-    bs = BeautifulSoup(html, "lxml")
-    base = link
-    if bs.base is not None and "href" in bs.base.attrs:
-        base = bs.base["href"]
-    base = urllib.parse.urlparse(courlan.normalize_url(base))
-
-    links = set()
-    # Normalizacia linkov
-    for l in bs.find_all("a", href=True):
-        if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
-            continue
-        href = l["href"]
-        try:
-            parsed = urllib.parse.urlparse(courlan.normalize_url(href))
-            netloc = parsed.netloc
-            path = os.path.normpath(parsed.path)
-            scheme = parsed.scheme
-            # internal link
-            if parsed.netloc == "":
-                scheme = base.scheme
-                netloc = base.netloc
-                if not parsed.path.startswith("/"):
-                    path = os.path.normpath(base.path +"/" + path)
-            if not scheme.startswith("http"):
-                continue
-            if path.startswith("/"):
-                path = path[1:]
-            if path.endswith(")"):
-                # javascript
-                continue
-            href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
-            href = courlan.normalize_url(href)
-            links.add(href)
-        except ValueError as err:
-            print(err)
-            pass
-    return links

 def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
    links = {}
@ -721,8 +722,7 @@ def crawl_summary():
        {"$group":{"_id":"$host",
                   "document_count":{"$sum":"$document_count"},
                   "good_document_count":{"$sum":"$good_document_count"},
-                   "batch_count":{"$sum":"$batch_size"},
-                   "text_size":{"$sum":"$text_size"},
+                   "batch_size":{"$sum":"$batch_size"},
                   "original_text_size":{"$sum":"$original_text_size"},
                   }
         },