From 44dc4be8c33f9562de7458f2e2e69cb457312525 Mon Sep 17 00:00:00 2001
From: Daniel Hladek <daniel.hladek@tuke.sk>
Date: Thu, 13 Apr 2023 16:16:11 +0200
Subject: [PATCH] zz

---
 mongo/mongocrawler.py | 92 +++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py
index 6da2061..29f6d20 100644
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@@ -21,6 +21,9 @@ import collections
 import math
 import random
 import hashlib
+from bs4 import BeautifulSoup
+import urllib.parse
+import os.path
 
 LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
 DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
@@ -38,6 +41,46 @@ SAMPLE_SET_SIZE =10000
 CLASSIFIER_SET_SIZE = 200
 STOP_PATHS=["xml","rss","login","admin"]
 
+
+def get_bs_links(link,html):
+    # Extrakcia linkov zo stranky
+    bs = BeautifulSoup(html, "lxml")
+    base = link
+    if bs.base is not None and "href" in bs.base.attrs:
+        base = bs.base["href"]
+    base = urllib.parse.urlparse(courlan.normalize_url(base))
+
+    links = set()
+    # Normalizacia linkov
+    for l in bs.find_all("a", href=True):
+        if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
+            continue
+        href = l["href"]
+        try:
+            parsed = urllib.parse.urlparse(courlan.normalize_url(href))
+            netloc = parsed.netloc
+            path = os.path.normpath(parsed.path)
+            scheme = parsed.scheme
+            # internal link
+            if parsed.netloc == "":
+                scheme = base.scheme
+                netloc = base.netloc
+                if not parsed.path.startswith("/"):
+                    path = os.path.normpath(base.path +"/" + path)
+            if not scheme.startswith("http"):
+                continue
+            if path.startswith("/"):
+                path = path[1:]
+            if path.endswith(")"):
+                # javascript
+                continue
+            href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
+            href = courlan.normalize_url(href)
+            links.add(href)
+        except ValueError as err:
+            print(err)
+            pass
+    return links
 def split_train(res):
     trainset = []
     testset = []
@@ -243,7 +286,8 @@ def save_batch_info(db,host,states,docs):
     good_document_count = 0
     original_text_size = 0
     batch_size = 0
-    _,domain = courlan.get_hostinfo(host)
+    d = host.split(".")
+    domain = d[-2] + "." + d[-1]
     for state,doc in zip(states,docs):
         batch_size += 1
         if state == "good":
@@ -261,49 +305,6 @@ def save_batch_info(db,host,states,docs):
     db["batches"].insert_one(batchdoc)
     print(batchdoc)
 
-from bs4 import BeautifulSoup
-import urllib.parse
-import os.path
-
-def get_bs_links(link,html):
-    # Extrakcia linkov zo stranky
-    bs = BeautifulSoup(html, "lxml")
-    base = link
-    if bs.base is not None and "href" in bs.base.attrs:
-        base = bs.base["href"]
-    base = urllib.parse.urlparse(courlan.normalize_url(base))
-
-    links = set()
-    # Normalizacia linkov
-    for l in bs.find_all("a", href=True):
-        if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
-            continue
-        href = l["href"]
-        try:
-            parsed = urllib.parse.urlparse(courlan.normalize_url(href))
-            netloc = parsed.netloc
-            path = os.path.normpath(parsed.path)
-            scheme = parsed.scheme
-            # internal link
-            if parsed.netloc == "":
-                scheme = base.scheme
-                netloc = base.netloc
-                if not parsed.path.startswith("/"):
-                    path = os.path.normpath(base.path +"/" + path)
-            if not scheme.startswith("http"):
-                continue
-            if path.startswith("/"):
-                path = path[1:]
-            if path.endswith(")"):
-                # javascript
-                continue
-            href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
-            href = courlan.normalize_url(href)
-            links.add(href)
-        except ValueError as err:
-            print(err)
-            pass
-    return links
 
 def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
     links = {}
@@ -721,8 +722,7 @@ def crawl_summary():
         {"$group":{"_id":"$host",
                    "document_count":{"$sum":"$document_count"},
                    "good_document_count":{"$sum":"$good_document_count"},
-                   "batch_count":{"$sum":"$batch_size"},
-                   "text_size":{"$sum":"$text_size"},
+                   "batch_size":{"$sum":"$batch_size"},
                    "original_text_size":{"$sum":"$original_text_size"},
                    }
          },