From 8a91f88d739cbd78b5ea222e35bb79d307e00ace Mon Sep 17 00:00:00 2001
From: Daniel Hladek <daniel.hladek@tuke.sk>
Date: Sun, 23 Apr 2023 10:02:52 +0200
Subject: [PATCH] zz

---
 mongo/cli.py          |  3 ++-
 mongo/mongocrawler.py | 15 +++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/mongo/cli.py b/mongo/cli.py
index d3ebcaa..66cc1e7 100644
--- a/mongo/cli.py
+++ b/mongo/cli.py
@@ -32,9 +32,10 @@ def classify(start_link):
     mongocrawler.classify(start_link)
 
 @cli.command()
-@click.argument("hostname",help="Hostname to crawl")
+@click.argument("hostname")
 @click.option("--filter_content",default=True,help="Filter content")
 def visit(hostname,filter_content=True):
+    """ Hostname to crawl """
     mongocrawler.visit(hostname,filter_content=filter_content)
 
 @cli.command()
diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py
index ba14123..2180ad2 100644
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@@ -99,7 +99,9 @@ def split_train(res):
 
 def calculate_checksums(text):
     """
-    @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
+    Paragraph separation must be compatible with text extraction. Are paragraphs separated with a blank line or a white line?
+
+    @return fingerprints of a paragraphs in text. Paragraphs are separated by a new line. 
     """
     checksums = []
     sizes = []
@@ -153,11 +155,11 @@ def is_link_good(link):
     return llink
 
 def get_link_doc(link:str,status="frontlink")->dict:
-    r  = courlan.check_url(link)
-    assert r is not None
-    link,host = r
-    domain = courlan.extract_domain(link)
-    return {"url":link,"host":host,"domain":domain,"status":status,"created_at":dat.utcnow()}
+    parsed = urllib.parse.urlparse(courlan.normalize_url(link))
+    url = urllib.parse.urlunparse(parsed)
+    tokens = parsed.netloc.split(".")
+    domain =  tokens[-2] + "." + tokens[-1]
+    return {"url":link,"host":parsed.netloc,"domain":domain,"status":status,"created_at":dat.utcnow()}
 
 
 def fetch_page(link:str)->(str,str):
@@ -738,6 +740,7 @@ def import_html():
     for l in sys.stdin:
         hdoc = json.loads(l)
         url = hdoc["url"]
+        # beautifusoup is to unify encoding
         html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
         doc = extract_page(url,html)
         if doc is not None: