diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 6da2061..29f6d20 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -21,6 +21,9 @@ import collections import math import random import hashlib +from bs4 import BeautifulSoup +import urllib.parse +import os.path LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") DOMAIN = os.getenv("SUCKER_DOMAIN","sk") @@ -38,6 +41,46 @@ SAMPLE_SET_SIZE =10000 CLASSIFIER_SET_SIZE = 200 STOP_PATHS=["xml","rss","login","admin"] + +def get_bs_links(link,html): + # Extrakcia linkov zo stranky + bs = BeautifulSoup(html, "lxml") + base = link + if bs.base is not None and "href" in bs.base.attrs: + base = bs.base["href"] + base = urllib.parse.urlparse(courlan.normalize_url(base)) + + links = set() + # Normalizacia linkov + for l in bs.find_all("a", href=True): + if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs: + continue + href = l["href"] + try: + parsed = urllib.parse.urlparse(courlan.normalize_url(href)) + netloc = parsed.netloc + path = os.path.normpath(parsed.path) + scheme = parsed.scheme + # internal link + if parsed.netloc == "": + scheme = base.scheme + netloc = base.netloc + if not parsed.path.startswith("/"): + path = os.path.normpath(base.path +"/" + path) + if not scheme.startswith("http"): + continue + if path.startswith("/"): + path = path[1:] + if path.endswith(")"): + # javascript + continue + href = urllib.parse.urlunparse((scheme,netloc,path,"","","")) + href = courlan.normalize_url(href) + links.add(href) + except ValueError as err: + print(err) + pass + return links def split_train(res): trainset = [] testset = [] @@ -243,7 +286,8 @@ def save_batch_info(db,host,states,docs): good_document_count = 0 original_text_size = 0 batch_size = 0 - _,domain = courlan.get_hostinfo(host) + d = host.split(".") + domain = d[-2] + "." + d[-1] for state,doc in zip(states,docs): batch_size += 1 if state == "good": @@ -261,49 +305,6 @@ def save_batch_info(db,host,states,docs): db["batches"].insert_one(batchdoc) print(batchdoc) -from bs4 import BeautifulSoup -import urllib.parse -import os.path - -def get_bs_links(link,html): - # Extrakcia linkov zo stranky - bs = BeautifulSoup(html, "lxml") - base = link - if bs.base is not None and "href" in bs.base.attrs: - base = bs.base["href"] - base = urllib.parse.urlparse(courlan.normalize_url(base)) - - links = set() - # Normalizacia linkov - for l in bs.find_all("a", href=True): - if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs: - continue - href = l["href"] - try: - parsed = urllib.parse.urlparse(courlan.normalize_url(href)) - netloc = parsed.netloc - path = os.path.normpath(parsed.path) - scheme = parsed.scheme - # internal link - if parsed.netloc == "": - scheme = base.scheme - netloc = base.netloc - if not parsed.path.startswith("/"): - path = os.path.normpath(base.path +"/" + path) - if not scheme.startswith("http"): - continue - if path.startswith("/"): - path = path[1:] - if path.endswith(")"): - # javascript - continue - href = urllib.parse.urlunparse((scheme,netloc,path,"","","")) - href = courlan.normalize_url(href) - links.add(href) - except ValueError as err: - print(err) - pass - return links def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list: links = {} @@ -721,8 +722,7 @@ def crawl_summary(): {"$group":{"_id":"$host", "document_count":{"$sum":"$document_count"}, "good_document_count":{"$sum":"$good_document_count"}, - "batch_count":{"$sum":"$batch_size"}, - "text_size":{"$sum":"$text_size"}, + "batch_size":{"$sum":"$batch_size"}, "original_text_size":{"$sum":"$original_text_size"}, } },