This commit is contained in:
Daniel Hládek 2023-04-13 16:16:11 +02:00
parent 8e8d4b9625
commit 44dc4be8c3

View File

@ -21,6 +21,9 @@ import collections
import math
import random
import hashlib
from bs4 import BeautifulSoup
import urllib.parse
import os.path
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
@ -38,6 +41,46 @@ SAMPLE_SET_SIZE =10000
CLASSIFIER_SET_SIZE = 200
STOP_PATHS=["xml","rss","login","admin"]
def get_bs_links(link,html):
# Extrakcia linkov zo stranky
bs = BeautifulSoup(html, "lxml")
base = link
if bs.base is not None and "href" in bs.base.attrs:
base = bs.base["href"]
base = urllib.parse.urlparse(courlan.normalize_url(base))
links = set()
# Normalizacia linkov
for l in bs.find_all("a", href=True):
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
continue
href = l["href"]
try:
parsed = urllib.parse.urlparse(courlan.normalize_url(href))
netloc = parsed.netloc
path = os.path.normpath(parsed.path)
scheme = parsed.scheme
# internal link
if parsed.netloc == "":
scheme = base.scheme
netloc = base.netloc
if not parsed.path.startswith("/"):
path = os.path.normpath(base.path +"/" + path)
if not scheme.startswith("http"):
continue
if path.startswith("/"):
path = path[1:]
if path.endswith(")"):
# javascript
continue
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
href = courlan.normalize_url(href)
links.add(href)
except ValueError as err:
print(err)
pass
return links
def split_train(res):
trainset = []
testset = []
@ -243,7 +286,8 @@ def save_batch_info(db,host,states,docs):
good_document_count = 0
original_text_size = 0
batch_size = 0
_,domain = courlan.get_hostinfo(host)
d = host.split(".")
domain = d[-2] + "." + d[-1]
for state,doc in zip(states,docs):
batch_size += 1
if state == "good":
@ -261,49 +305,6 @@ def save_batch_info(db,host,states,docs):
db["batches"].insert_one(batchdoc)
print(batchdoc)
from bs4 import BeautifulSoup
import urllib.parse
import os.path
def get_bs_links(link,html):
# Extrakcia linkov zo stranky
bs = BeautifulSoup(html, "lxml")
base = link
if bs.base is not None and "href" in bs.base.attrs:
base = bs.base["href"]
base = urllib.parse.urlparse(courlan.normalize_url(base))
links = set()
# Normalizacia linkov
for l in bs.find_all("a", href=True):
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
continue
href = l["href"]
try:
parsed = urllib.parse.urlparse(courlan.normalize_url(href))
netloc = parsed.netloc
path = os.path.normpath(parsed.path)
scheme = parsed.scheme
# internal link
if parsed.netloc == "":
scheme = base.scheme
netloc = base.netloc
if not parsed.path.startswith("/"):
path = os.path.normpath(base.path +"/" + path)
if not scheme.startswith("http"):
continue
if path.startswith("/"):
path = path[1:]
if path.endswith(")"):
# javascript
continue
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
href = courlan.normalize_url(href)
links.add(href)
except ValueError as err:
print(err)
pass
return links
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
links = {}
@ -721,8 +722,7 @@ def crawl_summary():
{"$group":{"_id":"$host",
"document_count":{"$sum":"$document_count"},
"good_document_count":{"$sum":"$good_document_count"},
"batch_count":{"$sum":"$batch_size"},
"text_size":{"$sum":"$text_size"},
"batch_size":{"$sum":"$batch_size"},
"original_text_size":{"$sum":"$original_text_size"},
}
},