This commit is contained in:
Daniel Hládek 2023-04-08 10:12:31 +02:00
parent 289fbf7fb2
commit ce8f939980

View File

@ -32,6 +32,9 @@ MINTEXTSIZE=200
CHECK_PARAGRAPH_SIZE=150 CHECK_PARAGRAPH_SIZE=150
TEXT_TRASH_SIZE=200 TEXT_TRASH_SIZE=200
TEXT_TRASH_RATIO=0.6 TEXT_TRASH_RATIO=0.6
DISCOVER_LINK_RATIO = 0.3
SAMPLE_SET_SIZE =10000
CLASSIFIER_SET_SIZE = 200
def split_train(res): def split_train(res):
trainset = [] trainset = []
@ -122,12 +125,13 @@ def fetch_page(link:str)->(str,str):
LOGGER.error('too large: length %s for URL %s', len(response.data), link) LOGGER.error('too large: length %s for URL %s', len(response.data), link)
if good: if good:
html = trafilatura.utils.decode_response(response) html = trafilatura.utils.decode_response(response)
final_link = response.url
if html is not None: if html is not None:
html, final_link = trafilatura.spider.refresh_detection(html, final_link) html, final_link = trafilatura.spider.refresh_detection(html, final_link)
# is there a meta-refresh on the page? # is there a meta-refresh on the page?
if final_link is None: # malformed or malicious content if final_link is None: # malformed or malicious content
html = None html = None
final_link = courlan.normalize_url(final_link)
return final_link,html return final_link,html
def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser: def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
@ -170,6 +174,7 @@ def index_pages(db,hostname,extracted_pages):
state = "good" state = "good"
link = original_link link = original_link
if original_link != final_link: if original_link != final_link:
print(original_link,final_link)
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}}) linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
link = final_link link = final_link
if html is None: if html is None:
@ -215,7 +220,7 @@ def index_pages(db,hostname,extracted_pages):
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
doc.update(get_link_doc(link,"good")) doc.update(get_link_doc(link,"good"))
# todo extract links # todo extract links
print(doc) print(link,doc)
del doc["url"] del doc["url"]
contentcol.update_one({"url":link},{"$set":doc},upsert=True) contentcol.update_one({"url":link},{"$set":doc},upsert=True)
for chs in doc["paragraph_checksums"]: for chs in doc["paragraph_checksums"]:
@ -223,11 +228,13 @@ def index_pages(db,hostname,extracted_pages):
checkcol.insert_one({"_id":chs}) checkcol.insert_one({"_id":chs})
except pymongo.errors.DuplicateKeyError as err: except pymongo.errors.DuplicateKeyError as err:
pass pass
linkcol.update_one({"url":link},{"$set":{"status":state}})
linkdoc = get_link_doc(link,state)
del linkdoc["url"]
linkcol.update_one({"url":link},{"$set":linkdoc})
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib.parse import urllib.parse
import w3lib.url
import os.path import os.path
def get_bs_links(link,html): def get_bs_links(link,html):
@ -236,7 +243,8 @@ def get_bs_links(link,html):
base = link base = link
if bs.base is not None and "href" in bs.base.attrs: if bs.base is not None and "href" in bs.base.attrs:
base = bs.base["href"] base = bs.base["href"]
base = urllib.parse.urlparse(w3lib.url.canonicalize_url(base)) base = urllib.parse.urlparse(courlan.normalize_url(base))
external_links = set() external_links = set()
internal_links = set() internal_links = set()
# Normalizacia linkov # Normalizacia linkov
@ -245,12 +253,10 @@ def get_bs_links(link,html):
continue continue
href = l["href"] href = l["href"]
try: try:
parsed = urllib.parse.urlparse(w3lib.url.canonicalize_url(href)) parsed = urllib.parse.urlparse(courlan.normalize_url(href))
netloc = parsed.netloc netloc = parsed.netloc
path = os.path.normpath(parsed.path) path = os.path.normpath(parsed.path)
scheme = parsed.scheme scheme = parsed.scheme
query = w3lib.url.url_query_cleaner(parsed.query,["id","aid","p","page","pid"])
print(parsed)
if parsed.netloc == "": if parsed.netloc == "":
scheme = base.scheme scheme = base.scheme
if parsed.path == "/": if parsed.path == "/":
@ -266,8 +272,7 @@ def get_bs_links(link,html):
if parsed.netloc == base.netloc: if parsed.netloc == base.netloc:
external = False external = False
href = urllib.parse.urlunparse((scheme,netloc,path,"","","")) href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
href = w3lib.url.canonicalize_url(href) href = courlan.normalize_url(href)
print(href)
if external: if external:
external_links.add(href) external_links.add(href)
else: else:
@ -275,7 +280,6 @@ def get_bs_links(link,html):
except ValueError as err: except ValueError as err:
print(err) print(err)
pass pass
print(internal_links,external_links)
return internal_links,external_links return internal_links,external_links
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list: def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
@ -283,6 +287,8 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
badrobot = 0 badrobot = 0
for original_link,(final_link,html) in zip(link_batch,responses): for original_link,(final_link,html) in zip(link_batch,responses):
status = default_status status = default_status
if html is None or len(html) < 256:
continue
internal_links, external_links = get_bs_links(final_link,html) internal_links, external_links = get_bs_links(final_link,html)
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE) #external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
for link in external_links: for link in external_links:
@ -516,19 +522,19 @@ def link_summary(db,hostname):
print(res) print(res)
def sample_links(db,hostname,status,batch_size): def sample_links(db,hostname,status,batch_size):
print("Getting backlinks") print("Sampling links")
linkcol = db["links"] linkcol = db["links"]
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}})
cl = LinkClassifier() cl = LinkClassifier()
crawled_links = list(res) crawled_links = list(res)
crawled_count = len(crawled_links) crawled_count = len(crawled_links)
prediction_accuracy = 0 prediction_accuracy = 0
if crawled_count > 200: if crawled_count > CLASSIFIER_SET_SIZE:
# train on crawled links # train on crawled links
trainset,testset = split_train(crawled_links) trainset,testset = split_train(crawled_links)
cl.train(trainset) cl.train(trainset)
prediction_accuracy = cl.test(testset) prediction_accuracy = cl.test(testset)
sample_set_size = 10000 sample_set_size = SAMPLE_SET_SIZE
res = linkcol.find({"host":hostname,"status": status}) res = linkcol.find({"host":hostname,"status": status})
sample_links = [] sample_links = []
predicted_good = 0 predicted_good = 0
@ -546,7 +552,7 @@ def sample_links(db,hostname,status,batch_size):
continue continue
for feature in features: for feature in features:
visitcounter[feature] += 1 visitcounter[feature] += 1
mls = int(min(batch_size/2,len(good_links))) mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links)))
random.shuffle(good_links) random.shuffle(good_links)
links = good_links[0:mls] links = good_links[0:mls]
numdiscover = len(discover_links) numdiscover = len(discover_links)
@ -556,11 +562,12 @@ def sample_links(db,hostname,status,batch_size):
prob = 0 prob = 0
if features is not None: if features is not None:
for feature in features: for feature in features:
prob += math.log(visitcounter[feature] / numdiscover) c = visitcounter[feature]
prob -= math.log(c) / c
eval_discover_links.append((link,prob)) eval_discover_links.append((link,prob))
eval_discover_links.sort(key=lambda x: x[1],reverse=True) eval_discover_links.sort(key=lambda x: x[1],reverse=True)
print(eval_discover_links) #print(eval_discover_links)
mls = int(min(batch_size/2,len(discover_links))) mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links)))
links += [l[0] for l in eval_discover_links[0:mls]] links += [l[0] for l in eval_discover_links[0:mls]]
return links return links