This commit is contained in:
Daniel Hládek 2023-04-07 15:56:43 +02:00
parent 7d09f112df
commit 289fbf7fb2

View File

@ -93,7 +93,7 @@ def is_link_good(link):
return None return None
return llink return llink
def get_link_doc(link,status="frontlink"): def get_link_doc(link:str,status="frontlink")->dict:
r = courlan.check_url(link) r = courlan.check_url(link)
assert r is not None assert r is not None
link,host = r link,host = r
@ -101,7 +101,7 @@ def get_link_doc(link,status="frontlink"):
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()} return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()}
def fetch_page(link): def fetch_page(link:str)->(str,str):
print("fetching:::::") print("fetching:::::")
print(link) print(link)
final_link = link final_link = link
@ -130,7 +130,7 @@ def fetch_page(link):
html = None html = None
return final_link,html return final_link,html
def fetch_robot(base_url): def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
try: try:
rawrules = trafilatura.fetch_url("https://"+ base_url + "/robots.txt") rawrules = trafilatura.fetch_url("https://"+ base_url + "/robots.txt")
#print(rawrules) #print(rawrules)
@ -144,7 +144,7 @@ def fetch_robot(base_url):
return rules return rules
def extract_pages(link_batch,responses): def extract_pages(link_batch:list,responses:list)->list:
out = [] out = []
for original_link,(final_link,html) in zip(link_batch,responses): for original_link,(final_link,html) in zip(link_batch,responses):
doc = None doc = None
@ -225,16 +225,69 @@ def index_pages(db,hostname,extracted_pages):
pass pass
linkcol.update_one({"url":link},{"$set":{"status":state}}) linkcol.update_one({"url":link},{"$set":{"status":state}})
from bs4 import BeautifulSoup
import urllib.parse
import w3lib.url
import os.path
def extract_links(link_batch,responses,hostname,rules,default_status="frontlink"): def get_bs_links(link,html):
# Extrakcia linkov zo stranky
bs = BeautifulSoup(html, "lxml")
base = link
if bs.base is not None and "href" in bs.base.attrs:
base = bs.base["href"]
base = urllib.parse.urlparse(w3lib.url.canonicalize_url(base))
external_links = set()
internal_links = set()
# Normalizacia linkov
for l in bs.find_all("a", href=True):
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
continue
href = l["href"]
try:
parsed = urllib.parse.urlparse(w3lib.url.canonicalize_url(href))
netloc = parsed.netloc
path = os.path.normpath(parsed.path)
scheme = parsed.scheme
query = w3lib.url.url_query_cleaner(parsed.query,["id","aid","p","page","pid"])
print(parsed)
if parsed.netloc == "":
scheme = base.scheme
if parsed.path == "/":
netloc = base.netloc
else:
netloc = base.netloc
path = os.path.normpath(base.path +"/" + path)
if not scheme.startswith("http"):
continue
if path.startswith("/"):
path = path[1:]
external = True
if parsed.netloc == base.netloc:
external = False
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
href = w3lib.url.canonicalize_url(href)
print(href)
if external:
external_links.add(href)
else:
internal_links.add(href)
except ValueError as err:
print(err)
pass
print(internal_links,external_links)
return internal_links,external_links
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
links = {} links = {}
badrobot = 0 badrobot = 0
for original_link,(final_link,html) in zip(link_batch,responses): for original_link,(final_link,html) in zip(link_batch,responses):
status = default_status status = default_status
external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE) internal_links, external_links = get_bs_links(final_link,html)
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
for link in external_links: for link in external_links:
links[link] = "frontlink" links[link] = "frontlink"
internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE) #internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
#print(extracted_links) #print(extracted_links)
for link in internal_links: for link in internal_links:
if not is_robot_good(link,rules): if not is_robot_good(link,rules):
@ -283,7 +336,6 @@ def get_link_features(link):
if len(res) < 2: if len(res) < 2:
return None return None
res = res[:-1] res = res[:-1]
print(res)
return res return res
class LinkClassifier: class LinkClassifier:
@ -477,30 +529,39 @@ def sample_links(db,hostname,status,batch_size):
cl.train(trainset) cl.train(trainset)
prediction_accuracy = cl.test(testset) prediction_accuracy = cl.test(testset)
sample_set_size = 10000 sample_set_size = 10000
res = linkcol.find({"host":hostname,"status": status},limit = sample_set_size) res = linkcol.find({"host":hostname,"status": status})
sample_links = [] sample_links = []
predicted_good = 0 predicted_good = 0
visitcounter = collections.Counter()
good_links = []
discover_links = []
for item in res: for item in res:
for item in res: link = item["url"]
cll = cl.classify(item["url"]) cll = cl.classify(link)
#cll += random.uniform(-0.1,0.1)
sample_links.append((item["url"],cll))
if cll > 0: if cll > 0:
predicted_good += 1 good_links.append(link)
# TODO frontlinks are not unique! features = get_link_features(link)
sample_links.sort(key=lambda x: x[1],reverse=True) discover_links.append(link)
predicted_good_prob = 0 if features is None:
if len(sample_links) > 0: continue
predicted_good_prob = predicted_good / len(sample_links) for feature in features:
domaincol = db["domain"] visitcounter[feature] += 1
info = { mls = int(min(batch_size/2,len(good_links)))
"predicted_good_prob":predicted_good_prob, random.shuffle(good_links)
"prediction_accuracy": prediction_accuracy, links = good_links[0:mls]
"crawled_count": crawled_count, numdiscover = len(discover_links)
} eval_discover_links = []
print(info) for link in discover_links:
domaincol.update_one({"host":hostname},{"$set":info}) features = get_link_features(link)
links = [l[0] for l in sample_links[0:batch_size]] prob = 0
if features is not None:
for feature in features:
prob += math.log(visitcounter[feature] / numdiscover)
eval_discover_links.append((link,prob))
eval_discover_links.sort(key=lambda x: x[1],reverse=True)
print(eval_discover_links)
mls = int(min(batch_size/2,len(discover_links)))
links += [l[0] for l in eval_discover_links[0:mls]]
return links return links
def domain_summary(db,hostname): def domain_summary(db,hostname):
@ -549,6 +610,7 @@ def parseurl(link):
print(rules.site_maps()) print(rules.site_maps())
print(rules.crawl_delay("*")) print(rules.crawl_delay("*"))
html = trafilatura.fetch_url(link,decode=True) html = trafilatura.fetch_url(link,decode=True)
get_bs_links(link,html)
doc = trafilatura.bare_extraction(html) doc = trafilatura.bare_extraction(html)
import pprint import pprint
pprint.pprint(doc) pprint.pprint(doc)
@ -597,17 +659,13 @@ def visit(start_link):
# frontlinks first # frontlinks first
links = sample_links(db,hostname,"frontlink",batch_size) links = sample_links(db,hostname,"frontlink",batch_size)
links.insert(0,start_link) links.insert(0,start_link)
# then backlinks
if len(links) < batch_size:
back_links = sample_links(db,hostname,"backlink",batch_size - len(links))
links += back_links
# index results # index results
print("Processing links") print("Processing links")
responses = [] responses = []
for link in links: for link in links:
responses.append(fetch_page(link)) responses.append(fetch_page(link))
extracted_pages = extract_pages(links,responses) extracted_pages = extract_pages(links,responses)
extracted_links = extract_links(links,responses,hostname,rules,"backlink") extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
index_links(db,extracted_links) index_links(db,extracted_links)
index_pages(db,hostname,extracted_pages) index_pages(db,hostname,extracted_pages)
link_summary(db,hostname) link_summary(db,hostname)