zz
This commit is contained in:
parent
7d09f112df
commit
289fbf7fb2
@ -93,7 +93,7 @@ def is_link_good(link):
|
|||||||
return None
|
return None
|
||||||
return llink
|
return llink
|
||||||
|
|
||||||
def get_link_doc(link,status="frontlink"):
|
def get_link_doc(link:str,status="frontlink")->dict:
|
||||||
r = courlan.check_url(link)
|
r = courlan.check_url(link)
|
||||||
assert r is not None
|
assert r is not None
|
||||||
link,host = r
|
link,host = r
|
||||||
@ -101,7 +101,7 @@ def get_link_doc(link,status="frontlink"):
|
|||||||
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()}
|
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()}
|
||||||
|
|
||||||
|
|
||||||
def fetch_page(link):
|
def fetch_page(link:str)->(str,str):
|
||||||
print("fetching:::::")
|
print("fetching:::::")
|
||||||
print(link)
|
print(link)
|
||||||
final_link = link
|
final_link = link
|
||||||
@ -130,7 +130,7 @@ def fetch_page(link):
|
|||||||
html = None
|
html = None
|
||||||
return final_link,html
|
return final_link,html
|
||||||
|
|
||||||
def fetch_robot(base_url):
|
def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
|
||||||
try:
|
try:
|
||||||
rawrules = trafilatura.fetch_url("https://"+ base_url + "/robots.txt")
|
rawrules = trafilatura.fetch_url("https://"+ base_url + "/robots.txt")
|
||||||
#print(rawrules)
|
#print(rawrules)
|
||||||
@ -144,7 +144,7 @@ def fetch_robot(base_url):
|
|||||||
return rules
|
return rules
|
||||||
|
|
||||||
|
|
||||||
def extract_pages(link_batch,responses):
|
def extract_pages(link_batch:list,responses:list)->list:
|
||||||
out = []
|
out = []
|
||||||
for original_link,(final_link,html) in zip(link_batch,responses):
|
for original_link,(final_link,html) in zip(link_batch,responses):
|
||||||
doc = None
|
doc = None
|
||||||
@ -225,16 +225,69 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
pass
|
pass
|
||||||
linkcol.update_one({"url":link},{"$set":{"status":state}})
|
linkcol.update_one({"url":link},{"$set":{"status":state}})
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import urllib.parse
|
||||||
|
import w3lib.url
|
||||||
|
import os.path
|
||||||
|
|
||||||
def extract_links(link_batch,responses,hostname,rules,default_status="frontlink"):
|
def get_bs_links(link,html):
|
||||||
|
# Extrakcia linkov zo stranky
|
||||||
|
bs = BeautifulSoup(html, "lxml")
|
||||||
|
base = link
|
||||||
|
if bs.base is not None and "href" in bs.base.attrs:
|
||||||
|
base = bs.base["href"]
|
||||||
|
base = urllib.parse.urlparse(w3lib.url.canonicalize_url(base))
|
||||||
|
external_links = set()
|
||||||
|
internal_links = set()
|
||||||
|
# Normalizacia linkov
|
||||||
|
for l in bs.find_all("a", href=True):
|
||||||
|
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
|
||||||
|
continue
|
||||||
|
href = l["href"]
|
||||||
|
try:
|
||||||
|
parsed = urllib.parse.urlparse(w3lib.url.canonicalize_url(href))
|
||||||
|
netloc = parsed.netloc
|
||||||
|
path = os.path.normpath(parsed.path)
|
||||||
|
scheme = parsed.scheme
|
||||||
|
query = w3lib.url.url_query_cleaner(parsed.query,["id","aid","p","page","pid"])
|
||||||
|
print(parsed)
|
||||||
|
if parsed.netloc == "":
|
||||||
|
scheme = base.scheme
|
||||||
|
if parsed.path == "/":
|
||||||
|
netloc = base.netloc
|
||||||
|
else:
|
||||||
|
netloc = base.netloc
|
||||||
|
path = os.path.normpath(base.path +"/" + path)
|
||||||
|
if not scheme.startswith("http"):
|
||||||
|
continue
|
||||||
|
if path.startswith("/"):
|
||||||
|
path = path[1:]
|
||||||
|
external = True
|
||||||
|
if parsed.netloc == base.netloc:
|
||||||
|
external = False
|
||||||
|
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
|
||||||
|
href = w3lib.url.canonicalize_url(href)
|
||||||
|
print(href)
|
||||||
|
if external:
|
||||||
|
external_links.add(href)
|
||||||
|
else:
|
||||||
|
internal_links.add(href)
|
||||||
|
except ValueError as err:
|
||||||
|
print(err)
|
||||||
|
pass
|
||||||
|
print(internal_links,external_links)
|
||||||
|
return internal_links,external_links
|
||||||
|
|
||||||
|
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
||||||
links = {}
|
links = {}
|
||||||
badrobot = 0
|
badrobot = 0
|
||||||
for original_link,(final_link,html) in zip(link_batch,responses):
|
for original_link,(final_link,html) in zip(link_batch,responses):
|
||||||
status = default_status
|
status = default_status
|
||||||
external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
internal_links, external_links = get_bs_links(final_link,html)
|
||||||
|
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
||||||
for link in external_links:
|
for link in external_links:
|
||||||
links[link] = "frontlink"
|
links[link] = "frontlink"
|
||||||
internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
|
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
|
||||||
#print(extracted_links)
|
#print(extracted_links)
|
||||||
for link in internal_links:
|
for link in internal_links:
|
||||||
if not is_robot_good(link,rules):
|
if not is_robot_good(link,rules):
|
||||||
@ -283,7 +336,6 @@ def get_link_features(link):
|
|||||||
if len(res) < 2:
|
if len(res) < 2:
|
||||||
return None
|
return None
|
||||||
res = res[:-1]
|
res = res[:-1]
|
||||||
print(res)
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
class LinkClassifier:
|
class LinkClassifier:
|
||||||
@ -477,30 +529,39 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
cl.train(trainset)
|
cl.train(trainset)
|
||||||
prediction_accuracy = cl.test(testset)
|
prediction_accuracy = cl.test(testset)
|
||||||
sample_set_size = 10000
|
sample_set_size = 10000
|
||||||
res = linkcol.find({"host":hostname,"status": status},limit = sample_set_size)
|
res = linkcol.find({"host":hostname,"status": status})
|
||||||
sample_links = []
|
sample_links = []
|
||||||
predicted_good = 0
|
predicted_good = 0
|
||||||
|
visitcounter = collections.Counter()
|
||||||
|
good_links = []
|
||||||
|
discover_links = []
|
||||||
for item in res:
|
for item in res:
|
||||||
for item in res:
|
link = item["url"]
|
||||||
cll = cl.classify(item["url"])
|
cll = cl.classify(link)
|
||||||
#cll += random.uniform(-0.1,0.1)
|
if cll > 0:
|
||||||
sample_links.append((item["url"],cll))
|
good_links.append(link)
|
||||||
if cll > 0:
|
features = get_link_features(link)
|
||||||
predicted_good += 1
|
discover_links.append(link)
|
||||||
# TODO frontlinks are not unique!
|
if features is None:
|
||||||
sample_links.sort(key=lambda x: x[1],reverse=True)
|
continue
|
||||||
predicted_good_prob = 0
|
for feature in features:
|
||||||
if len(sample_links) > 0:
|
visitcounter[feature] += 1
|
||||||
predicted_good_prob = predicted_good / len(sample_links)
|
mls = int(min(batch_size/2,len(good_links)))
|
||||||
domaincol = db["domain"]
|
random.shuffle(good_links)
|
||||||
info = {
|
links = good_links[0:mls]
|
||||||
"predicted_good_prob":predicted_good_prob,
|
numdiscover = len(discover_links)
|
||||||
"prediction_accuracy": prediction_accuracy,
|
eval_discover_links = []
|
||||||
"crawled_count": crawled_count,
|
for link in discover_links:
|
||||||
}
|
features = get_link_features(link)
|
||||||
print(info)
|
prob = 0
|
||||||
domaincol.update_one({"host":hostname},{"$set":info})
|
if features is not None:
|
||||||
links = [l[0] for l in sample_links[0:batch_size]]
|
for feature in features:
|
||||||
|
prob += math.log(visitcounter[feature] / numdiscover)
|
||||||
|
eval_discover_links.append((link,prob))
|
||||||
|
eval_discover_links.sort(key=lambda x: x[1],reverse=True)
|
||||||
|
print(eval_discover_links)
|
||||||
|
mls = int(min(batch_size/2,len(discover_links)))
|
||||||
|
links += [l[0] for l in eval_discover_links[0:mls]]
|
||||||
return links
|
return links
|
||||||
|
|
||||||
def domain_summary(db,hostname):
|
def domain_summary(db,hostname):
|
||||||
@ -549,6 +610,7 @@ def parseurl(link):
|
|||||||
print(rules.site_maps())
|
print(rules.site_maps())
|
||||||
print(rules.crawl_delay("*"))
|
print(rules.crawl_delay("*"))
|
||||||
html = trafilatura.fetch_url(link,decode=True)
|
html = trafilatura.fetch_url(link,decode=True)
|
||||||
|
get_bs_links(link,html)
|
||||||
doc = trafilatura.bare_extraction(html)
|
doc = trafilatura.bare_extraction(html)
|
||||||
import pprint
|
import pprint
|
||||||
pprint.pprint(doc)
|
pprint.pprint(doc)
|
||||||
@ -597,17 +659,13 @@ def visit(start_link):
|
|||||||
# frontlinks first
|
# frontlinks first
|
||||||
links = sample_links(db,hostname,"frontlink",batch_size)
|
links = sample_links(db,hostname,"frontlink",batch_size)
|
||||||
links.insert(0,start_link)
|
links.insert(0,start_link)
|
||||||
# then backlinks
|
|
||||||
if len(links) < batch_size:
|
|
||||||
back_links = sample_links(db,hostname,"backlink",batch_size - len(links))
|
|
||||||
links += back_links
|
|
||||||
# index results
|
# index results
|
||||||
print("Processing links")
|
print("Processing links")
|
||||||
responses = []
|
responses = []
|
||||||
for link in links:
|
for link in links:
|
||||||
responses.append(fetch_page(link))
|
responses.append(fetch_page(link))
|
||||||
extracted_pages = extract_pages(links,responses)
|
extracted_pages = extract_pages(links,responses)
|
||||||
extracted_links = extract_links(links,responses,hostname,rules,"backlink")
|
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
||||||
index_links(db,extracted_links)
|
index_links(db,extracted_links)
|
||||||
index_pages(db,hostname,extracted_pages)
|
index_pages(db,hostname,extracted_pages)
|
||||||
link_summary(db,hostname)
|
link_summary(db,hostname)
|
||||||
|
Loading…
Reference in New Issue
Block a user