Compare commits
2 Commits
289fbf7fb2
...
9d06223012
Author | SHA1 | Date | |
---|---|---|---|
9d06223012 | |||
ce8f939980 |
@ -32,6 +32,10 @@ MINTEXTSIZE=200
|
|||||||
CHECK_PARAGRAPH_SIZE=150
|
CHECK_PARAGRAPH_SIZE=150
|
||||||
TEXT_TRASH_SIZE=200
|
TEXT_TRASH_SIZE=200
|
||||||
TEXT_TRASH_RATIO=0.6
|
TEXT_TRASH_RATIO=0.6
|
||||||
|
DISCOVER_LINK_RATIO = 0.3
|
||||||
|
SAMPLE_SET_SIZE =10000
|
||||||
|
CLASSIFIER_SET_SIZE = 200
|
||||||
|
STOP_PATHS=["xml","rss","login","admin"]
|
||||||
|
|
||||||
def split_train(res):
|
def split_train(res):
|
||||||
trainset = []
|
trainset = []
|
||||||
@ -83,6 +87,11 @@ def is_link_good(link):
|
|||||||
if r is None:
|
if r is None:
|
||||||
return None
|
return None
|
||||||
llink,lhostname = r
|
llink,lhostname = r
|
||||||
|
paths = set(llink.split("/"))
|
||||||
|
for item in STOP_PATHS:
|
||||||
|
if item in paths:
|
||||||
|
return None
|
||||||
|
|
||||||
#print(llink,lhostname)
|
#print(llink,lhostname)
|
||||||
# hostname rules
|
# hostname rules
|
||||||
if not lhostname.endswith(DOMAIN):
|
if not lhostname.endswith(DOMAIN):
|
||||||
@ -122,12 +131,13 @@ def fetch_page(link:str)->(str,str):
|
|||||||
LOGGER.error('too large: length %s for URL %s', len(response.data), link)
|
LOGGER.error('too large: length %s for URL %s', len(response.data), link)
|
||||||
if good:
|
if good:
|
||||||
html = trafilatura.utils.decode_response(response)
|
html = trafilatura.utils.decode_response(response)
|
||||||
final_link = response.url
|
|
||||||
if html is not None:
|
if html is not None:
|
||||||
html, final_link = trafilatura.spider.refresh_detection(html, final_link)
|
html, final_link = trafilatura.spider.refresh_detection(html, final_link)
|
||||||
# is there a meta-refresh on the page?
|
# is there a meta-refresh on the page?
|
||||||
if final_link is None: # malformed or malicious content
|
if final_link is None: # malformed or malicious content
|
||||||
html = None
|
html = None
|
||||||
|
|
||||||
|
final_link = courlan.normalize_url(final_link)
|
||||||
return final_link,html
|
return final_link,html
|
||||||
|
|
||||||
def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
|
def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
|
||||||
@ -170,6 +180,7 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
state = "good"
|
state = "good"
|
||||||
link = original_link
|
link = original_link
|
||||||
if original_link != final_link:
|
if original_link != final_link:
|
||||||
|
print(original_link,final_link)
|
||||||
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
||||||
link = final_link
|
link = final_link
|
||||||
if html is None:
|
if html is None:
|
||||||
@ -215,7 +226,7 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
||||||
doc.update(get_link_doc(link,"good"))
|
doc.update(get_link_doc(link,"good"))
|
||||||
# todo extract links
|
# todo extract links
|
||||||
print(doc)
|
print(link,doc)
|
||||||
del doc["url"]
|
del doc["url"]
|
||||||
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
||||||
for chs in doc["paragraph_checksums"]:
|
for chs in doc["paragraph_checksums"]:
|
||||||
@ -223,11 +234,13 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
checkcol.insert_one({"_id":chs})
|
checkcol.insert_one({"_id":chs})
|
||||||
except pymongo.errors.DuplicateKeyError as err:
|
except pymongo.errors.DuplicateKeyError as err:
|
||||||
pass
|
pass
|
||||||
linkcol.update_one({"url":link},{"$set":{"status":state}})
|
|
||||||
|
linkdoc = get_link_doc(link,state)
|
||||||
|
del linkdoc["url"]
|
||||||
|
linkcol.update_one({"url":link},{"$set":linkdoc})
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import w3lib.url
|
|
||||||
import os.path
|
import os.path
|
||||||
|
|
||||||
def get_bs_links(link,html):
|
def get_bs_links(link,html):
|
||||||
@ -236,7 +249,8 @@ def get_bs_links(link,html):
|
|||||||
base = link
|
base = link
|
||||||
if bs.base is not None and "href" in bs.base.attrs:
|
if bs.base is not None and "href" in bs.base.attrs:
|
||||||
base = bs.base["href"]
|
base = bs.base["href"]
|
||||||
base = urllib.parse.urlparse(w3lib.url.canonicalize_url(base))
|
base = urllib.parse.urlparse(courlan.normalize_url(base))
|
||||||
|
|
||||||
external_links = set()
|
external_links = set()
|
||||||
internal_links = set()
|
internal_links = set()
|
||||||
# Normalizacia linkov
|
# Normalizacia linkov
|
||||||
@ -245,12 +259,10 @@ def get_bs_links(link,html):
|
|||||||
continue
|
continue
|
||||||
href = l["href"]
|
href = l["href"]
|
||||||
try:
|
try:
|
||||||
parsed = urllib.parse.urlparse(w3lib.url.canonicalize_url(href))
|
parsed = urllib.parse.urlparse(courlan.normalize_url(href))
|
||||||
netloc = parsed.netloc
|
netloc = parsed.netloc
|
||||||
path = os.path.normpath(parsed.path)
|
path = os.path.normpath(parsed.path)
|
||||||
scheme = parsed.scheme
|
scheme = parsed.scheme
|
||||||
query = w3lib.url.url_query_cleaner(parsed.query,["id","aid","p","page","pid"])
|
|
||||||
print(parsed)
|
|
||||||
if parsed.netloc == "":
|
if parsed.netloc == "":
|
||||||
scheme = base.scheme
|
scheme = base.scheme
|
||||||
if parsed.path == "/":
|
if parsed.path == "/":
|
||||||
@ -262,12 +274,14 @@ def get_bs_links(link,html):
|
|||||||
continue
|
continue
|
||||||
if path.startswith("/"):
|
if path.startswith("/"):
|
||||||
path = path[1:]
|
path = path[1:]
|
||||||
|
if path.endswith(")"):
|
||||||
|
# javascript
|
||||||
|
continue
|
||||||
external = True
|
external = True
|
||||||
if parsed.netloc == base.netloc:
|
if parsed.netloc == base.netloc:
|
||||||
external = False
|
external = False
|
||||||
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
|
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
|
||||||
href = w3lib.url.canonicalize_url(href)
|
href = courlan.normalize_url(href)
|
||||||
print(href)
|
|
||||||
if external:
|
if external:
|
||||||
external_links.add(href)
|
external_links.add(href)
|
||||||
else:
|
else:
|
||||||
@ -275,7 +289,6 @@ def get_bs_links(link,html):
|
|||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
print(err)
|
print(err)
|
||||||
pass
|
pass
|
||||||
print(internal_links,external_links)
|
|
||||||
return internal_links,external_links
|
return internal_links,external_links
|
||||||
|
|
||||||
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
||||||
@ -283,6 +296,8 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
|
|||||||
badrobot = 0
|
badrobot = 0
|
||||||
for original_link,(final_link,html) in zip(link_batch,responses):
|
for original_link,(final_link,html) in zip(link_batch,responses):
|
||||||
status = default_status
|
status = default_status
|
||||||
|
if html is None or len(html) < 256:
|
||||||
|
continue
|
||||||
internal_links, external_links = get_bs_links(final_link,html)
|
internal_links, external_links = get_bs_links(final_link,html)
|
||||||
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
||||||
for link in external_links:
|
for link in external_links:
|
||||||
@ -516,19 +531,19 @@ def link_summary(db,hostname):
|
|||||||
print(res)
|
print(res)
|
||||||
|
|
||||||
def sample_links(db,hostname,status,batch_size):
|
def sample_links(db,hostname,status,batch_size):
|
||||||
print("Getting backlinks")
|
print("Sampling links")
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}})
|
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}})
|
||||||
cl = LinkClassifier()
|
cl = LinkClassifier()
|
||||||
crawled_links = list(res)
|
crawled_links = list(res)
|
||||||
crawled_count = len(crawled_links)
|
crawled_count = len(crawled_links)
|
||||||
prediction_accuracy = 0
|
prediction_accuracy = 0
|
||||||
if crawled_count > 200:
|
if crawled_count > CLASSIFIER_SET_SIZE:
|
||||||
# train on crawled links
|
# train on crawled links
|
||||||
trainset,testset = split_train(crawled_links)
|
trainset,testset = split_train(crawled_links)
|
||||||
cl.train(trainset)
|
cl.train(trainset)
|
||||||
prediction_accuracy = cl.test(testset)
|
prediction_accuracy = cl.test(testset)
|
||||||
sample_set_size = 10000
|
sample_set_size = SAMPLE_SET_SIZE
|
||||||
res = linkcol.find({"host":hostname,"status": status})
|
res = linkcol.find({"host":hostname,"status": status})
|
||||||
sample_links = []
|
sample_links = []
|
||||||
predicted_good = 0
|
predicted_good = 0
|
||||||
@ -546,7 +561,7 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
continue
|
continue
|
||||||
for feature in features:
|
for feature in features:
|
||||||
visitcounter[feature] += 1
|
visitcounter[feature] += 1
|
||||||
mls = int(min(batch_size/2,len(good_links)))
|
mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links)))
|
||||||
random.shuffle(good_links)
|
random.shuffle(good_links)
|
||||||
links = good_links[0:mls]
|
links = good_links[0:mls]
|
||||||
numdiscover = len(discover_links)
|
numdiscover = len(discover_links)
|
||||||
@ -556,11 +571,12 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
prob = 0
|
prob = 0
|
||||||
if features is not None:
|
if features is not None:
|
||||||
for feature in features:
|
for feature in features:
|
||||||
prob += math.log(visitcounter[feature] / numdiscover)
|
c = visitcounter[feature]
|
||||||
|
prob -= math.log(c) / c
|
||||||
eval_discover_links.append((link,prob))
|
eval_discover_links.append((link,prob))
|
||||||
eval_discover_links.sort(key=lambda x: x[1],reverse=True)
|
eval_discover_links.sort(key=lambda x: x[1],reverse=True)
|
||||||
print(eval_discover_links)
|
#print(eval_discover_links)
|
||||||
mls = int(min(batch_size/2,len(discover_links)))
|
mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links)))
|
||||||
links += [l[0] for l in eval_discover_links[0:mls]]
|
links += [l[0] for l in eval_discover_links[0:mls]]
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user