zz
This commit is contained in:
commit
01645b8862
@ -66,6 +66,7 @@ def get_bs_links(link,html):
|
||||
netloc = parsed.netloc
|
||||
path = os.path.normpath(parsed.path)
|
||||
scheme = parsed.scheme
|
||||
query = parsed.query
|
||||
# internal link
|
||||
if parsed.netloc == "":
|
||||
scheme = base.scheme
|
||||
@ -79,7 +80,7 @@ def get_bs_links(link,html):
|
||||
if path.endswith(")"):
|
||||
# javascript
|
||||
continue
|
||||
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
|
||||
href = urllib.parse.urlunparse((scheme,netloc,path,"",query,""))
|
||||
href = courlan.normalize_url(href)
|
||||
links.add(href)
|
||||
except ValueError as err:
|
||||
@ -238,7 +239,6 @@ def index_page(db,original_link,final_link,html,doc,filter_content=True):
|
||||
state = "good"
|
||||
link = original_link
|
||||
if original_link != final_link:
|
||||
print(original_link,final_link)
|
||||
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
||||
link = final_link
|
||||
if html is None:
|
||||
@ -277,10 +277,7 @@ def index_page(db,original_link,final_link,html,doc,filter_content=True):
|
||||
del doc["url"]
|
||||
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
||||
for chs in doc["paragraph_checksums"]:
|
||||
try:
|
||||
checkcol.insert_one({"_id":chs})
|
||||
except pymongo.errors.DuplicateKeyError as err:
|
||||
pass
|
||||
checkcol.update_one({"_id":chs},{"$inc":{"count":1}},upsert=True)
|
||||
|
||||
linkdoc = get_link_doc(link,state)
|
||||
del linkdoc["url"]
|
||||
@ -308,7 +305,6 @@ def save_batch_info(db,host,states,docs):
|
||||
"batch_size": batch_size,
|
||||
}
|
||||
db["batches"].insert_one(batchdoc)
|
||||
print(batchdoc)
|
||||
|
||||
|
||||
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
||||
@ -319,15 +315,11 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
|
||||
if html is None or len(html) < 256:
|
||||
continue
|
||||
page_links = get_bs_links(final_link,html)
|
||||
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
||||
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
|
||||
#print(extracted_links)
|
||||
for link in page_links:
|
||||
if not courlan.is_external(link,final_link) and not is_robot_good(link,rules):
|
||||
badrobot += 1
|
||||
continue
|
||||
status = str(default_status)
|
||||
#print(link,status)
|
||||
links[link] = status
|
||||
outlinks = []
|
||||
badlink = 0
|
||||
@ -453,7 +445,6 @@ class LinkClassifier:
|
||||
goodprob += g
|
||||
b = math.log(self.badcounter[feature] + self.alpha) - bcc
|
||||
badprob += b
|
||||
print(feature,g,b)
|
||||
pa = math.exp(goodprob + gp)
|
||||
pb = math.exp(badprob + bp)
|
||||
return pa - pb #+ random.uniform(-0.001,0.001)
|
||||
|
Loading…
Reference in New Issue
Block a user