zz
This commit is contained in:
parent
4a42078bef
commit
1546a63b75
@ -33,10 +33,11 @@ DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
|||||||
MINFILESIZE=300
|
MINFILESIZE=300
|
||||||
MAXFILESIZE=10000000
|
MAXFILESIZE=10000000
|
||||||
MINTEXTSIZE=200
|
MINTEXTSIZE=200
|
||||||
CHECK_PARAGRAPH_SIZE=150
|
CHECK_PARAGRAPH_SIZE=200
|
||||||
TEXT_TRASH_SIZE=200
|
TEXT_TRASH_SIZE=200
|
||||||
TEXT_TRASH_RATIO=0.6
|
TEXT_TRASH_RATIO=0.6
|
||||||
DISCOVER_LINK_RATIO = 0.3
|
DISCOVER_LINK_RATIO = 0.3
|
||||||
|
DISCOVER_DOMAIN_RATIO = 0.5
|
||||||
SAMPLE_SET_SIZE =10000
|
SAMPLE_SET_SIZE =10000
|
||||||
CLASSIFIER_SET_SIZE = 200
|
CLASSIFIER_SET_SIZE = 200
|
||||||
STOP_PATHS=["xml","rss","login","admin"]
|
STOP_PATHS=["xml","rss","login","admin"]
|
||||||
@ -61,6 +62,7 @@ def get_bs_links(link,html):
|
|||||||
netloc = parsed.netloc
|
netloc = parsed.netloc
|
||||||
path = os.path.normpath(parsed.path)
|
path = os.path.normpath(parsed.path)
|
||||||
scheme = parsed.scheme
|
scheme = parsed.scheme
|
||||||
|
query = parsed.query
|
||||||
# internal link
|
# internal link
|
||||||
if parsed.netloc == "":
|
if parsed.netloc == "":
|
||||||
scheme = base.scheme
|
scheme = base.scheme
|
||||||
@ -74,7 +76,7 @@ def get_bs_links(link,html):
|
|||||||
if path.endswith(")"):
|
if path.endswith(")"):
|
||||||
# javascript
|
# javascript
|
||||||
continue
|
continue
|
||||||
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
|
href = urllib.parse.urlunparse((scheme,netloc,path,"",query,""))
|
||||||
href = courlan.normalize_url(href)
|
href = courlan.normalize_url(href)
|
||||||
links.add(href)
|
links.add(href)
|
||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
@ -232,7 +234,6 @@ def index_page(db,original_link,final_link,html,doc):
|
|||||||
state = "good"
|
state = "good"
|
||||||
link = original_link
|
link = original_link
|
||||||
if original_link != final_link:
|
if original_link != final_link:
|
||||||
print(original_link,final_link)
|
|
||||||
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
||||||
link = final_link
|
link = final_link
|
||||||
if html is None:
|
if html is None:
|
||||||
@ -250,7 +251,6 @@ def index_page(db,original_link,final_link,html,doc):
|
|||||||
origsz = 0
|
origsz = 0
|
||||||
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
||||||
# index paragraph checksums
|
# index paragraph checksums
|
||||||
print(checkcol)
|
|
||||||
nd = checkcol.find_one({"_id":chs})
|
nd = checkcol.find_one({"_id":chs})
|
||||||
if nd is None:
|
if nd is None:
|
||||||
origsz += paragraph_size
|
origsz += paragraph_size
|
||||||
@ -258,7 +258,6 @@ def index_page(db,original_link,final_link,html,doc):
|
|||||||
|
|
||||||
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
||||||
state = "copy"
|
state = "copy"
|
||||||
print(origsz)
|
|
||||||
if state == "good":
|
if state == "good":
|
||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
htdoc["html"] = html
|
htdoc["html"] = html
|
||||||
@ -273,10 +272,7 @@ def index_page(db,original_link,final_link,html,doc):
|
|||||||
del doc["url"]
|
del doc["url"]
|
||||||
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
||||||
for chs in doc["paragraph_checksums"]:
|
for chs in doc["paragraph_checksums"]:
|
||||||
try:
|
checkcol.update_one({"_id":chs},{"$inc":{"count":1}},upsert=True)
|
||||||
checkcol.insert_one({"_id":chs})
|
|
||||||
except pymongo.errors.DuplicateKeyError as err:
|
|
||||||
pass
|
|
||||||
|
|
||||||
linkdoc = get_link_doc(link,state)
|
linkdoc = get_link_doc(link,state)
|
||||||
del linkdoc["url"]
|
del linkdoc["url"]
|
||||||
@ -304,7 +300,6 @@ def save_batch_info(db,host,states,docs):
|
|||||||
"batch_size": batch_size,
|
"batch_size": batch_size,
|
||||||
}
|
}
|
||||||
db["batches"].insert_one(batchdoc)
|
db["batches"].insert_one(batchdoc)
|
||||||
print(batchdoc)
|
|
||||||
|
|
||||||
|
|
||||||
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
||||||
@ -315,15 +310,11 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
|
|||||||
if html is None or len(html) < 256:
|
if html is None or len(html) < 256:
|
||||||
continue
|
continue
|
||||||
page_links = get_bs_links(final_link,html)
|
page_links = get_bs_links(final_link,html)
|
||||||
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
|
||||||
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
|
|
||||||
#print(extracted_links)
|
|
||||||
for link in page_links:
|
for link in page_links:
|
||||||
if not courlan.is_external(link,final_link) and not is_robot_good(link,rules):
|
if not courlan.is_external(link,final_link) and not is_robot_good(link,rules):
|
||||||
badrobot += 1
|
badrobot += 1
|
||||||
continue
|
continue
|
||||||
status = str(default_status)
|
status = str(default_status)
|
||||||
#print(link,status)
|
|
||||||
links[link] = status
|
links[link] = status
|
||||||
outlinks = []
|
outlinks = []
|
||||||
badlink = 0
|
badlink = 0
|
||||||
@ -449,7 +440,6 @@ class LinkClassifier:
|
|||||||
goodprob += g
|
goodprob += g
|
||||||
b = math.log(self.badcounter[feature] + self.alpha) - bcc
|
b = math.log(self.badcounter[feature] + self.alpha) - bcc
|
||||||
badprob += b
|
badprob += b
|
||||||
print(feature,g,b)
|
|
||||||
pa = math.exp(goodprob + gp)
|
pa = math.exp(goodprob + gp)
|
||||||
pb = math.exp(badprob + bp)
|
pb = math.exp(badprob + bp)
|
||||||
return pa - pb #+ random.uniform(-0.001,0.001)
|
return pa - pb #+ random.uniform(-0.001,0.001)
|
||||||
@ -730,7 +720,7 @@ def crawl_summary():
|
|||||||
{"$sort":{"original_text_size":-1}},
|
{"$sort":{"original_text_size":-1}},
|
||||||
])
|
])
|
||||||
print(">>>> Batches")
|
print(">>>> Batches")
|
||||||
headers = ["_id","document_count","good_document_count","batch_count","text_size","original_text_size"]
|
headers = ["_id","document_count","good_document_count","batch_size","original_text_size"]
|
||||||
print("\t".join(headers))
|
print("\t".join(headers))
|
||||||
for item in res:
|
for item in res:
|
||||||
values = [str(item[x]) for x in headers]
|
values = [str(item[x]) for x in headers]
|
||||||
@ -761,7 +751,7 @@ def sample_domains():
|
|||||||
all_domains = []
|
all_domains = []
|
||||||
for domain in domains:
|
for domain in domains:
|
||||||
all_domains.append(domain)
|
all_domains.append(domain)
|
||||||
sample_size = min(int(DISCOVER_LINK_RATIO* BATCHSIZE), len(all_domains))
|
sample_size = min(int(DISCOVER_DOMAIN_RATIO* BATCHSIZE), len(all_domains))
|
||||||
print(">>> Discover domains {}".format(sample_size))
|
print(">>> Discover domains {}".format(sample_size))
|
||||||
sample_domains = random.sample(all_domains,sample_size)
|
sample_domains = random.sample(all_domains,sample_size)
|
||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
@ -770,7 +760,7 @@ def sample_domains():
|
|||||||
all_domains = []
|
all_domains = []
|
||||||
for item in res:
|
for item in res:
|
||||||
all_domains.append(item["host"])
|
all_domains.append(item["host"])
|
||||||
sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCHSIZE),len(all_domains))
|
sample_size = min(int((1 - DISCOVER_DOMAIN_RATIO) * BATCHSIZE),len(all_domains))
|
||||||
print(">>>> Best domains {}".format(sample_size))
|
print(">>>> Best domains {}".format(sample_size))
|
||||||
sample_domains += random.sample(all_domains,sample_size)
|
sample_domains += random.sample(all_domains,sample_size)
|
||||||
for domain in sample_domains:
|
for domain in sample_domains:
|
||||||
|
Loading…
Reference in New Issue
Block a user