zz
This commit is contained in:
parent
6eca731e42
commit
a05a3372af
@ -59,10 +59,10 @@ def is_robot_good(link,rules):
|
||||
def is_link_good(link):
|
||||
r = courlan.check_url(link,strict=True,language=LANGUAGE)
|
||||
if r is None:
|
||||
print(link)
|
||||
#print(link)
|
||||
return None
|
||||
llink,ldomain = r
|
||||
print(llink,ldomain)
|
||||
#print(llink,ldomain)
|
||||
# domain rules
|
||||
if not ldomain.endswith(DOMAIN):
|
||||
LOGGER.debug("bad domain")
|
||||
@ -187,17 +187,20 @@ def index_pages(db,domain,extracted_pages):
|
||||
doc["paragraph_checksums"] = checksums
|
||||
doc["paragraph_sizes"] = sizes
|
||||
goodsz = sum(sizes)
|
||||
if len(text) < 200 or goodsz/len(text) < 0.3:
|
||||
if len(text) < 200 or goodsz/len(text) < 0.4:
|
||||
state = "trash"
|
||||
if state == "good":
|
||||
htdoc = get_link_doc(link,state)
|
||||
htdoc["html"] = html
|
||||
htdoc["html_size"] = len(html)
|
||||
htmlcol.insert_one(htdoc)
|
||||
# can be revisited - upsert
|
||||
del htdoc["url"]
|
||||
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
||||
doc.update(get_link_doc(link,"good"))
|
||||
# todo extract links
|
||||
print(doc)
|
||||
contentcol.insert_one(doc)
|
||||
del doc["url"]
|
||||
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
||||
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
||||
|
||||
|
||||
@ -246,8 +249,8 @@ def get_links(db,domain,status,batch_size=BATCHSIZE):
|
||||
res = linkcol.find({"status":status,"host":domain},{"url":1},limit=batch_size)
|
||||
links = []
|
||||
for doc in res:
|
||||
print(">>>>>" + status)
|
||||
print(doc)
|
||||
#print(">>>>>" + status)
|
||||
#print(doc)
|
||||
links.append(doc["url"])
|
||||
return links
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user