zz
This commit is contained in:
parent
6eca731e42
commit
a05a3372af
@ -59,10 +59,10 @@ def is_robot_good(link,rules):
|
|||||||
def is_link_good(link):
|
def is_link_good(link):
|
||||||
r = courlan.check_url(link,strict=True,language=LANGUAGE)
|
r = courlan.check_url(link,strict=True,language=LANGUAGE)
|
||||||
if r is None:
|
if r is None:
|
||||||
print(link)
|
#print(link)
|
||||||
return None
|
return None
|
||||||
llink,ldomain = r
|
llink,ldomain = r
|
||||||
print(llink,ldomain)
|
#print(llink,ldomain)
|
||||||
# domain rules
|
# domain rules
|
||||||
if not ldomain.endswith(DOMAIN):
|
if not ldomain.endswith(DOMAIN):
|
||||||
LOGGER.debug("bad domain")
|
LOGGER.debug("bad domain")
|
||||||
@ -187,17 +187,20 @@ def index_pages(db,domain,extracted_pages):
|
|||||||
doc["paragraph_checksums"] = checksums
|
doc["paragraph_checksums"] = checksums
|
||||||
doc["paragraph_sizes"] = sizes
|
doc["paragraph_sizes"] = sizes
|
||||||
goodsz = sum(sizes)
|
goodsz = sum(sizes)
|
||||||
if len(text) < 200 or goodsz/len(text) < 0.3:
|
if len(text) < 200 or goodsz/len(text) < 0.4:
|
||||||
state = "trash"
|
state = "trash"
|
||||||
if state == "good":
|
if state == "good":
|
||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
htdoc["html"] = html
|
htdoc["html"] = html
|
||||||
htdoc["html_size"] = len(html)
|
htdoc["html_size"] = len(html)
|
||||||
htmlcol.insert_one(htdoc)
|
# can be revisited - upsert
|
||||||
|
del htdoc["url"]
|
||||||
|
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
||||||
doc.update(get_link_doc(link,"good"))
|
doc.update(get_link_doc(link,"good"))
|
||||||
# todo extract links
|
# todo extract links
|
||||||
print(doc)
|
print(doc)
|
||||||
contentcol.insert_one(doc)
|
del doc["url"]
|
||||||
|
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
||||||
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
||||||
|
|
||||||
|
|
||||||
@ -246,8 +249,8 @@ def get_links(db,domain,status,batch_size=BATCHSIZE):
|
|||||||
res = linkcol.find({"status":status,"host":domain},{"url":1},limit=batch_size)
|
res = linkcol.find({"status":status,"host":domain},{"url":1},limit=batch_size)
|
||||||
links = []
|
links = []
|
||||||
for doc in res:
|
for doc in res:
|
||||||
print(">>>>>" + status)
|
#print(">>>>>" + status)
|
||||||
print(doc)
|
#print(doc)
|
||||||
links.append(doc["url"])
|
links.append(doc["url"])
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user