This commit is contained in:
Daniel Hládek 2023-03-14 13:54:40 +01:00
parent 6eca731e42
commit a05a3372af

View File

@ -59,10 +59,10 @@ def is_robot_good(link,rules):
def is_link_good(link):
r = courlan.check_url(link,strict=True,language=LANGUAGE)
if r is None:
print(link)
#print(link)
return None
llink,ldomain = r
print(llink,ldomain)
#print(llink,ldomain)
# domain rules
if not ldomain.endswith(DOMAIN):
LOGGER.debug("bad domain")
@ -187,17 +187,20 @@ def index_pages(db,domain,extracted_pages):
doc["paragraph_checksums"] = checksums
doc["paragraph_sizes"] = sizes
goodsz = sum(sizes)
if len(text) < 200 or goodsz/len(text) < 0.3:
if len(text) < 200 or goodsz/len(text) < 0.4:
state = "trash"
if state == "good":
htdoc = get_link_doc(link,state)
htdoc["html"] = html
htdoc["html_size"] = len(html)
htmlcol.insert_one(htdoc)
# can be revisited - upsert
del htdoc["url"]
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
doc.update(get_link_doc(link,"good"))
# todo extract links
print(doc)
contentcol.insert_one(doc)
del doc["url"]
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
@ -246,8 +249,8 @@ def get_links(db,domain,status,batch_size=BATCHSIZE):
res = linkcol.find({"status":status,"host":domain},{"url":1},limit=batch_size)
links = []
for doc in res:
print(">>>>>" + status)
print(doc)
#print(">>>>>" + status)
#print(doc)
links.append(doc["url"])
return links