This commit is contained in:
Daniel Hládek 2023-03-14 13:54:40 +01:00
parent 6eca731e42
commit a05a3372af

View File

@ -59,10 +59,10 @@ def is_robot_good(link,rules):
def is_link_good(link): def is_link_good(link):
r = courlan.check_url(link,strict=True,language=LANGUAGE) r = courlan.check_url(link,strict=True,language=LANGUAGE)
if r is None: if r is None:
print(link) #print(link)
return None return None
llink,ldomain = r llink,ldomain = r
print(llink,ldomain) #print(llink,ldomain)
# domain rules # domain rules
if not ldomain.endswith(DOMAIN): if not ldomain.endswith(DOMAIN):
LOGGER.debug("bad domain") LOGGER.debug("bad domain")
@ -187,17 +187,20 @@ def index_pages(db,domain,extracted_pages):
doc["paragraph_checksums"] = checksums doc["paragraph_checksums"] = checksums
doc["paragraph_sizes"] = sizes doc["paragraph_sizes"] = sizes
goodsz = sum(sizes) goodsz = sum(sizes)
if len(text) < 200 or goodsz/len(text) < 0.3: if len(text) < 200 or goodsz/len(text) < 0.4:
state = "trash" state = "trash"
if state == "good": if state == "good":
htdoc = get_link_doc(link,state) htdoc = get_link_doc(link,state)
htdoc["html"] = html htdoc["html"] = html
htdoc["html_size"] = len(html) htdoc["html_size"] = len(html)
htmlcol.insert_one(htdoc) # can be revisited - upsert
del htdoc["url"]
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
doc.update(get_link_doc(link,"good")) doc.update(get_link_doc(link,"good"))
# todo extract links # todo extract links
print(doc) print(doc)
contentcol.insert_one(doc) del doc["url"]
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
linkcol.update_one({"url":original_link},{"$set":{"status":state}}) linkcol.update_one({"url":original_link},{"$set":{"status":state}})
@ -246,8 +249,8 @@ def get_links(db,domain,status,batch_size=BATCHSIZE):
res = linkcol.find({"status":status,"host":domain},{"url":1},limit=batch_size) res = linkcol.find({"status":status,"host":domain},{"url":1},limit=batch_size)
links = [] links = []
for doc in res: for doc in res:
print(">>>>>" + status) #print(">>>>>" + status)
print(doc) #print(doc)
links.append(doc["url"]) links.append(doc["url"])
return links return links