This commit is contained in:
Daniel Hládek 2023-03-12 10:08:21 +01:00
parent 44fbf6b755
commit 39d1057ea1

View File

@ -172,7 +172,7 @@ def index_pages(db,domain,extracted_pages):
state = "good" state = "good"
link = original_link link = original_link
if original_link != final_link: if original_link != final_link:
linkcol.insert_one(get_link_doc(original_link,"redirect")) linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
link = final_link link = final_link
if html is None: if html is None:
state = "html_error" state = "html_error"
@ -224,7 +224,10 @@ def index_links(db,extracted_links):
linkcol=db["links"] linkcol=db["links"]
for link,status in extracted_links: for link,status in extracted_links:
doc = get_link_doc(link,status) doc = get_link_doc(link,status)
try:
linkcol.insert_one(doc) linkcol.insert_one(doc)
except pymongo.errors.DuplicateKeyError as ex:
pass
def get_links(db,domain,status,batch_size=BATCHSIZE): def get_links(db,domain,status,batch_size=BATCHSIZE):
@ -277,18 +280,18 @@ def cli():
pass pass
@cli.command() @cli.command()
def dropdb(): def createdb():
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
linkcol = db["links"] linkcol = db["links"]
linkcol.create_index({"url":1},{"name":"url"}) linkcol.create_index("url",unique=True)
linkcol.create_index({"host":1,"status":1},{"name":"hostname_status"}) linkcol.create_index("host")
contentcol = db["content"] contentcol = db["content"]
contentcol.create_index({"url":1}) contentcol.create_index("url",unique=True)
contentcol.create_index({"paragraph_checksums":1}) #contentcol.create_index({"paragraph_checksums":1})
contentcol.create_index({"domain":1}) #contentcol.create_index({"domain":1})
htmlcol = db["html"] htmlcol = db["html"]
htmlcol.create_index({"url":1}) htmlcol.create_index("url",unique=True)
@cli.command() @cli.command()
@click.argument("start_link") @click.argument("start_link")