This commit is contained in:
Daniel Hládek 2023-03-12 10:08:21 +01:00
parent 44fbf6b755
commit 39d1057ea1

View File

@ -172,7 +172,7 @@ def index_pages(db,domain,extracted_pages):
state = "good"
link = original_link
if original_link != final_link:
linkcol.insert_one(get_link_doc(original_link,"redirect"))
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
link = final_link
if html is None:
state = "html_error"
@ -224,7 +224,10 @@ def index_links(db,extracted_links):
linkcol=db["links"]
for link,status in extracted_links:
doc = get_link_doc(link,status)
try:
linkcol.insert_one(doc)
except pymongo.errors.DuplicateKeyError as ex:
pass
def get_links(db,domain,status,batch_size=BATCHSIZE):
@ -277,18 +280,18 @@ def cli():
pass
@cli.command()
def dropdb():
def createdb():
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
linkcol = db["links"]
linkcol.create_index({"url":1},{"name":"url"})
linkcol.create_index({"host":1,"status":1},{"name":"hostname_status"})
linkcol.create_index("url",unique=True)
linkcol.create_index("host")
contentcol = db["content"]
contentcol.create_index({"url":1})
contentcol.create_index({"paragraph_checksums":1})
contentcol.create_index({"domain":1})
contentcol.create_index("url",unique=True)
#contentcol.create_index({"paragraph_checksums":1})
#contentcol.create_index({"domain":1})
htmlcol = db["html"]
htmlcol.create_index({"url":1})
htmlcol.create_index("url",unique=True)
@cli.command()
@click.argument("start_link")