diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 4466e0f..a7ca97d 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -172,7 +172,7 @@ def index_pages(db,domain,extracted_pages): state = "good" link = original_link if original_link != final_link: - linkcol.insert_one(get_link_doc(original_link,"redirect")) + linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}}) link = final_link if html is None: state = "html_error" @@ -224,7 +224,10 @@ def index_links(db,extracted_links): linkcol=db["links"] for link,status in extracted_links: doc = get_link_doc(link,status) - linkcol.insert_one(doc) + try: + linkcol.insert_one(doc) + except pymongo.errors.DuplicateKeyError as ex: + pass def get_links(db,domain,status,batch_size=BATCHSIZE): @@ -277,18 +280,18 @@ def cli(): pass @cli.command() -def dropdb(): +def createdb(): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] linkcol = db["links"] - linkcol.create_index({"url":1},{"name":"url"}) - linkcol.create_index({"host":1,"status":1},{"name":"hostname_status"}) + linkcol.create_index("url",unique=True) + linkcol.create_index("host") contentcol = db["content"] - contentcol.create_index({"url":1}) - contentcol.create_index({"paragraph_checksums":1}) - contentcol.create_index({"domain":1}) + contentcol.create_index("url",unique=True) + #contentcol.create_index({"paragraph_checksums":1}) + #contentcol.create_index({"domain":1}) htmlcol = db["html"] - htmlcol.create_index({"url":1}) + htmlcol.create_index("url",unique=True) @cli.command() @click.argument("start_link")