zz
This commit is contained in:
parent
44fbf6b755
commit
39d1057ea1
@ -172,7 +172,7 @@ def index_pages(db,domain,extracted_pages):
|
||||
state = "good"
|
||||
link = original_link
|
||||
if original_link != final_link:
|
||||
linkcol.insert_one(get_link_doc(original_link,"redirect"))
|
||||
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
||||
link = final_link
|
||||
if html is None:
|
||||
state = "html_error"
|
||||
@ -224,7 +224,10 @@ def index_links(db,extracted_links):
|
||||
linkcol=db["links"]
|
||||
for link,status in extracted_links:
|
||||
doc = get_link_doc(link,status)
|
||||
try:
|
||||
linkcol.insert_one(doc)
|
||||
except pymongo.errors.DuplicateKeyError as ex:
|
||||
pass
|
||||
|
||||
|
||||
def get_links(db,domain,status,batch_size=BATCHSIZE):
|
||||
@ -277,18 +280,18 @@ def cli():
|
||||
pass
|
||||
|
||||
@cli.command()
|
||||
def dropdb():
|
||||
def createdb():
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
linkcol = db["links"]
|
||||
linkcol.create_index({"url":1},{"name":"url"})
|
||||
linkcol.create_index({"host":1,"status":1},{"name":"hostname_status"})
|
||||
linkcol.create_index("url",unique=True)
|
||||
linkcol.create_index("host")
|
||||
contentcol = db["content"]
|
||||
contentcol.create_index({"url":1})
|
||||
contentcol.create_index({"paragraph_checksums":1})
|
||||
contentcol.create_index({"domain":1})
|
||||
contentcol.create_index("url",unique=True)
|
||||
#contentcol.create_index({"paragraph_checksums":1})
|
||||
#contentcol.create_index({"domain":1})
|
||||
htmlcol = db["html"]
|
||||
htmlcol.create_index({"url":1})
|
||||
htmlcol.create_index("url",unique=True)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("start_link")
|
||||
|
Loading…
Reference in New Issue
Block a user