zz
This commit is contained in:
parent
44fbf6b755
commit
39d1057ea1
@ -172,7 +172,7 @@ def index_pages(db,domain,extracted_pages):
|
|||||||
state = "good"
|
state = "good"
|
||||||
link = original_link
|
link = original_link
|
||||||
if original_link != final_link:
|
if original_link != final_link:
|
||||||
linkcol.insert_one(get_link_doc(original_link,"redirect"))
|
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
||||||
link = final_link
|
link = final_link
|
||||||
if html is None:
|
if html is None:
|
||||||
state = "html_error"
|
state = "html_error"
|
||||||
@ -224,7 +224,10 @@ def index_links(db,extracted_links):
|
|||||||
linkcol=db["links"]
|
linkcol=db["links"]
|
||||||
for link,status in extracted_links:
|
for link,status in extracted_links:
|
||||||
doc = get_link_doc(link,status)
|
doc = get_link_doc(link,status)
|
||||||
|
try:
|
||||||
linkcol.insert_one(doc)
|
linkcol.insert_one(doc)
|
||||||
|
except pymongo.errors.DuplicateKeyError as ex:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def get_links(db,domain,status,batch_size=BATCHSIZE):
|
def get_links(db,domain,status,batch_size=BATCHSIZE):
|
||||||
@ -277,18 +280,18 @@ def cli():
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
def dropdb():
|
def createdb():
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
linkcol.create_index({"url":1},{"name":"url"})
|
linkcol.create_index("url",unique=True)
|
||||||
linkcol.create_index({"host":1,"status":1},{"name":"hostname_status"})
|
linkcol.create_index("host")
|
||||||
contentcol = db["content"]
|
contentcol = db["content"]
|
||||||
contentcol.create_index({"url":1})
|
contentcol.create_index("url",unique=True)
|
||||||
contentcol.create_index({"paragraph_checksums":1})
|
#contentcol.create_index({"paragraph_checksums":1})
|
||||||
contentcol.create_index({"domain":1})
|
#contentcol.create_index({"domain":1})
|
||||||
htmlcol = db["html"]
|
htmlcol = db["html"]
|
||||||
htmlcol.create_index({"url":1})
|
htmlcol.create_index("url",unique=True)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("start_link")
|
@click.argument("start_link")
|
||||||
|
Loading…
Reference in New Issue
Block a user