zz
This commit is contained in:
parent
4d62cd38fd
commit
37a115fb94
14
mongo/cli.py
14
mongo/cli.py
@ -1,5 +1,10 @@
|
||||
import click
|
||||
import mongocrawler
|
||||
import rq
|
||||
import os
|
||||
|
||||
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
|
||||
QUEUES=os.getenv("QUEUES","high,default,low")
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
@ -37,5 +42,14 @@ def summary():
|
||||
def sampledomains():
|
||||
mongocrawler.sample_domains()
|
||||
|
||||
|
||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
||||
def enqueue():
|
||||
q = rq.Queue(connection=redis.from_url(REDIS_URL))
|
||||
for l in sys.stdin:
|
||||
print(l.strip())
|
||||
r = q.enqueue(mongocrawler.visit, l.strip())
|
||||
print(r)
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
|
@ -628,7 +628,7 @@ def createdb():
|
||||
htmlcol.create_index("html_md5",unique=True)
|
||||
domaincol = db["domains"]
|
||||
domaincol.create_index("host",unique=True)
|
||||
domaincol.create_index("average_fetch_characters")
|
||||
domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING))
|
||||
batchcol = db["batches"]
|
||||
batchcol.create_index("host")
|
||||
batchcol.create_index("created_at")
|
||||
@ -711,7 +711,6 @@ def crawl_summary():
|
||||
batchcol = db["batches"]
|
||||
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
|
||||
print(yesterday)
|
||||
res = batchcol.find({"created_at":{"$lt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters")
|
||||
res = batchcol.aggregate([
|
||||
{"$match":{"created_at":{"$lt": yesterday.utcnow()}}},
|
||||
{"$group":{"_id":"$host",
|
||||
@ -737,11 +736,20 @@ def sample_domains():
|
||||
linkscol = db["links"]
|
||||
# discover domains
|
||||
domains = linkscol.distinct("host",filter={"status":"frontlink"})
|
||||
all_domains = []
|
||||
for domain in domains:
|
||||
print(domain)
|
||||
all_domains.append(domain)
|
||||
sample_size = min(int(DISCOVER_LINK_RATIO* BATCHSIZE), len(all_domains))
|
||||
print(">>> Discover domains {}".format(sample_size))
|
||||
sample_domains = random.sample(all_domains,sample_size)
|
||||
domaincol = db["domains"]
|
||||
# exploit domains
|
||||
print(">>>> Best domains")
|
||||
res = domaincol.find({},limit=100).sort("average_fetch_characters",-1)
|
||||
res = domaincol.find({"average_fetch_characters":{"$gt":1000}}).sort("average_fetch_characters",-1)
|
||||
all_domains = []
|
||||
for item in res:
|
||||
print(item)
|
||||
all_domains.append(item["host"])
|
||||
sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCHSIZE),len(all_domains))
|
||||
print(">>>> Best domains {}".format(sample_size))
|
||||
sample_domains += random.sample(all_domains,sample_size)
|
||||
for domain in sample_domains:
|
||||
print(domain)
|
||||
|
Loading…
Reference in New Issue
Block a user