zz
This commit is contained in:
parent
4d62cd38fd
commit
37a115fb94
14
mongo/cli.py
14
mongo/cli.py
@ -1,5 +1,10 @@
|
|||||||
import click
|
import click
|
||||||
import mongocrawler
|
import mongocrawler
|
||||||
|
import rq
|
||||||
|
import os
|
||||||
|
|
||||||
|
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
|
||||||
|
QUEUES=os.getenv("QUEUES","high,default,low")
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
def cli():
|
def cli():
|
||||||
@ -37,5 +42,14 @@ def summary():
|
|||||||
def sampledomains():
|
def sampledomains():
|
||||||
mongocrawler.sample_domains()
|
mongocrawler.sample_domains()
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
||||||
|
def enqueue():
|
||||||
|
q = rq.Queue(connection=redis.from_url(REDIS_URL))
|
||||||
|
for l in sys.stdin:
|
||||||
|
print(l.strip())
|
||||||
|
r = q.enqueue(mongocrawler.visit, l.strip())
|
||||||
|
print(r)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cli()
|
cli()
|
||||||
|
@ -628,7 +628,7 @@ def createdb():
|
|||||||
htmlcol.create_index("html_md5",unique=True)
|
htmlcol.create_index("html_md5",unique=True)
|
||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
domaincol.create_index("host",unique=True)
|
domaincol.create_index("host",unique=True)
|
||||||
domaincol.create_index("average_fetch_characters")
|
domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING))
|
||||||
batchcol = db["batches"]
|
batchcol = db["batches"]
|
||||||
batchcol.create_index("host")
|
batchcol.create_index("host")
|
||||||
batchcol.create_index("created_at")
|
batchcol.create_index("created_at")
|
||||||
@ -711,7 +711,6 @@ def crawl_summary():
|
|||||||
batchcol = db["batches"]
|
batchcol = db["batches"]
|
||||||
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
|
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
|
||||||
print(yesterday)
|
print(yesterday)
|
||||||
res = batchcol.find({"created_at":{"$lt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters")
|
|
||||||
res = batchcol.aggregate([
|
res = batchcol.aggregate([
|
||||||
{"$match":{"created_at":{"$lt": yesterday.utcnow()}}},
|
{"$match":{"created_at":{"$lt": yesterday.utcnow()}}},
|
||||||
{"$group":{"_id":"$host",
|
{"$group":{"_id":"$host",
|
||||||
@ -737,11 +736,20 @@ def sample_domains():
|
|||||||
linkscol = db["links"]
|
linkscol = db["links"]
|
||||||
# discover domains
|
# discover domains
|
||||||
domains = linkscol.distinct("host",filter={"status":"frontlink"})
|
domains = linkscol.distinct("host",filter={"status":"frontlink"})
|
||||||
|
all_domains = []
|
||||||
for domain in domains:
|
for domain in domains:
|
||||||
print(domain)
|
all_domains.append(domain)
|
||||||
|
sample_size = min(int(DISCOVER_LINK_RATIO* BATCHSIZE), len(all_domains))
|
||||||
|
print(">>> Discover domains {}".format(sample_size))
|
||||||
|
sample_domains = random.sample(all_domains,sample_size)
|
||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
# exploit domains
|
# exploit domains
|
||||||
print(">>>> Best domains")
|
res = domaincol.find({"average_fetch_characters":{"$gt":1000}}).sort("average_fetch_characters",-1)
|
||||||
res = domaincol.find({},limit=100).sort("average_fetch_characters",-1)
|
all_domains = []
|
||||||
for item in res:
|
for item in res:
|
||||||
print(item)
|
all_domains.append(item["host"])
|
||||||
|
sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCHSIZE),len(all_domains))
|
||||||
|
print(">>>> Best domains {}".format(sample_size))
|
||||||
|
sample_domains += random.sample(all_domains,sample_size)
|
||||||
|
for domain in sample_domains:
|
||||||
|
print(domain)
|
||||||
|
Loading…
Reference in New Issue
Block a user