From 37a115fb94d30959408266025a0299ad04217f8c Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Thu, 13 Apr 2023 09:10:03 +0200 Subject: [PATCH] zz --- mongo/cli.py | 14 ++++++++++++++ mongo/mongocrawler.py | 20 ++++++++++++++------ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/mongo/cli.py b/mongo/cli.py index 1e7dcfe..093dcc9 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -1,5 +1,10 @@ import click import mongocrawler +import rq +import os + +REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/") +QUEUES=os.getenv("QUEUES","high,default,low") @click.group() def cli(): @@ -37,5 +42,14 @@ def summary(): def sampledomains(): mongocrawler.sample_domains() + +@cli.command(help="Enqueue a list of links into redis queue for crawling") +def enqueue(): + q = rq.Queue(connection=redis.from_url(REDIS_URL)) + for l in sys.stdin: + print(l.strip()) + r = q.enqueue(mongocrawler.visit, l.strip()) + print(r) + if __name__ == "__main__": cli() diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index a2fc796..7179f82 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -628,7 +628,7 @@ def createdb(): htmlcol.create_index("html_md5",unique=True) domaincol = db["domains"] domaincol.create_index("host",unique=True) - domaincol.create_index("average_fetch_characters") + domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING)) batchcol = db["batches"] batchcol.create_index("host") batchcol.create_index("created_at") @@ -711,7 +711,6 @@ def crawl_summary(): batchcol = db["batches"] yesterday = datetime.datetime.today() - datetime.timedelta(days=1) print(yesterday) - res = batchcol.find({"created_at":{"$lt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters") res = batchcol.aggregate([ {"$match":{"created_at":{"$lt": yesterday.utcnow()}}}, {"$group":{"_id":"$host", @@ -737,11 +736,20 @@ def sample_domains(): linkscol = db["links"] # discover domains domains = linkscol.distinct("host",filter={"status":"frontlink"}) + all_domains = [] for domain in domains: - print(domain) + all_domains.append(domain) + sample_size = min(int(DISCOVER_LINK_RATIO* BATCHSIZE), len(all_domains)) + print(">>> Discover domains {}".format(sample_size)) + sample_domains = random.sample(all_domains,sample_size) domaincol = db["domains"] # exploit domains - print(">>>> Best domains") - res = domaincol.find({},limit=100).sort("average_fetch_characters",-1) + res = domaincol.find({"average_fetch_characters":{"$gt":1000}}).sort("average_fetch_characters",-1) + all_domains = [] for item in res: - print(item) + all_domains.append(item["host"]) + sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCHSIZE),len(all_domains)) + print(">>>> Best domains {}".format(sample_size)) + sample_domains += random.sample(all_domains,sample_size) + for domain in sample_domains: + print(domain)