This commit is contained in:
Daniel Hládek 2023-04-13 09:10:03 +02:00
parent 4d62cd38fd
commit 37a115fb94
2 changed files with 28 additions and 6 deletions

View File

@ -1,5 +1,10 @@
import click import click
import mongocrawler import mongocrawler
import rq
import os
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
QUEUES=os.getenv("QUEUES","high,default,low")
@click.group() @click.group()
def cli(): def cli():
@ -37,5 +42,14 @@ def summary():
def sampledomains(): def sampledomains():
mongocrawler.sample_domains() mongocrawler.sample_domains()
@cli.command(help="Enqueue a list of links into redis queue for crawling")
def enqueue():
q = rq.Queue(connection=redis.from_url(REDIS_URL))
for l in sys.stdin:
print(l.strip())
r = q.enqueue(mongocrawler.visit, l.strip())
print(r)
if __name__ == "__main__": if __name__ == "__main__":
cli() cli()

View File

@ -628,7 +628,7 @@ def createdb():
htmlcol.create_index("html_md5",unique=True) htmlcol.create_index("html_md5",unique=True)
domaincol = db["domains"] domaincol = db["domains"]
domaincol.create_index("host",unique=True) domaincol.create_index("host",unique=True)
domaincol.create_index("average_fetch_characters") domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING))
batchcol = db["batches"] batchcol = db["batches"]
batchcol.create_index("host") batchcol.create_index("host")
batchcol.create_index("created_at") batchcol.create_index("created_at")
@ -711,7 +711,6 @@ def crawl_summary():
batchcol = db["batches"] batchcol = db["batches"]
yesterday = datetime.datetime.today() - datetime.timedelta(days=1) yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
print(yesterday) print(yesterday)
res = batchcol.find({"created_at":{"$lt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters")
res = batchcol.aggregate([ res = batchcol.aggregate([
{"$match":{"created_at":{"$lt": yesterday.utcnow()}}}, {"$match":{"created_at":{"$lt": yesterday.utcnow()}}},
{"$group":{"_id":"$host", {"$group":{"_id":"$host",
@ -737,11 +736,20 @@ def sample_domains():
linkscol = db["links"] linkscol = db["links"]
# discover domains # discover domains
domains = linkscol.distinct("host",filter={"status":"frontlink"}) domains = linkscol.distinct("host",filter={"status":"frontlink"})
all_domains = []
for domain in domains: for domain in domains:
print(domain) all_domains.append(domain)
sample_size = min(int(DISCOVER_LINK_RATIO* BATCHSIZE), len(all_domains))
print(">>> Discover domains {}".format(sample_size))
sample_domains = random.sample(all_domains,sample_size)
domaincol = db["domains"] domaincol = db["domains"]
# exploit domains # exploit domains
print(">>>> Best domains") res = domaincol.find({"average_fetch_characters":{"$gt":1000}}).sort("average_fetch_characters",-1)
res = domaincol.find({},limit=100).sort("average_fetch_characters",-1) all_domains = []
for item in res: for item in res:
print(item) all_domains.append(item["host"])
sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCHSIZE),len(all_domains))
print(">>>> Best domains {}".format(sample_size))
sample_domains += random.sample(all_domains,sample_size)
for domain in sample_domains:
print(domain)