This commit is contained in:
Daniel Hládek 2021-10-15 09:22:14 +02:00
commit a22fa87537
4 changed files with 80 additions and 50 deletions

2
.dockerignore Normal file
View File

@ -0,0 +1,2 @@
venv
websucker.egg-info

View File

@ -1,15 +1,16 @@
FROM python:3.8.0-alpine FROM python:3.8
RUN apk add --update --no-cache git curl curl-dev vim py3-lxml gcc make libxml2-dev libxslt-dev libc-dev RUN apt-get update && apt-get install -y git curl libcurl4-openssl-dev build-essential vim libssl-dev python-pip
RUN apk add py2-pip
RUN pip2 install cqlsh RUN pip2 install cqlsh
RUN addgroup -S appgroup -g 1000 && \ RUN addgroup appgroup && \
adduser -u 1000 -S appuser -G appgroup adduser appuser && adduser appuser appgroup
RUN mkdir /app /src RUN mkdir /app /src
ADD requirements.txt setup.py ./websucker /src/ ADD requirements.txt /src/
RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip install -r /src/requirements.txt RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip install -r /src/requirements.txt
WORKDIR /src
ADD . /src
RUN python /src/setup.py install
WORKDIR /app WORKDIR /app

View File

@ -283,6 +283,9 @@ class Connection:
return res return res
class ParsedDocument: class ParsedDocument:
"""
One document in the database
"""
def __init__(self, parser,work_link): def __init__(self, parser,work_link):
self.parser = parser self.parser = parser
self.work_link = work_link self.work_link = work_link
@ -304,6 +307,9 @@ class ParsedDocument:
self.current_time = datetime.date.today() self.current_time = datetime.date.today()
def extract(self,content,bs): def extract(self,content,bs):
"""
Parse content and fill the object
"""
self.content = content self.content = content
self.bs = bs self.bs = bs
@ -336,9 +342,15 @@ class ParsedDocument:
pass pass
def get_links(self): def get_links(self):
"""
@return all links
"""
return self.link_set return self.link_set
def get_follow_links(self): def get_follow_links(self):
"""
@return good normalized links
"""
follow_links = set() follow_links = set()
for l in self.link_set: for l in self.link_set:
if self.parser.is_link_good(l): if self.parser.is_link_good(l):
@ -355,24 +367,18 @@ class ParsedDocument:
if (len(self.body) < 20): if (len(self.body) < 20):
r.append(self.body) r.append(self.body)
else: else:
r.append(self.body[0:20]) + " ...." r.append(self.body[0:20] + " ....")
return ">>> ".join(r) return ">>> ".join(r)
def get_domains(arg):
domains = []
if arg == "-":
for l in sys.stdin:
domain = l.rstrip()
assert(domain is not None)
if len(domain) == 0:
continue
domains.append(domain)
else:
domains = arg.split(",")
return domains
def parse_and_index(work_link,parser,responses,db): def parse_and_index(work_link,parser,responses,db):
"""
Take all responses from work link, parse and store in db
@param work_link - final link from downloader
@param parser to use
@param responses from the downloader
@param db
"""
target_link = work_link target_link = work_link
links = [] links = []
if len(responses) > 0: if len(responses) > 0:
@ -387,6 +393,9 @@ def parse_and_index(work_link,parser,responses,db):
return target_link,links return target_link,links
def visit_sitemap(domain,connection,parser,db): def visit_sitemap(domain,connection,parser,db):
"""
get links from sitemap of the domain
"""
link = "http://" + domain link = "http://" + domain
print("Sitemap visit: " + link) print("Sitemap visit: " + link)
responses = connection.html_download2(link) responses = connection.html_download2(link)
@ -402,8 +411,10 @@ def visit_sitemap(domain,connection,parser,db):
return True return True
def visit_links(links,connection,parser,db,is_online): def visit_links(links,connection,parser,db,is_online=True):
# is is not online, then just check links """
if the site is not online, then just check links
"""
outlinks = [] outlinks = []
junklinks = [] junklinks = []
badrobotlinks = [] badrobotlinks = []
@ -423,6 +434,14 @@ def visit_links(links,connection,parser,db,is_online):
db.index_follow_links(parser,outlinks,connection) db.index_follow_links(parser,outlinks,connection)
def visit_domain(domain,parser,db): def visit_domain(domain,parser,db):
"""
One visit of the domain
1.Get links from the frontpage,
2. visit links and extract new links
3. get new links to visit
4. repeat visit for parser.crawl_rounds
"""
c = Connection() c = Connection()
p = parser p = parser
# Get links from frontpage # Get links from frontpage
@ -442,6 +461,10 @@ def visit_domain(domain,parser,db):
return True return True
def process_domains(domains,visit,parser,db,queue): def process_domains(domains,visit,parser,db,queue):
"""
Visit all domains in list.
if queue is true, then queue domain instead immediate visit
"""
print("Websucker Agenda>>") print("Websucker Agenda>>")
random.shuffle(domains) random.shuffle(domains)
for domain in domains: for domain in domains:
@ -460,6 +483,9 @@ def process_domains(domains,visit,parser,db,queue):
visit_domain(domain[0],parser,db) visit_domain(domain[0],parser,db)
def work_domains(parser,db,queue): def work_domains(parser,db,queue):
"""
Poll the queue and visit
"""
while True: while True:
print("Waiting for a new job:") print("Waiting for a new job:")
job = queue.reserve() job = queue.reserve()

View File

@ -15,7 +15,7 @@ def create_database_from_context(ctx):
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"]) return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
def create_queue_from_context(ctx): def create_queue_from_context(ctx):
return greenstalk.Client(ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"],use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8") return greenstalk.Client((ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]),use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8")
@click.group() @click.group()
@ -52,7 +52,7 @@ def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,bea
ctx.obj["queue"] = queue ctx.obj["queue"] = queue
@cli.command(help="All domains") @cli.command(help="Get visited domains from db")
@click.pass_context @click.pass_context
@click.argument("count",type=int,default=20) @click.argument("count",type=int,default=20)
def all(ctx,count): def all(ctx,count):
@ -63,28 +63,7 @@ def all(ctx,count):
q = create_queue_from_context(ctx) q = create_queue_from_context(ctx)
process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q) process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q)
@cli.command(help="Work queue") @cli.command(help="Get random domains")
@click.pass_context
def work(ctx):
db = create_database_from_context(ctx)
q = create_queue_from_context(ctx)
work_domains(ctx.obj["parser"],db,q)
@cli.command(help="find best domains")
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
def best(ctx, count):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = db.get_best_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="select random domains")
@click.pass_context @click.pass_context
@click.argument("count",type=int,default=20) @click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True) #@click.option("visit",is_flag=True)
@ -97,7 +76,29 @@ def blind(ctx, count):
q = create_queue_from_context(ctx) q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q) process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl") @cli.command(help="Visit domains from queue")
@click.pass_context
def work(ctx):
db = create_database_from_context(ctx)
q = create_queue_from_context(ctx)
work_domains(ctx.obj["parser"],db,q)
@cli.command(help="Get best domains from db")
@click.pass_context
@click.argument("count",type=int,default=20)
#@click.option("visit",is_flag=True)
def best(ctx, count):
db = create_database_from_context(ctx)
p = ctx.obj["parser"]
domains = db.get_best_domains(count,p)
q = None
if ctx.obj["queue"]:
q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p ,db,q)
@cli.command(help="Get unvisited domains")
@click.pass_context @click.pass_context
@click.argument("count",type=int,default=20) @click.argument("count",type=int,default=20)
def unvisited(ctx, count): def unvisited(ctx, count):
@ -124,7 +125,7 @@ def file(ctx, name):
q = create_queue_from_context(ctx) q = create_queue_from_context(ctx)
process_domains(domains,ctx.obj["visit"],p,db,q) process_domains(domains,ctx.obj["visit"],p,db,q)
@cli.command(help="Visit url and get links. Start here") @cli.command(help="Visit one url and get links. Start here")
@click.pass_context @click.pass_context
@click.argument("link") @click.argument("link")
def start(ctx, link): def start(ctx, link):
@ -132,7 +133,7 @@ def start(ctx, link):
p = ctx.obj["parser"] p = ctx.obj["parser"]
c = Connection() c = Connection()
visit_links([link],c,p,db) visit_links([link],c,p,db)
db.check_domain(domain) #db.check_domain(domain)
@cli.command(help="Continue crawling of seen links from a domain") @cli.command(help="Continue crawling of seen links from a domain")
@click.pass_context @click.pass_context