zz
This commit is contained in:
commit
a22fa87537
2
.dockerignore
Normal file
2
.dockerignore
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
venv
|
||||||
|
websucker.egg-info
|
15
Dockerfile
15
Dockerfile
@ -1,15 +1,16 @@
|
|||||||
FROM python:3.8.0-alpine
|
FROM python:3.8
|
||||||
|
|
||||||
RUN apk add --update --no-cache git curl curl-dev vim py3-lxml gcc make libxml2-dev libxslt-dev libc-dev
|
RUN apt-get update && apt-get install -y git curl libcurl4-openssl-dev build-essential vim libssl-dev python-pip
|
||||||
|
|
||||||
RUN apk add py2-pip
|
|
||||||
RUN pip2 install cqlsh
|
RUN pip2 install cqlsh
|
||||||
|
|
||||||
RUN addgroup -S appgroup -g 1000 && \
|
RUN addgroup appgroup && \
|
||||||
adduser -u 1000 -S appuser -G appgroup
|
adduser appuser && adduser appuser appgroup
|
||||||
|
|
||||||
RUN mkdir /app /src
|
RUN mkdir /app /src
|
||||||
ADD requirements.txt setup.py ./websucker /src/
|
ADD requirements.txt /src/
|
||||||
|
|
||||||
RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip install -r /src/requirements.txt
|
RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip install -r /src/requirements.txt
|
||||||
|
WORKDIR /src
|
||||||
|
ADD . /src
|
||||||
|
RUN python /src/setup.py install
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
@ -283,6 +283,9 @@ class Connection:
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
class ParsedDocument:
|
class ParsedDocument:
|
||||||
|
"""
|
||||||
|
One document in the database
|
||||||
|
"""
|
||||||
def __init__(self, parser,work_link):
|
def __init__(self, parser,work_link):
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
self.work_link = work_link
|
self.work_link = work_link
|
||||||
@ -304,6 +307,9 @@ class ParsedDocument:
|
|||||||
self.current_time = datetime.date.today()
|
self.current_time = datetime.date.today()
|
||||||
|
|
||||||
def extract(self,content,bs):
|
def extract(self,content,bs):
|
||||||
|
"""
|
||||||
|
Parse content and fill the object
|
||||||
|
"""
|
||||||
self.content = content
|
self.content = content
|
||||||
self.bs = bs
|
self.bs = bs
|
||||||
|
|
||||||
@ -336,9 +342,15 @@ class ParsedDocument:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def get_links(self):
|
def get_links(self):
|
||||||
|
"""
|
||||||
|
@return all links
|
||||||
|
"""
|
||||||
return self.link_set
|
return self.link_set
|
||||||
|
|
||||||
def get_follow_links(self):
|
def get_follow_links(self):
|
||||||
|
"""
|
||||||
|
@return good normalized links
|
||||||
|
"""
|
||||||
follow_links = set()
|
follow_links = set()
|
||||||
for l in self.link_set:
|
for l in self.link_set:
|
||||||
if self.parser.is_link_good(l):
|
if self.parser.is_link_good(l):
|
||||||
@ -355,24 +367,18 @@ class ParsedDocument:
|
|||||||
if (len(self.body) < 20):
|
if (len(self.body) < 20):
|
||||||
r.append(self.body)
|
r.append(self.body)
|
||||||
else:
|
else:
|
||||||
r.append(self.body[0:20]) + " ...."
|
r.append(self.body[0:20] + " ....")
|
||||||
return ">>> ".join(r)
|
return ">>> ".join(r)
|
||||||
|
|
||||||
|
|
||||||
def get_domains(arg):
|
|
||||||
domains = []
|
|
||||||
if arg == "-":
|
|
||||||
for l in sys.stdin:
|
|
||||||
domain = l.rstrip()
|
|
||||||
assert(domain is not None)
|
|
||||||
if len(domain) == 0:
|
|
||||||
continue
|
|
||||||
domains.append(domain)
|
|
||||||
else:
|
|
||||||
domains = arg.split(",")
|
|
||||||
return domains
|
|
||||||
|
|
||||||
def parse_and_index(work_link,parser,responses,db):
|
def parse_and_index(work_link,parser,responses,db):
|
||||||
|
"""
|
||||||
|
Take all responses from work link, parse and store in db
|
||||||
|
@param work_link - final link from downloader
|
||||||
|
@param parser to use
|
||||||
|
@param responses from the downloader
|
||||||
|
@param db
|
||||||
|
"""
|
||||||
target_link = work_link
|
target_link = work_link
|
||||||
links = []
|
links = []
|
||||||
if len(responses) > 0:
|
if len(responses) > 0:
|
||||||
@ -387,6 +393,9 @@ def parse_and_index(work_link,parser,responses,db):
|
|||||||
return target_link,links
|
return target_link,links
|
||||||
|
|
||||||
def visit_sitemap(domain,connection,parser,db):
|
def visit_sitemap(domain,connection,parser,db):
|
||||||
|
"""
|
||||||
|
get links from sitemap of the domain
|
||||||
|
"""
|
||||||
link = "http://" + domain
|
link = "http://" + domain
|
||||||
print("Sitemap visit: " + link)
|
print("Sitemap visit: " + link)
|
||||||
responses = connection.html_download2(link)
|
responses = connection.html_download2(link)
|
||||||
@ -402,8 +411,10 @@ def visit_sitemap(domain,connection,parser,db):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def visit_links(links,connection,parser,db,is_online):
|
def visit_links(links,connection,parser,db,is_online=True):
|
||||||
# is is not online, then just check links
|
"""
|
||||||
|
if the site is not online, then just check links
|
||||||
|
"""
|
||||||
outlinks = []
|
outlinks = []
|
||||||
junklinks = []
|
junklinks = []
|
||||||
badrobotlinks = []
|
badrobotlinks = []
|
||||||
@ -423,6 +434,14 @@ def visit_links(links,connection,parser,db,is_online):
|
|||||||
db.index_follow_links(parser,outlinks,connection)
|
db.index_follow_links(parser,outlinks,connection)
|
||||||
|
|
||||||
def visit_domain(domain,parser,db):
|
def visit_domain(domain,parser,db):
|
||||||
|
"""
|
||||||
|
One visit of the domain
|
||||||
|
|
||||||
|
1.Get links from the frontpage,
|
||||||
|
2. visit links and extract new links
|
||||||
|
3. get new links to visit
|
||||||
|
4. repeat visit for parser.crawl_rounds
|
||||||
|
"""
|
||||||
c = Connection()
|
c = Connection()
|
||||||
p = parser
|
p = parser
|
||||||
# Get links from frontpage
|
# Get links from frontpage
|
||||||
@ -442,6 +461,10 @@ def visit_domain(domain,parser,db):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def process_domains(domains,visit,parser,db,queue):
|
def process_domains(domains,visit,parser,db,queue):
|
||||||
|
"""
|
||||||
|
Visit all domains in list.
|
||||||
|
if queue is true, then queue domain instead immediate visit
|
||||||
|
"""
|
||||||
print("Websucker Agenda>>")
|
print("Websucker Agenda>>")
|
||||||
random.shuffle(domains)
|
random.shuffle(domains)
|
||||||
for domain in domains:
|
for domain in domains:
|
||||||
@ -460,6 +483,9 @@ def process_domains(domains,visit,parser,db,queue):
|
|||||||
visit_domain(domain[0],parser,db)
|
visit_domain(domain[0],parser,db)
|
||||||
|
|
||||||
def work_domains(parser,db,queue):
|
def work_domains(parser,db,queue):
|
||||||
|
"""
|
||||||
|
Poll the queue and visit
|
||||||
|
"""
|
||||||
while True:
|
while True:
|
||||||
print("Waiting for a new job:")
|
print("Waiting for a new job:")
|
||||||
job = queue.reserve()
|
job = queue.reserve()
|
||||||
|
@ -15,7 +15,7 @@ def create_database_from_context(ctx):
|
|||||||
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
|
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
|
||||||
|
|
||||||
def create_queue_from_context(ctx):
|
def create_queue_from_context(ctx):
|
||||||
return greenstalk.Client(ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"],use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8")
|
return greenstalk.Client((ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]),use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8")
|
||||||
|
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
@ -52,7 +52,7 @@ def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,bea
|
|||||||
ctx.obj["queue"] = queue
|
ctx.obj["queue"] = queue
|
||||||
|
|
||||||
|
|
||||||
@cli.command(help="All domains")
|
@cli.command(help="Get visited domains from db")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
@click.argument("count",type=int,default=20)
|
@click.argument("count",type=int,default=20)
|
||||||
def all(ctx,count):
|
def all(ctx,count):
|
||||||
@ -63,28 +63,7 @@ def all(ctx,count):
|
|||||||
q = create_queue_from_context(ctx)
|
q = create_queue_from_context(ctx)
|
||||||
process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q)
|
process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q)
|
||||||
|
|
||||||
@cli.command(help="Work queue")
|
@cli.command(help="Get random domains")
|
||||||
@click.pass_context
|
|
||||||
def work(ctx):
|
|
||||||
db = create_database_from_context(ctx)
|
|
||||||
q = create_queue_from_context(ctx)
|
|
||||||
work_domains(ctx.obj["parser"],db,q)
|
|
||||||
|
|
||||||
|
|
||||||
@cli.command(help="find best domains")
|
|
||||||
@click.pass_context
|
|
||||||
@click.argument("count",type=int,default=20)
|
|
||||||
#@click.option("visit",is_flag=True)
|
|
||||||
def best(ctx, count):
|
|
||||||
db = create_database_from_context(ctx)
|
|
||||||
p = ctx.obj["parser"]
|
|
||||||
domains = db.get_best_domains(count,p)
|
|
||||||
q = None
|
|
||||||
if ctx.obj["queue"]:
|
|
||||||
q = create_queue_from_context(ctx)
|
|
||||||
process_domains(domains,ctx.obj["visit"],p ,db,q)
|
|
||||||
|
|
||||||
@cli.command(help="select random domains")
|
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
@click.argument("count",type=int,default=20)
|
@click.argument("count",type=int,default=20)
|
||||||
#@click.option("visit",is_flag=True)
|
#@click.option("visit",is_flag=True)
|
||||||
@ -97,7 +76,29 @@ def blind(ctx, count):
|
|||||||
q = create_queue_from_context(ctx)
|
q = create_queue_from_context(ctx)
|
||||||
process_domains(domains,ctx.obj["visit"],p ,db,q)
|
process_domains(domains,ctx.obj["visit"],p ,db,q)
|
||||||
|
|
||||||
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
|
@cli.command(help="Visit domains from queue")
|
||||||
|
@click.pass_context
|
||||||
|
def work(ctx):
|
||||||
|
db = create_database_from_context(ctx)
|
||||||
|
q = create_queue_from_context(ctx)
|
||||||
|
work_domains(ctx.obj["parser"],db,q)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(help="Get best domains from db")
|
||||||
|
@click.pass_context
|
||||||
|
@click.argument("count",type=int,default=20)
|
||||||
|
#@click.option("visit",is_flag=True)
|
||||||
|
def best(ctx, count):
|
||||||
|
db = create_database_from_context(ctx)
|
||||||
|
p = ctx.obj["parser"]
|
||||||
|
domains = db.get_best_domains(count,p)
|
||||||
|
q = None
|
||||||
|
if ctx.obj["queue"]:
|
||||||
|
q = create_queue_from_context(ctx)
|
||||||
|
process_domains(domains,ctx.obj["visit"],p ,db,q)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(help="Get unvisited domains")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
@click.argument("count",type=int,default=20)
|
@click.argument("count",type=int,default=20)
|
||||||
def unvisited(ctx, count):
|
def unvisited(ctx, count):
|
||||||
@ -124,7 +125,7 @@ def file(ctx, name):
|
|||||||
q = create_queue_from_context(ctx)
|
q = create_queue_from_context(ctx)
|
||||||
process_domains(domains,ctx.obj["visit"],p,db,q)
|
process_domains(domains,ctx.obj["visit"],p,db,q)
|
||||||
|
|
||||||
@cli.command(help="Visit url and get links. Start here")
|
@cli.command(help="Visit one url and get links. Start here")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
@click.argument("link")
|
@click.argument("link")
|
||||||
def start(ctx, link):
|
def start(ctx, link):
|
||||||
@ -132,7 +133,7 @@ def start(ctx, link):
|
|||||||
p = ctx.obj["parser"]
|
p = ctx.obj["parser"]
|
||||||
c = Connection()
|
c = Connection()
|
||||||
visit_links([link],c,p,db)
|
visit_links([link],c,p,db)
|
||||||
db.check_domain(domain)
|
#db.check_domain(domain)
|
||||||
|
|
||||||
@cli.command(help="Continue crawling of seen links from a domain")
|
@cli.command(help="Continue crawling of seen links from a domain")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
|
Loading…
Reference in New Issue
Block a user