zz
This commit is contained in:
		
						commit
						a22fa87537
					
				
							
								
								
									
										2
									
								
								.dockerignore
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								.dockerignore
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,2 @@
 | 
				
			|||||||
 | 
					venv
 | 
				
			||||||
 | 
					websucker.egg-info
 | 
				
			||||||
							
								
								
									
										15
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								Dockerfile
									
									
									
									
									
								
							@ -1,15 +1,16 @@
 | 
				
			|||||||
FROM python:3.8.0-alpine
 | 
					FROM python:3.8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
RUN apk add --update --no-cache git curl curl-dev vim py3-lxml gcc make libxml2-dev libxslt-dev libc-dev
 | 
					RUN apt-get update && apt-get install -y git curl  libcurl4-openssl-dev build-essential vim libssl-dev python-pip
 | 
				
			||||||
 | 
					
 | 
				
			||||||
RUN apk add py2-pip 
 | 
					 | 
				
			||||||
RUN pip2 install cqlsh
 | 
					RUN pip2 install cqlsh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
RUN addgroup -S appgroup -g 1000 && \
 | 
					RUN addgroup appgroup && \
 | 
				
			||||||
    adduser -u 1000 -S appuser -G appgroup
 | 
					    adduser  appuser  && adduser appuser appgroup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
RUN mkdir /app /src
 | 
					RUN mkdir /app /src
 | 
				
			||||||
ADD requirements.txt setup.py ./websucker /src/
 | 
					ADD requirements.txt  /src/
 | 
				
			||||||
 | 
					 | 
				
			||||||
RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip install -r /src/requirements.txt
 | 
					RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip install -r /src/requirements.txt
 | 
				
			||||||
 | 
					WORKDIR /src
 | 
				
			||||||
 | 
					ADD . /src
 | 
				
			||||||
 | 
					RUN python /src/setup.py install
 | 
				
			||||||
WORKDIR /app
 | 
					WORKDIR /app
 | 
				
			||||||
 | 
				
			|||||||
@ -283,6 +283,9 @@ class Connection:
 | 
				
			|||||||
        return res
 | 
					        return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ParsedDocument:
 | 
					class ParsedDocument:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    One document in the database
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    def __init__(self, parser,work_link):
 | 
					    def __init__(self, parser,work_link):
 | 
				
			||||||
        self.parser = parser
 | 
					        self.parser = parser
 | 
				
			||||||
        self.work_link = work_link
 | 
					        self.work_link = work_link
 | 
				
			||||||
@ -304,6 +307,9 @@ class ParsedDocument:
 | 
				
			|||||||
        self.current_time = datetime.date.today()
 | 
					        self.current_time = datetime.date.today()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def extract(self,content,bs):
 | 
					    def extract(self,content,bs):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Parse content and fill the object
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
        self.content = content
 | 
					        self.content = content
 | 
				
			||||||
        self.bs = bs
 | 
					        self.bs = bs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -336,9 +342,15 @@ class ParsedDocument:
 | 
				
			|||||||
                pass
 | 
					                pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_links(self):
 | 
					    def get_links(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        @return all links
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
        return self.link_set
 | 
					        return self.link_set
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_follow_links(self):
 | 
					    def get_follow_links(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        @return good normalized links
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
        follow_links = set()
 | 
					        follow_links = set()
 | 
				
			||||||
        for l in self.link_set:
 | 
					        for l in self.link_set:
 | 
				
			||||||
            if self.parser.is_link_good(l):
 | 
					            if self.parser.is_link_good(l):
 | 
				
			||||||
@ -355,24 +367,18 @@ class ParsedDocument:
 | 
				
			|||||||
            if (len(self.body) < 20):
 | 
					            if (len(self.body) < 20):
 | 
				
			||||||
                r.append(self.body)
 | 
					                r.append(self.body)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                r.append(self.body[0:20]) + " ...."
 | 
					                r.append(self.body[0:20] + " ....")
 | 
				
			||||||
        return ">>> ".join(r)
 | 
					        return ">>> ".join(r)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_domains(arg):
 | 
					 | 
				
			||||||
    domains = []
 | 
					 | 
				
			||||||
    if arg == "-":
 | 
					 | 
				
			||||||
        for l in sys.stdin:
 | 
					 | 
				
			||||||
            domain = l.rstrip()
 | 
					 | 
				
			||||||
            assert(domain is not None)
 | 
					 | 
				
			||||||
            if len(domain) == 0:
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            domains.append(domain)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        domains = arg.split(",")
 | 
					 | 
				
			||||||
    return domains
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def parse_and_index(work_link,parser,responses,db):
 | 
					def parse_and_index(work_link,parser,responses,db):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Take all responses from work link, parse and store in db
 | 
				
			||||||
 | 
					    @param work_link - final link from downloader
 | 
				
			||||||
 | 
					    @param parser to use
 | 
				
			||||||
 | 
					    @param responses from the downloader
 | 
				
			||||||
 | 
					    @param db 
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    target_link = work_link
 | 
					    target_link = work_link
 | 
				
			||||||
    links = []
 | 
					    links = []
 | 
				
			||||||
    if len(responses) > 0:
 | 
					    if len(responses) > 0:
 | 
				
			||||||
@ -387,6 +393,9 @@ def parse_and_index(work_link,parser,responses,db):
 | 
				
			|||||||
    return target_link,links
 | 
					    return target_link,links
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def visit_sitemap(domain,connection,parser,db):
 | 
					def visit_sitemap(domain,connection,parser,db):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    get links from sitemap of the domain
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    link = "http://" + domain
 | 
					    link = "http://" + domain
 | 
				
			||||||
    print("Sitemap visit: " + link)
 | 
					    print("Sitemap visit: " + link)
 | 
				
			||||||
    responses = connection.html_download2(link)
 | 
					    responses = connection.html_download2(link)
 | 
				
			||||||
@ -402,8 +411,10 @@ def visit_sitemap(domain,connection,parser,db):
 | 
				
			|||||||
    return True
 | 
					    return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def visit_links(links,connection,parser,db,is_online):
 | 
					def visit_links(links,connection,parser,db,is_online=True):
 | 
				
			||||||
    # is is not online, then just check links
 | 
					    """
 | 
				
			||||||
 | 
					    if the site is not online, then just check links
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    outlinks = []
 | 
					    outlinks = []
 | 
				
			||||||
    junklinks = []
 | 
					    junklinks = []
 | 
				
			||||||
    badrobotlinks = []
 | 
					    badrobotlinks = []
 | 
				
			||||||
@ -423,6 +434,14 @@ def visit_links(links,connection,parser,db,is_online):
 | 
				
			|||||||
        db.index_follow_links(parser,outlinks,connection)
 | 
					        db.index_follow_links(parser,outlinks,connection)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def visit_domain(domain,parser,db):
 | 
					def visit_domain(domain,parser,db):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    One visit of the domain
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    1.Get links from the frontpage,
 | 
				
			||||||
 | 
					    2. visit links and extract new links
 | 
				
			||||||
 | 
					    3. get new links to visit
 | 
				
			||||||
 | 
					    4. repeat visit for parser.crawl_rounds
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    c = Connection()
 | 
					    c = Connection()
 | 
				
			||||||
    p = parser
 | 
					    p = parser
 | 
				
			||||||
    # Get links from frontpage
 | 
					    # Get links from frontpage
 | 
				
			||||||
@ -442,6 +461,10 @@ def visit_domain(domain,parser,db):
 | 
				
			|||||||
    return True
 | 
					    return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def process_domains(domains,visit,parser,db,queue):
 | 
					def process_domains(domains,visit,parser,db,queue):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Visit all domains in list.
 | 
				
			||||||
 | 
					    if queue is true, then queue domain instead immediate visit
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    print("Websucker Agenda>>")
 | 
					    print("Websucker Agenda>>")
 | 
				
			||||||
    random.shuffle(domains)
 | 
					    random.shuffle(domains)
 | 
				
			||||||
    for domain in domains:
 | 
					    for domain in domains:
 | 
				
			||||||
@ -460,6 +483,9 @@ def process_domains(domains,visit,parser,db,queue):
 | 
				
			|||||||
            visit_domain(domain[0],parser,db)
 | 
					            visit_domain(domain[0],parser,db)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def work_domains(parser,db,queue):
 | 
					def work_domains(parser,db,queue):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Poll the queue and visit
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    while True:
 | 
					    while True:
 | 
				
			||||||
        print("Waiting for a new job:")
 | 
					        print("Waiting for a new job:")
 | 
				
			||||||
        job = queue.reserve()
 | 
					        job = queue.reserve()
 | 
				
			||||||
 | 
				
			|||||||
@ -15,7 +15,7 @@ def create_database_from_context(ctx):
 | 
				
			|||||||
    return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
 | 
					    return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def create_queue_from_context(ctx):
 | 
					def create_queue_from_context(ctx):
 | 
				
			||||||
    return greenstalk.Client(ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"],use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8")
 | 
					    return greenstalk.Client((ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]),use=ctx.obj["beanstalkd_tube"],watch=ctx.obj["beanstalkd_tube"],encoding="utf8")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@click.group()
 | 
					@click.group()
 | 
				
			||||||
@ -52,7 +52,7 @@ def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,beanstalkd_tube,bea
 | 
				
			|||||||
    ctx.obj["queue"] = queue
 | 
					    ctx.obj["queue"] = queue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@cli.command(help="All domains")
 | 
					@cli.command(help="Get visited domains from db")
 | 
				
			||||||
@click.pass_context
 | 
					@click.pass_context
 | 
				
			||||||
@click.argument("count",type=int,default=20)
 | 
					@click.argument("count",type=int,default=20)
 | 
				
			||||||
def all(ctx,count):
 | 
					def all(ctx,count):
 | 
				
			||||||
@ -63,28 +63,7 @@ def all(ctx,count):
 | 
				
			|||||||
        q = create_queue_from_context(ctx)
 | 
					        q = create_queue_from_context(ctx)
 | 
				
			||||||
    process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q)
 | 
					    process_domains(res,ctx.obj["visit"],ctx.obj["parser"],db,q)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@cli.command(help="Work queue")
 | 
					@cli.command(help="Get random domains")
 | 
				
			||||||
@click.pass_context
 | 
					 | 
				
			||||||
def work(ctx):
 | 
					 | 
				
			||||||
    db = create_database_from_context(ctx)
 | 
					 | 
				
			||||||
    q = create_queue_from_context(ctx)
 | 
					 | 
				
			||||||
    work_domains(ctx.obj["parser"],db,q)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@cli.command(help="find best domains")
 | 
					 | 
				
			||||||
@click.pass_context
 | 
					 | 
				
			||||||
@click.argument("count",type=int,default=20)
 | 
					 | 
				
			||||||
#@click.option("visit",is_flag=True)
 | 
					 | 
				
			||||||
def best(ctx, count):
 | 
					 | 
				
			||||||
    db = create_database_from_context(ctx)
 | 
					 | 
				
			||||||
    p = ctx.obj["parser"]
 | 
					 | 
				
			||||||
    domains = db.get_best_domains(count,p)
 | 
					 | 
				
			||||||
    q = None
 | 
					 | 
				
			||||||
    if ctx.obj["queue"]:
 | 
					 | 
				
			||||||
        q = create_queue_from_context(ctx)
 | 
					 | 
				
			||||||
    process_domains(domains,ctx.obj["visit"],p ,db,q)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@cli.command(help="select random domains")
 | 
					 | 
				
			||||||
@click.pass_context
 | 
					@click.pass_context
 | 
				
			||||||
@click.argument("count",type=int,default=20)
 | 
					@click.argument("count",type=int,default=20)
 | 
				
			||||||
#@click.option("visit",is_flag=True)
 | 
					#@click.option("visit",is_flag=True)
 | 
				
			||||||
@ -97,7 +76,29 @@ def blind(ctx, count):
 | 
				
			|||||||
        q = create_queue_from_context(ctx)
 | 
					        q = create_queue_from_context(ctx)
 | 
				
			||||||
    process_domains(domains,ctx.obj["visit"],p ,db,q)
 | 
					    process_domains(domains,ctx.obj["visit"],p ,db,q)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
 | 
					@cli.command(help="Visit domains from queue")
 | 
				
			||||||
 | 
					@click.pass_context
 | 
				
			||||||
 | 
					def work(ctx):
 | 
				
			||||||
 | 
					    db = create_database_from_context(ctx)
 | 
				
			||||||
 | 
					    q = create_queue_from_context(ctx)
 | 
				
			||||||
 | 
					    work_domains(ctx.obj["parser"],db,q)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command(help="Get best domains from db")
 | 
				
			||||||
 | 
					@click.pass_context
 | 
				
			||||||
 | 
					@click.argument("count",type=int,default=20)
 | 
				
			||||||
 | 
					#@click.option("visit",is_flag=True)
 | 
				
			||||||
 | 
					def best(ctx, count):
 | 
				
			||||||
 | 
					    db = create_database_from_context(ctx)
 | 
				
			||||||
 | 
					    p = ctx.obj["parser"]
 | 
				
			||||||
 | 
					    domains = db.get_best_domains(count,p)
 | 
				
			||||||
 | 
					    q = None
 | 
				
			||||||
 | 
					    if ctx.obj["queue"]:
 | 
				
			||||||
 | 
					        q = create_queue_from_context(ctx)
 | 
				
			||||||
 | 
					    process_domains(domains,ctx.obj["visit"],p ,db,q)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command(help="Get unvisited domains")
 | 
				
			||||||
@click.pass_context
 | 
					@click.pass_context
 | 
				
			||||||
@click.argument("count",type=int,default=20)
 | 
					@click.argument("count",type=int,default=20)
 | 
				
			||||||
def unvisited(ctx, count):
 | 
					def unvisited(ctx, count):
 | 
				
			||||||
@ -124,7 +125,7 @@ def file(ctx, name):
 | 
				
			|||||||
        q = create_queue_from_context(ctx)
 | 
					        q = create_queue_from_context(ctx)
 | 
				
			||||||
    process_domains(domains,ctx.obj["visit"],p,db,q)
 | 
					    process_domains(domains,ctx.obj["visit"],p,db,q)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@cli.command(help="Visit url and get links. Start here")
 | 
					@cli.command(help="Visit one url and get links. Start here")
 | 
				
			||||||
@click.pass_context
 | 
					@click.pass_context
 | 
				
			||||||
@click.argument("link")
 | 
					@click.argument("link")
 | 
				
			||||||
def start(ctx, link):
 | 
					def start(ctx, link):
 | 
				
			||||||
@ -132,7 +133,7 @@ def start(ctx, link):
 | 
				
			|||||||
    p = ctx.obj["parser"]
 | 
					    p = ctx.obj["parser"]
 | 
				
			||||||
    c = Connection()
 | 
					    c = Connection()
 | 
				
			||||||
    visit_links([link],c,p,db)
 | 
					    visit_links([link],c,p,db)
 | 
				
			||||||
    db.check_domain(domain)
 | 
					    #db.check_domain(domain)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@cli.command(help="Continue crawling of seen links from a domain")
 | 
					@cli.command(help="Continue crawling of seen links from a domain")
 | 
				
			||||||
@click.pass_context
 | 
					@click.pass_context
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user