This commit is contained in:
Daniel Hládek 2021-01-20 12:06:03 +01:00
parent e287236033
commit fb0c1d6eb7

View File

@ -283,6 +283,9 @@ class Connection:
return res return res
class ParsedDocument: class ParsedDocument:
"""
One document in the database
"""
def __init__(self, parser,work_link): def __init__(self, parser,work_link):
self.parser = parser self.parser = parser
self.work_link = work_link self.work_link = work_link
@ -304,6 +307,9 @@ class ParsedDocument:
self.current_time = datetime.date.today() self.current_time = datetime.date.today()
def extract(self,content,bs): def extract(self,content,bs):
"""
Parse content and fill the object
"""
self.content = content self.content = content
self.bs = bs self.bs = bs
@ -336,9 +342,15 @@ class ParsedDocument:
pass pass
def get_links(self): def get_links(self):
"""
@return all links
"""
return self.link_set return self.link_set
def get_follow_links(self): def get_follow_links(self):
"""
@return good normalized links
"""
follow_links = set() follow_links = set()
for l in self.link_set: for l in self.link_set:
if self.parser.is_link_good(l): if self.parser.is_link_good(l):
@ -360,6 +372,12 @@ class ParsedDocument:
def get_domains(arg): def get_domains(arg):
"""
Get domains from argument or stdin
if arg is -, get from stdin, else split arg
@param arg dash or domains separated by comma
@return domains
"""
domains = [] domains = []
if arg == "-": if arg == "-":
for l in sys.stdin: for l in sys.stdin:
@ -373,6 +391,13 @@ def get_domains(arg):
return domains return domains
def parse_and_index(work_link,parser,responses,db): def parse_and_index(work_link,parser,responses,db):
"""
Take all responses from work link, parse and store in db
@param work_link - final link from downloader
@param parser to use
@param responses from the downloader
@param db
"""
target_link = work_link target_link = work_link
links = [] links = []
if len(responses) > 0: if len(responses) > 0:
@ -387,6 +412,9 @@ def parse_and_index(work_link,parser,responses,db):
return target_link,links return target_link,links
def visit_sitemap(domain,connection,parser,db): def visit_sitemap(domain,connection,parser,db):
"""
get links from sitemap of the domain
"""
link = "http://" + domain link = "http://" + domain
print("Sitemap visit: " + link) print("Sitemap visit: " + link)
responses = connection.html_download2(link) responses = connection.html_download2(link)
@ -403,7 +431,9 @@ def visit_sitemap(domain,connection,parser,db):
def visit_links(links,connection,parser,db,is_online): def visit_links(links,connection,parser,db,is_online):
# is is not online, then just check links """
if the site is not online, then just check links
"""
outlinks = [] outlinks = []
junklinks = [] junklinks = []
badrobotlinks = [] badrobotlinks = []
@ -423,6 +453,14 @@ def visit_links(links,connection,parser,db,is_online):
db.index_follow_links(parser,outlinks,connection) db.index_follow_links(parser,outlinks,connection)
def visit_domain(domain,parser,db): def visit_domain(domain,parser,db):
"""
One visit of the domain
1.Get links from the frontpage,
2. visit links and extract new links
3. get new links to visit
4. repeat visit for parser.crawl_rounds
"""
c = Connection() c = Connection()
p = parser p = parser
# Get links from frontpage # Get links from frontpage
@ -442,6 +480,10 @@ def visit_domain(domain,parser,db):
return True return True
def process_domains(domains,visit,parser,db,queue): def process_domains(domains,visit,parser,db,queue):
"""
Visit all domains in list.
if queue is true, then queue domain instead immediate visit
"""
print("Websucker Agenda>>") print("Websucker Agenda>>")
random.shuffle(domains) random.shuffle(domains)
for domain in domains: for domain in domains:
@ -460,6 +502,9 @@ def process_domains(domains,visit,parser,db,queue):
visit_domain(domain[0],parser,db) visit_domain(domain[0],parser,db)
def work_domains(parser,db,queue): def work_domains(parser,db,queue):
"""
Poll the queue and visit
"""
while True: while True:
print("Waiting for a new job:") print("Waiting for a new job:")
job = queue.reserve() job = queue.reserve()