fix
This commit is contained in:
parent
e287236033
commit
fb0c1d6eb7
@ -283,6 +283,9 @@ class Connection:
|
||||
return res
|
||||
|
||||
class ParsedDocument:
|
||||
"""
|
||||
One document in the database
|
||||
"""
|
||||
def __init__(self, parser,work_link):
|
||||
self.parser = parser
|
||||
self.work_link = work_link
|
||||
@ -304,6 +307,9 @@ class ParsedDocument:
|
||||
self.current_time = datetime.date.today()
|
||||
|
||||
def extract(self,content,bs):
|
||||
"""
|
||||
Parse content and fill the object
|
||||
"""
|
||||
self.content = content
|
||||
self.bs = bs
|
||||
|
||||
@ -336,9 +342,15 @@ class ParsedDocument:
|
||||
pass
|
||||
|
||||
def get_links(self):
|
||||
"""
|
||||
@return all links
|
||||
"""
|
||||
return self.link_set
|
||||
|
||||
def get_follow_links(self):
|
||||
"""
|
||||
@return good normalized links
|
||||
"""
|
||||
follow_links = set()
|
||||
for l in self.link_set:
|
||||
if self.parser.is_link_good(l):
|
||||
@ -360,6 +372,12 @@ class ParsedDocument:
|
||||
|
||||
|
||||
def get_domains(arg):
|
||||
"""
|
||||
Get domains from argument or stdin
|
||||
if arg is -, get from stdin, else split arg
|
||||
@param arg dash or domains separated by comma
|
||||
@return domains
|
||||
"""
|
||||
domains = []
|
||||
if arg == "-":
|
||||
for l in sys.stdin:
|
||||
@ -373,6 +391,13 @@ def get_domains(arg):
|
||||
return domains
|
||||
|
||||
def parse_and_index(work_link,parser,responses,db):
|
||||
"""
|
||||
Take all responses from work link, parse and store in db
|
||||
@param work_link - final link from downloader
|
||||
@param parser to use
|
||||
@param responses from the downloader
|
||||
@param db
|
||||
"""
|
||||
target_link = work_link
|
||||
links = []
|
||||
if len(responses) > 0:
|
||||
@ -387,6 +412,9 @@ def parse_and_index(work_link,parser,responses,db):
|
||||
return target_link,links
|
||||
|
||||
def visit_sitemap(domain,connection,parser,db):
|
||||
"""
|
||||
get links from sitemap of the domain
|
||||
"""
|
||||
link = "http://" + domain
|
||||
print("Sitemap visit: " + link)
|
||||
responses = connection.html_download2(link)
|
||||
@ -403,7 +431,9 @@ def visit_sitemap(domain,connection,parser,db):
|
||||
|
||||
|
||||
def visit_links(links,connection,parser,db,is_online):
|
||||
# is is not online, then just check links
|
||||
"""
|
||||
if the site is not online, then just check links
|
||||
"""
|
||||
outlinks = []
|
||||
junklinks = []
|
||||
badrobotlinks = []
|
||||
@ -423,6 +453,14 @@ def visit_links(links,connection,parser,db,is_online):
|
||||
db.index_follow_links(parser,outlinks,connection)
|
||||
|
||||
def visit_domain(domain,parser,db):
|
||||
"""
|
||||
One visit of the domain
|
||||
|
||||
1.Get links from the frontpage,
|
||||
2. visit links and extract new links
|
||||
3. get new links to visit
|
||||
4. repeat visit for parser.crawl_rounds
|
||||
"""
|
||||
c = Connection()
|
||||
p = parser
|
||||
# Get links from frontpage
|
||||
@ -442,6 +480,10 @@ def visit_domain(domain,parser,db):
|
||||
return True
|
||||
|
||||
def process_domains(domains,visit,parser,db,queue):
|
||||
"""
|
||||
Visit all domains in list.
|
||||
if queue is true, then queue domain instead immediate visit
|
||||
"""
|
||||
print("Websucker Agenda>>")
|
||||
random.shuffle(domains)
|
||||
for domain in domains:
|
||||
@ -460,6 +502,9 @@ def process_domains(domains,visit,parser,db,queue):
|
||||
visit_domain(domain[0],parser,db)
|
||||
|
||||
def work_domains(parser,db,queue):
|
||||
"""
|
||||
Poll the queue and visit
|
||||
"""
|
||||
while True:
|
||||
print("Waiting for a new job:")
|
||||
job = queue.reserve()
|
||||
|
Loading…
Reference in New Issue
Block a user