zz
This commit is contained in:
parent
8d4a873005
commit
3687403184
@ -399,9 +399,15 @@ def visit_sitemap(domain,connection,parser,db):
|
||||
|
||||
def visit_links(links,connection,parser,db):
|
||||
outlinks = []
|
||||
junklinks = []
|
||||
badrobotlinks = []
|
||||
for work_link in links:
|
||||
responses = []
|
||||
if parser.is_link_good(work_link) and connection.is_robot_good(work_link):
|
||||
if not parser.is_link_good(work_link):
|
||||
db.update_link_status(parser,work_link,"junk")
|
||||
elif connection.is_robot_good(work_link):
|
||||
db.update_link_status(parser,work_link,"bad_robot")
|
||||
else:
|
||||
responses = connection.html_download2(work_link)
|
||||
target_link,links = parse_and_index(work_link,parser,responses,db)
|
||||
nl = normalize_link(target_link)
|
||||
@ -428,12 +434,14 @@ def visit_domain(domain,parser,db):
|
||||
def process_domains(domains,visit,parser,db,queue):
|
||||
print("Websucker Agenda>>")
|
||||
for domain in domains:
|
||||
assert len(domain[0]) > 1
|
||||
print(domain)
|
||||
if queue is not None:
|
||||
print("Queuing:")
|
||||
for domain in domains:
|
||||
print(domain)
|
||||
queue.put(domain[0])
|
||||
queue.close()
|
||||
if visit:
|
||||
print("Visiting:")
|
||||
for domain in domains:
|
||||
|
@ -138,7 +138,7 @@ def report(ctx):
|
||||
ready = stats["current-jobs-ready"]
|
||||
print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
|
||||
print("{} ready jobs, {} burried jobs".format(ready,buried))
|
||||
except Error as err:
|
||||
except Exception as err:
|
||||
print(err)
|
||||
|
||||
@cli.command(help="Database summary")
|
||||
|
@ -19,6 +19,7 @@ class Data:
|
||||
Database of text documents
|
||||
"""
|
||||
def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042):
|
||||
print("Database {}@{}:{}".format(keyspace,cassandra_host, cassandra_port))
|
||||
# execution profile
|
||||
ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0)
|
||||
profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
|
||||
@ -29,9 +30,9 @@ class Data:
|
||||
|
||||
|
||||
|
||||
self.index_response_link_update = self.session.prepare("""
|
||||
self.update_links = self.session.prepare("""
|
||||
UPDATE links SET
|
||||
link_status ='redirect',
|
||||
link_status = ?,
|
||||
redirect_target = ?,
|
||||
update_time = toTimestamp(now())
|
||||
WHERE
|
||||
@ -141,14 +142,8 @@ INSERT INTO content(
|
||||
pl = normalize_link(source_link)
|
||||
for response in responses:
|
||||
tl = response.get_canonical()
|
||||
r = (
|
||||
tl,
|
||||
pl[1],
|
||||
pl[2],
|
||||
pl[3],
|
||||
)
|
||||
if pl != tl:
|
||||
res = self.session.execute(self.index_response_link_update,r)
|
||||
self.update_link_status(source_link,"redirect",tl)
|
||||
d = (
|
||||
pl[1],
|
||||
source_link,
|
||||
@ -227,6 +222,17 @@ INSERT INTO content(
|
||||
|
||||
print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
|
||||
|
||||
def update_link_status(self,parser,links,status,redirect_target=None):
|
||||
pl = normalize_link(source_link)
|
||||
r = (
|
||||
status,
|
||||
redirect_target,
|
||||
pl[1],
|
||||
pl[2],
|
||||
pl[3],
|
||||
)
|
||||
res = self.session.execute(self.update_links ,r)
|
||||
|
||||
def index_follow_links(self,parser,links,connection):
|
||||
# Index seen links
|
||||
follow_links = set()
|
||||
@ -477,7 +483,7 @@ INSERT INTO content(
|
||||
gain_ratio = row[3]
|
||||
afg = row[4]
|
||||
if seen_count and not fetched_count and parser.is_domain_good(domain):
|
||||
domains.append(domain)
|
||||
domains.append((domain,0))
|
||||
ss = min(len(domains),count)
|
||||
return random.sample(domains,ss)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user