zz
This commit is contained in:
parent
8d4a873005
commit
3687403184
@ -399,9 +399,15 @@ def visit_sitemap(domain,connection,parser,db):
|
|||||||
|
|
||||||
def visit_links(links,connection,parser,db):
|
def visit_links(links,connection,parser,db):
|
||||||
outlinks = []
|
outlinks = []
|
||||||
|
junklinks = []
|
||||||
|
badrobotlinks = []
|
||||||
for work_link in links:
|
for work_link in links:
|
||||||
responses = []
|
responses = []
|
||||||
if parser.is_link_good(work_link) and connection.is_robot_good(work_link):
|
if not parser.is_link_good(work_link):
|
||||||
|
db.update_link_status(parser,work_link,"junk")
|
||||||
|
elif connection.is_robot_good(work_link):
|
||||||
|
db.update_link_status(parser,work_link,"bad_robot")
|
||||||
|
else:
|
||||||
responses = connection.html_download2(work_link)
|
responses = connection.html_download2(work_link)
|
||||||
target_link,links = parse_and_index(work_link,parser,responses,db)
|
target_link,links = parse_and_index(work_link,parser,responses,db)
|
||||||
nl = normalize_link(target_link)
|
nl = normalize_link(target_link)
|
||||||
@ -428,12 +434,14 @@ def visit_domain(domain,parser,db):
|
|||||||
def process_domains(domains,visit,parser,db,queue):
|
def process_domains(domains,visit,parser,db,queue):
|
||||||
print("Websucker Agenda>>")
|
print("Websucker Agenda>>")
|
||||||
for domain in domains:
|
for domain in domains:
|
||||||
|
assert len(domain[0]) > 1
|
||||||
print(domain)
|
print(domain)
|
||||||
if queue is not None:
|
if queue is not None:
|
||||||
print("Queuing:")
|
print("Queuing:")
|
||||||
for domain in domains:
|
for domain in domains:
|
||||||
print(domain)
|
print(domain)
|
||||||
queue.put(domain[0])
|
queue.put(domain[0])
|
||||||
|
queue.close()
|
||||||
if visit:
|
if visit:
|
||||||
print("Visiting:")
|
print("Visiting:")
|
||||||
for domain in domains:
|
for domain in domains:
|
||||||
|
@ -138,7 +138,7 @@ def report(ctx):
|
|||||||
ready = stats["current-jobs-ready"]
|
ready = stats["current-jobs-ready"]
|
||||||
print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
|
print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
|
||||||
print("{} ready jobs, {} burried jobs".format(ready,buried))
|
print("{} ready jobs, {} burried jobs".format(ready,buried))
|
||||||
except Error as err:
|
except Exception as err:
|
||||||
print(err)
|
print(err)
|
||||||
|
|
||||||
@cli.command(help="Database summary")
|
@cli.command(help="Database summary")
|
||||||
|
@ -19,6 +19,7 @@ class Data:
|
|||||||
Database of text documents
|
Database of text documents
|
||||||
"""
|
"""
|
||||||
def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042):
|
def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042):
|
||||||
|
print("Database {}@{}:{}".format(keyspace,cassandra_host, cassandra_port))
|
||||||
# execution profile
|
# execution profile
|
||||||
ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0)
|
ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0)
|
||||||
profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
|
profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
|
||||||
@ -29,9 +30,9 @@ class Data:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
self.index_response_link_update = self.session.prepare("""
|
self.update_links = self.session.prepare("""
|
||||||
UPDATE links SET
|
UPDATE links SET
|
||||||
link_status ='redirect',
|
link_status = ?,
|
||||||
redirect_target = ?,
|
redirect_target = ?,
|
||||||
update_time = toTimestamp(now())
|
update_time = toTimestamp(now())
|
||||||
WHERE
|
WHERE
|
||||||
@ -141,14 +142,8 @@ INSERT INTO content(
|
|||||||
pl = normalize_link(source_link)
|
pl = normalize_link(source_link)
|
||||||
for response in responses:
|
for response in responses:
|
||||||
tl = response.get_canonical()
|
tl = response.get_canonical()
|
||||||
r = (
|
|
||||||
tl,
|
|
||||||
pl[1],
|
|
||||||
pl[2],
|
|
||||||
pl[3],
|
|
||||||
)
|
|
||||||
if pl != tl:
|
if pl != tl:
|
||||||
res = self.session.execute(self.index_response_link_update,r)
|
self.update_link_status(source_link,"redirect",tl)
|
||||||
d = (
|
d = (
|
||||||
pl[1],
|
pl[1],
|
||||||
source_link,
|
source_link,
|
||||||
@ -227,6 +222,17 @@ INSERT INTO content(
|
|||||||
|
|
||||||
print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
|
print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
|
||||||
|
|
||||||
|
def update_link_status(self,parser,links,status,redirect_target=None):
|
||||||
|
pl = normalize_link(source_link)
|
||||||
|
r = (
|
||||||
|
status,
|
||||||
|
redirect_target,
|
||||||
|
pl[1],
|
||||||
|
pl[2],
|
||||||
|
pl[3],
|
||||||
|
)
|
||||||
|
res = self.session.execute(self.update_links ,r)
|
||||||
|
|
||||||
def index_follow_links(self,parser,links,connection):
|
def index_follow_links(self,parser,links,connection):
|
||||||
# Index seen links
|
# Index seen links
|
||||||
follow_links = set()
|
follow_links = set()
|
||||||
@ -477,7 +483,7 @@ INSERT INTO content(
|
|||||||
gain_ratio = row[3]
|
gain_ratio = row[3]
|
||||||
afg = row[4]
|
afg = row[4]
|
||||||
if seen_count and not fetched_count and parser.is_domain_good(domain):
|
if seen_count and not fetched_count and parser.is_domain_good(domain):
|
||||||
domains.append(domain)
|
domains.append((domain,0))
|
||||||
ss = min(len(domains),count)
|
ss = min(len(domains),count)
|
||||||
return random.sample(domains,ss)
|
return random.sample(domains,ss)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user