diff --git a/websucker/agent.py b/websucker/agent.py index 6d3b8c1..767fabe 100755 --- a/websucker/agent.py +++ b/websucker/agent.py @@ -399,9 +399,15 @@ def visit_sitemap(domain,connection,parser,db): def visit_links(links,connection,parser,db): outlinks = [] + junklinks = [] + badrobotlinks = [] for work_link in links: responses = [] - if parser.is_link_good(work_link) and connection.is_robot_good(work_link): + if not parser.is_link_good(work_link): + db.update_link_status(parser,work_link,"junk") + elif connection.is_robot_good(work_link): + db.update_link_status(parser,work_link,"bad_robot") + else: responses = connection.html_download2(work_link) target_link,links = parse_and_index(work_link,parser,responses,db) nl = normalize_link(target_link) @@ -428,12 +434,14 @@ def visit_domain(domain,parser,db): def process_domains(domains,visit,parser,db,queue): print("Websucker Agenda>>") for domain in domains: + assert len(domain[0]) > 1 print(domain) if queue is not None: print("Queuing:") for domain in domains: print(domain) queue.put(domain[0]) + queue.close() if visit: print("Visiting:") for domain in domains: diff --git a/websucker/cli.py b/websucker/cli.py index 34bdbd8..99ea05a 100644 --- a/websucker/cli.py +++ b/websucker/cli.py @@ -138,7 +138,7 @@ def report(ctx): ready = stats["current-jobs-ready"] print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"])) print("{} ready jobs, {} burried jobs".format(ready,buried)) - except Error as err: + except Exception as err: print(err) @cli.command(help="Database summary") diff --git a/websucker/db.py b/websucker/db.py index 37fa9ce..a1432c7 100644 --- a/websucker/db.py +++ b/websucker/db.py @@ -19,6 +19,7 @@ class Data: Database of text documents """ def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042): + print("Database {}@{}:{}".format(keyspace,cassandra_host, cassandra_port)) # execution profile ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0) profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep} @@ -29,9 +30,9 @@ class Data: - self.index_response_link_update = self.session.prepare(""" + self.update_links = self.session.prepare(""" UPDATE links SET - link_status ='redirect', + link_status = ?, redirect_target = ?, update_time = toTimestamp(now()) WHERE @@ -141,14 +142,8 @@ INSERT INTO content( pl = normalize_link(source_link) for response in responses: tl = response.get_canonical() - r = ( - tl, - pl[1], - pl[2], - pl[3], - ) if pl != tl: - res = self.session.execute(self.index_response_link_update,r) + self.update_link_status(source_link,"redirect",tl) d = ( pl[1], source_link, @@ -227,6 +222,17 @@ INSERT INTO content( print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size)) + def update_link_status(self,parser,links,status,redirect_target=None): + pl = normalize_link(source_link) + r = ( + status, + redirect_target, + pl[1], + pl[2], + pl[3], + ) + res = self.session.execute(self.update_links ,r) + def index_follow_links(self,parser,links,connection): # Index seen links follow_links = set() @@ -477,7 +483,7 @@ INSERT INTO content( gain_ratio = row[3] afg = row[4] if seen_count and not fetched_count and parser.is_domain_good(domain): - domains.append(domain) + domains.append((domain,0)) ss = min(len(domains),count) return random.sample(domains,ss)