This commit is contained in:
Daniel Hladek 2020-06-04 13:44:22 +02:00
parent 8d4a873005
commit 3687403184
3 changed files with 26 additions and 12 deletions

View File

@ -399,9 +399,15 @@ def visit_sitemap(domain,connection,parser,db):
def visit_links(links,connection,parser,db):
outlinks = []
junklinks = []
badrobotlinks = []
for work_link in links:
responses = []
if parser.is_link_good(work_link) and connection.is_robot_good(work_link):
if not parser.is_link_good(work_link):
db.update_link_status(parser,work_link,"junk")
elif connection.is_robot_good(work_link):
db.update_link_status(parser,work_link,"bad_robot")
else:
responses = connection.html_download2(work_link)
target_link,links = parse_and_index(work_link,parser,responses,db)
nl = normalize_link(target_link)
@ -428,12 +434,14 @@ def visit_domain(domain,parser,db):
def process_domains(domains,visit,parser,db,queue):
print("Websucker Agenda>>")
for domain in domains:
assert len(domain[0]) > 1
print(domain)
if queue is not None:
print("Queuing:")
for domain in domains:
print(domain)
queue.put(domain[0])
queue.close()
if visit:
print("Visiting:")
for domain in domains:

View File

@ -138,7 +138,7 @@ def report(ctx):
ready = stats["current-jobs-ready"]
print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
print("{} ready jobs, {} burried jobs".format(ready,buried))
except Error as err:
except Exception as err:
print(err)
@cli.command(help="Database summary")

View File

@ -19,6 +19,7 @@ class Data:
Database of text documents
"""
def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042):
print("Database {}@{}:{}".format(keyspace,cassandra_host, cassandra_port))
# execution profile
ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0)
profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
@ -29,9 +30,9 @@ class Data:
self.index_response_link_update = self.session.prepare("""
self.update_links = self.session.prepare("""
UPDATE links SET
link_status ='redirect',
link_status = ?,
redirect_target = ?,
update_time = toTimestamp(now())
WHERE
@ -141,14 +142,8 @@ INSERT INTO content(
pl = normalize_link(source_link)
for response in responses:
tl = response.get_canonical()
r = (
tl,
pl[1],
pl[2],
pl[3],
)
if pl != tl:
res = self.session.execute(self.index_response_link_update,r)
self.update_link_status(source_link,"redirect",tl)
d = (
pl[1],
source_link,
@ -227,6 +222,17 @@ INSERT INTO content(
print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
def update_link_status(self,parser,links,status,redirect_target=None):
pl = normalize_link(source_link)
r = (
status,
redirect_target,
pl[1],
pl[2],
pl[3],
)
res = self.session.execute(self.update_links ,r)
def index_follow_links(self,parser,links,connection):
# Index seen links
follow_links = set()
@ -477,7 +483,7 @@ INSERT INTO content(
gain_ratio = row[3]
afg = row[4]
if seen_count and not fetched_count and parser.is_domain_good(domain):
domains.append(domain)
domains.append((domain,0))
ss = min(len(domains),count)
return random.sample(domains,ss)