websucker-pip/websucker/agent.py
2020-05-10 11:48:17 +02:00

448 lines
14 KiB
Python
Executable File

#!/usr/bin/env python
#! -*- coding: utf-8 -*-
import urllib.parse
import urllib.error
import os
import os.path
import re
import datetime
import time
import sys
import tempfile
import pprint
import bs4
import pycurl
import urllib.robotparser
import collections
from websucker.parser import normalize_link,urlunparse
# Parses http refresh in header or on html meta
def get_refresh(ref,target_link):
refresh = None
tokens = ref.strip().split(";")
if len(tokens) > 1 and tokens[1].lower().startswith("url="):
refresh = urlunparse(normalize_link(
tokens[1][4:].strip("\'"), target_link))
return refresh
class Response:
def __init__(self,url,headers,status,content,redirects,link_status):
assert len(url) > 0
assert url[0] != "/"
self.url = url
self.status = status
self.content = content
self.headers = headers
self.redirects = redirects
self.visited_time = datetime.date.today()
self.bs = None
self.link_status = link_status
if content is not None and link_status == "good":
self.bs = bs4.BeautifulSoup(content, "lxml")
def __str__(self):
return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)
def get_content(self):
if self.content is None:
print("NO CONTENT")
print(self.url,self.redirects)
return None
self.content.seek(0)
text = self.content.read()
out = str(text,encoding="utf8",errors="replace")
return out
# HMTL metarefresh redirect
def get_metarefresh(self):
if self.bs is None:
return None
metarefresh = None
t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"})
canonical = self.get_canonical()
for tags in t:
if "content" in tags:
metarefresh = get_refresh(tags["content"],canonical)
if metarefresh is not None:
nl = normalize_link(metarefresh, canonical)
print("Metarefresh")
print(nl)
metarefresh = urlunparse(nl)
return metarefresh
def get_canonical(self):
r = None
last_link = self.url
if len(self.redirects) > 0:
last_link = self.redirects[-1]
if self.bs is not None:
l = self.bs.find("link", rel="canonical", href=True)
if l is not None:
r = urlunparse(normalize_link(l["href"], last_link))
if r is None:
r = last_link
r = urlunparse(normalize_link(r, last_link))
assert len(r) > 0
assert r[0] != "/"
return r
def get_redirects(self):
if len(self.redirects) <2 :
return []
return self.redirects[0:-1]
class Connection:
def __init__(self):
self.useragent = "Googlebot-News"
self.c = pycurl.Curl()
self.c.setopt(self.c.FOLLOWLOCATION, True)
# self.c.setopt(self.c.VERBOSE, True)
self.c.setopt(self.c.CONNECTTIMEOUT, 20)
self.c.setopt(self.c.TIMEOUT, 20)
self.c.setopt(self.c.FAILONERROR, True)
self.c.setopt(self.c.HTTPHEADER, [
'Accept: text/html', 'Accept-Charset: UTF-8'])
self.c.setopt(self.c.HEADERFUNCTION, self.header)
self.c.setopt(self.c.USERAGENT,self.useragent )
# #self.c.setopt(pycurl.COOKIEJAR, 'cookie.txt')
# #self.c.setopt(pycurl.COOKIEFILE, 'cookie.txt')
self.robots = {}
self.headers = {}
self.redirects = []
self.header_lines = []
self.status = 0
self.max_redirect = 4
# Zastavi spracovanie ak content nie je text
# zaznamena location a refresh
def header(self, data):
if len(data) == 0:
return None
l = str(data, encoding="utf8")
self.header_lines.append(l)
s = l.find(" ")
if s >= 1 and s < len(l):
key = l[0:s - 1]
value = l[s + 1:].rstrip()
self.headers[key] = value
kl = key.lower()
if kl == "refresh":
self.add_redirect(value)
elif kl == "location":
self.add_redirect(value)
elif kl == "content-type" and "text" not in value:
# Pycurl potom vyhodi 23, failed writing header
return 0
def crawl_delay(self,domain):
self.cache_robot(domain)
delay = 4
if domain in self.robots:
r = self.robots[domain]
if r is not None:
d = r.crawl_delay(self.useragent)
if d is not None:
delay = d
print("Waiting for {} s".format(delay))
time.sleep(delay)
def __del__(self):
self.c.close()
def close(self):
self.c.close()
def add_redirect(self,link):
last_link = self.url
if len(self.redirects) > 0:
last_link = self.redirects[-1]
v = urlunparse(normalize_link(link, last_link))
if v!=last_link and v not in set(self.redirects):
self.redirects.append(v)
"""
@returns content, link_status
@throws pycurl.error
"""
def _download(self, url):
print("Downloading " + url)
self.url = url
self.headers = {}
self.redirects = []
self.header_lines = []
self.status = 0
content = None
link_status = "bad_connection"
try:
self.headers = {}
del self.header_lines[:]
content = tempfile.SpooledTemporaryFile()
self.c.setopt(self.c.WRITEDATA, content)
self.c.setopt(self.c.URL, url)
self.c.perform()
self.status = self.c.getinfo(self.c.RESPONSE_CODE)
if self.status != 200:
link_status = "bad_httpcode"
elif "Content-Type" in self.headers and not self.headers["Content-Type"].startswith("text"):
link_status = "bad_type"
else:
link_status = "good"
content.seek(0)
except pycurl.error as e:
errno, message = e.args
content = None
self.status = self.c.getinfo(self.c.RESPONSE_CODE)
if errno == 23:
# 23 je zly content v header
link_status = "bad_type"
elif errno == 22:
link_status = "bad_httpcode"
elif errno == 28:
# 28 je connection timeout
link_status = "bad_connection"
elif errno == 60:
# 60 bad ssl certificate
link_status = "bad_connection"
elif errno == 56:
# 56 Connection reset by peer
link_status = "bad_connection"
elif errno == 16:
# 16 HTTP2
link_status = "bad_connection"
elif errno == 6:
# 60 Unable to resolve dns
link_status = "bad_connection"
elif errno == 7:
# 7 Connection refused
link_status = "bad_connection"
else:
raise e
except UnicodeDecodeError as e:
content = None
link_status = "bad_unicode"
except UnicodeEncodeError as e:
content = None
link_status = "bad_unicode"
sz = self.c.getinfo(self.c.SIZE_DOWNLOAD)
tt = self.c.getinfo(self.c.TOTAL_TIME)
print("{} Received {} bytes in {} s".format(self.status,sz,tt))
return content, link_status
# Throws pycurl.error
def html_download2(self, url):
dlink = url
responses = []
while len(responses) < 5:
nl = normalize_link(dlink)
url = urlunparse(nl)
assert url.startswith("http")
content, link_status = self._download(url)
response = Response(url,"\r\n".join(self.header_lines),self.status,content,self.redirects,link_status)
dlink = response.get_metarefresh()
responses.append(response)
if dlink is None:
break
return responses
def cache_robot(self,domain):
if domain not in self.robots:
roboturl = urlunparse(("https", domain, "robots.txt", ""))
try:
r = self._download(roboturl)
if r[1] == "good":
c = r[0].read()
lines = str(c, errors="ignore", encoding="utf8").split("\n")
self.robots[domain] = urllib.robotparser.RobotFileParser()
self.robots[domain].parse(lines)
else:
self.robots[domain] = None
except pycurl.error as err:
print(err)
def is_robot_good(self, url):
schema, domain, path, query = normalize_link(url)
self.cache_robot(domain)
res = True
if domain in self.robots and self.robots[domain] is not None:
res = self.robots[domain].can_fetch("Agent", url)
return res
class ParsedDocument:
def __init__(self, parser,work_link):
self.parser = parser
self.work_link = work_link
self.content = None
self.bs = None
self.paragraph_checksums = None
self.paragraph_sizes = None
self.link_set = set()
self.body = None
self.text_date = None
self.tags = None
self.authors = None
self.title = None
self.description = None
self.section = None
self.article_published_time = None
self.current_time = datetime.date.today()
def extract(self,content,bs):
self.content = content
self.bs = bs
# Extract text and metatext
self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time)
# Paragraph Checksums
pch,pszs = self.parser.calculate_checksums(self.body)
self.paragraph_checksums = pch
self.paragraph_sizes = pszs
if bs is None:
return
self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs)
# Extrakcia linkov zo stranky
base = self.work_link
if bs.base is not None and "href" in bs.base.attrs:
base = bs.base["href"]
# Normalizacia linkov
for l in bs.find_all("a", href=True):
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
continue
href = l["href"]
try:
nl = normalize_link(href, base)
link = urlunparse(nl)
if link == base:
continue
self.link_set.add(link)
except ValueError:
pass
def get_links(self):
return self.link_set
def get_follow_links(self):
follow_links = set()
for l in self.link_set:
if self.parser.is_link_good(l):
link = normalize_link(l,strip_query=self.parser.strip_query)
follow_links.add(urlunparse(link))
return follow_links
def __str__(self):
r = []
if self.title is not None:
r.append(self.title)
if self.body is not None:
if (len(self.body) < 20):
r.append(self.body)
else:
r.append(self.body[0:20]) + " ...."
return ">>> ".join(r)
def get_domains(arg):
domains = []
if arg == "-":
for l in sys.stdin:
domain = l.rstrip()
assert(domain is not None)
if len(domain) == 0:
continue
domains.append(domain)
else:
domains = arg.split(",")
return domains
def parse_and_index(work_link,parser,responses,db):
target_link = work_link
links = []
if len(responses) > 0:
db.index_responses(work_link,responses)
lr = responses[-1]
if lr.content is not None:
target_link = lr.get_canonical()
parsed = ParsedDocument(parser,target_link)
parsed.extract(lr.content, lr.bs)
db.index_content(target_link,parsed)
links = parsed.get_links()
return target_link,links
def visit_sitemap(domain,connection,parser,db):
link = "http://" + domain
responses = connection.html_download2(link)
if len(responses) == 0:
return False
lr = responses[-1]
if lr.link_status.startswith("bad_"):
return False
target_link,outlinks = parse_and_index(link,parser,responses,db)
if len(outlinks) > 0:
db.index_follow_links(parser,outlinks,connection)
return True
def visit_links(links,connection,parser,db):
outlinks = []
for work_link in links:
responses = []
if parser.is_link_good(work_link) and connection.is_robot_good(work_link):
responses = connection.html_download2(work_link)
target_link,links = parse_and_index(work_link,parser,responses,db)
nl = normalize_link(target_link)
connection.crawl_delay(nl[1])
outlinks += links
if len(outlinks) > 0:
db.index_follow_links(parser,outlinks,connection)
def visit_domain(domain,parser,db):
c = Connection()
p = parser
# Get links from frontpage
# TODO Sitemap
res = visit_sitemap(domain,c,parser,db)
if not res:
return False
for i in range(p.crawl_rounds):
# Visit links from frontpage
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
visit_links(links,c,p,db)
db.check_domain(domain)
return True
def process_domains(domains,visit,parser,db,queue):
print("Websucker Agenda>>")
for domain in domains:
print(domain)
if queue is not None:
print("Queuing:")
for domain in domains:
print(domain)
queue.put(domain[0])
if visit:
print("Visiting:")
for domain in domains:
print(domain)
visit_domain(domain[0],parser,db)
def work_domains(parser,db,queue):
while True:
print("Waiting for a new job:")
job = queue.reserve()
domain = job.body
queue.bury(job)
print("Visiting:")
visit_domain(domain,parser,db)
queue.delete(job)