websucker-pip/websucker/agent.py

516 lines
16 KiB
Python
Raw Normal View History

2020-05-07 14:09:45 +00:00
#!/usr/bin/env python
#! -*- coding: utf-8 -*-
import urllib.parse
import urllib.error
import os
import os.path
import re
import datetime
import time
import sys
import tempfile
import pprint
import bs4
import pycurl
import urllib.robotparser
2020-05-09 09:50:50 +00:00
import collections
2020-06-08 14:09:47 +00:00
import random
2020-05-07 14:09:45 +00:00
from websucker.parser import normalize_link,urlunparse
# Parses http refresh in header or on html meta
def get_refresh(ref,target_link):
refresh = None
tokens = ref.strip().split(";")
if len(tokens) > 1 and tokens[1].lower().startswith("url="):
refresh = urlunparse(normalize_link(
tokens[1][4:].strip("\'"), target_link))
return refresh
class Response:
def __init__(self,url,headers,status,content,redirects,link_status):
assert len(url) > 0
assert url[0] != "/"
self.url = url
self.status = status
self.content = content
self.headers = headers
self.redirects = redirects
self.visited_time = datetime.date.today()
self.bs = None
if content is not None and link_status == "good":
2020-06-06 09:29:36 +00:00
try:
self.bs = bs4.BeautifulSoup(content, "lxml")
except ValueError:
link_status = "bad_parse"
self.link_status = link_status
2020-05-07 14:09:45 +00:00
def __str__(self):
return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)
def get_content(self):
if self.content is None:
print("NO CONTENT")
print(self.url,self.redirects)
return None
self.content.seek(0)
text = self.content.read()
out = str(text,encoding="utf8",errors="replace")
return out
# HMTL metarefresh redirect
def get_metarefresh(self):
2020-05-10 09:48:17 +00:00
if self.bs is None:
2020-05-07 14:09:45 +00:00
return None
metarefresh = None
t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"})
canonical = self.get_canonical()
for tags in t:
if "content" in tags:
metarefresh = get_refresh(tags["content"],canonical)
if metarefresh is not None:
nl = normalize_link(metarefresh, canonical)
print("Metarefresh")
print(nl)
metarefresh = urlunparse(nl)
return metarefresh
def get_canonical(self):
r = None
last_link = self.url
if len(self.redirects) > 0:
last_link = self.redirects[-1]
if self.bs is not None:
l = self.bs.find("link", rel="canonical", href=True)
if l is not None:
r = urlunparse(normalize_link(l["href"], last_link))
if r is None:
r = last_link
r = urlunparse(normalize_link(r, last_link))
assert len(r) > 0
assert r[0] != "/"
return r
def get_redirects(self):
if len(self.redirects) <2 :
return []
return self.redirects[0:-1]
class Connection:
def __init__(self):
2020-05-09 09:50:50 +00:00
self.useragent = "Googlebot-News"
2020-05-07 14:09:45 +00:00
self.c = pycurl.Curl()
self.c.setopt(self.c.FOLLOWLOCATION, True)
# self.c.setopt(self.c.VERBOSE, True)
self.c.setopt(self.c.CONNECTTIMEOUT, 20)
self.c.setopt(self.c.TIMEOUT, 20)
self.c.setopt(self.c.FAILONERROR, True)
self.c.setopt(self.c.HTTPHEADER, [
'Accept: text/html', 'Accept-Charset: UTF-8'])
self.c.setopt(self.c.HEADERFUNCTION, self.header)
2020-05-09 09:50:50 +00:00
self.c.setopt(self.c.USERAGENT,self.useragent )
2020-05-07 14:09:45 +00:00
# #self.c.setopt(pycurl.COOKIEJAR, 'cookie.txt')
# #self.c.setopt(pycurl.COOKIEFILE, 'cookie.txt')
self.robots = {}
self.headers = {}
self.redirects = []
self.header_lines = []
self.status = 0
self.max_redirect = 4
# Zastavi spracovanie ak content nie je text
# zaznamena location a refresh
def header(self, data):
if len(data) == 0:
return None
l = str(data, encoding="utf8")
self.header_lines.append(l)
s = l.find(" ")
if s >= 1 and s < len(l):
key = l[0:s - 1]
value = l[s + 1:].rstrip()
self.headers[key] = value
2020-05-10 09:48:17 +00:00
kl = key.lower()
if kl == "refresh":
2020-05-07 14:09:45 +00:00
self.add_redirect(value)
2020-05-10 09:48:17 +00:00
elif kl == "location":
2020-05-07 14:09:45 +00:00
self.add_redirect(value)
2020-05-10 09:48:17 +00:00
elif kl == "content-type" and "text" not in value:
2020-05-07 14:09:45 +00:00
# Pycurl potom vyhodi 23, failed writing header
return 0
2020-05-09 09:50:50 +00:00
def crawl_delay(self,domain):
self.cache_robot(domain)
delay = 4
if domain in self.robots:
r = self.robots[domain]
if r is not None:
d = r.crawl_delay(self.useragent)
if d is not None:
delay = d
print("Waiting for {} s".format(delay))
time.sleep(delay)
2020-05-07 14:09:45 +00:00
def __del__(self):
self.c.close()
def close(self):
self.c.close()
def add_redirect(self,link):
last_link = self.url
if len(self.redirects) > 0:
last_link = self.redirects[-1]
v = urlunparse(normalize_link(link, last_link))
if v!=last_link and v not in set(self.redirects):
self.redirects.append(v)
"""
@returns content, link_status
@throws pycurl.error
"""
def _download(self, url):
print("Downloading " + url)
self.url = url
self.headers = {}
self.redirects = []
self.header_lines = []
self.status = 0
content = None
link_status = "bad_connection"
try:
self.headers = {}
del self.header_lines[:]
content = tempfile.SpooledTemporaryFile()
self.c.setopt(self.c.WRITEDATA, content)
self.c.setopt(self.c.URL, url)
self.c.perform()
self.status = self.c.getinfo(self.c.RESPONSE_CODE)
if self.status != 200:
link_status = "bad_httpcode"
elif "Content-Type" in self.headers and not self.headers["Content-Type"].startswith("text"):
link_status = "bad_type"
else:
link_status = "good"
content.seek(0)
except pycurl.error as e:
errno, message = e.args
content = None
self.status = self.c.getinfo(self.c.RESPONSE_CODE)
if errno == 23:
# 23 je zly content v header
link_status = "bad_type"
elif errno == 22:
link_status = "bad_httpcode"
2020-05-09 09:50:50 +00:00
elif errno == 28:
# 28 je connection timeout
link_status = "bad_connection"
elif errno == 60:
# 60 bad ssl certificate
link_status = "bad_connection"
2020-05-10 09:48:17 +00:00
elif errno == 56:
# 56 Connection reset by peer
link_status = "bad_connection"
2020-05-09 09:50:50 +00:00
elif errno == 16:
# 16 HTTP2
link_status = "bad_connection"
2020-05-11 14:43:39 +00:00
elif errno == 92:
# 92 HTTP2 not closed
link_status = "bad_connection"
2020-05-09 09:50:50 +00:00
elif errno == 6:
# 60 Unable to resolve dns
link_status = "bad_connection"
2020-05-10 09:48:17 +00:00
elif errno == 7:
# 7 Connection refused
link_status = "bad_connection"
2020-05-07 14:09:45 +00:00
else:
2020-05-13 13:20:20 +00:00
link_status = "bad_connection"
#raise e
2020-05-07 14:09:45 +00:00
except UnicodeDecodeError as e:
content = None
link_status = "bad_unicode"
except UnicodeEncodeError as e:
content = None
link_status = "bad_unicode"
sz = self.c.getinfo(self.c.SIZE_DOWNLOAD)
tt = self.c.getinfo(self.c.TOTAL_TIME)
print("{} Received {} bytes in {} s".format(self.status,sz,tt))
return content, link_status
# Throws pycurl.error
def html_download2(self, url):
dlink = url
responses = []
while len(responses) < 5:
nl = normalize_link(dlink)
url = urlunparse(nl)
assert url.startswith("http")
content, link_status = self._download(url)
response = Response(url,"\r\n".join(self.header_lines),self.status,content,self.redirects,link_status)
dlink = response.get_metarefresh()
responses.append(response)
if dlink is None:
break
return responses
2020-05-09 09:50:50 +00:00
def cache_robot(self,domain):
2020-05-07 14:09:45 +00:00
if domain not in self.robots:
2020-05-09 09:50:50 +00:00
roboturl = urlunparse(("https", domain, "robots.txt", ""))
2020-05-07 14:09:45 +00:00
try:
r = self._download(roboturl)
if r[1] == "good":
c = r[0].read()
lines = str(c, errors="ignore", encoding="utf8").split("\n")
self.robots[domain] = urllib.robotparser.RobotFileParser()
self.robots[domain].parse(lines)
else:
self.robots[domain] = None
except pycurl.error as err:
print(err)
2020-05-09 09:50:50 +00:00
def is_robot_good(self, url):
schema, domain, path, query = normalize_link(url)
self.cache_robot(domain)
res = True
2020-05-07 14:09:45 +00:00
if domain in self.robots and self.robots[domain] is not None:
res = self.robots[domain].can_fetch("Agent", url)
return res
class ParsedDocument:
2021-01-20 11:06:03 +00:00
"""
One document in the database
"""
2020-05-07 14:09:45 +00:00
def __init__(self, parser,work_link):
self.parser = parser
self.work_link = work_link
self.content = None
self.bs = None
self.paragraph_checksums = None
self.paragraph_sizes = None
self.link_set = set()
self.body = None
self.text_date = None
self.tags = None
self.authors = None
self.title = None
self.description = None
self.section = None
self.article_published_time = None
self.current_time = datetime.date.today()
def extract(self,content,bs):
2021-01-20 11:06:03 +00:00
"""
Parse content and fill the object
"""
2020-05-07 14:09:45 +00:00
self.content = content
self.bs = bs
# Extract text and metatext
self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time)
# Paragraph Checksums
pch,pszs = self.parser.calculate_checksums(self.body)
self.paragraph_checksums = pch
self.paragraph_sizes = pszs
if bs is None:
return
2020-05-10 09:48:17 +00:00
self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs)
2020-05-07 14:09:45 +00:00
# Extrakcia linkov zo stranky
base = self.work_link
if bs.base is not None and "href" in bs.base.attrs:
base = bs.base["href"]
# Normalizacia linkov
for l in bs.find_all("a", href=True):
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
continue
href = l["href"]
try:
nl = normalize_link(href, base)
link = urlunparse(nl)
if link == base:
continue
self.link_set.add(link)
except ValueError:
pass
def get_links(self):
2021-01-20 11:06:03 +00:00
"""
@return all links
"""
2020-05-07 14:09:45 +00:00
return self.link_set
def get_follow_links(self):
2021-01-20 11:06:03 +00:00
"""
@return good normalized links
"""
2020-05-07 14:09:45 +00:00
follow_links = set()
for l in self.link_set:
if self.parser.is_link_good(l):
link = normalize_link(l,strip_query=self.parser.strip_query)
follow_links.add(urlunparse(link))
return follow_links
def __str__(self):
r = []
if self.title is not None:
r.append(self.title)
if self.body is not None:
if (len(self.body) < 20):
r.append(self.body)
else:
2021-01-20 08:56:53 +00:00
r.append(self.body[0:20] + " ....")
2020-05-07 14:09:45 +00:00
return ">>> ".join(r)
def get_domains(arg):
2021-01-20 11:06:03 +00:00
"""
Get domains from argument or stdin
if arg is -, get from stdin, else split arg
@param arg dash or domains separated by comma
@return domains
"""
2020-05-07 14:09:45 +00:00
domains = []
if arg == "-":
for l in sys.stdin:
domain = l.rstrip()
assert(domain is not None)
if len(domain) == 0:
continue
domains.append(domain)
else:
domains = arg.split(",")
return domains
2020-05-09 09:50:50 +00:00
def parse_and_index(work_link,parser,responses,db):
2021-01-20 11:06:03 +00:00
"""
Take all responses from work link, parse and store in db
@param work_link - final link from downloader
@param parser to use
@param responses from the downloader
@param db
"""
2020-05-09 09:50:50 +00:00
target_link = work_link
links = []
if len(responses) > 0:
db.index_responses(work_link,responses)
lr = responses[-1]
2020-06-06 09:29:36 +00:00
if lr.bs is not None:
2020-05-09 09:50:50 +00:00
target_link = lr.get_canonical()
parsed = ParsedDocument(parser,target_link)
parsed.extract(lr.content, lr.bs)
db.index_content(target_link,parsed)
links = parsed.get_links()
return target_link,links
def visit_sitemap(domain,connection,parser,db):
2021-01-20 11:06:03 +00:00
"""
get links from sitemap of the domain
"""
2020-05-09 09:50:50 +00:00
link = "http://" + domain
2020-06-06 09:29:36 +00:00
print("Sitemap visit: " + link)
2020-05-09 09:50:50 +00:00
responses = connection.html_download2(link)
if len(responses) == 0:
return False
lr = responses[-1]
2020-06-06 09:29:36 +00:00
if lr.bs is None:
2020-05-09 09:50:50 +00:00
return False
target_link,outlinks = parse_and_index(link,parser,responses,db)
if len(outlinks) > 0:
db.index_follow_links(parser,outlinks,connection)
return True
2020-06-06 09:29:36 +00:00
def visit_links(links,connection,parser,db,is_online):
2021-01-20 11:06:03 +00:00
"""
if the site is not online, then just check links
"""
2020-05-07 14:09:45 +00:00
outlinks = []
2020-06-04 11:44:22 +00:00
junklinks = []
badrobotlinks = []
2020-05-07 14:09:45 +00:00
for work_link in links:
responses = []
2020-06-04 11:44:22 +00:00
if not parser.is_link_good(work_link):
2020-06-06 09:29:36 +00:00
db.update_link_status(work_link,"bad_link")
elif is_online and not connection.is_robot_good(work_link):
db.update_link_status(work_link,"bad_robot")
elif is_online:
2020-05-07 14:09:45 +00:00
responses = connection.html_download2(work_link)
2020-05-09 09:50:50 +00:00
target_link,links = parse_and_index(work_link,parser,responses,db)
nl = normalize_link(target_link)
connection.crawl_delay(nl[1])
outlinks += links
2020-05-07 14:09:45 +00:00
if len(outlinks) > 0:
db.index_follow_links(parser,outlinks,connection)
def visit_domain(domain,parser,db):
2021-01-20 11:06:03 +00:00
"""
One visit of the domain
1.Get links from the frontpage,
2. visit links and extract new links
3. get new links to visit
4. repeat visit for parser.crawl_rounds
"""
2020-05-07 14:09:45 +00:00
c = Connection()
p = parser
# Get links from frontpage
# TODO Sitemap
2020-06-06 09:29:36 +00:00
is_online = False
if parser.is_domain_good(domain):
# Is domain online?
is_online = visit_sitemap(domain,c,parser,db)
2020-06-08 14:09:47 +00:00
if is_online:
for i in range(p.crawl_rounds):
# Visit links from frontpage
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
visit_links(links,c,p,db,is_online)
db.check_domain(domain)
else:
2020-05-07 14:09:45 +00:00
db.check_domain(domain)
2020-05-09 09:50:50 +00:00
return True
def process_domains(domains,visit,parser,db,queue):
2021-01-20 11:06:03 +00:00
"""
Visit all domains in list.
if queue is true, then queue domain instead immediate visit
"""
2020-05-09 09:50:50 +00:00
print("Websucker Agenda>>")
2020-06-08 14:09:47 +00:00
random.shuffle(domains)
2020-05-09 09:50:50 +00:00
for domain in domains:
2020-06-04 11:44:22 +00:00
assert len(domain[0]) > 1
2020-05-09 09:50:50 +00:00
print(domain)
if queue is not None:
print("Queuing:")
for domain in domains:
print(domain)
queue.put(domain[0])
2020-06-04 11:44:22 +00:00
queue.close()
2020-05-09 09:50:50 +00:00
if visit:
print("Visiting:")
for domain in domains:
print(domain)
visit_domain(domain[0],parser,db)
def work_domains(parser,db,queue):
2021-01-20 11:06:03 +00:00
"""
Poll the queue and visit
"""
2020-05-09 09:50:50 +00:00
while True:
print("Waiting for a new job:")
job = queue.reserve()
domain = job.body
queue.bury(job)
print("Visiting:")
visit_domain(domain,parser,db)
queue.delete(job)