2020-05-07 14:09:45 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
#! -*- coding: utf-8 -*-
|
|
|
|
import urllib.parse
|
|
|
|
import urllib.error
|
|
|
|
import os
|
|
|
|
import os.path
|
|
|
|
import re
|
|
|
|
import datetime
|
|
|
|
import time
|
|
|
|
import sys
|
|
|
|
import tempfile
|
|
|
|
import pprint
|
|
|
|
import bs4
|
|
|
|
|
|
|
|
import pycurl
|
|
|
|
import urllib.robotparser
|
2020-05-09 09:50:50 +00:00
|
|
|
import collections
|
2020-06-08 14:09:47 +00:00
|
|
|
import random
|
2020-05-07 14:09:45 +00:00
|
|
|
|
|
|
|
|
|
|
|
from websucker.parser import normalize_link,urlunparse
|
|
|
|
|
|
|
|
|
|
|
|
# Parses http refresh in header or on html meta
|
|
|
|
def get_refresh(ref,target_link):
|
|
|
|
refresh = None
|
|
|
|
tokens = ref.strip().split(";")
|
|
|
|
if len(tokens) > 1 and tokens[1].lower().startswith("url="):
|
|
|
|
refresh = urlunparse(normalize_link(
|
|
|
|
tokens[1][4:].strip("\'"), target_link))
|
|
|
|
return refresh
|
|
|
|
|
|
|
|
class Response:
|
|
|
|
def __init__(self,url,headers,status,content,redirects,link_status):
|
|
|
|
assert len(url) > 0
|
|
|
|
assert url[0] != "/"
|
|
|
|
self.url = url
|
|
|
|
self.status = status
|
|
|
|
self.content = content
|
|
|
|
self.headers = headers
|
|
|
|
self.redirects = redirects
|
|
|
|
self.visited_time = datetime.date.today()
|
|
|
|
self.bs = None
|
|
|
|
if content is not None and link_status == "good":
|
2020-06-06 09:29:36 +00:00
|
|
|
try:
|
|
|
|
self.bs = bs4.BeautifulSoup(content, "lxml")
|
|
|
|
except ValueError:
|
|
|
|
link_status = "bad_parse"
|
|
|
|
self.link_status = link_status
|
2020-05-07 14:09:45 +00:00
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)
|
|
|
|
|
|
|
|
def get_content(self):
|
|
|
|
if self.content is None:
|
|
|
|
print("NO CONTENT")
|
|
|
|
print(self.url,self.redirects)
|
|
|
|
return None
|
|
|
|
self.content.seek(0)
|
|
|
|
text = self.content.read()
|
|
|
|
out = str(text,encoding="utf8",errors="replace")
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
# HMTL metarefresh redirect
|
|
|
|
def get_metarefresh(self):
|
2020-05-10 09:48:17 +00:00
|
|
|
if self.bs is None:
|
2020-05-07 14:09:45 +00:00
|
|
|
return None
|
|
|
|
metarefresh = None
|
|
|
|
t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"})
|
|
|
|
canonical = self.get_canonical()
|
|
|
|
for tags in t:
|
|
|
|
if "content" in tags:
|
|
|
|
metarefresh = get_refresh(tags["content"],canonical)
|
|
|
|
if metarefresh is not None:
|
|
|
|
nl = normalize_link(metarefresh, canonical)
|
|
|
|
print("Metarefresh")
|
|
|
|
print(nl)
|
|
|
|
metarefresh = urlunparse(nl)
|
|
|
|
|
|
|
|
return metarefresh
|
|
|
|
|
|
|
|
def get_canonical(self):
|
|
|
|
r = None
|
|
|
|
last_link = self.url
|
|
|
|
if len(self.redirects) > 0:
|
|
|
|
last_link = self.redirects[-1]
|
|
|
|
if self.bs is not None:
|
|
|
|
l = self.bs.find("link", rel="canonical", href=True)
|
|
|
|
if l is not None:
|
|
|
|
r = urlunparse(normalize_link(l["href"], last_link))
|
|
|
|
if r is None:
|
|
|
|
r = last_link
|
|
|
|
r = urlunparse(normalize_link(r, last_link))
|
|
|
|
assert len(r) > 0
|
|
|
|
assert r[0] != "/"
|
|
|
|
return r
|
|
|
|
|
|
|
|
def get_redirects(self):
|
|
|
|
if len(self.redirects) <2 :
|
|
|
|
return []
|
|
|
|
return self.redirects[0:-1]
|
|
|
|
|
|
|
|
|
|
|
|
class Connection:
|
|
|
|
def __init__(self):
|
2020-05-09 09:50:50 +00:00
|
|
|
self.useragent = "Googlebot-News"
|
2020-05-07 14:09:45 +00:00
|
|
|
self.c = pycurl.Curl()
|
|
|
|
self.c.setopt(self.c.FOLLOWLOCATION, True)
|
|
|
|
# self.c.setopt(self.c.VERBOSE, True)
|
|
|
|
self.c.setopt(self.c.CONNECTTIMEOUT, 20)
|
|
|
|
self.c.setopt(self.c.TIMEOUT, 20)
|
|
|
|
self.c.setopt(self.c.FAILONERROR, True)
|
|
|
|
self.c.setopt(self.c.HTTPHEADER, [
|
|
|
|
'Accept: text/html', 'Accept-Charset: UTF-8'])
|
|
|
|
self.c.setopt(self.c.HEADERFUNCTION, self.header)
|
2020-05-09 09:50:50 +00:00
|
|
|
self.c.setopt(self.c.USERAGENT,self.useragent )
|
2020-05-07 14:09:45 +00:00
|
|
|
# #self.c.setopt(pycurl.COOKIEJAR, 'cookie.txt')
|
|
|
|
# #self.c.setopt(pycurl.COOKIEFILE, 'cookie.txt')
|
|
|
|
self.robots = {}
|
|
|
|
self.headers = {}
|
|
|
|
self.redirects = []
|
|
|
|
self.header_lines = []
|
|
|
|
self.status = 0
|
|
|
|
self.max_redirect = 4
|
|
|
|
|
|
|
|
# Zastavi spracovanie ak content nie je text
|
|
|
|
# zaznamena location a refresh
|
|
|
|
def header(self, data):
|
|
|
|
if len(data) == 0:
|
|
|
|
return None
|
|
|
|
l = str(data, encoding="utf8")
|
|
|
|
self.header_lines.append(l)
|
|
|
|
s = l.find(" ")
|
|
|
|
if s >= 1 and s < len(l):
|
|
|
|
key = l[0:s - 1]
|
|
|
|
value = l[s + 1:].rstrip()
|
|
|
|
self.headers[key] = value
|
2020-05-10 09:48:17 +00:00
|
|
|
kl = key.lower()
|
|
|
|
if kl == "refresh":
|
2020-05-07 14:09:45 +00:00
|
|
|
self.add_redirect(value)
|
2020-05-10 09:48:17 +00:00
|
|
|
elif kl == "location":
|
2020-05-07 14:09:45 +00:00
|
|
|
self.add_redirect(value)
|
2020-05-10 09:48:17 +00:00
|
|
|
elif kl == "content-type" and "text" not in value:
|
2020-05-07 14:09:45 +00:00
|
|
|
# Pycurl potom vyhodi 23, failed writing header
|
|
|
|
return 0
|
|
|
|
|
2020-05-09 09:50:50 +00:00
|
|
|
def crawl_delay(self,domain):
|
|
|
|
self.cache_robot(domain)
|
|
|
|
delay = 4
|
|
|
|
if domain in self.robots:
|
|
|
|
r = self.robots[domain]
|
|
|
|
if r is not None:
|
|
|
|
d = r.crawl_delay(self.useragent)
|
|
|
|
if d is not None:
|
|
|
|
delay = d
|
|
|
|
print("Waiting for {} s".format(delay))
|
|
|
|
time.sleep(delay)
|
|
|
|
|
2020-05-07 14:09:45 +00:00
|
|
|
def __del__(self):
|
|
|
|
self.c.close()
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
self.c.close()
|
|
|
|
|
|
|
|
def add_redirect(self,link):
|
|
|
|
last_link = self.url
|
|
|
|
if len(self.redirects) > 0:
|
|
|
|
last_link = self.redirects[-1]
|
|
|
|
v = urlunparse(normalize_link(link, last_link))
|
|
|
|
if v!=last_link and v not in set(self.redirects):
|
|
|
|
self.redirects.append(v)
|
|
|
|
|
|
|
|
"""
|
|
|
|
@returns content, link_status
|
|
|
|
@throws pycurl.error
|
|
|
|
"""
|
|
|
|
def _download(self, url):
|
|
|
|
print("Downloading " + url)
|
|
|
|
self.url = url
|
|
|
|
self.headers = {}
|
|
|
|
self.redirects = []
|
|
|
|
self.header_lines = []
|
|
|
|
self.status = 0
|
|
|
|
content = None
|
|
|
|
link_status = "bad_connection"
|
|
|
|
try:
|
|
|
|
self.headers = {}
|
|
|
|
del self.header_lines[:]
|
|
|
|
content = tempfile.SpooledTemporaryFile()
|
|
|
|
self.c.setopt(self.c.WRITEDATA, content)
|
|
|
|
self.c.setopt(self.c.URL, url)
|
|
|
|
self.c.perform()
|
|
|
|
self.status = self.c.getinfo(self.c.RESPONSE_CODE)
|
|
|
|
if self.status != 200:
|
|
|
|
link_status = "bad_httpcode"
|
|
|
|
elif "Content-Type" in self.headers and not self.headers["Content-Type"].startswith("text"):
|
|
|
|
link_status = "bad_type"
|
|
|
|
else:
|
|
|
|
link_status = "good"
|
|
|
|
content.seek(0)
|
|
|
|
except pycurl.error as e:
|
|
|
|
errno, message = e.args
|
|
|
|
content = None
|
|
|
|
self.status = self.c.getinfo(self.c.RESPONSE_CODE)
|
|
|
|
if errno == 23:
|
|
|
|
# 23 je zly content v header
|
|
|
|
link_status = "bad_type"
|
|
|
|
elif errno == 22:
|
|
|
|
link_status = "bad_httpcode"
|
2020-05-09 09:50:50 +00:00
|
|
|
elif errno == 28:
|
|
|
|
# 28 je connection timeout
|
|
|
|
link_status = "bad_connection"
|
|
|
|
elif errno == 60:
|
|
|
|
# 60 bad ssl certificate
|
|
|
|
link_status = "bad_connection"
|
2020-05-10 09:48:17 +00:00
|
|
|
elif errno == 56:
|
|
|
|
# 56 Connection reset by peer
|
|
|
|
link_status = "bad_connection"
|
2020-05-09 09:50:50 +00:00
|
|
|
elif errno == 16:
|
|
|
|
# 16 HTTP2
|
|
|
|
link_status = "bad_connection"
|
2020-05-11 14:43:39 +00:00
|
|
|
elif errno == 92:
|
|
|
|
# 92 HTTP2 not closed
|
|
|
|
link_status = "bad_connection"
|
2020-05-09 09:50:50 +00:00
|
|
|
elif errno == 6:
|
|
|
|
# 60 Unable to resolve dns
|
|
|
|
link_status = "bad_connection"
|
2020-05-10 09:48:17 +00:00
|
|
|
elif errno == 7:
|
|
|
|
# 7 Connection refused
|
|
|
|
link_status = "bad_connection"
|
2020-05-07 14:09:45 +00:00
|
|
|
else:
|
2020-05-13 13:20:20 +00:00
|
|
|
link_status = "bad_connection"
|
|
|
|
#raise e
|
2020-05-07 14:09:45 +00:00
|
|
|
except UnicodeDecodeError as e:
|
|
|
|
content = None
|
|
|
|
link_status = "bad_unicode"
|
|
|
|
except UnicodeEncodeError as e:
|
|
|
|
content = None
|
|
|
|
link_status = "bad_unicode"
|
|
|
|
sz = self.c.getinfo(self.c.SIZE_DOWNLOAD)
|
|
|
|
tt = self.c.getinfo(self.c.TOTAL_TIME)
|
|
|
|
print("{} Received {} bytes in {} s".format(self.status,sz,tt))
|
|
|
|
return content, link_status
|
|
|
|
|
|
|
|
# Throws pycurl.error
|
|
|
|
def html_download2(self, url):
|
|
|
|
dlink = url
|
|
|
|
responses = []
|
|
|
|
while len(responses) < 5:
|
|
|
|
nl = normalize_link(dlink)
|
|
|
|
url = urlunparse(nl)
|
|
|
|
assert url.startswith("http")
|
|
|
|
content, link_status = self._download(url)
|
|
|
|
response = Response(url,"\r\n".join(self.header_lines),self.status,content,self.redirects,link_status)
|
|
|
|
dlink = response.get_metarefresh()
|
|
|
|
responses.append(response)
|
|
|
|
if dlink is None:
|
|
|
|
break
|
|
|
|
return responses
|
|
|
|
|
2020-05-09 09:50:50 +00:00
|
|
|
def cache_robot(self,domain):
|
2020-05-07 14:09:45 +00:00
|
|
|
if domain not in self.robots:
|
2020-05-09 09:50:50 +00:00
|
|
|
roboturl = urlunparse(("https", domain, "robots.txt", ""))
|
2020-05-07 14:09:45 +00:00
|
|
|
try:
|
|
|
|
r = self._download(roboturl)
|
|
|
|
if r[1] == "good":
|
|
|
|
c = r[0].read()
|
|
|
|
lines = str(c, errors="ignore", encoding="utf8").split("\n")
|
|
|
|
self.robots[domain] = urllib.robotparser.RobotFileParser()
|
|
|
|
self.robots[domain].parse(lines)
|
|
|
|
else:
|
|
|
|
self.robots[domain] = None
|
|
|
|
except pycurl.error as err:
|
|
|
|
print(err)
|
2020-05-09 09:50:50 +00:00
|
|
|
|
|
|
|
def is_robot_good(self, url):
|
|
|
|
schema, domain, path, query = normalize_link(url)
|
|
|
|
self.cache_robot(domain)
|
|
|
|
res = True
|
2020-05-07 14:09:45 +00:00
|
|
|
if domain in self.robots and self.robots[domain] is not None:
|
|
|
|
res = self.robots[domain].can_fetch("Agent", url)
|
|
|
|
return res
|
|
|
|
|
|
|
|
class ParsedDocument:
|
|
|
|
def __init__(self, parser,work_link):
|
|
|
|
self.parser = parser
|
|
|
|
self.work_link = work_link
|
|
|
|
|
|
|
|
self.content = None
|
|
|
|
self.bs = None
|
|
|
|
self.paragraph_checksums = None
|
|
|
|
self.paragraph_sizes = None
|
|
|
|
|
|
|
|
self.link_set = set()
|
|
|
|
self.body = None
|
|
|
|
self.text_date = None
|
|
|
|
self.tags = None
|
|
|
|
self.authors = None
|
|
|
|
self.title = None
|
|
|
|
self.description = None
|
|
|
|
self.section = None
|
|
|
|
self.article_published_time = None
|
|
|
|
self.current_time = datetime.date.today()
|
|
|
|
|
|
|
|
def extract(self,content,bs):
|
|
|
|
self.content = content
|
|
|
|
self.bs = bs
|
|
|
|
|
|
|
|
# Extract text and metatext
|
|
|
|
self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time)
|
|
|
|
# Paragraph Checksums
|
|
|
|
pch,pszs = self.parser.calculate_checksums(self.body)
|
|
|
|
self.paragraph_checksums = pch
|
|
|
|
self.paragraph_sizes = pszs
|
|
|
|
if bs is None:
|
|
|
|
return
|
2020-05-10 09:48:17 +00:00
|
|
|
self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs)
|
2020-05-07 14:09:45 +00:00
|
|
|
|
|
|
|
# Extrakcia linkov zo stranky
|
|
|
|
base = self.work_link
|
|
|
|
if bs.base is not None and "href" in bs.base.attrs:
|
|
|
|
base = bs.base["href"]
|
|
|
|
# Normalizacia linkov
|
|
|
|
for l in bs.find_all("a", href=True):
|
|
|
|
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
|
|
|
|
continue
|
|
|
|
href = l["href"]
|
|
|
|
try:
|
|
|
|
nl = normalize_link(href, base)
|
|
|
|
link = urlunparse(nl)
|
|
|
|
if link == base:
|
|
|
|
continue
|
|
|
|
self.link_set.add(link)
|
|
|
|
except ValueError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
def get_links(self):
|
|
|
|
return self.link_set
|
|
|
|
|
|
|
|
def get_follow_links(self):
|
|
|
|
follow_links = set()
|
|
|
|
for l in self.link_set:
|
|
|
|
if self.parser.is_link_good(l):
|
|
|
|
link = normalize_link(l,strip_query=self.parser.strip_query)
|
|
|
|
follow_links.add(urlunparse(link))
|
|
|
|
return follow_links
|
|
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
r = []
|
|
|
|
if self.title is not None:
|
|
|
|
r.append(self.title)
|
|
|
|
if self.body is not None:
|
|
|
|
if (len(self.body) < 20):
|
|
|
|
r.append(self.body)
|
|
|
|
else:
|
|
|
|
r.append(self.body[0:20]) + " ...."
|
|
|
|
return ">>> ".join(r)
|
|
|
|
|
|
|
|
|
|
|
|
def get_domains(arg):
|
|
|
|
domains = []
|
|
|
|
if arg == "-":
|
|
|
|
for l in sys.stdin:
|
|
|
|
domain = l.rstrip()
|
|
|
|
assert(domain is not None)
|
|
|
|
if len(domain) == 0:
|
|
|
|
continue
|
|
|
|
domains.append(domain)
|
|
|
|
else:
|
|
|
|
domains = arg.split(",")
|
|
|
|
return domains
|
|
|
|
|
2020-05-09 09:50:50 +00:00
|
|
|
def parse_and_index(work_link,parser,responses,db):
|
|
|
|
target_link = work_link
|
|
|
|
links = []
|
|
|
|
if len(responses) > 0:
|
|
|
|
db.index_responses(work_link,responses)
|
|
|
|
lr = responses[-1]
|
2020-06-06 09:29:36 +00:00
|
|
|
if lr.bs is not None:
|
2020-05-09 09:50:50 +00:00
|
|
|
target_link = lr.get_canonical()
|
|
|
|
parsed = ParsedDocument(parser,target_link)
|
|
|
|
parsed.extract(lr.content, lr.bs)
|
|
|
|
db.index_content(target_link,parsed)
|
|
|
|
links = parsed.get_links()
|
|
|
|
return target_link,links
|
|
|
|
|
|
|
|
def visit_sitemap(domain,connection,parser,db):
|
|
|
|
link = "http://" + domain
|
2020-06-06 09:29:36 +00:00
|
|
|
print("Sitemap visit: " + link)
|
2020-05-09 09:50:50 +00:00
|
|
|
responses = connection.html_download2(link)
|
|
|
|
if len(responses) == 0:
|
|
|
|
return False
|
|
|
|
lr = responses[-1]
|
2020-06-06 09:29:36 +00:00
|
|
|
if lr.bs is None:
|
2020-05-09 09:50:50 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
target_link,outlinks = parse_and_index(link,parser,responses,db)
|
|
|
|
if len(outlinks) > 0:
|
|
|
|
db.index_follow_links(parser,outlinks,connection)
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2020-06-06 09:29:36 +00:00
|
|
|
def visit_links(links,connection,parser,db,is_online):
|
|
|
|
# is is not online, then just check links
|
2020-05-07 14:09:45 +00:00
|
|
|
outlinks = []
|
2020-06-04 11:44:22 +00:00
|
|
|
junklinks = []
|
|
|
|
badrobotlinks = []
|
2020-05-07 14:09:45 +00:00
|
|
|
for work_link in links:
|
|
|
|
responses = []
|
2020-06-04 11:44:22 +00:00
|
|
|
if not parser.is_link_good(work_link):
|
2020-06-06 09:29:36 +00:00
|
|
|
db.update_link_status(work_link,"bad_link")
|
|
|
|
elif is_online and not connection.is_robot_good(work_link):
|
|
|
|
db.update_link_status(work_link,"bad_robot")
|
|
|
|
elif is_online:
|
2020-05-07 14:09:45 +00:00
|
|
|
responses = connection.html_download2(work_link)
|
2020-05-09 09:50:50 +00:00
|
|
|
target_link,links = parse_and_index(work_link,parser,responses,db)
|
|
|
|
nl = normalize_link(target_link)
|
|
|
|
connection.crawl_delay(nl[1])
|
|
|
|
outlinks += links
|
2020-05-07 14:09:45 +00:00
|
|
|
if len(outlinks) > 0:
|
|
|
|
db.index_follow_links(parser,outlinks,connection)
|
|
|
|
|
|
|
|
def visit_domain(domain,parser,db):
|
|
|
|
c = Connection()
|
|
|
|
p = parser
|
|
|
|
# Get links from frontpage
|
|
|
|
# TODO Sitemap
|
2020-06-06 09:29:36 +00:00
|
|
|
is_online = False
|
|
|
|
if parser.is_domain_good(domain):
|
|
|
|
# Is domain online?
|
|
|
|
is_online = visit_sitemap(domain,c,parser,db)
|
2020-06-08 14:09:47 +00:00
|
|
|
if is_online:
|
|
|
|
for i in range(p.crawl_rounds):
|
|
|
|
# Visit links from frontpage
|
|
|
|
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
|
|
|
|
visit_links(links,c,p,db,is_online)
|
|
|
|
db.check_domain(domain)
|
|
|
|
else:
|
2020-05-07 14:09:45 +00:00
|
|
|
db.check_domain(domain)
|
2020-05-09 09:50:50 +00:00
|
|
|
return True
|
|
|
|
|
|
|
|
def process_domains(domains,visit,parser,db,queue):
|
|
|
|
print("Websucker Agenda>>")
|
2020-06-08 14:09:47 +00:00
|
|
|
random.shuffle(domains)
|
2020-05-09 09:50:50 +00:00
|
|
|
for domain in domains:
|
2020-06-04 11:44:22 +00:00
|
|
|
assert len(domain[0]) > 1
|
2020-05-09 09:50:50 +00:00
|
|
|
print(domain)
|
|
|
|
if queue is not None:
|
|
|
|
print("Queuing:")
|
|
|
|
for domain in domains:
|
|
|
|
print(domain)
|
|
|
|
queue.put(domain[0])
|
2020-06-04 11:44:22 +00:00
|
|
|
queue.close()
|
2020-05-09 09:50:50 +00:00
|
|
|
if visit:
|
|
|
|
print("Visiting:")
|
|
|
|
for domain in domains:
|
|
|
|
print(domain)
|
|
|
|
visit_domain(domain[0],parser,db)
|
|
|
|
|
|
|
|
def work_domains(parser,db,queue):
|
|
|
|
while True:
|
|
|
|
print("Waiting for a new job:")
|
|
|
|
job = queue.reserve()
|
|
|
|
domain = job.body
|
|
|
|
queue.bury(job)
|
|
|
|
print("Visiting:")
|
|
|
|
visit_domain(domain,parser,db)
|
|
|
|
queue.delete(job)
|