websucker-pip/websucker/agent.py

365 lines
12 KiB
Python
Raw Normal View History

2020-05-07 14:09:45 +00:00
#!/usr/bin/env python
#! -*- coding: utf-8 -*-
import urllib.parse
import urllib.error
import os
import os.path
import re
import datetime
import time
import sys
import tempfile
import pprint
import bs4
import pycurl
import urllib.robotparser
from websucker.parser import normalize_link,urlunparse
# Parses http refresh in header or on html meta
def get_refresh(ref,target_link):
refresh = None
tokens = ref.strip().split(";")
if len(tokens) > 1 and tokens[1].lower().startswith("url="):
refresh = urlunparse(normalize_link(
tokens[1][4:].strip("\'"), target_link))
return refresh
class Response:
def __init__(self,url,headers,status,content,redirects,link_status):
assert len(url) > 0
assert url[0] != "/"
self.url = url
self.status = status
self.content = content
self.headers = headers
self.redirects = redirects
self.visited_time = datetime.date.today()
self.bs = None
self.link_status = link_status
if content is not None and link_status == "good":
self.bs = bs4.BeautifulSoup(content, "lxml")
def __str__(self):
return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)
def get_content(self):
if self.content is None:
print("NO CONTENT")
print(self.url,self.redirects)
return None
self.content.seek(0)
text = self.content.read()
out = str(text,encoding="utf8",errors="replace")
return out
# HMTL metarefresh redirect
def get_metarefresh(self):
if self.content is None:
return None
metarefresh = None
t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"})
canonical = self.get_canonical()
for tags in t:
if "content" in tags:
metarefresh = get_refresh(tags["content"],canonical)
if metarefresh is not None:
nl = normalize_link(metarefresh, canonical)
print("Metarefresh")
print(nl)
metarefresh = urlunparse(nl)
return metarefresh
def get_canonical(self):
r = None
last_link = self.url
if len(self.redirects) > 0:
last_link = self.redirects[-1]
if self.bs is not None:
l = self.bs.find("link", rel="canonical", href=True)
if l is not None:
r = urlunparse(normalize_link(l["href"], last_link))
if r is None:
r = last_link
r = urlunparse(normalize_link(r, last_link))
assert len(r) > 0
assert r[0] != "/"
return r
def get_redirects(self):
if len(self.redirects) <2 :
return []
return self.redirects[0:-1]
class Connection:
def __init__(self):
self.c = pycurl.Curl()
self.c.setopt(self.c.FOLLOWLOCATION, True)
# self.c.setopt(self.c.VERBOSE, True)
self.c.setopt(self.c.CONNECTTIMEOUT, 20)
self.c.setopt(self.c.TIMEOUT, 20)
self.c.setopt(self.c.FAILONERROR, True)
self.c.setopt(self.c.HTTPHEADER, [
'Accept: text/html', 'Accept-Charset: UTF-8'])
self.c.setopt(self.c.HEADERFUNCTION, self.header)
self.c.setopt(self.c.USERAGENT, "Googlebot-News")
# #self.c.setopt(pycurl.COOKIEJAR, 'cookie.txt')
# #self.c.setopt(pycurl.COOKIEFILE, 'cookie.txt')
self.robots = {}
self.headers = {}
self.redirects = []
self.header_lines = []
self.status = 0
self.max_redirect = 4
# Zastavi spracovanie ak content nie je text
# zaznamena location a refresh
def header(self, data):
if len(data) == 0:
return None
l = str(data, encoding="utf8")
self.header_lines.append(l)
s = l.find(" ")
if s >= 1 and s < len(l):
key = l[0:s - 1]
value = l[s + 1:].rstrip()
self.headers[key] = value
if key.lower() == "refresh":
self.add_redirect(value)
elif key.lower() == "location":
self.add_redirect(value)
elif key == "Content-Type" and "text" not in value:
# Pycurl potom vyhodi 23, failed writing header
return 0
def __del__(self):
self.c.close()
def close(self):
self.c.close()
def add_redirect(self,link):
last_link = self.url
if len(self.redirects) > 0:
last_link = self.redirects[-1]
v = urlunparse(normalize_link(link, last_link))
if v!=last_link and v not in set(self.redirects):
self.redirects.append(v)
"""
@returns content, link_status
@throws pycurl.error
"""
def _download(self, url):
print("Downloading " + url)
self.url = url
self.headers = {}
self.redirects = []
self.header_lines = []
self.status = 0
content = None
link_status = "bad_connection"
try:
self.headers = {}
del self.header_lines[:]
content = tempfile.SpooledTemporaryFile()
self.c.setopt(self.c.WRITEDATA, content)
self.c.setopt(self.c.URL, url)
self.c.perform()
self.status = self.c.getinfo(self.c.RESPONSE_CODE)
if self.status != 200:
link_status = "bad_httpcode"
elif "Content-Type" in self.headers and not self.headers["Content-Type"].startswith("text"):
link_status = "bad_type"
else:
link_status = "good"
content.seek(0)
except pycurl.error as e:
errno, message = e.args
content = None
self.status = self.c.getinfo(self.c.RESPONSE_CODE)
if errno == 23:
# 23 je zly content v header
link_status = "bad_type"
elif errno == 22:
link_status = "bad_httpcode"
else:
raise e
except UnicodeDecodeError as e:
content = None
link_status = "bad_unicode"
except UnicodeEncodeError as e:
content = None
link_status = "bad_unicode"
sz = self.c.getinfo(self.c.SIZE_DOWNLOAD)
tt = self.c.getinfo(self.c.TOTAL_TIME)
print("{} Received {} bytes in {} s".format(self.status,sz,tt))
return content, link_status
# Throws pycurl.error
def html_download2(self, url):
dlink = url
responses = []
while len(responses) < 5:
nl = normalize_link(dlink)
url = urlunparse(nl)
assert url.startswith("http")
content, link_status = self._download(url)
response = Response(url,"\r\n".join(self.header_lines),self.status,content,self.redirects,link_status)
dlink = response.get_metarefresh()
responses.append(response)
if dlink is None:
break
return responses
def is_robot_good(self, url):
schema, domain, path, query = normalize_link(url)
res = True
if domain not in self.robots:
roboturl = urlunparse((schema, domain, "robots.txt", ""))
try:
r = self._download(roboturl)
if r[1] == "good":
c = r[0].read()
lines = str(c, errors="ignore", encoding="utf8").split("\n")
self.robots[domain] = urllib.robotparser.RobotFileParser()
self.robots[domain].parse(lines)
else:
self.robots[domain] = None
except pycurl.error as err:
print(err)
if domain in self.robots and self.robots[domain] is not None:
res = self.robots[domain].can_fetch("Agent", url)
return res
class ParsedDocument:
def __init__(self, parser,work_link):
self.parser = parser
self.work_link = work_link
self.content = None
self.bs = None
self.paragraph_checksums = None
self.paragraph_sizes = None
self.link_set = set()
self.body = None
self.text_date = None
self.tags = None
self.authors = None
self.title = None
self.description = None
self.section = None
self.article_published_time = None
self.current_time = datetime.date.today()
def extract(self,content,bs):
self.content = content
self.bs = bs
# Extract text and metatext
self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time)
# Paragraph Checksums
pch,pszs = self.parser.calculate_checksums(self.body)
self.paragraph_checksums = pch
self.paragraph_sizes = pszs
if bs is None:
return
self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_og(bs)
# Extrakcia linkov zo stranky
base = self.work_link
if bs.base is not None and "href" in bs.base.attrs:
base = bs.base["href"]
# Normalizacia linkov
for l in bs.find_all("a", href=True):
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
continue
href = l["href"]
try:
nl = normalize_link(href, base)
link = urlunparse(nl)
if link == base:
continue
self.link_set.add(link)
except ValueError:
pass
def get_links(self):
return self.link_set
def get_follow_links(self):
follow_links = set()
for l in self.link_set:
if self.parser.is_link_good(l):
link = normalize_link(l,strip_query=self.parser.strip_query)
follow_links.add(urlunparse(link))
return follow_links
def __str__(self):
r = []
if self.title is not None:
r.append(self.title)
if self.body is not None:
if (len(self.body) < 20):
r.append(self.body)
else:
r.append(self.body[0:20]) + " ...."
return ">>> ".join(r)
def get_domains(arg):
domains = []
if arg == "-":
for l in sys.stdin:
domain = l.rstrip()
assert(domain is not None)
if len(domain) == 0:
continue
domains.append(domain)
else:
domains = arg.split(",")
return domains
def visit_links(links,connection,parser,db):
outlinks = []
for work_link in links:
responses = []
if parser.is_link_good(work_link) and connection.is_robot_good(work_link):
responses = connection.html_download2(work_link)
time.sleep(4)
db.index_responses(work_link,responses)
if len(responses) > 0:
lr = responses[-1]
if lr.content is not None:
target_link = lr.get_canonical()
parsed = ParsedDocument(parser,target_link)
parsed.extract(lr.content, lr.bs)
db.index_content(target_link,parsed)
outlinks += parsed.get_links()
if len(outlinks) > 0:
db.index_follow_links(parser,outlinks,connection)
def visit_domain(domain,parser,db):
c = Connection()
p = parser
# Get links from frontpage
# TODO Sitemap
sitemap = "http://" + domain
visit_links([sitemap],c,p,db)
db.check_domain(domain)
for i in range(p.crawl_rounds):
# Visit links from frontpage
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
visit_links(links,c,p,db)
db.check_domain(domain)