#!/usr/bin/env python #! -*- coding: utf-8 -*- import urllib.parse import urllib.error import os import os.path import re import datetime import time import sys import tempfile import pprint import bs4 import pycurl import urllib.robotparser from websucker.parser import normalize_link,urlunparse # Parses http refresh in header or on html meta def get_refresh(ref,target_link): refresh = None tokens = ref.strip().split(";") if len(tokens) > 1 and tokens[1].lower().startswith("url="): refresh = urlunparse(normalize_link( tokens[1][4:].strip("\'"), target_link)) return refresh class Response: def __init__(self,url,headers,status,content,redirects,link_status): assert len(url) > 0 assert url[0] != "/" self.url = url self.status = status self.content = content self.headers = headers self.redirects = redirects self.visited_time = datetime.date.today() self.bs = None self.link_status = link_status if content is not None and link_status == "good": self.bs = bs4.BeautifulSoup(content, "lxml") def __str__(self): return "{} {} {}".format(self.url,self.get_canonical(),self.link_status) def get_content(self): if self.content is None: print("NO CONTENT") print(self.url,self.redirects) return None self.content.seek(0) text = self.content.read() out = str(text,encoding="utf8",errors="replace") return out # HMTL metarefresh redirect def get_metarefresh(self): if self.content is None: return None metarefresh = None t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"}) canonical = self.get_canonical() for tags in t: if "content" in tags: metarefresh = get_refresh(tags["content"],canonical) if metarefresh is not None: nl = normalize_link(metarefresh, canonical) print("Metarefresh") print(nl) metarefresh = urlunparse(nl) return metarefresh def get_canonical(self): r = None last_link = self.url if len(self.redirects) > 0: last_link = self.redirects[-1] if self.bs is not None: l = self.bs.find("link", rel="canonical", href=True) if l is not None: r = urlunparse(normalize_link(l["href"], last_link)) if r is None: r = last_link r = urlunparse(normalize_link(r, last_link)) assert len(r) > 0 assert r[0] != "/" return r def get_redirects(self): if len(self.redirects) <2 : return [] return self.redirects[0:-1] class Connection: def __init__(self): self.c = pycurl.Curl() self.c.setopt(self.c.FOLLOWLOCATION, True) # self.c.setopt(self.c.VERBOSE, True) self.c.setopt(self.c.CONNECTTIMEOUT, 20) self.c.setopt(self.c.TIMEOUT, 20) self.c.setopt(self.c.FAILONERROR, True) self.c.setopt(self.c.HTTPHEADER, [ 'Accept: text/html', 'Accept-Charset: UTF-8']) self.c.setopt(self.c.HEADERFUNCTION, self.header) self.c.setopt(self.c.USERAGENT, "Googlebot-News") # #self.c.setopt(pycurl.COOKIEJAR, 'cookie.txt') # #self.c.setopt(pycurl.COOKIEFILE, 'cookie.txt') self.robots = {} self.headers = {} self.redirects = [] self.header_lines = [] self.status = 0 self.max_redirect = 4 # Zastavi spracovanie ak content nie je text # zaznamena location a refresh def header(self, data): if len(data) == 0: return None l = str(data, encoding="utf8") self.header_lines.append(l) s = l.find(" ") if s >= 1 and s < len(l): key = l[0:s - 1] value = l[s + 1:].rstrip() self.headers[key] = value if key.lower() == "refresh": self.add_redirect(value) elif key.lower() == "location": self.add_redirect(value) elif key == "Content-Type" and "text" not in value: # Pycurl potom vyhodi 23, failed writing header return 0 def __del__(self): self.c.close() def close(self): self.c.close() def add_redirect(self,link): last_link = self.url if len(self.redirects) > 0: last_link = self.redirects[-1] v = urlunparse(normalize_link(link, last_link)) if v!=last_link and v not in set(self.redirects): self.redirects.append(v) """ @returns content, link_status @throws pycurl.error """ def _download(self, url): print("Downloading " + url) self.url = url self.headers = {} self.redirects = [] self.header_lines = [] self.status = 0 content = None link_status = "bad_connection" try: self.headers = {} del self.header_lines[:] content = tempfile.SpooledTemporaryFile() self.c.setopt(self.c.WRITEDATA, content) self.c.setopt(self.c.URL, url) self.c.perform() self.status = self.c.getinfo(self.c.RESPONSE_CODE) if self.status != 200: link_status = "bad_httpcode" elif "Content-Type" in self.headers and not self.headers["Content-Type"].startswith("text"): link_status = "bad_type" else: link_status = "good" content.seek(0) except pycurl.error as e: errno, message = e.args content = None self.status = self.c.getinfo(self.c.RESPONSE_CODE) if errno == 23: # 23 je zly content v header link_status = "bad_type" elif errno == 22: link_status = "bad_httpcode" else: raise e except UnicodeDecodeError as e: content = None link_status = "bad_unicode" except UnicodeEncodeError as e: content = None link_status = "bad_unicode" sz = self.c.getinfo(self.c.SIZE_DOWNLOAD) tt = self.c.getinfo(self.c.TOTAL_TIME) print("{} Received {} bytes in {} s".format(self.status,sz,tt)) return content, link_status # Throws pycurl.error def html_download2(self, url): dlink = url responses = [] while len(responses) < 5: nl = normalize_link(dlink) url = urlunparse(nl) assert url.startswith("http") content, link_status = self._download(url) response = Response(url,"\r\n".join(self.header_lines),self.status,content,self.redirects,link_status) dlink = response.get_metarefresh() responses.append(response) if dlink is None: break return responses def is_robot_good(self, url): schema, domain, path, query = normalize_link(url) res = True if domain not in self.robots: roboturl = urlunparse((schema, domain, "robots.txt", "")) try: r = self._download(roboturl) if r[1] == "good": c = r[0].read() lines = str(c, errors="ignore", encoding="utf8").split("\n") self.robots[domain] = urllib.robotparser.RobotFileParser() self.robots[domain].parse(lines) else: self.robots[domain] = None except pycurl.error as err: print(err) if domain in self.robots and self.robots[domain] is not None: res = self.robots[domain].can_fetch("Agent", url) return res class ParsedDocument: def __init__(self, parser,work_link): self.parser = parser self.work_link = work_link self.content = None self.bs = None self.paragraph_checksums = None self.paragraph_sizes = None self.link_set = set() self.body = None self.text_date = None self.tags = None self.authors = None self.title = None self.description = None self.section = None self.article_published_time = None self.current_time = datetime.date.today() def extract(self,content,bs): self.content = content self.bs = bs # Extract text and metatext self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time) # Paragraph Checksums pch,pszs = self.parser.calculate_checksums(self.body) self.paragraph_checksums = pch self.paragraph_sizes = pszs if bs is None: return self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_og(bs) # Extrakcia linkov zo stranky base = self.work_link if bs.base is not None and "href" in bs.base.attrs: base = bs.base["href"] # Normalizacia linkov for l in bs.find_all("a", href=True): if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs: continue href = l["href"] try: nl = normalize_link(href, base) link = urlunparse(nl) if link == base: continue self.link_set.add(link) except ValueError: pass def get_links(self): return self.link_set def get_follow_links(self): follow_links = set() for l in self.link_set: if self.parser.is_link_good(l): link = normalize_link(l,strip_query=self.parser.strip_query) follow_links.add(urlunparse(link)) return follow_links def __str__(self): r = [] if self.title is not None: r.append(self.title) if self.body is not None: if (len(self.body) < 20): r.append(self.body) else: r.append(self.body[0:20]) + " ...." return ">>> ".join(r) def get_domains(arg): domains = [] if arg == "-": for l in sys.stdin: domain = l.rstrip() assert(domain is not None) if len(domain) == 0: continue domains.append(domain) else: domains = arg.split(",") return domains def visit_links(links,connection,parser,db): outlinks = [] for work_link in links: responses = [] if parser.is_link_good(work_link) and connection.is_robot_good(work_link): responses = connection.html_download2(work_link) time.sleep(4) db.index_responses(work_link,responses) if len(responses) > 0: lr = responses[-1] if lr.content is not None: target_link = lr.get_canonical() parsed = ParsedDocument(parser,target_link) parsed.extract(lr.content, lr.bs) db.index_content(target_link,parsed) outlinks += parsed.get_links() if len(outlinks) > 0: db.index_follow_links(parser,outlinks,connection) def visit_domain(domain,parser,db): c = Connection() p = parser # Get links from frontpage # TODO Sitemap sitemap = "http://" + domain visit_links([sitemap],c,p,db) db.check_domain(domain) for i in range(p.crawl_rounds): # Visit links from frontpage links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links) visit_links(links,c,p,db) db.check_domain(domain)