#!/usr/bin/env python #! -*- coding: utf-8 -*- import urllib.parse import urllib.error import os import os.path import re import datetime import time import sys import tempfile import pprint import bs4 import pycurl import urllib.robotparser import collections import random from websucker.parser import normalize_link,urlunparse # Parses http refresh in header or on html meta def get_refresh(ref,target_link): refresh = None tokens = ref.strip().split(";") if len(tokens) > 1 and tokens[1].lower().startswith("url="): refresh = urlunparse(normalize_link( tokens[1][4:].strip("\'"), target_link)) return refresh class Response: def __init__(self,url,headers,status,content,redirects,link_status): assert len(url) > 0 assert url[0] != "/" self.url = url self.status = status self.content = content self.headers = headers self.redirects = redirects self.visited_time = datetime.date.today() self.bs = None if content is not None and link_status == "good": try: self.bs = bs4.BeautifulSoup(content, "lxml") except ValueError: link_status = "bad_parse" self.link_status = link_status def __str__(self): return "{} {} {}".format(self.url,self.get_canonical(),self.link_status) def get_content(self): if self.content is None: print("NO CONTENT") print(self.url,self.redirects) return None self.content.seek(0) text = self.content.read() out = str(text,encoding="utf8",errors="replace") return out # HMTL metarefresh redirect def get_metarefresh(self): if self.bs is None: return None metarefresh = None t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"}) canonical = self.get_canonical() for tags in t: if "content" in tags: metarefresh = get_refresh(tags["content"],canonical) if metarefresh is not None: nl = normalize_link(metarefresh, canonical) print("Metarefresh") print(nl) metarefresh = urlunparse(nl) return metarefresh def get_canonical(self): r = None last_link = self.url if len(self.redirects) > 0: last_link = self.redirects[-1] if self.bs is not None: l = self.bs.find("link", rel="canonical", href=True) if l is not None: r = urlunparse(normalize_link(l["href"], last_link)) if r is None: r = last_link r = urlunparse(normalize_link(r, last_link)) assert len(r) > 0 assert r[0] != "/" return r def get_redirects(self): if len(self.redirects) <2 : return [] return self.redirects[0:-1] class Connection: def __init__(self): self.useragent = "Googlebot-News" self.c = pycurl.Curl() self.c.setopt(self.c.FOLLOWLOCATION, True) # self.c.setopt(self.c.VERBOSE, True) self.c.setopt(self.c.CONNECTTIMEOUT, 20) self.c.setopt(self.c.TIMEOUT, 20) self.c.setopt(self.c.FAILONERROR, True) self.c.setopt(self.c.HTTPHEADER, [ 'Accept: text/html', 'Accept-Charset: UTF-8']) self.c.setopt(self.c.HEADERFUNCTION, self.header) self.c.setopt(self.c.USERAGENT,self.useragent ) # #self.c.setopt(pycurl.COOKIEJAR, 'cookie.txt') # #self.c.setopt(pycurl.COOKIEFILE, 'cookie.txt') self.robots = {} self.headers = {} self.redirects = [] self.header_lines = [] self.status = 0 self.max_redirect = 4 # Zastavi spracovanie ak content nie je text # zaznamena location a refresh def header(self, data): if len(data) == 0: return None l = str(data, encoding="utf8") self.header_lines.append(l) s = l.find(" ") if s >= 1 and s < len(l): key = l[0:s - 1] value = l[s + 1:].rstrip() self.headers[key] = value kl = key.lower() if kl == "refresh": self.add_redirect(value) elif kl == "location": self.add_redirect(value) elif kl == "content-type" and "text" not in value: # Pycurl potom vyhodi 23, failed writing header return 0 def crawl_delay(self,domain): self.cache_robot(domain) delay = 4 if domain in self.robots: r = self.robots[domain] if r is not None: d = r.crawl_delay(self.useragent) if d is not None: delay = d print("Waiting for {} s".format(delay)) time.sleep(delay) def __del__(self): self.c.close() def close(self): self.c.close() def add_redirect(self,link): last_link = self.url if len(self.redirects) > 0: last_link = self.redirects[-1] v = urlunparse(normalize_link(link, last_link)) if v!=last_link and v not in set(self.redirects): self.redirects.append(v) """ @returns content, link_status @throws pycurl.error """ def _download(self, url): print("Downloading " + url) self.url = url self.headers = {} self.redirects = [] self.header_lines = [] self.status = 0 content = None link_status = "bad_connection" try: self.headers = {} del self.header_lines[:] content = tempfile.SpooledTemporaryFile() self.c.setopt(self.c.WRITEDATA, content) self.c.setopt(self.c.URL, url) self.c.perform() self.status = self.c.getinfo(self.c.RESPONSE_CODE) if self.status != 200: link_status = "bad_httpcode" elif "Content-Type" in self.headers and not self.headers["Content-Type"].startswith("text"): link_status = "bad_type" else: link_status = "good" content.seek(0) except pycurl.error as e: errno, message = e.args content = None self.status = self.c.getinfo(self.c.RESPONSE_CODE) if errno == 23: # 23 je zly content v header link_status = "bad_type" elif errno == 22: link_status = "bad_httpcode" elif errno == 28: # 28 je connection timeout link_status = "bad_connection" elif errno == 60: # 60 bad ssl certificate link_status = "bad_connection" elif errno == 56: # 56 Connection reset by peer link_status = "bad_connection" elif errno == 16: # 16 HTTP2 link_status = "bad_connection" elif errno == 92: # 92 HTTP2 not closed link_status = "bad_connection" elif errno == 6: # 60 Unable to resolve dns link_status = "bad_connection" elif errno == 7: # 7 Connection refused link_status = "bad_connection" else: link_status = "bad_connection" #raise e except UnicodeDecodeError as e: content = None link_status = "bad_unicode" except UnicodeEncodeError as e: content = None link_status = "bad_unicode" sz = self.c.getinfo(self.c.SIZE_DOWNLOAD) tt = self.c.getinfo(self.c.TOTAL_TIME) print("{} Received {} bytes in {} s".format(self.status,sz,tt)) return content, link_status # Throws pycurl.error def html_download2(self, url): dlink = url responses = [] while len(responses) < 5: nl = normalize_link(dlink) url = urlunparse(nl) assert url.startswith("http") content, link_status = self._download(url) response = Response(url,"\r\n".join(self.header_lines),self.status,content,self.redirects,link_status) dlink = response.get_metarefresh() responses.append(response) if dlink is None: break return responses def cache_robot(self,domain): if domain not in self.robots: roboturl = urlunparse(("https", domain, "robots.txt", "")) try: r = self._download(roboturl) if r[1] == "good": c = r[0].read() lines = str(c, errors="ignore", encoding="utf8").split("\n") self.robots[domain] = urllib.robotparser.RobotFileParser() self.robots[domain].parse(lines) else: self.robots[domain] = None except pycurl.error as err: print(err) def is_robot_good(self, url): schema, domain, path, query = normalize_link(url) self.cache_robot(domain) res = True if domain in self.robots and self.robots[domain] is not None: res = self.robots[domain].can_fetch("Agent", url) return res class ParsedDocument: """ One document in the database """ def __init__(self, parser,work_link): self.parser = parser self.work_link = work_link self.content = None self.bs = None self.paragraph_checksums = None self.paragraph_sizes = None self.link_set = set() self.body = None self.text_date = None self.tags = None self.authors = None self.title = None self.description = None self.section = None self.article_published_time = None self.current_time = datetime.date.today() def extract(self,content,bs): """ Parse content and fill the object """ self.content = content self.bs = bs # Extract text and metatext self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time) # Paragraph Checksums pch,pszs = self.parser.calculate_checksums(self.body) self.paragraph_checksums = pch self.paragraph_sizes = pszs if bs is None: return self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs) # Extrakcia linkov zo stranky base = self.work_link if bs.base is not None and "href" in bs.base.attrs: base = bs.base["href"] # Normalizacia linkov for l in bs.find_all("a", href=True): if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs: continue href = l["href"] try: nl = normalize_link(href, base) link = urlunparse(nl) if link == base: continue self.link_set.add(link) except ValueError: pass def get_links(self): """ @return all links """ return self.link_set def get_follow_links(self): """ @return good normalized links """ follow_links = set() for l in self.link_set: if self.parser.is_link_good(l): link = normalize_link(l,strip_query=self.parser.strip_query) follow_links.add(urlunparse(link)) return follow_links def __str__(self): r = [] if self.title is not None: r.append(self.title) if self.body is not None: if (len(self.body) < 20): r.append(self.body) else: r.append(self.body[0:20] + " ....") return ">>> ".join(r) def get_domains(arg): """ Get domains from argument or stdin if arg is -, get from stdin, else split arg @param arg dash or domains separated by comma @return domains """ domains = [] if arg == "-": for l in sys.stdin: domain = l.rstrip() assert(domain is not None) if len(domain) == 0: continue domains.append(domain) else: domains = arg.split(",") return domains def parse_and_index(work_link,parser,responses,db): """ Take all responses from work link, parse and store in db @param work_link - final link from downloader @param parser to use @param responses from the downloader @param db """ target_link = work_link links = [] if len(responses) > 0: db.index_responses(work_link,responses) lr = responses[-1] if lr.bs is not None: target_link = lr.get_canonical() parsed = ParsedDocument(parser,target_link) parsed.extract(lr.content, lr.bs) db.index_content(target_link,parsed) links = parsed.get_links() return target_link,links def visit_sitemap(domain,connection,parser,db): """ get links from sitemap of the domain """ link = "http://" + domain print("Sitemap visit: " + link) responses = connection.html_download2(link) if len(responses) == 0: return False lr = responses[-1] if lr.bs is None: return False target_link,outlinks = parse_and_index(link,parser,responses,db) if len(outlinks) > 0: db.index_follow_links(parser,outlinks,connection) return True def visit_links(links,connection,parser,db,is_online): """ if the site is not online, then just check links """ outlinks = [] junklinks = [] badrobotlinks = [] for work_link in links: responses = [] if not parser.is_link_good(work_link): db.update_link_status(work_link,"bad_link") elif is_online and not connection.is_robot_good(work_link): db.update_link_status(work_link,"bad_robot") elif is_online: responses = connection.html_download2(work_link) target_link,links = parse_and_index(work_link,parser,responses,db) nl = normalize_link(target_link) connection.crawl_delay(nl[1]) outlinks += links if len(outlinks) > 0: db.index_follow_links(parser,outlinks,connection) def visit_domain(domain,parser,db): """ One visit of the domain 1.Get links from the frontpage, 2. visit links and extract new links 3. get new links to visit 4. repeat visit for parser.crawl_rounds """ c = Connection() p = parser # Get links from frontpage # TODO Sitemap is_online = False if parser.is_domain_good(domain): # Is domain online? is_online = visit_sitemap(domain,c,parser,db) if is_online: for i in range(p.crawl_rounds): # Visit links from frontpage links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links) visit_links(links,c,p,db,is_online) db.check_domain(domain) else: db.check_domain(domain) return True def process_domains(domains,visit,parser,db,queue): """ Visit all domains in list. if queue is true, then queue domain instead immediate visit """ print("Websucker Agenda>>") random.shuffle(domains) for domain in domains: assert len(domain[0]) > 1 print(domain) if queue is not None: print("Queuing:") for domain in domains: print(domain) queue.put(domain[0]) queue.close() if visit: print("Visiting:") for domain in domains: print(domain) visit_domain(domain[0],parser,db) def work_domains(parser,db,queue): """ Poll the queue and visit """ while True: print("Waiting for a new job:") job = queue.reserve() domain = job.body queue.bury(job) print("Visiting:") visit_domain(domain,parser,db) queue.delete(job)