initial
This commit is contained in:
		
						commit
						0c9ea2b4e3
					
				
							
								
								
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@ -0,0 +1,4 @@
 | 
			
		||||
build
 | 
			
		||||
dist
 | 
			
		||||
*.egg-info
 | 
			
		||||
venv
 | 
			
		||||
							
								
								
									
										21
									
								
								LICENSE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								LICENSE.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,21 @@
 | 
			
		||||
MIT License
 | 
			
		||||
 | 
			
		||||
Copyright (c) 2020 Technical University of Kosice
 | 
			
		||||
 | 
			
		||||
Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
in the Software without restriction, including without limitation the rights
 | 
			
		||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
furnished to do so, subject to the following conditions:
 | 
			
		||||
 | 
			
		||||
The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
copies or substantial portions of the Software.
 | 
			
		||||
 | 
			
		||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
SOFTWARE.
 | 
			
		||||
							
								
								
									
										1
									
								
								MANIFEST.in
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								MANIFEST.in
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
			
		||||
recursive-include websucker *.sql
 | 
			
		||||
							
								
								
									
										6
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,6 @@
 | 
			
		||||
BeautifulSoup4
 | 
			
		||||
justext
 | 
			
		||||
cassandra-driver
 | 
			
		||||
python-dateutil
 | 
			
		||||
click
 | 
			
		||||
pycurl
 | 
			
		||||
							
								
								
									
										43
									
								
								setup.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								setup.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,43 @@
 | 
			
		||||
import setuptools
 | 
			
		||||
 | 
			
		||||
with open("README.md", "r") as fh:
 | 
			
		||||
    long_description = fh.read()
 | 
			
		||||
 | 
			
		||||
setuptools.setup(
 | 
			
		||||
    name="websucker", # Replace with your own username
 | 
			
		||||
    version="1.0.0",
 | 
			
		||||
    author="Daniel Hládek",
 | 
			
		||||
    author_email="dhladek@gmail.com",
 | 
			
		||||
    description="Web Crawler",
 | 
			
		||||
    long_description=long_description,
 | 
			
		||||
    long_description_content_type="text/markdown",
 | 
			
		||||
    url="https://github.com/hladek/websucker",
 | 
			
		||||
    packages=setuptools.find_packages(),
 | 
			
		||||
    # specified in MANIFEST
 | 
			
		||||
    include_package_data=True,
 | 
			
		||||
    classifiers=[
 | 
			
		||||
        "Programming Language :: Python :: 3",
 | 
			
		||||
        "License :: OSI Approved :: MIT License",
 | 
			
		||||
        "Operating System :: OS Independent",
 | 
			
		||||
        "Development Status :: 3 - Alpha",
 | 
			
		||||
        "Intended Audience :: Science/Research",
 | 
			
		||||
        "Topic :: Internet :: WWW/HTTP :: Indexing/Search"
 | 
			
		||||
    ],
 | 
			
		||||
    python_requires='>=3.6',
 | 
			
		||||
    entry_points={  # Optional
 | 
			
		||||
        'console_scripts': [
 | 
			
		||||
            'websuck=websucker.cli:cli',
 | 
			
		||||
        ],
 | 
			
		||||
    },
 | 
			
		||||
    install_requires=[
 | 
			
		||||
        "BeautifulSoup4",
 | 
			
		||||
        "justext",
 | 
			
		||||
        "cassandra-driver",
 | 
			
		||||
        "python-dateutil",
 | 
			
		||||
        "click",
 | 
			
		||||
        "pycurl",
 | 
			
		||||
        "greenstalk"
 | 
			
		||||
    ],
 | 
			
		||||
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										0
									
								
								websucker/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								websucker/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										364
									
								
								websucker/agent.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										364
									
								
								websucker/agent.py
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,364 @@
 | 
			
		||||
#!/usr/bin/env python
 | 
			
		||||
#! -*- coding: utf-8 -*-
 | 
			
		||||
import urllib.parse
 | 
			
		||||
import urllib.error
 | 
			
		||||
import os
 | 
			
		||||
import os.path
 | 
			
		||||
import re
 | 
			
		||||
import datetime
 | 
			
		||||
import time
 | 
			
		||||
import sys
 | 
			
		||||
import tempfile
 | 
			
		||||
import pprint
 | 
			
		||||
import bs4
 | 
			
		||||
 | 
			
		||||
import pycurl
 | 
			
		||||
import urllib.robotparser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from websucker.parser import normalize_link,urlunparse
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Parses http refresh in header or on html meta
 | 
			
		||||
def get_refresh(ref,target_link):
 | 
			
		||||
    refresh = None
 | 
			
		||||
    tokens = ref.strip().split(";")
 | 
			
		||||
    if len(tokens) > 1 and tokens[1].lower().startswith("url="):
 | 
			
		||||
        refresh = urlunparse(normalize_link(
 | 
			
		||||
            tokens[1][4:].strip("\'"), target_link))
 | 
			
		||||
    return refresh
 | 
			
		||||
 | 
			
		||||
class Response:
 | 
			
		||||
    def __init__(self,url,headers,status,content,redirects,link_status):
 | 
			
		||||
        assert len(url) > 0
 | 
			
		||||
        assert url[0] != "/"
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.status = status
 | 
			
		||||
        self.content = content
 | 
			
		||||
        self.headers = headers
 | 
			
		||||
        self.redirects = redirects
 | 
			
		||||
        self.visited_time = datetime.date.today()
 | 
			
		||||
        self.bs = None
 | 
			
		||||
        self.link_status = link_status
 | 
			
		||||
        if content is not None and link_status == "good":
 | 
			
		||||
            self.bs = bs4.BeautifulSoup(content, "lxml")
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)
 | 
			
		||||
 | 
			
		||||
    def get_content(self):
 | 
			
		||||
        if self.content is None:
 | 
			
		||||
            print("NO CONTENT")
 | 
			
		||||
            print(self.url,self.redirects)
 | 
			
		||||
            return None
 | 
			
		||||
        self.content.seek(0)
 | 
			
		||||
        text = self.content.read()
 | 
			
		||||
        out = str(text,encoding="utf8",errors="replace")
 | 
			
		||||
        return out
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # HMTL metarefresh redirect
 | 
			
		||||
    def get_metarefresh(self):
 | 
			
		||||
        if self.content is None:
 | 
			
		||||
            return None
 | 
			
		||||
        metarefresh = None
 | 
			
		||||
        t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"})
 | 
			
		||||
        canonical = self.get_canonical()
 | 
			
		||||
        for tags in t:
 | 
			
		||||
            if "content" in tags:
 | 
			
		||||
                metarefresh = get_refresh(tags["content"],canonical)
 | 
			
		||||
        if metarefresh is not None:
 | 
			
		||||
            nl = normalize_link(metarefresh, canonical)
 | 
			
		||||
            print("Metarefresh")
 | 
			
		||||
            print(nl)
 | 
			
		||||
            metarefresh = urlunparse(nl)
 | 
			
		||||
 | 
			
		||||
        return metarefresh
 | 
			
		||||
 | 
			
		||||
    def get_canonical(self):
 | 
			
		||||
        r = None
 | 
			
		||||
        last_link = self.url
 | 
			
		||||
        if len(self.redirects) > 0:
 | 
			
		||||
            last_link = self.redirects[-1]
 | 
			
		||||
        if self.bs is not None:
 | 
			
		||||
            l = self.bs.find("link", rel="canonical", href=True)
 | 
			
		||||
            if l is not None:
 | 
			
		||||
                r = urlunparse(normalize_link(l["href"], last_link))
 | 
			
		||||
        if r is None:
 | 
			
		||||
            r = last_link
 | 
			
		||||
        r = urlunparse(normalize_link(r, last_link))
 | 
			
		||||
        assert len(r) > 0
 | 
			
		||||
        assert r[0] != "/"
 | 
			
		||||
        return r
 | 
			
		||||
 | 
			
		||||
    def get_redirects(self):
 | 
			
		||||
        if len(self.redirects) <2 :
 | 
			
		||||
            return []
 | 
			
		||||
        return self.redirects[0:-1]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Connection:
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.c = pycurl.Curl()
 | 
			
		||||
        self.c.setopt(self.c.FOLLOWLOCATION, True)
 | 
			
		||||
#       self.c.setopt(self.c.VERBOSE, True)
 | 
			
		||||
        self.c.setopt(self.c.CONNECTTIMEOUT, 20)
 | 
			
		||||
        self.c.setopt(self.c.TIMEOUT, 20)
 | 
			
		||||
        self.c.setopt(self.c.FAILONERROR, True)
 | 
			
		||||
        self.c.setopt(self.c.HTTPHEADER, [
 | 
			
		||||
                      'Accept: text/html', 'Accept-Charset: UTF-8'])
 | 
			
		||||
        self.c.setopt(self.c.HEADERFUNCTION, self.header)
 | 
			
		||||
        self.c.setopt(self.c.USERAGENT, "Googlebot-News")
 | 
			
		||||
#        #self.c.setopt(pycurl.COOKIEJAR, 'cookie.txt')
 | 
			
		||||
#        #self.c.setopt(pycurl.COOKIEFILE, 'cookie.txt')
 | 
			
		||||
        self.robots = {}
 | 
			
		||||
        self.headers = {}
 | 
			
		||||
        self.redirects = []
 | 
			
		||||
        self.header_lines = []
 | 
			
		||||
        self.status = 0
 | 
			
		||||
        self.max_redirect = 4
 | 
			
		||||
 | 
			
		||||
    # Zastavi spracovanie ak content nie je text
 | 
			
		||||
    # zaznamena location a refresh
 | 
			
		||||
    def header(self, data):
 | 
			
		||||
        if len(data) == 0:
 | 
			
		||||
            return None
 | 
			
		||||
        l = str(data, encoding="utf8")
 | 
			
		||||
        self.header_lines.append(l)
 | 
			
		||||
        s = l.find(" ")
 | 
			
		||||
        if s >= 1 and s < len(l):
 | 
			
		||||
            key = l[0:s - 1]
 | 
			
		||||
            value = l[s + 1:].rstrip()
 | 
			
		||||
            self.headers[key] = value
 | 
			
		||||
            if key.lower() == "refresh":
 | 
			
		||||
                self.add_redirect(value)
 | 
			
		||||
            elif key.lower() == "location":
 | 
			
		||||
                self.add_redirect(value)
 | 
			
		||||
            elif key == "Content-Type" and "text" not in value:
 | 
			
		||||
                # Pycurl potom vyhodi 23, failed writing header
 | 
			
		||||
                return 0
 | 
			
		||||
 | 
			
		||||
    def __del__(self):
 | 
			
		||||
        self.c.close()
 | 
			
		||||
 | 
			
		||||
    def close(self):
 | 
			
		||||
        self.c.close()
 | 
			
		||||
    
 | 
			
		||||
    def add_redirect(self,link):
 | 
			
		||||
        last_link = self.url
 | 
			
		||||
        if len(self.redirects) > 0:
 | 
			
		||||
            last_link = self.redirects[-1]
 | 
			
		||||
        v = urlunparse(normalize_link(link, last_link))
 | 
			
		||||
        if v!=last_link and v not in set(self.redirects):
 | 
			
		||||
            self.redirects.append(v)
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
    @returns content, link_status 
 | 
			
		||||
    @throws pycurl.error
 | 
			
		||||
    """
 | 
			
		||||
    def _download(self, url):
 | 
			
		||||
        print("Downloading " + url)
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.headers = {}
 | 
			
		||||
        self.redirects = []
 | 
			
		||||
        self.header_lines = []
 | 
			
		||||
        self.status = 0
 | 
			
		||||
        content = None
 | 
			
		||||
        link_status = "bad_connection"
 | 
			
		||||
        try:
 | 
			
		||||
            self.headers = {}
 | 
			
		||||
            del self.header_lines[:]
 | 
			
		||||
            content = tempfile.SpooledTemporaryFile()
 | 
			
		||||
            self.c.setopt(self.c.WRITEDATA, content)
 | 
			
		||||
            self.c.setopt(self.c.URL, url)
 | 
			
		||||
            self.c.perform()
 | 
			
		||||
            self.status = self.c.getinfo(self.c.RESPONSE_CODE)
 | 
			
		||||
            if self.status != 200:
 | 
			
		||||
                link_status = "bad_httpcode"
 | 
			
		||||
            elif "Content-Type" in self.headers and not self.headers["Content-Type"].startswith("text"):
 | 
			
		||||
                link_status = "bad_type"
 | 
			
		||||
            else:
 | 
			
		||||
                link_status = "good"
 | 
			
		||||
                content.seek(0)
 | 
			
		||||
        except pycurl.error as e:
 | 
			
		||||
            errno, message = e.args
 | 
			
		||||
            content = None
 | 
			
		||||
            self.status = self.c.getinfo(self.c.RESPONSE_CODE)
 | 
			
		||||
            if errno == 23:
 | 
			
		||||
                # 23 je zly content v header
 | 
			
		||||
                link_status = "bad_type"
 | 
			
		||||
            elif errno == 22:
 | 
			
		||||
                link_status = "bad_httpcode"
 | 
			
		||||
            else:
 | 
			
		||||
                raise e
 | 
			
		||||
        except UnicodeDecodeError as e:
 | 
			
		||||
            content = None
 | 
			
		||||
            link_status = "bad_unicode"
 | 
			
		||||
        except UnicodeEncodeError as e:
 | 
			
		||||
            content = None
 | 
			
		||||
            link_status = "bad_unicode"
 | 
			
		||||
        sz = self.c.getinfo(self.c.SIZE_DOWNLOAD)
 | 
			
		||||
        tt = self.c.getinfo(self.c.TOTAL_TIME)
 | 
			
		||||
        print("{} Received {} bytes in {} s".format(self.status,sz,tt))
 | 
			
		||||
        return content, link_status
 | 
			
		||||
 | 
			
		||||
    # Throws pycurl.error
 | 
			
		||||
    def html_download2(self, url):
 | 
			
		||||
        dlink = url
 | 
			
		||||
        responses = []
 | 
			
		||||
        while len(responses) < 5:
 | 
			
		||||
            nl = normalize_link(dlink)
 | 
			
		||||
            url = urlunparse(nl)
 | 
			
		||||
            assert url.startswith("http")
 | 
			
		||||
            content, link_status = self._download(url)
 | 
			
		||||
            response = Response(url,"\r\n".join(self.header_lines),self.status,content,self.redirects,link_status)
 | 
			
		||||
            dlink = response.get_metarefresh()
 | 
			
		||||
            responses.append(response)
 | 
			
		||||
            if dlink is None:
 | 
			
		||||
                break
 | 
			
		||||
        return responses
 | 
			
		||||
 | 
			
		||||
    def is_robot_good(self, url):
 | 
			
		||||
        schema, domain, path, query = normalize_link(url)
 | 
			
		||||
        res = True
 | 
			
		||||
        if domain not in self.robots:
 | 
			
		||||
            roboturl = urlunparse((schema, domain, "robots.txt", ""))
 | 
			
		||||
            try:
 | 
			
		||||
                r = self._download(roboturl)
 | 
			
		||||
                if r[1] == "good":
 | 
			
		||||
                    c = r[0].read()
 | 
			
		||||
                    lines = str(c, errors="ignore", encoding="utf8").split("\n")
 | 
			
		||||
                    self.robots[domain] = urllib.robotparser.RobotFileParser()
 | 
			
		||||
                    self.robots[domain].parse(lines)
 | 
			
		||||
                else:
 | 
			
		||||
                    self.robots[domain] = None
 | 
			
		||||
            except pycurl.error as err:
 | 
			
		||||
                print(err)
 | 
			
		||||
        if domain in self.robots and self.robots[domain] is not None:
 | 
			
		||||
            res = self.robots[domain].can_fetch("Agent", url)
 | 
			
		||||
        return res
 | 
			
		||||
 | 
			
		||||
class ParsedDocument:
 | 
			
		||||
    def __init__(self, parser,work_link):
 | 
			
		||||
        self.parser = parser
 | 
			
		||||
        self.work_link = work_link
 | 
			
		||||
 | 
			
		||||
        self.content = None 
 | 
			
		||||
        self.bs = None
 | 
			
		||||
        self.paragraph_checksums = None
 | 
			
		||||
        self.paragraph_sizes = None
 | 
			
		||||
        
 | 
			
		||||
        self.link_set = set()
 | 
			
		||||
        self.body = None
 | 
			
		||||
        self.text_date = None
 | 
			
		||||
        self.tags = None
 | 
			
		||||
        self.authors = None
 | 
			
		||||
        self.title = None
 | 
			
		||||
        self.description = None
 | 
			
		||||
        self.section = None
 | 
			
		||||
        self.article_published_time = None
 | 
			
		||||
        self.current_time = datetime.date.today()
 | 
			
		||||
 | 
			
		||||
    def extract(self,content,bs):
 | 
			
		||||
        self.content = content
 | 
			
		||||
        self.bs = bs
 | 
			
		||||
 | 
			
		||||
        # Extract text and metatext
 | 
			
		||||
        self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time)
 | 
			
		||||
        # Paragraph Checksums
 | 
			
		||||
        pch,pszs = self.parser.calculate_checksums(self.body) 
 | 
			
		||||
        self.paragraph_checksums = pch
 | 
			
		||||
        self.paragraph_sizes = pszs
 | 
			
		||||
        if bs is  None:
 | 
			
		||||
            return
 | 
			
		||||
        self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_og(bs)
 | 
			
		||||
 | 
			
		||||
        # Extrakcia linkov zo stranky
 | 
			
		||||
        base = self.work_link
 | 
			
		||||
        if bs.base is not None and "href" in bs.base.attrs:
 | 
			
		||||
            base = bs.base["href"]
 | 
			
		||||
        # Normalizacia linkov
 | 
			
		||||
        for l in bs.find_all("a", href=True):
 | 
			
		||||
            if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
 | 
			
		||||
                continue
 | 
			
		||||
            href = l["href"]
 | 
			
		||||
            try:
 | 
			
		||||
                nl = normalize_link(href, base)
 | 
			
		||||
                link = urlunparse(nl)
 | 
			
		||||
                if link == base:
 | 
			
		||||
                    continue
 | 
			
		||||
                self.link_set.add(link)
 | 
			
		||||
            except ValueError:
 | 
			
		||||
                pass
 | 
			
		||||
 | 
			
		||||
    def get_links(self):
 | 
			
		||||
        return self.link_set
 | 
			
		||||
 | 
			
		||||
    def get_follow_links(self):
 | 
			
		||||
        follow_links = set()
 | 
			
		||||
        for l in self.link_set:
 | 
			
		||||
            if self.parser.is_link_good(l):
 | 
			
		||||
                link = normalize_link(l,strip_query=self.parser.strip_query)
 | 
			
		||||
                follow_links.add(urlunparse(link))
 | 
			
		||||
        return follow_links
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        r = []
 | 
			
		||||
        if self.title is not None:
 | 
			
		||||
            r.append(self.title)
 | 
			
		||||
        if self.body is not None:
 | 
			
		||||
            if (len(self.body) < 20):
 | 
			
		||||
                r.append(self.body)
 | 
			
		||||
            else:
 | 
			
		||||
                r.append(self.body[0:20]) + " ...."
 | 
			
		||||
        return ">>> ".join(r)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_domains(arg):
 | 
			
		||||
    domains = []
 | 
			
		||||
    if arg == "-":
 | 
			
		||||
        for l in sys.stdin:
 | 
			
		||||
            domain = l.rstrip()
 | 
			
		||||
            assert(domain is not None)
 | 
			
		||||
            if len(domain) == 0:
 | 
			
		||||
                continue
 | 
			
		||||
            domains.append(domain)
 | 
			
		||||
    else:
 | 
			
		||||
        domains = arg.split(",")
 | 
			
		||||
    return domains
 | 
			
		||||
 | 
			
		||||
def visit_links(links,connection,parser,db):
 | 
			
		||||
    outlinks = []
 | 
			
		||||
    for work_link in links:
 | 
			
		||||
        responses = []
 | 
			
		||||
        if parser.is_link_good(work_link) and connection.is_robot_good(work_link):
 | 
			
		||||
            responses = connection.html_download2(work_link)
 | 
			
		||||
            time.sleep(4)
 | 
			
		||||
            db.index_responses(work_link,responses)
 | 
			
		||||
        if len(responses) > 0:
 | 
			
		||||
            lr = responses[-1]
 | 
			
		||||
            if lr.content is not None:
 | 
			
		||||
                target_link = lr.get_canonical()
 | 
			
		||||
                parsed = ParsedDocument(parser,target_link)
 | 
			
		||||
                parsed.extract(lr.content, lr.bs)
 | 
			
		||||
                db.index_content(target_link,parsed)
 | 
			
		||||
                outlinks += parsed.get_links()
 | 
			
		||||
    if len(outlinks) > 0:
 | 
			
		||||
        db.index_follow_links(parser,outlinks,connection)
 | 
			
		||||
 | 
			
		||||
def visit_domain(domain,parser,db):
 | 
			
		||||
    c = Connection()
 | 
			
		||||
    p = parser
 | 
			
		||||
    # Get links from frontpage
 | 
			
		||||
    # TODO Sitemap
 | 
			
		||||
    sitemap = "http://" + domain
 | 
			
		||||
    visit_links([sitemap],c,p,db)
 | 
			
		||||
    db.check_domain(domain)
 | 
			
		||||
    for i in range(p.crawl_rounds):
 | 
			
		||||
        # Visit links from frontpage
 | 
			
		||||
        links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
 | 
			
		||||
        visit_links(links,c,p,db)
 | 
			
		||||
        db.check_domain(domain)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										130
									
								
								websucker/cli.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										130
									
								
								websucker/cli.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,130 @@
 | 
			
		||||
from websucker.agent import Connection,visit_links,visit_domain
 | 
			
		||||
from websucker.agent import ParsedDocument
 | 
			
		||||
from websucker.parser import BaseParser
 | 
			
		||||
from websucker.parser import normalize_link,urlunparse
 | 
			
		||||
from websucker.db import Data
 | 
			
		||||
from websucker.db import get_schema
 | 
			
		||||
import click
 | 
			
		||||
import pprint
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def create_database_from_context(ctx):
 | 
			
		||||
    return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
 | 
			
		||||
 | 
			
		||||
@click.group()
 | 
			
		||||
@click.pass_context
 | 
			
		||||
@click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True)
 | 
			
		||||
@click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True)
 | 
			
		||||
@click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True)
 | 
			
		||||
 | 
			
		||||
@click.option("--justext-language",metavar="JUSTEXT_LANGUAGE",help="Target language (if defined, value read from JUSTEXT_LANGUAGE env variable)",envvar="JUSTEXT_LANGUAGE",default="English",show_default=True)
 | 
			
		||||
@click.option("--visit",is_flag=True)
 | 
			
		||||
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,justext_language,visit):
 | 
			
		||||
    ctx.ensure_object(dict)
 | 
			
		||||
    p = BaseParser()
 | 
			
		||||
    p.justext_language = justext_language
 | 
			
		||||
    ctx.obj["parser"] = p
 | 
			
		||||
    ctx.obj["cassandra_host"] = cassandra_host
 | 
			
		||||
    ctx.obj["cassandra_port"] = cassandra_port
 | 
			
		||||
    ctx.obj["cassandra_keyspace"] = cassandra_keyspace
 | 
			
		||||
    ctx.obj["visit"] = visit
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@cli.command(help="Print domains")
 | 
			
		||||
@click.pass_context
 | 
			
		||||
@click.argument("count",type=int,default=20)
 | 
			
		||||
def all(ctx,count):
 | 
			
		||||
    p = ctx.obj["parser"]
 | 
			
		||||
    c = Connection()
 | 
			
		||||
    db = create_database_from_context(ctx)
 | 
			
		||||
    res = db.all_domains(count)
 | 
			
		||||
    for row in res:
 | 
			
		||||
        print(",".join(map(str,row)))
 | 
			
		||||
        if ctx.obj["visit"]:
 | 
			
		||||
            visit_domain(row[0],p,db)
 | 
			
		||||
 | 
			
		||||
@cli.command(help="Continue crawling of seen links from a domain")
 | 
			
		||||
@click.pass_context
 | 
			
		||||
@click.argument("domain")
 | 
			
		||||
def crawl(ctx, domain):
 | 
			
		||||
    db = create_database_from_context(ctx)
 | 
			
		||||
    p = ctx.obj["parser"]
 | 
			
		||||
    c = Connection()
 | 
			
		||||
    links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
 | 
			
		||||
    visit_links(links,c,p,db)
 | 
			
		||||
    db.check_domain(domain)
 | 
			
		||||
 | 
			
		||||
@cli.command(help="find best domains")
 | 
			
		||||
@click.pass_context
 | 
			
		||||
@click.argument("count",type=int,default=20)
 | 
			
		||||
#@click.option("visit",is_flag=True)
 | 
			
		||||
def best(ctx, count):
 | 
			
		||||
    db = create_database_from_context(ctx)
 | 
			
		||||
    p = ctx.obj["parser"]
 | 
			
		||||
    domains = db.get_best_domains(count)
 | 
			
		||||
    for domain,gr in domains:
 | 
			
		||||
        print(domain,gr)
 | 
			
		||||
        if ctx.obj["visit"]:
 | 
			
		||||
           visit_domain(domain,p,db)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
 | 
			
		||||
@click.pass_context
 | 
			
		||||
@click.argument("count",type=int,default=20)
 | 
			
		||||
def unvisited(ctx, count):
 | 
			
		||||
    db = create_database_from_context(ctx)
 | 
			
		||||
    p = ctx.obj["parser"]
 | 
			
		||||
    c = Connection()
 | 
			
		||||
    domains = db.get_unvisited_domains(count)
 | 
			
		||||
    for domain in domains:
 | 
			
		||||
        print(domain)
 | 
			
		||||
        if ctx.obj["visit"]:
 | 
			
		||||
          visit_domain(domain,p,db)
 | 
			
		||||
 | 
			
		||||
@cli.command(help="Visit url, get links and crawl. Start here")
 | 
			
		||||
@click.pass_context
 | 
			
		||||
@click.argument("link")
 | 
			
		||||
def visit(ctx, link):
 | 
			
		||||
    db = create_database_from_context(ctx)
 | 
			
		||||
    p = ctx.obj["parser"]
 | 
			
		||||
    c = Connection()
 | 
			
		||||
    nl = normalize_link(link)
 | 
			
		||||
    domain=nl[1]
 | 
			
		||||
    visit_domain(domain,p,db)
 | 
			
		||||
 | 
			
		||||
@cli.command(help="Update domain statistics")
 | 
			
		||||
@click.pass_context
 | 
			
		||||
@click.argument("domain")
 | 
			
		||||
def check(ctx,domain):
 | 
			
		||||
    db = create_database_from_context(ctx)
 | 
			
		||||
    res = db.check_domain(domain)
 | 
			
		||||
    print(res)
 | 
			
		||||
 | 
			
		||||
@cli.command(help="Print daily report")
 | 
			
		||||
@click.pass_context
 | 
			
		||||
def report(ctx):
 | 
			
		||||
    db = create_database_from_context(ctx)
 | 
			
		||||
    db.daily_report()
 | 
			
		||||
 | 
			
		||||
@cli.command(help="Print keyspace schema")
 | 
			
		||||
def schema():
 | 
			
		||||
    schema = get_schema()
 | 
			
		||||
    print(schema)
 | 
			
		||||
 | 
			
		||||
@cli.command(help="Fetch given url (just for debug)")
 | 
			
		||||
@click.pass_context
 | 
			
		||||
@click.argument("urls")
 | 
			
		||||
def fetch(ctx,urls):
 | 
			
		||||
    parser = ctx.obj["parser"]
 | 
			
		||||
    # Visit first page
 | 
			
		||||
    connection = Connection()
 | 
			
		||||
    responses = connection.html_download2(urls)
 | 
			
		||||
    for res in responses:
 | 
			
		||||
        target_link = res.get_canonical()
 | 
			
		||||
        pd = ParsedDocument(parser,target_link)
 | 
			
		||||
        pd.extract(res.content, res.bs)
 | 
			
		||||
        print(pd)
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    cli()
 | 
			
		||||
							
								
								
									
										444
									
								
								websucker/db.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										444
									
								
								websucker/db.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,444 @@
 | 
			
		||||
import cassandra
 | 
			
		||||
import cassandra.cluster
 | 
			
		||||
import random
 | 
			
		||||
import os
 | 
			
		||||
import pkg_resources
 | 
			
		||||
import datetime
 | 
			
		||||
from websucker.parser import normalize_link,urlunparse
 | 
			
		||||
 | 
			
		||||
VERSION = "sucker6"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_schema():
 | 
			
		||||
    with pkg_resources.resource_stream(__name__,"schema.sql") as f:
 | 
			
		||||
        schema = f.read()
 | 
			
		||||
        return str(schema,encoding="utf8")
 | 
			
		||||
 | 
			
		||||
class Data:
 | 
			
		||||
    """
 | 
			
		||||
    Database of text documents
 | 
			
		||||
    """
 | 
			
		||||
    def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042):
 | 
			
		||||
        # execution profile
 | 
			
		||||
        ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0)
 | 
			
		||||
        profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
 | 
			
		||||
        self.cluster = cassandra.cluster.Cluster([cassandra_host],port=cassandra_port,execution_profiles=profiles)
 | 
			
		||||
        self.session = self.cluster.connect(keyspace)
 | 
			
		||||
 | 
			
		||||
        self.check_document_select_query = self.session.prepare("SELECT count(url_hash) FROM paragraph_checksums WHERE checksum=?" )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        self.index_response_link_update = self.session.prepare("""
 | 
			
		||||
        UPDATE links SET 
 | 
			
		||||
        link_status ='redirect',
 | 
			
		||||
        redirect_target = ?,
 | 
			
		||||
        update_time = toTimestamp(now())
 | 
			
		||||
        WHERE
 | 
			
		||||
        domain_name=? AND
 | 
			
		||||
        url_path=? AND
 | 
			
		||||
        url_query=? 
 | 
			
		||||
        """)
 | 
			
		||||
 | 
			
		||||
        self.domain_quality_update = self.session.prepare("""
 | 
			
		||||
        UPDATE domain_quality SET 
 | 
			
		||||
            seen_count=?,
 | 
			
		||||
            good_size=?,
 | 
			
		||||
            good_count=?,
 | 
			
		||||
            good_probability=?,
 | 
			
		||||
            good_originality=?,
 | 
			
		||||
            average_good_characters=?,
 | 
			
		||||
            content_size=?,
 | 
			
		||||
            content_count=?,
 | 
			
		||||
            content_probability=?,
 | 
			
		||||
            content_originality=?,
 | 
			
		||||
            average_content_characters=?,
 | 
			
		||||
            fetched_count=?,
 | 
			
		||||
            average_fetched_good_characters=?,
 | 
			
		||||
            gain_ratio=?,
 | 
			
		||||
            update_time = toTimestamp(now())
 | 
			
		||||
        WHERE
 | 
			
		||||
            domain_name=? AND
 | 
			
		||||
            day=toDate(now())
 | 
			
		||||
        """)
 | 
			
		||||
 | 
			
		||||
        self.index_response_insert_html = self.session.prepare("""
 | 
			
		||||
INSERT INTO html(
 | 
			
		||||
  day,
 | 
			
		||||
  domain_name,
 | 
			
		||||
  source_link,
 | 
			
		||||
  target_link,
 | 
			
		||||
  redirect_links,
 | 
			
		||||
  status,
 | 
			
		||||
  headers,
 | 
			
		||||
  content,
 | 
			
		||||
  agent_version,
 | 
			
		||||
  update_time
 | 
			
		||||
  ) VALUES (toDate(now()),?,?,?,?,?,?,?,?,toTimestamp(now()));
 | 
			
		||||
""")
 | 
			
		||||
 | 
			
		||||
        self.index_content_link_insert = self.session.prepare("""
 | 
			
		||||
INSERT INTO links (
 | 
			
		||||
url_schema,
 | 
			
		||||
domain_name,
 | 
			
		||||
url_path,
 | 
			
		||||
url_query,
 | 
			
		||||
link_status,
 | 
			
		||||
update_time
 | 
			
		||||
) VALUES (?,?,?,?,'seen',?) IF NOT EXISTS
 | 
			
		||||
""")
 | 
			
		||||
 | 
			
		||||
        self.daily_links_insert = self.session.prepare("""
 | 
			
		||||
INSERT INTO daily_links (
 | 
			
		||||
day,
 | 
			
		||||
domain_name,
 | 
			
		||||
url_path,
 | 
			
		||||
url_query,
 | 
			
		||||
link_status,
 | 
			
		||||
body_size,
 | 
			
		||||
link_originality,
 | 
			
		||||
update_time
 | 
			
		||||
) VALUES (toDate(now()),?,?,?,?,?,?,toTimestamp(now()))
 | 
			
		||||
""")
 | 
			
		||||
        self.daily_links_select = self.session.prepare("""
 | 
			
		||||
SELECT 
 | 
			
		||||
domain_name,
 | 
			
		||||
link_status,
 | 
			
		||||
count(link_status) 
 | 
			
		||||
FROM daily_links WHERE day=toDate(now()) GROUP BY domain_name,link_status
 | 
			
		||||
""")
 | 
			
		||||
        # PArsed Content
 | 
			
		||||
        self.index_content_content_insert = self.session.prepare("""
 | 
			
		||||
INSERT INTO content(
 | 
			
		||||
  domain_name,
 | 
			
		||||
  target_link,
 | 
			
		||||
  links,
 | 
			
		||||
  title,
 | 
			
		||||
  description,
 | 
			
		||||
  section,
 | 
			
		||||
  authors,
 | 
			
		||||
  tags,
 | 
			
		||||
  article_published_time,
 | 
			
		||||
  text_date,
 | 
			
		||||
  body,
 | 
			
		||||
  body_size,
 | 
			
		||||
  agent_version,
 | 
			
		||||
  update_time
 | 
			
		||||
  ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?);
 | 
			
		||||
""")
 | 
			
		||||
 | 
			
		||||
        self.paragraph_checksums_insert = self.session.prepare("INSERT INTO paragraph_checksums (checksum,url_hash) VALUES(?,?)")
 | 
			
		||||
        self.index_content_links_update = self.session.prepare("UPDATE links SET link_status=?, link_originality=?,body_size=?,url_schema=? WHERE domain_name=? AND url_path = ? AND url_query=? ")
 | 
			
		||||
        self.check_domain_count = self.session.prepare("select count(url_path) from links where domain_name=? and link_status = ?")
 | 
			
		||||
 | 
			
		||||
        self.check_domain_size = self.session.prepare("select sum(body_size),sum(link_originality) from links where domain_name=? and link_status =?")
 | 
			
		||||
 | 
			
		||||
        self.domains_select = self.session.prepare("SELECT domain_name,seen_count,fetched_count,gain_ratio,average_fetched_good_characters FROM domain_quality PER PARTITION LIMIT 1")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def index_responses(self,source_link,responses):
 | 
			
		||||
        # Redirect links
 | 
			
		||||
        pl = normalize_link(source_link)
 | 
			
		||||
        for response in responses:
 | 
			
		||||
            tl = response.get_canonical()
 | 
			
		||||
            r = (
 | 
			
		||||
                tl,
 | 
			
		||||
                pl[1],
 | 
			
		||||
                pl[2],
 | 
			
		||||
                pl[3],
 | 
			
		||||
            )
 | 
			
		||||
            if pl != tl:
 | 
			
		||||
                res = self.session.execute(self.index_response_link_update,r)
 | 
			
		||||
            d = (
 | 
			
		||||
                pl[1],
 | 
			
		||||
                source_link,
 | 
			
		||||
                response.get_canonical(),
 | 
			
		||||
                response.redirects,
 | 
			
		||||
                response.status,
 | 
			
		||||
                response.headers,
 | 
			
		||||
                response.get_content(),
 | 
			
		||||
                VERSION,
 | 
			
		||||
            )
 | 
			
		||||
            self.session.execute(self.index_response_insert_html,d)
 | 
			
		||||
    
 | 
			
		||||
    def daily_report(self):
 | 
			
		||||
        rows = self.session.execute(self.daily_links_select)
 | 
			
		||||
        for row in rows:
 | 
			
		||||
            print(row[0],row[1],row[2])
 | 
			
		||||
 | 
			
		||||
    def index_follow_links(self,parser,links,connection):
 | 
			
		||||
        # Index seen links
 | 
			
		||||
 | 
			
		||||
        follow_links = set()
 | 
			
		||||
        for l in links:
 | 
			
		||||
            if parser.is_link_good(l):
 | 
			
		||||
                #if connection is not None and parser.listen_robot and not connection.is_robot_good(l):
 | 
			
		||||
                #    continue
 | 
			
		||||
                link = normalize_link(l,strip_query=parser.strip_query)
 | 
			
		||||
                follow_links.add(urlunparse(link))
 | 
			
		||||
 | 
			
		||||
        newlinkdomains = set()
 | 
			
		||||
        for link in follow_links:
 | 
			
		||||
            value = []
 | 
			
		||||
            nl = normalize_link(link)
 | 
			
		||||
            value +=  nl
 | 
			
		||||
            value.append(datetime.date.today())
 | 
			
		||||
            rows = self.session.execute(self.index_content_link_insert,value)
 | 
			
		||||
            row = rows.one()
 | 
			
		||||
            if row.applied:
 | 
			
		||||
                newlinkdomains.add(nl[1])
 | 
			
		||||
        for domain in newlinkdomains:
 | 
			
		||||
            self.check_domain(domain)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def index_content(self,target_link,parsed_document):
 | 
			
		||||
        nl = normalize_link(target_link)
 | 
			
		||||
        domain_name = nl[1]
 | 
			
		||||
        assert len(domain_name) > 1
 | 
			
		||||
 | 
			
		||||
        pd = parsed_document
 | 
			
		||||
        body_length = 0
 | 
			
		||||
        if pd.body is not None:
 | 
			
		||||
            body_length = len(pd.body)
 | 
			
		||||
        value = ( 
 | 
			
		||||
          domain_name,
 | 
			
		||||
          target_link,
 | 
			
		||||
          pd.get_links(),
 | 
			
		||||
          pd.title,
 | 
			
		||||
          pd.description,
 | 
			
		||||
          pd.section,
 | 
			
		||||
          pd.authors,
 | 
			
		||||
          pd.tags,
 | 
			
		||||
          pd.article_published_time,
 | 
			
		||||
          pd.text_date,
 | 
			
		||||
          pd.body,
 | 
			
		||||
          body_length,
 | 
			
		||||
            VERSION,
 | 
			
		||||
          pd.current_time
 | 
			
		||||
        )
 | 
			
		||||
        content_future = self.session.execute_async(self.index_content_content_insert,value)
 | 
			
		||||
        # result later
 | 
			
		||||
 | 
			
		||||
        link_status = "good"
 | 
			
		||||
        originality = 0
 | 
			
		||||
        tsz = 0
 | 
			
		||||
        if pd.body is None:
 | 
			
		||||
            link_status = "bad_parse"
 | 
			
		||||
        else:
 | 
			
		||||
            tsz = len(pd.body)
 | 
			
		||||
            if tsz < 300:
 | 
			
		||||
                link_status = "bad_small"
 | 
			
		||||
 | 
			
		||||
        if link_status == "good":
 | 
			
		||||
 | 
			
		||||
            futures = []
 | 
			
		||||
            for pc,psz in zip(pd.paragraph_checksums,pd.paragraph_sizes):
 | 
			
		||||
                fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3])))
 | 
			
		||||
                futures.append(fut)
 | 
			
		||||
            for fut in futures:
 | 
			
		||||
                fut.result()
 | 
			
		||||
            originality = self.check_document(pd.paragraph_checksums,pd.paragraph_sizes)
 | 
			
		||||
            if originality < 0.8:
 | 
			
		||||
                link_status = "bad_copy"
 | 
			
		||||
        print(nl)
 | 
			
		||||
        self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3]))
 | 
			
		||||
        content_future.result()
 | 
			
		||||
        print("<<<< " + link_status + " " + str(originality))
 | 
			
		||||
        dl = (
 | 
			
		||||
            nl[1],
 | 
			
		||||
            nl[2],
 | 
			
		||||
            nl[3],
 | 
			
		||||
            link_status,
 | 
			
		||||
            tsz,
 | 
			
		||||
            originality
 | 
			
		||||
        )
 | 
			
		||||
        self.session.execute(self.daily_links_insert,dl)
 | 
			
		||||
 | 
			
		||||
    def check_document(self,paragraph_checksums,paragraph_sizes):
 | 
			
		||||
       tsz = sum(paragraph_sizes)
 | 
			
		||||
       if tsz == 0:
 | 
			
		||||
           return 0
 | 
			
		||||
       copies = 0
 | 
			
		||||
       futures = []
 | 
			
		||||
       for pc,psz in zip(paragraph_checksums,paragraph_sizes):
 | 
			
		||||
           futures.append(self.session.execute_async(self.check_document_select_query,(pc,)))
 | 
			
		||||
 | 
			
		||||
       for fut,psz in zip(futures,paragraph_sizes):
 | 
			
		||||
           rows = fut.result()
 | 
			
		||||
           res = rows.one()[0]
 | 
			
		||||
           if res > 1:
 | 
			
		||||
               copies += psz
 | 
			
		||||
       return (tsz-copies)/tsz
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def check_domain(self, domain):
 | 
			
		||||
        assert len(domain) > 0
 | 
			
		||||
        seen_count = None
 | 
			
		||||
        good_size = None
 | 
			
		||||
        good_count = None
 | 
			
		||||
        good_probability = None
 | 
			
		||||
        good_originality = None
 | 
			
		||||
        average_good_characters = None
 | 
			
		||||
        content_size = None
 | 
			
		||||
        content_count = None
 | 
			
		||||
        content_probability = None
 | 
			
		||||
        content_originality = None
 | 
			
		||||
        average_content_characters = None
 | 
			
		||||
        fetched_count = None
 | 
			
		||||
        average_fetched_good_characters = None
 | 
			
		||||
        gain_ratio = None
 | 
			
		||||
 | 
			
		||||
        counts = {
 | 
			
		||||
            "good":0,
 | 
			
		||||
            "bad_copy":0,
 | 
			
		||||
            "bad_small":0,
 | 
			
		||||
            "bad_httpcode":0,
 | 
			
		||||
            "bad_type":0,
 | 
			
		||||
            "bad_content":0,
 | 
			
		||||
            "bad_parse":0,
 | 
			
		||||
            "seen":0
 | 
			
		||||
        }
 | 
			
		||||
        for k in counts.keys():
 | 
			
		||||
            res = self.session.execute(self.check_domain_count,(domain,k))
 | 
			
		||||
            co = res.one()[0]
 | 
			
		||||
            counts[k]= co
 | 
			
		||||
 | 
			
		||||
        seen_count = counts["seen"]
 | 
			
		||||
        good_count = counts["good"]
 | 
			
		||||
        content_count = counts["good"] + counts["bad_copy"] + counts["bad_small"]
 | 
			
		||||
 | 
			
		||||
        fetched_count = sum(counts.values()) - counts["seen"]
 | 
			
		||||
 | 
			
		||||
        if fetched_count > 0:
 | 
			
		||||
            content_probability = content_count / fetched_count
 | 
			
		||||
            good_probability = good_count / fetched_count
 | 
			
		||||
            sizes = {
 | 
			
		||||
                "good":0,
 | 
			
		||||
                "bad_copy":0,
 | 
			
		||||
                "bad_small":0
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            originalities ={}
 | 
			
		||||
 | 
			
		||||
            for k in sizes.keys():
 | 
			
		||||
                res = self.session.execute(self.check_domain_size,(domain,k))
 | 
			
		||||
                row = res.one()
 | 
			
		||||
                co =row[0]
 | 
			
		||||
                originalities[k] = row[1]
 | 
			
		||||
                sizes[k]= co
 | 
			
		||||
            good_size = sizes["good"]
 | 
			
		||||
            content_size = sum(sizes.values())
 | 
			
		||||
            if good_count > 0:
 | 
			
		||||
                good_originality = originalities["good"] / good_count
 | 
			
		||||
            if content_count > 0:
 | 
			
		||||
                content_originality =  sum(originalities.values()) / content_count
 | 
			
		||||
        
 | 
			
		||||
        if good_count > 0:
 | 
			
		||||
            average_good_characters = good_size / good_count  * good_originality
 | 
			
		||||
            average_fetched_good_characters = good_size * good_originality / fetched_count
 | 
			
		||||
 | 
			
		||||
            gain_ratio = average_fetched_good_characters / fetched_count
 | 
			
		||||
 | 
			
		||||
        if content_count > 0:
 | 
			
		||||
            average_content_characters = content_size / content_count
 | 
			
		||||
 | 
			
		||||
        #print(sizes)
 | 
			
		||||
        #print(originalities)
 | 
			
		||||
        uv = (
 | 
			
		||||
        seen_count,
 | 
			
		||||
        good_size,
 | 
			
		||||
        good_count,
 | 
			
		||||
        good_probability,
 | 
			
		||||
        good_originality,
 | 
			
		||||
        average_good_characters,
 | 
			
		||||
        content_size,
 | 
			
		||||
        content_count,
 | 
			
		||||
        content_probability,
 | 
			
		||||
        content_originality,
 | 
			
		||||
        average_content_characters,
 | 
			
		||||
        fetched_count,
 | 
			
		||||
        average_fetched_good_characters,
 | 
			
		||||
        gain_ratio,
 | 
			
		||||
        domain)
 | 
			
		||||
        if fetched_count > 0 or seen_count > 0:
 | 
			
		||||
            self.session.execute(self.domain_quality_update,uv)
 | 
			
		||||
        return average_fetched_good_characters
 | 
			
		||||
 | 
			
		||||
    def all_domains(self,count):
 | 
			
		||||
        rows = self.session.execute(self.domains_select)
 | 
			
		||||
        domains = []
 | 
			
		||||
        for row in rows:
 | 
			
		||||
            domain = row[0]
 | 
			
		||||
            seen_count = row[1]
 | 
			
		||||
            fetched_count = row[2]
 | 
			
		||||
            gain_ratio = row[3]
 | 
			
		||||
            afg = row[4]
 | 
			
		||||
            if fetched_count and afg and seen_count:
 | 
			
		||||
                domains.append(tuple(row))
 | 
			
		||||
        l = len(domains)
 | 
			
		||||
        ss = min(l,count)
 | 
			
		||||
        res = []
 | 
			
		||||
        if ss > 0:
 | 
			
		||||
            # sort according to ratio
 | 
			
		||||
            res = list(sorted(domains,key=lambda x:x[4],reverse=True))[0:ss]
 | 
			
		||||
        # returns sorted list of tuples domain,gain_ratio
 | 
			
		||||
        return res
 | 
			
		||||
 | 
			
		||||
    def get_best_domains(self,count):
 | 
			
		||||
        # get all domains
 | 
			
		||||
        rows = self.session.execute(self.domains_select)
 | 
			
		||||
        domains = []
 | 
			
		||||
        for row in rows:
 | 
			
		||||
            domain = row[0]
 | 
			
		||||
            seen_count = row[1]
 | 
			
		||||
            fetched_count = row[2]
 | 
			
		||||
            gain_ratio = row[3]
 | 
			
		||||
            afg = row[4]
 | 
			
		||||
            if seen_count and fetched_count and gain_ratio:
 | 
			
		||||
                domains.append((domain,gain_ratio))
 | 
			
		||||
        l = len(domains)
 | 
			
		||||
        ss = min(l,count)
 | 
			
		||||
        res = []
 | 
			
		||||
        if ss > 0:
 | 
			
		||||
            # sort according to ratio
 | 
			
		||||
            res = list(sorted(domains,key=lambda x:x[1],reverse=True))[0:ss]
 | 
			
		||||
        # returns sorted list of tuples domain,gain_ratio
 | 
			
		||||
        return res
 | 
			
		||||
 | 
			
		||||
    def get_unvisited_domains(self,count):
 | 
			
		||||
        # get all domains
 | 
			
		||||
        rows = self.session.execute(self.domains_select)
 | 
			
		||||
        domains = []
 | 
			
		||||
        for row in rows:
 | 
			
		||||
            domain = row[0]
 | 
			
		||||
            seen_count = row[1]
 | 
			
		||||
            fetched_count = row[2]
 | 
			
		||||
            gain_ratio = row[3]
 | 
			
		||||
            afg = row[4]
 | 
			
		||||
            if seen_count and not fetched_count:
 | 
			
		||||
                domains.append(domain)
 | 
			
		||||
        ss = min(len(domains),count)
 | 
			
		||||
        return random.sample(domains,ss)
 | 
			
		||||
    
 | 
			
		||||
    def get_visit_links(self,domain,recent_count,old_count,random_count):
 | 
			
		||||
        dblinks = []
 | 
			
		||||
        rows = self.session.execute("SELECT url_schema,url_path,url_query,update_time FROM links Where domain_name=%s AND link_status='seen'",(domain,))
 | 
			
		||||
        for row in rows:
 | 
			
		||||
            link = urlunparse((row[0],domain,row[1],row[2]))
 | 
			
		||||
            dblinks.append((link,row[3]))
 | 
			
		||||
 | 
			
		||||
        visitlinks = []
 | 
			
		||||
        dblinks.sort(key=lambda x:x[1])
 | 
			
		||||
        random_links = []
 | 
			
		||||
        for i,(link,time) in enumerate(dblinks):
 | 
			
		||||
            #print(link,time)
 | 
			
		||||
            if i < recent_count:
 | 
			
		||||
                visitlinks.append(link)
 | 
			
		||||
            elif i >= len(dblinks) - old_count:
 | 
			
		||||
                visitlinks.append(link)
 | 
			
		||||
            else:
 | 
			
		||||
                random_links.append(link)
 | 
			
		||||
        sc = min(random_count,len(random_links))
 | 
			
		||||
        if sc > 0:
 | 
			
		||||
            visitlinks += random.sample(random_links,sc)
 | 
			
		||||
        return visitlinks
 | 
			
		||||
							
								
								
									
										335
									
								
								websucker/parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										335
									
								
								websucker/parser.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,335 @@
 | 
			
		||||
import dateutil.parser
 | 
			
		||||
import justext
 | 
			
		||||
import re
 | 
			
		||||
import sys
 | 
			
		||||
import datetime
 | 
			
		||||
 | 
			
		||||
import lxml.etree
 | 
			
		||||
import urllib.parse
 | 
			
		||||
import os.path
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
datere = re.compile("\d{1,2}\.\s*\d{1,2}\.\s*[12]\d{3}")
 | 
			
		||||
yearre = re.compile(r"\s\d{4}\s")
 | 
			
		||||
 | 
			
		||||
def urlunparse(parsed_url):
 | 
			
		||||
    schema, netloc, path, query = parsed_url
 | 
			
		||||
    return urllib.parse.urlunparse((schema, netloc, path, "", query, ""))
 | 
			
		||||
 | 
			
		||||
def normalize_link(link, base=None,strip_query=False):
 | 
			
		||||
 | 
			
		||||
    link = link.strip().replace(
 | 
			
		||||
                "\n", "").replace("\t", "").replace("\r", "")
 | 
			
		||||
    parsed_link = urllib.parse.urlparse(link)
 | 
			
		||||
    schema = parsed_link[0]
 | 
			
		||||
    netloc = parsed_link[1].strip().lower()
 | 
			
		||||
    path = parsed_link[2].strip()
 | 
			
		||||
    query = parsed_link[4]
 | 
			
		||||
    if strip_query:
 | 
			
		||||
        query = ""
 | 
			
		||||
    if path is None or len(path) == 0:
 | 
			
		||||
        path = "/"
 | 
			
		||||
    dirname, filename = os.path.split(path)
 | 
			
		||||
    if base is not None:
 | 
			
		||||
        parsed_base = urllib.parse.urlparse(base)
 | 
			
		||||
        if schema == "":
 | 
			
		||||
            schema = parsed_base[0]
 | 
			
		||||
        # Ak je relativny link
 | 
			
		||||
        if netloc == "":
 | 
			
		||||
            netloc = parsed_base[1]
 | 
			
		||||
            schema = parsed_base[0]
 | 
			
		||||
            bdir, bfile = os.path.split(parsed_base[2])
 | 
			
		||||
            if len(bdir) > 0 and bdir[0] != "." and len(dirname) > 0 and dirname[0] != "/":
 | 
			
		||||
                dirname = bdir + "/" + dirname
 | 
			
		||||
    # if len(dirname) == 0 or dirname[0] != '/':
 | 
			
		||||
    #    path = '/' + path
 | 
			
		||||
    dirname = os.path.normpath(dirname)
 | 
			
		||||
    dirname = dirname.lstrip("/").lstrip(".")
 | 
			
		||||
    path = dirname + "/" + filename
 | 
			
		||||
    return schema, netloc, path, query
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_date(te):
 | 
			
		||||
    dates = []
 | 
			
		||||
    words = []
 | 
			
		||||
    if te is None:
 | 
			
		||||
        te = ""
 | 
			
		||||
    for t in te.split():
 | 
			
		||||
        t = t.strip().lower().lstrip("0").replace("\r", "\n").replace("\n", "")
 | 
			
		||||
        if len(t) == 0:
 | 
			
		||||
            continue
 | 
			
		||||
        for i, m in enumerate(["jan", "feb", "mar", "apr", "máj", "jún", "júl", "aug", "sept", "okt", "nov", "dec"]):
 | 
			
		||||
            if t.startswith(m):
 | 
			
		||||
                t = str(i + 1) + "."
 | 
			
		||||
                break
 | 
			
		||||
        if t[0].isdigit():
 | 
			
		||||
            words.append(t)
 | 
			
		||||
    txt = " ".join(words)
 | 
			
		||||
    for st in re.findall(datere, txt):
 | 
			
		||||
        tokens = st.replace(" ", "").split(".")
 | 
			
		||||
        try:
 | 
			
		||||
            y = int(tokens[-1])
 | 
			
		||||
            if y < 2000 or y > 2020:
 | 
			
		||||
                continue
 | 
			
		||||
            m = 2
 | 
			
		||||
            d = 2
 | 
			
		||||
            if len(tokens) > 2:
 | 
			
		||||
                m = int(tokens[-2])
 | 
			
		||||
                d = int(tokens[-3])
 | 
			
		||||
            dates.append(datetime.date(y, m, d))
 | 
			
		||||
        except ValueError:
 | 
			
		||||
            pass
 | 
			
		||||
    return dates
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BaseParser:
 | 
			
		||||
    def __init__(self, verbose=False):
 | 
			
		||||
        self.strip_query = True
 | 
			
		||||
        self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz",
 | 
			
		||||
                          ".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"]
 | 
			
		||||
        self.skipchars = re.compile(r"[();:@& ]")
 | 
			
		||||
        self.store = True
 | 
			
		||||
        self.verbose = verbose
 | 
			
		||||
        self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}$")
 | 
			
		||||
        self.listen_robot = True
 | 
			
		||||
        self.recent_links = 5
 | 
			
		||||
        self.old_links = 3
 | 
			
		||||
        self.random_links = 10
 | 
			
		||||
        self.crawl_rounds = 3
 | 
			
		||||
        self.skipdomains = set()
 | 
			
		||||
        self.allowdomains = set()
 | 
			
		||||
        self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter",   "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc",  "eshop", "e-shop", "email", "gallery", "flog"])
 | 
			
		||||
        self.justext_language = "Slovak"
 | 
			
		||||
 | 
			
		||||
    def is_domain_good(self, domain):
 | 
			
		||||
        r = None
 | 
			
		||||
        # Netloc
 | 
			
		||||
        if ":" in domain:
 | 
			
		||||
            r = "Port in domain"
 | 
			
		||||
        elif len(domain) < 4:
 | 
			
		||||
            r = "Too short domain"
 | 
			
		||||
        elif len(domain) > 50:
 | 
			
		||||
            r = "Too long location"
 | 
			
		||||
        elif domain.startswith(".") or domain.endswith("."):
 | 
			
		||||
            r = "Malformed domain"
 | 
			
		||||
        elif not self.domain_re.match(domain):
 | 
			
		||||
            r = "Bad domain"
 | 
			
		||||
        else:
 | 
			
		||||
            da = False
 | 
			
		||||
            for d in self.allowdomains:
 | 
			
		||||
                if domain.endswith(d):
 | 
			
		||||
                    da = True
 | 
			
		||||
                    break
 | 
			
		||||
            if not da and len(self.allowdomains) > 0:
 | 
			
		||||
                r = "Domain not in allowlist"
 | 
			
		||||
            for d in self.skipdomains:
 | 
			
		||||
                if domain.endswith(d):
 | 
			
		||||
                    r = "In domain skiplist"
 | 
			
		||||
            for d in domain.split("."):
 | 
			
		||||
                if d in self.skippaths:
 | 
			
		||||
                    r = "Domain in skippath"
 | 
			
		||||
        if r is not None and self.verbose:
 | 
			
		||||
            print(domain + " " + r)
 | 
			
		||||
        return r is None
 | 
			
		||||
 | 
			
		||||
#   # Argument - parsovana url
 | 
			
		||||
    def is_link_good(self, link):
 | 
			
		||||
        assert(link is not None)
 | 
			
		||||
        r = None
 | 
			
		||||
        if sys.getsizeof(link) > 1023:
 | 
			
		||||
            r = "Too long"
 | 
			
		||||
        try:
 | 
			
		||||
            schema, domain, path, query = normalize_link(link)
 | 
			
		||||
            if not schema.startswith("http"):
 | 
			
		||||
                r = "Bad schema"
 | 
			
		||||
            dg = self.is_domain_good(domain)
 | 
			
		||||
            if not dg:
 | 
			
		||||
                return False
 | 
			
		||||
            for c in link:
 | 
			
		||||
                if ord(c) >= 128:
 | 
			
		||||
                    r = "Bad domain character"
 | 
			
		||||
                    break
 | 
			
		||||
            for p in self.skipdomains:
 | 
			
		||||
                if domain.endswith(p):
 | 
			
		||||
                    r = "Bad domain"
 | 
			
		||||
                    break
 | 
			
		||||
            if ".b-" in domain:
 | 
			
		||||
                r = "Bad domain"
 | 
			
		||||
            if len(domain) > 127:
 | 
			
		||||
                r = "Too long path"
 | 
			
		||||
            # Path
 | 
			
		||||
            for t in self.skiptypes:
 | 
			
		||||
                if path.lower().endswith(t):
 | 
			
		||||
                    r = "Bad type"
 | 
			
		||||
                    break
 | 
			
		||||
            if re.search(self.skipchars, path):
 | 
			
		||||
                r = "Bad path"
 | 
			
		||||
            for p in path.split("/"):
 | 
			
		||||
                if p in self.skippaths or "jpg" in p or "galeria" in p:
 | 
			
		||||
                    r = "Bad path"
 | 
			
		||||
                    break
 | 
			
		||||
        except ValueError:
 | 
			
		||||
            r = "Bad urlparse"
 | 
			
		||||
        return r is None
 | 
			
		||||
    
 | 
			
		||||
    def filter_links(links):
 | 
			
		||||
        # Filter links
 | 
			
		||||
        linkset = set()
 | 
			
		||||
        for link in links:
 | 
			
		||||
            if not self.is_link_good(link):
 | 
			
		||||
                continue
 | 
			
		||||
            link = urlunparse(normalize_link(link,strip_query=self.strip_query))
 | 
			
		||||
            linkset.add(link)
 | 
			
		||||
 | 
			
		||||
        return list(linkset)
 | 
			
		||||
 | 
			
		||||
    def extract_raw_text(self, content, current_time):
 | 
			
		||||
        result = []
 | 
			
		||||
        rd = None
 | 
			
		||||
        paragraphs = []
 | 
			
		||||
        content.seek(0)
 | 
			
		||||
        try:
 | 
			
		||||
            c = content.read()
 | 
			
		||||
            paragraphs = justext.justext(c, justext.get_stoplist(self.justext_language), length_low=50, length_high=150)
 | 
			
		||||
            content.seek(0)
 | 
			
		||||
        except lxml.etree.XMLSyntaxError:
 | 
			
		||||
            print("XML Syntax parse error")
 | 
			
		||||
        except lxml.etree.ParserError:
 | 
			
		||||
 | 
			
		||||
            print("XML Parse parse error")
 | 
			
		||||
        except justext.core.JustextError:
 | 
			
		||||
            print("Justext error")
 | 
			
		||||
        except IndexError:
 | 
			
		||||
            print("XML error")
 | 
			
		||||
        except UnicodeDecodeError:
 | 
			
		||||
            print("Unicode Error")
 | 
			
		||||
        except TypeError:
 | 
			
		||||
            # NUll in string
 | 
			
		||||
            print("String Error")
 | 
			
		||||
        except RuntimeError:
 | 
			
		||||
            # Maximum recursion depth"
 | 
			
		||||
            print("Recursion Error")
 | 
			
		||||
        dates = []
 | 
			
		||||
        for p in paragraphs:
 | 
			
		||||
            # TODO - match URL for date
 | 
			
		||||
            if p is not None and p.text is not None and len(p.text) > 0:
 | 
			
		||||
                dat = get_date(p.text)
 | 
			
		||||
                for d in dat:
 | 
			
		||||
                    dates.append(d)
 | 
			
		||||
                if self.verbose:
 | 
			
		||||
                    print(p.class_type, p.links_density(), p.stopwords_density(
 | 
			
		||||
                        justext.get_stoplist(self.justext_language)), p.text)
 | 
			
		||||
                if not p.is_boilerplate:
 | 
			
		||||
                    result.append(p.text.strip())
 | 
			
		||||
        if len(dates) == 0:
 | 
			
		||||
            dates.append(current_time)
 | 
			
		||||
        if len(dates) > 0:
 | 
			
		||||
            rd = max(dates)
 | 
			
		||||
        rd = rd.isoformat()
 | 
			
		||||
 | 
			
		||||
        return "\n\n".join(result), rd
 | 
			
		||||
 | 
			
		||||
    # Extracts matainformation from html
 | 
			
		||||
    # First it looks for name, content in meta tags
 | 
			
		||||
    # then it looks for opengraph
 | 
			
		||||
    def extract_og(self, bs):
 | 
			
		||||
        tags = set()
 | 
			
		||||
        authors = set()
 | 
			
		||||
        title = ""
 | 
			
		||||
        description = ""
 | 
			
		||||
        section = ""
 | 
			
		||||
        article_published_time = ""
 | 
			
		||||
 | 
			
		||||
        for m in bs.find_all("meta", attrs={"name": True, "content": True}):
 | 
			
		||||
            content = m["content"].strip()
 | 
			
		||||
            if len(content) == 0:
 | 
			
		||||
                continue
 | 
			
		||||
            name = m["name"].strip()
 | 
			
		||||
            if name == "keywords":
 | 
			
		||||
                for t in content.split(","):
 | 
			
		||||
                    if len(t.strip()) > 0:
 | 
			
		||||
                        tags.add(t.strip())
 | 
			
		||||
            if name == "news_keywords":
 | 
			
		||||
                for t in content.split(","):
 | 
			
		||||
                    if len(t.strip()) > 0:
 | 
			
		||||
                        tags.add(t.strip())
 | 
			
		||||
            if name == "author":
 | 
			
		||||
                authors.add(content)
 | 
			
		||||
            if name == "description":
 | 
			
		||||
                description = content
 | 
			
		||||
 | 
			
		||||
        for m in bs.find_all("meta", property=True, content=True):
 | 
			
		||||
            content = m["content"].strip()
 | 
			
		||||
            if len(content) == 0:
 | 
			
		||||
                continue
 | 
			
		||||
            property = m["property"].strip()
 | 
			
		||||
            if property == "og:title":
 | 
			
		||||
                title = content
 | 
			
		||||
            if property == "article:published_time":
 | 
			
		||||
                try:
 | 
			
		||||
                    # Je v ISO formate?
 | 
			
		||||
                    d = dateutil.parser.parse(content)
 | 
			
		||||
                    article_published_time = d.isoformat()
 | 
			
		||||
                except ValueError:
 | 
			
		||||
                    pass
 | 
			
		||||
                except OverflowError:
 | 
			
		||||
                    pass
 | 
			
		||||
            if property == "article:author" and "admin" not in content.lower():
 | 
			
		||||
                authors.add(content)
 | 
			
		||||
            if property == "section":
 | 
			
		||||
                section = content
 | 
			
		||||
            if property == "tag":
 | 
			
		||||
                tags.add(content)
 | 
			
		||||
            if property == "og:description":
 | 
			
		||||
                description = content
 | 
			
		||||
 | 
			
		||||
        if len(title) < 2 and bs.h1 is not None:
 | 
			
		||||
            title = bs.h1.get_text(strip=True)
 | 
			
		||||
        if len(title) < 2 and bs.title is not None:
 | 
			
		||||
            title = bs.title.get_text(strip=True)
 | 
			
		||||
        if len(authors) == 0:
 | 
			
		||||
            for m in bs.find_all(property="author"):
 | 
			
		||||
                authors.add(m.get_text(strip=True))
 | 
			
		||||
        if len(authors) == 0:
 | 
			
		||||
            for m in bs.find_all(itemprop="author"):
 | 
			
		||||
                authors.add(m.get_text(strip=True))
 | 
			
		||||
        authors = set(filter(lambda x: len(x) > 2, authors))
 | 
			
		||||
 | 
			
		||||
        return tags,authors,title.strip(),article_published_time.strip(),description,section.strip()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def calculate_checksums(self, text):
 | 
			
		||||
        """
 | 
			
		||||
        @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
 | 
			
		||||
        """
 | 
			
		||||
        checksums = []
 | 
			
		||||
        sizes = []
 | 
			
		||||
        hval = 0
 | 
			
		||||
        hsz = 0
 | 
			
		||||
        sz = 0
 | 
			
		||||
        for c in text:
 | 
			
		||||
            cv = ord(c)
 | 
			
		||||
            sz += 1
 | 
			
		||||
            if cv > 64:
 | 
			
		||||
                hval += (hval << 3) + cv
 | 
			
		||||
                zv = hval >> 31
 | 
			
		||||
                hval &= 0x7fffffff
 | 
			
		||||
                hval += zv
 | 
			
		||||
                hsz += 1
 | 
			
		||||
            if c == "\n" and hsz > 0:
 | 
			
		||||
                if hsz > 100:
 | 
			
		||||
                    checksums.append(hval)
 | 
			
		||||
                    sizes.append(sz)
 | 
			
		||||
                sz = 0
 | 
			
		||||
                hsz = 0
 | 
			
		||||
        if hsz > 100:
 | 
			
		||||
            checksums.append(hval)
 | 
			
		||||
            sizes.append(sz)
 | 
			
		||||
        return checksums, sizes
 | 
			
		||||
 | 
			
		||||
class EnglishParser(BaseParser):
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        super(EnglishParser,self).__init__()
 | 
			
		||||
        self.justext_language = "English"
 | 
			
		||||
        self.allowdomains = set(["com","org","io"])
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										35
									
								
								websucker/queue.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								websucker/queue.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,35 @@
 | 
			
		||||
import greenstalk
 | 
			
		||||
import random
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
MAX_PRIORITY = 0
 | 
			
		||||
MIN_PRIORITY = 4000000000
 | 
			
		||||
 | 
			
		||||
MAX_FLOAT_PRIORITY = 10000.0
 | 
			
		||||
 | 
			
		||||
def map_priority(p,max_priority):
 | 
			
		||||
    p = p / max_priority
 | 
			
		||||
    return MIN_PRIORITY - (p*MIN_PRIORITY)
 | 
			
		||||
 | 
			
		||||
class BeanstalkdQueue:
 | 
			
		||||
    def __init__(self,host,port,tube):
 | 
			
		||||
        self.c = greenstalk.Client(host,port,use=tube,encoding="utf8")
 | 
			
		||||
 | 
			
		||||
    def queue_priority_domains(self,priority_domains):
 | 
			
		||||
        for domain,priority in priority_domains:
 | 
			
		||||
            p = priority / MAX_FLOAT_PRIORITY
 | 
			
		||||
            p = MIN_PRIORITY - (p*MIN_PRIORITY)
 | 
			
		||||
            self.c.put(domain,p)
 | 
			
		||||
 | 
			
		||||
    def queue_random_domains(self,domains):
 | 
			
		||||
        for domain in domains:
 | 
			
		||||
            p = random.randint(MAX_PRIORITY,MIN_PRIORITY)
 | 
			
		||||
            self.c.put(domain,p)
 | 
			
		||||
 | 
			
		||||
    def consume_domains(self,callback):
 | 
			
		||||
        while True:
 | 
			
		||||
            job = self.c.reserve()
 | 
			
		||||
            domain = job.body
 | 
			
		||||
            self.c.delete(job)
 | 
			
		||||
            callback(domain)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										94
									
								
								websucker/schema.sql
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										94
									
								
								websucker/schema.sql
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,94 @@
 | 
			
		||||
DROP KEYSPACE websucker;
 | 
			
		||||
 | 
			
		||||
CREATE KEYSPACE websucker
 | 
			
		||||
WITH replication = {'class':'SimpleStrategy', 'replication_factor' : 1};
 | 
			
		||||
 | 
			
		||||
USE websucker;
 | 
			
		||||
 | 
			
		||||
CREATE TABLE links (
 | 
			
		||||
    domain_name TEXT,
 | 
			
		||||
    url_path TEXT,
 | 
			
		||||
    url_query TEXT,
 | 
			
		||||
    url_schema TEXT,
 | 
			
		||||
    redirect_target TEXT,
 | 
			
		||||
    link_status TEXT,
 | 
			
		||||
    link_originality FLOAT,
 | 
			
		||||
    body_size INT,
 | 
			
		||||
    update_time TIMESTAMP,
 | 
			
		||||
    PRIMARY KEY(domain_name,url_path,url_query)
 | 
			
		||||
);
 | 
			
		||||
 | 
			
		||||
CREATE INDEX link_status_index ON links(link_status);
 | 
			
		||||
 | 
			
		||||
CREATE TABLE daily_links (
 | 
			
		||||
    day DATE,
 | 
			
		||||
    domain_name TEXT,
 | 
			
		||||
    url_path TEXT,
 | 
			
		||||
    url_query TEXT,
 | 
			
		||||
    link_status TEXT,
 | 
			
		||||
    body_size INT,
 | 
			
		||||
    link_originality FLOAT,
 | 
			
		||||
    update_time TIMESTAMP,
 | 
			
		||||
    PRIMARY KEY(day,domain_name,link_status,url_path,url_query)
 | 
			
		||||
);
 | 
			
		||||
 | 
			
		||||
CREATE TABLE domain_quality (
 | 
			
		||||
    domain_name TEXT,
 | 
			
		||||
    day DATE,
 | 
			
		||||
    seen_count INT,
 | 
			
		||||
    good_size INT,
 | 
			
		||||
    good_count INT,
 | 
			
		||||
    good_probability FLOAT,
 | 
			
		||||
    good_originality FLOAT,
 | 
			
		||||
    average_good_characters FLOAT,
 | 
			
		||||
    content_size INT,
 | 
			
		||||
    content_count INT,
 | 
			
		||||
    content_probability FLOAT,
 | 
			
		||||
    content_originality FLOAT,
 | 
			
		||||
    average_content_characters FLOAT,
 | 
			
		||||
    fetched_count INT,
 | 
			
		||||
    average_fetched_good_characters FLOAT,
 | 
			
		||||
    gain_ratio FLOAT,
 | 
			
		||||
    update_time TIMESTAMP STATIC ,
 | 
			
		||||
    PRIMARY KEY(domain_name,day)
 | 
			
		||||
) WITH CLUSTERING ORDER BY (day DESC);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CREATE TABLE content (
 | 
			
		||||
    domain_name TEXT,
 | 
			
		||||
    target_link TEXT,
 | 
			
		||||
    agent_version TEXT,
 | 
			
		||||
    title TEXT,
 | 
			
		||||
    links SET<TEXT>,
 | 
			
		||||
    authors SET<TEXT>,
 | 
			
		||||
    tags SET<TEXT>,
 | 
			
		||||
    description TEXT,
 | 
			
		||||
    section TEXT,
 | 
			
		||||
    article_published_time TEXT,
 | 
			
		||||
    text_date TEXT,
 | 
			
		||||
    body TEXT,
 | 
			
		||||
    body_size INT,
 | 
			
		||||
    update_time TIMESTAMP,
 | 
			
		||||
    PRIMARY KEY(domain_name,target_link),
 | 
			
		||||
);
 | 
			
		||||
 | 
			
		||||
CREATE TABLE paragraph_checksums (
 | 
			
		||||
    checksum BIGINT,
 | 
			
		||||
    url_hash BIGINT,
 | 
			
		||||
    PRIMARY KEY(checksum,url_hash),
 | 
			
		||||
);
 | 
			
		||||
 | 
			
		||||
CREATE TABLE html (
 | 
			
		||||
    day DATE,
 | 
			
		||||
    domain_name TEXT,
 | 
			
		||||
    source_link TEXT,
 | 
			
		||||
    target_link TEXT,
 | 
			
		||||
    redirect_links LIST<TEXT>,
 | 
			
		||||
    status INT,
 | 
			
		||||
    content TEXT,
 | 
			
		||||
    headers TEXT,
 | 
			
		||||
    agent_version TEXT,
 | 
			
		||||
    update_time TIMESTAMP,
 | 
			
		||||
    PRIMARY KEY(day,domain_name,source_link)
 | 
			
		||||
);
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user