initial
This commit is contained in:
		
						commit
						0c9ea2b4e3
					
				
							
								
								
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@ -0,0 +1,4 @@
 | 
				
			|||||||
 | 
					build
 | 
				
			||||||
 | 
					dist
 | 
				
			||||||
 | 
					*.egg-info
 | 
				
			||||||
 | 
					venv
 | 
				
			||||||
							
								
								
									
										21
									
								
								LICENSE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								LICENSE.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,21 @@
 | 
				
			|||||||
 | 
					MIT License
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Copyright (c) 2020 Technical University of Kosice
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
				
			||||||
 | 
					of this software and associated documentation files (the "Software"), to deal
 | 
				
			||||||
 | 
					in the Software without restriction, including without limitation the rights
 | 
				
			||||||
 | 
					to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
				
			||||||
 | 
					copies of the Software, and to permit persons to whom the Software is
 | 
				
			||||||
 | 
					furnished to do so, subject to the following conditions:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The above copyright notice and this permission notice shall be included in all
 | 
				
			||||||
 | 
					copies or substantial portions of the Software.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
				
			||||||
 | 
					IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
				
			||||||
 | 
					FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
				
			||||||
 | 
					AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
				
			||||||
 | 
					LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
				
			||||||
 | 
					OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
				
			||||||
 | 
					SOFTWARE.
 | 
				
			||||||
							
								
								
									
										1
									
								
								MANIFEST.in
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								MANIFEST.in
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					recursive-include websucker *.sql
 | 
				
			||||||
							
								
								
									
										6
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,6 @@
 | 
				
			|||||||
 | 
					BeautifulSoup4
 | 
				
			||||||
 | 
					justext
 | 
				
			||||||
 | 
					cassandra-driver
 | 
				
			||||||
 | 
					python-dateutil
 | 
				
			||||||
 | 
					click
 | 
				
			||||||
 | 
					pycurl
 | 
				
			||||||
							
								
								
									
										43
									
								
								setup.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								setup.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,43 @@
 | 
				
			|||||||
 | 
					import setuptools
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open("README.md", "r") as fh:
 | 
				
			||||||
 | 
					    long_description = fh.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					setuptools.setup(
 | 
				
			||||||
 | 
					    name="websucker", # Replace with your own username
 | 
				
			||||||
 | 
					    version="1.0.0",
 | 
				
			||||||
 | 
					    author="Daniel Hládek",
 | 
				
			||||||
 | 
					    author_email="dhladek@gmail.com",
 | 
				
			||||||
 | 
					    description="Web Crawler",
 | 
				
			||||||
 | 
					    long_description=long_description,
 | 
				
			||||||
 | 
					    long_description_content_type="text/markdown",
 | 
				
			||||||
 | 
					    url="https://github.com/hladek/websucker",
 | 
				
			||||||
 | 
					    packages=setuptools.find_packages(),
 | 
				
			||||||
 | 
					    # specified in MANIFEST
 | 
				
			||||||
 | 
					    include_package_data=True,
 | 
				
			||||||
 | 
					    classifiers=[
 | 
				
			||||||
 | 
					        "Programming Language :: Python :: 3",
 | 
				
			||||||
 | 
					        "License :: OSI Approved :: MIT License",
 | 
				
			||||||
 | 
					        "Operating System :: OS Independent",
 | 
				
			||||||
 | 
					        "Development Status :: 3 - Alpha",
 | 
				
			||||||
 | 
					        "Intended Audience :: Science/Research",
 | 
				
			||||||
 | 
					        "Topic :: Internet :: WWW/HTTP :: Indexing/Search"
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    python_requires='>=3.6',
 | 
				
			||||||
 | 
					    entry_points={  # Optional
 | 
				
			||||||
 | 
					        'console_scripts': [
 | 
				
			||||||
 | 
					            'websuck=websucker.cli:cli',
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    install_requires=[
 | 
				
			||||||
 | 
					        "BeautifulSoup4",
 | 
				
			||||||
 | 
					        "justext",
 | 
				
			||||||
 | 
					        "cassandra-driver",
 | 
				
			||||||
 | 
					        "python-dateutil",
 | 
				
			||||||
 | 
					        "click",
 | 
				
			||||||
 | 
					        "pycurl",
 | 
				
			||||||
 | 
					        "greenstalk"
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										0
									
								
								websucker/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								websucker/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										364
									
								
								websucker/agent.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										364
									
								
								websucker/agent.py
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,364 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/env python
 | 
				
			||||||
 | 
					#! -*- coding: utf-8 -*-
 | 
				
			||||||
 | 
					import urllib.parse
 | 
				
			||||||
 | 
					import urllib.error
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import os.path
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import datetime
 | 
				
			||||||
 | 
					import time
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					import tempfile
 | 
				
			||||||
 | 
					import pprint
 | 
				
			||||||
 | 
					import bs4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pycurl
 | 
				
			||||||
 | 
					import urllib.robotparser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from websucker.parser import normalize_link,urlunparse
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Parses http refresh in header or on html meta
 | 
				
			||||||
 | 
					def get_refresh(ref,target_link):
 | 
				
			||||||
 | 
					    refresh = None
 | 
				
			||||||
 | 
					    tokens = ref.strip().split(";")
 | 
				
			||||||
 | 
					    if len(tokens) > 1 and tokens[1].lower().startswith("url="):
 | 
				
			||||||
 | 
					        refresh = urlunparse(normalize_link(
 | 
				
			||||||
 | 
					            tokens[1][4:].strip("\'"), target_link))
 | 
				
			||||||
 | 
					    return refresh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Response:
 | 
				
			||||||
 | 
					    def __init__(self,url,headers,status,content,redirects,link_status):
 | 
				
			||||||
 | 
					        assert len(url) > 0
 | 
				
			||||||
 | 
					        assert url[0] != "/"
 | 
				
			||||||
 | 
					        self.url = url
 | 
				
			||||||
 | 
					        self.status = status
 | 
				
			||||||
 | 
					        self.content = content
 | 
				
			||||||
 | 
					        self.headers = headers
 | 
				
			||||||
 | 
					        self.redirects = redirects
 | 
				
			||||||
 | 
					        self.visited_time = datetime.date.today()
 | 
				
			||||||
 | 
					        self.bs = None
 | 
				
			||||||
 | 
					        self.link_status = link_status
 | 
				
			||||||
 | 
					        if content is not None and link_status == "good":
 | 
				
			||||||
 | 
					            self.bs = bs4.BeautifulSoup(content, "lxml")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __str__(self):
 | 
				
			||||||
 | 
					        return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_content(self):
 | 
				
			||||||
 | 
					        if self.content is None:
 | 
				
			||||||
 | 
					            print("NO CONTENT")
 | 
				
			||||||
 | 
					            print(self.url,self.redirects)
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					        self.content.seek(0)
 | 
				
			||||||
 | 
					        text = self.content.read()
 | 
				
			||||||
 | 
					        out = str(text,encoding="utf8",errors="replace")
 | 
				
			||||||
 | 
					        return out
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # HMTL metarefresh redirect
 | 
				
			||||||
 | 
					    def get_metarefresh(self):
 | 
				
			||||||
 | 
					        if self.content is None:
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					        metarefresh = None
 | 
				
			||||||
 | 
					        t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"})
 | 
				
			||||||
 | 
					        canonical = self.get_canonical()
 | 
				
			||||||
 | 
					        for tags in t:
 | 
				
			||||||
 | 
					            if "content" in tags:
 | 
				
			||||||
 | 
					                metarefresh = get_refresh(tags["content"],canonical)
 | 
				
			||||||
 | 
					        if metarefresh is not None:
 | 
				
			||||||
 | 
					            nl = normalize_link(metarefresh, canonical)
 | 
				
			||||||
 | 
					            print("Metarefresh")
 | 
				
			||||||
 | 
					            print(nl)
 | 
				
			||||||
 | 
					            metarefresh = urlunparse(nl)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return metarefresh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_canonical(self):
 | 
				
			||||||
 | 
					        r = None
 | 
				
			||||||
 | 
					        last_link = self.url
 | 
				
			||||||
 | 
					        if len(self.redirects) > 0:
 | 
				
			||||||
 | 
					            last_link = self.redirects[-1]
 | 
				
			||||||
 | 
					        if self.bs is not None:
 | 
				
			||||||
 | 
					            l = self.bs.find("link", rel="canonical", href=True)
 | 
				
			||||||
 | 
					            if l is not None:
 | 
				
			||||||
 | 
					                r = urlunparse(normalize_link(l["href"], last_link))
 | 
				
			||||||
 | 
					        if r is None:
 | 
				
			||||||
 | 
					            r = last_link
 | 
				
			||||||
 | 
					        r = urlunparse(normalize_link(r, last_link))
 | 
				
			||||||
 | 
					        assert len(r) > 0
 | 
				
			||||||
 | 
					        assert r[0] != "/"
 | 
				
			||||||
 | 
					        return r
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_redirects(self):
 | 
				
			||||||
 | 
					        if len(self.redirects) <2 :
 | 
				
			||||||
 | 
					            return []
 | 
				
			||||||
 | 
					        return self.redirects[0:-1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Connection:
 | 
				
			||||||
 | 
					    def __init__(self):
 | 
				
			||||||
 | 
					        self.c = pycurl.Curl()
 | 
				
			||||||
 | 
					        self.c.setopt(self.c.FOLLOWLOCATION, True)
 | 
				
			||||||
 | 
					#       self.c.setopt(self.c.VERBOSE, True)
 | 
				
			||||||
 | 
					        self.c.setopt(self.c.CONNECTTIMEOUT, 20)
 | 
				
			||||||
 | 
					        self.c.setopt(self.c.TIMEOUT, 20)
 | 
				
			||||||
 | 
					        self.c.setopt(self.c.FAILONERROR, True)
 | 
				
			||||||
 | 
					        self.c.setopt(self.c.HTTPHEADER, [
 | 
				
			||||||
 | 
					                      'Accept: text/html', 'Accept-Charset: UTF-8'])
 | 
				
			||||||
 | 
					        self.c.setopt(self.c.HEADERFUNCTION, self.header)
 | 
				
			||||||
 | 
					        self.c.setopt(self.c.USERAGENT, "Googlebot-News")
 | 
				
			||||||
 | 
					#        #self.c.setopt(pycurl.COOKIEJAR, 'cookie.txt')
 | 
				
			||||||
 | 
					#        #self.c.setopt(pycurl.COOKIEFILE, 'cookie.txt')
 | 
				
			||||||
 | 
					        self.robots = {}
 | 
				
			||||||
 | 
					        self.headers = {}
 | 
				
			||||||
 | 
					        self.redirects = []
 | 
				
			||||||
 | 
					        self.header_lines = []
 | 
				
			||||||
 | 
					        self.status = 0
 | 
				
			||||||
 | 
					        self.max_redirect = 4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Zastavi spracovanie ak content nie je text
 | 
				
			||||||
 | 
					    # zaznamena location a refresh
 | 
				
			||||||
 | 
					    def header(self, data):
 | 
				
			||||||
 | 
					        if len(data) == 0:
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					        l = str(data, encoding="utf8")
 | 
				
			||||||
 | 
					        self.header_lines.append(l)
 | 
				
			||||||
 | 
					        s = l.find(" ")
 | 
				
			||||||
 | 
					        if s >= 1 and s < len(l):
 | 
				
			||||||
 | 
					            key = l[0:s - 1]
 | 
				
			||||||
 | 
					            value = l[s + 1:].rstrip()
 | 
				
			||||||
 | 
					            self.headers[key] = value
 | 
				
			||||||
 | 
					            if key.lower() == "refresh":
 | 
				
			||||||
 | 
					                self.add_redirect(value)
 | 
				
			||||||
 | 
					            elif key.lower() == "location":
 | 
				
			||||||
 | 
					                self.add_redirect(value)
 | 
				
			||||||
 | 
					            elif key == "Content-Type" and "text" not in value:
 | 
				
			||||||
 | 
					                # Pycurl potom vyhodi 23, failed writing header
 | 
				
			||||||
 | 
					                return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __del__(self):
 | 
				
			||||||
 | 
					        self.c.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def close(self):
 | 
				
			||||||
 | 
					        self.c.close()
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def add_redirect(self,link):
 | 
				
			||||||
 | 
					        last_link = self.url
 | 
				
			||||||
 | 
					        if len(self.redirects) > 0:
 | 
				
			||||||
 | 
					            last_link = self.redirects[-1]
 | 
				
			||||||
 | 
					        v = urlunparse(normalize_link(link, last_link))
 | 
				
			||||||
 | 
					        if v!=last_link and v not in set(self.redirects):
 | 
				
			||||||
 | 
					            self.redirects.append(v)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    @returns content, link_status 
 | 
				
			||||||
 | 
					    @throws pycurl.error
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    def _download(self, url):
 | 
				
			||||||
 | 
					        print("Downloading " + url)
 | 
				
			||||||
 | 
					        self.url = url
 | 
				
			||||||
 | 
					        self.headers = {}
 | 
				
			||||||
 | 
					        self.redirects = []
 | 
				
			||||||
 | 
					        self.header_lines = []
 | 
				
			||||||
 | 
					        self.status = 0
 | 
				
			||||||
 | 
					        content = None
 | 
				
			||||||
 | 
					        link_status = "bad_connection"
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            self.headers = {}
 | 
				
			||||||
 | 
					            del self.header_lines[:]
 | 
				
			||||||
 | 
					            content = tempfile.SpooledTemporaryFile()
 | 
				
			||||||
 | 
					            self.c.setopt(self.c.WRITEDATA, content)
 | 
				
			||||||
 | 
					            self.c.setopt(self.c.URL, url)
 | 
				
			||||||
 | 
					            self.c.perform()
 | 
				
			||||||
 | 
					            self.status = self.c.getinfo(self.c.RESPONSE_CODE)
 | 
				
			||||||
 | 
					            if self.status != 200:
 | 
				
			||||||
 | 
					                link_status = "bad_httpcode"
 | 
				
			||||||
 | 
					            elif "Content-Type" in self.headers and not self.headers["Content-Type"].startswith("text"):
 | 
				
			||||||
 | 
					                link_status = "bad_type"
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                link_status = "good"
 | 
				
			||||||
 | 
					                content.seek(0)
 | 
				
			||||||
 | 
					        except pycurl.error as e:
 | 
				
			||||||
 | 
					            errno, message = e.args
 | 
				
			||||||
 | 
					            content = None
 | 
				
			||||||
 | 
					            self.status = self.c.getinfo(self.c.RESPONSE_CODE)
 | 
				
			||||||
 | 
					            if errno == 23:
 | 
				
			||||||
 | 
					                # 23 je zly content v header
 | 
				
			||||||
 | 
					                link_status = "bad_type"
 | 
				
			||||||
 | 
					            elif errno == 22:
 | 
				
			||||||
 | 
					                link_status = "bad_httpcode"
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                raise e
 | 
				
			||||||
 | 
					        except UnicodeDecodeError as e:
 | 
				
			||||||
 | 
					            content = None
 | 
				
			||||||
 | 
					            link_status = "bad_unicode"
 | 
				
			||||||
 | 
					        except UnicodeEncodeError as e:
 | 
				
			||||||
 | 
					            content = None
 | 
				
			||||||
 | 
					            link_status = "bad_unicode"
 | 
				
			||||||
 | 
					        sz = self.c.getinfo(self.c.SIZE_DOWNLOAD)
 | 
				
			||||||
 | 
					        tt = self.c.getinfo(self.c.TOTAL_TIME)
 | 
				
			||||||
 | 
					        print("{} Received {} bytes in {} s".format(self.status,sz,tt))
 | 
				
			||||||
 | 
					        return content, link_status
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Throws pycurl.error
 | 
				
			||||||
 | 
					    def html_download2(self, url):
 | 
				
			||||||
 | 
					        dlink = url
 | 
				
			||||||
 | 
					        responses = []
 | 
				
			||||||
 | 
					        while len(responses) < 5:
 | 
				
			||||||
 | 
					            nl = normalize_link(dlink)
 | 
				
			||||||
 | 
					            url = urlunparse(nl)
 | 
				
			||||||
 | 
					            assert url.startswith("http")
 | 
				
			||||||
 | 
					            content, link_status = self._download(url)
 | 
				
			||||||
 | 
					            response = Response(url,"\r\n".join(self.header_lines),self.status,content,self.redirects,link_status)
 | 
				
			||||||
 | 
					            dlink = response.get_metarefresh()
 | 
				
			||||||
 | 
					            responses.append(response)
 | 
				
			||||||
 | 
					            if dlink is None:
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					        return responses
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def is_robot_good(self, url):
 | 
				
			||||||
 | 
					        schema, domain, path, query = normalize_link(url)
 | 
				
			||||||
 | 
					        res = True
 | 
				
			||||||
 | 
					        if domain not in self.robots:
 | 
				
			||||||
 | 
					            roboturl = urlunparse((schema, domain, "robots.txt", ""))
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                r = self._download(roboturl)
 | 
				
			||||||
 | 
					                if r[1] == "good":
 | 
				
			||||||
 | 
					                    c = r[0].read()
 | 
				
			||||||
 | 
					                    lines = str(c, errors="ignore", encoding="utf8").split("\n")
 | 
				
			||||||
 | 
					                    self.robots[domain] = urllib.robotparser.RobotFileParser()
 | 
				
			||||||
 | 
					                    self.robots[domain].parse(lines)
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    self.robots[domain] = None
 | 
				
			||||||
 | 
					            except pycurl.error as err:
 | 
				
			||||||
 | 
					                print(err)
 | 
				
			||||||
 | 
					        if domain in self.robots and self.robots[domain] is not None:
 | 
				
			||||||
 | 
					            res = self.robots[domain].can_fetch("Agent", url)
 | 
				
			||||||
 | 
					        return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ParsedDocument:
 | 
				
			||||||
 | 
					    def __init__(self, parser,work_link):
 | 
				
			||||||
 | 
					        self.parser = parser
 | 
				
			||||||
 | 
					        self.work_link = work_link
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.content = None 
 | 
				
			||||||
 | 
					        self.bs = None
 | 
				
			||||||
 | 
					        self.paragraph_checksums = None
 | 
				
			||||||
 | 
					        self.paragraph_sizes = None
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        self.link_set = set()
 | 
				
			||||||
 | 
					        self.body = None
 | 
				
			||||||
 | 
					        self.text_date = None
 | 
				
			||||||
 | 
					        self.tags = None
 | 
				
			||||||
 | 
					        self.authors = None
 | 
				
			||||||
 | 
					        self.title = None
 | 
				
			||||||
 | 
					        self.description = None
 | 
				
			||||||
 | 
					        self.section = None
 | 
				
			||||||
 | 
					        self.article_published_time = None
 | 
				
			||||||
 | 
					        self.current_time = datetime.date.today()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def extract(self,content,bs):
 | 
				
			||||||
 | 
					        self.content = content
 | 
				
			||||||
 | 
					        self.bs = bs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Extract text and metatext
 | 
				
			||||||
 | 
					        self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time)
 | 
				
			||||||
 | 
					        # Paragraph Checksums
 | 
				
			||||||
 | 
					        pch,pszs = self.parser.calculate_checksums(self.body) 
 | 
				
			||||||
 | 
					        self.paragraph_checksums = pch
 | 
				
			||||||
 | 
					        self.paragraph_sizes = pszs
 | 
				
			||||||
 | 
					        if bs is  None:
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					        self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_og(bs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Extrakcia linkov zo stranky
 | 
				
			||||||
 | 
					        base = self.work_link
 | 
				
			||||||
 | 
					        if bs.base is not None and "href" in bs.base.attrs:
 | 
				
			||||||
 | 
					            base = bs.base["href"]
 | 
				
			||||||
 | 
					        # Normalizacia linkov
 | 
				
			||||||
 | 
					        for l in bs.find_all("a", href=True):
 | 
				
			||||||
 | 
					            if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            href = l["href"]
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                nl = normalize_link(href, base)
 | 
				
			||||||
 | 
					                link = urlunparse(nl)
 | 
				
			||||||
 | 
					                if link == base:
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					                self.link_set.add(link)
 | 
				
			||||||
 | 
					            except ValueError:
 | 
				
			||||||
 | 
					                pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_links(self):
 | 
				
			||||||
 | 
					        return self.link_set
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_follow_links(self):
 | 
				
			||||||
 | 
					        follow_links = set()
 | 
				
			||||||
 | 
					        for l in self.link_set:
 | 
				
			||||||
 | 
					            if self.parser.is_link_good(l):
 | 
				
			||||||
 | 
					                link = normalize_link(l,strip_query=self.parser.strip_query)
 | 
				
			||||||
 | 
					                follow_links.add(urlunparse(link))
 | 
				
			||||||
 | 
					        return follow_links
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __str__(self):
 | 
				
			||||||
 | 
					        r = []
 | 
				
			||||||
 | 
					        if self.title is not None:
 | 
				
			||||||
 | 
					            r.append(self.title)
 | 
				
			||||||
 | 
					        if self.body is not None:
 | 
				
			||||||
 | 
					            if (len(self.body) < 20):
 | 
				
			||||||
 | 
					                r.append(self.body)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                r.append(self.body[0:20]) + " ...."
 | 
				
			||||||
 | 
					        return ">>> ".join(r)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_domains(arg):
 | 
				
			||||||
 | 
					    domains = []
 | 
				
			||||||
 | 
					    if arg == "-":
 | 
				
			||||||
 | 
					        for l in sys.stdin:
 | 
				
			||||||
 | 
					            domain = l.rstrip()
 | 
				
			||||||
 | 
					            assert(domain is not None)
 | 
				
			||||||
 | 
					            if len(domain) == 0:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            domains.append(domain)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        domains = arg.split(",")
 | 
				
			||||||
 | 
					    return domains
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def visit_links(links,connection,parser,db):
 | 
				
			||||||
 | 
					    outlinks = []
 | 
				
			||||||
 | 
					    for work_link in links:
 | 
				
			||||||
 | 
					        responses = []
 | 
				
			||||||
 | 
					        if parser.is_link_good(work_link) and connection.is_robot_good(work_link):
 | 
				
			||||||
 | 
					            responses = connection.html_download2(work_link)
 | 
				
			||||||
 | 
					            time.sleep(4)
 | 
				
			||||||
 | 
					            db.index_responses(work_link,responses)
 | 
				
			||||||
 | 
					        if len(responses) > 0:
 | 
				
			||||||
 | 
					            lr = responses[-1]
 | 
				
			||||||
 | 
					            if lr.content is not None:
 | 
				
			||||||
 | 
					                target_link = lr.get_canonical()
 | 
				
			||||||
 | 
					                parsed = ParsedDocument(parser,target_link)
 | 
				
			||||||
 | 
					                parsed.extract(lr.content, lr.bs)
 | 
				
			||||||
 | 
					                db.index_content(target_link,parsed)
 | 
				
			||||||
 | 
					                outlinks += parsed.get_links()
 | 
				
			||||||
 | 
					    if len(outlinks) > 0:
 | 
				
			||||||
 | 
					        db.index_follow_links(parser,outlinks,connection)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def visit_domain(domain,parser,db):
 | 
				
			||||||
 | 
					    c = Connection()
 | 
				
			||||||
 | 
					    p = parser
 | 
				
			||||||
 | 
					    # Get links from frontpage
 | 
				
			||||||
 | 
					    # TODO Sitemap
 | 
				
			||||||
 | 
					    sitemap = "http://" + domain
 | 
				
			||||||
 | 
					    visit_links([sitemap],c,p,db)
 | 
				
			||||||
 | 
					    db.check_domain(domain)
 | 
				
			||||||
 | 
					    for i in range(p.crawl_rounds):
 | 
				
			||||||
 | 
					        # Visit links from frontpage
 | 
				
			||||||
 | 
					        links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
 | 
				
			||||||
 | 
					        visit_links(links,c,p,db)
 | 
				
			||||||
 | 
					        db.check_domain(domain)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										130
									
								
								websucker/cli.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										130
									
								
								websucker/cli.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,130 @@
 | 
				
			|||||||
 | 
					from websucker.agent import Connection,visit_links,visit_domain
 | 
				
			||||||
 | 
					from websucker.agent import ParsedDocument
 | 
				
			||||||
 | 
					from websucker.parser import BaseParser
 | 
				
			||||||
 | 
					from websucker.parser import normalize_link,urlunparse
 | 
				
			||||||
 | 
					from websucker.db import Data
 | 
				
			||||||
 | 
					from websucker.db import get_schema
 | 
				
			||||||
 | 
					import click
 | 
				
			||||||
 | 
					import pprint
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_database_from_context(ctx):
 | 
				
			||||||
 | 
					    return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@click.group()
 | 
				
			||||||
 | 
					@click.pass_context
 | 
				
			||||||
 | 
					@click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True)
 | 
				
			||||||
 | 
					@click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True)
 | 
				
			||||||
 | 
					@click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@click.option("--justext-language",metavar="JUSTEXT_LANGUAGE",help="Target language (if defined, value read from JUSTEXT_LANGUAGE env variable)",envvar="JUSTEXT_LANGUAGE",default="English",show_default=True)
 | 
				
			||||||
 | 
					@click.option("--visit",is_flag=True)
 | 
				
			||||||
 | 
					def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,justext_language,visit):
 | 
				
			||||||
 | 
					    ctx.ensure_object(dict)
 | 
				
			||||||
 | 
					    p = BaseParser()
 | 
				
			||||||
 | 
					    p.justext_language = justext_language
 | 
				
			||||||
 | 
					    ctx.obj["parser"] = p
 | 
				
			||||||
 | 
					    ctx.obj["cassandra_host"] = cassandra_host
 | 
				
			||||||
 | 
					    ctx.obj["cassandra_port"] = cassandra_port
 | 
				
			||||||
 | 
					    ctx.obj["cassandra_keyspace"] = cassandra_keyspace
 | 
				
			||||||
 | 
					    ctx.obj["visit"] = visit
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command(help="Print domains")
 | 
				
			||||||
 | 
					@click.pass_context
 | 
				
			||||||
 | 
					@click.argument("count",type=int,default=20)
 | 
				
			||||||
 | 
					def all(ctx,count):
 | 
				
			||||||
 | 
					    p = ctx.obj["parser"]
 | 
				
			||||||
 | 
					    c = Connection()
 | 
				
			||||||
 | 
					    db = create_database_from_context(ctx)
 | 
				
			||||||
 | 
					    res = db.all_domains(count)
 | 
				
			||||||
 | 
					    for row in res:
 | 
				
			||||||
 | 
					        print(",".join(map(str,row)))
 | 
				
			||||||
 | 
					        if ctx.obj["visit"]:
 | 
				
			||||||
 | 
					            visit_domain(row[0],p,db)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command(help="Continue crawling of seen links from a domain")
 | 
				
			||||||
 | 
					@click.pass_context
 | 
				
			||||||
 | 
					@click.argument("domain")
 | 
				
			||||||
 | 
					def crawl(ctx, domain):
 | 
				
			||||||
 | 
					    db = create_database_from_context(ctx)
 | 
				
			||||||
 | 
					    p = ctx.obj["parser"]
 | 
				
			||||||
 | 
					    c = Connection()
 | 
				
			||||||
 | 
					    links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
 | 
				
			||||||
 | 
					    visit_links(links,c,p,db)
 | 
				
			||||||
 | 
					    db.check_domain(domain)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command(help="find best domains")
 | 
				
			||||||
 | 
					@click.pass_context
 | 
				
			||||||
 | 
					@click.argument("count",type=int,default=20)
 | 
				
			||||||
 | 
					#@click.option("visit",is_flag=True)
 | 
				
			||||||
 | 
					def best(ctx, count):
 | 
				
			||||||
 | 
					    db = create_database_from_context(ctx)
 | 
				
			||||||
 | 
					    p = ctx.obj["parser"]
 | 
				
			||||||
 | 
					    domains = db.get_best_domains(count)
 | 
				
			||||||
 | 
					    for domain,gr in domains:
 | 
				
			||||||
 | 
					        print(domain,gr)
 | 
				
			||||||
 | 
					        if ctx.obj["visit"]:
 | 
				
			||||||
 | 
					           visit_domain(domain,p,db)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
 | 
				
			||||||
 | 
					@click.pass_context
 | 
				
			||||||
 | 
					@click.argument("count",type=int,default=20)
 | 
				
			||||||
 | 
					def unvisited(ctx, count):
 | 
				
			||||||
 | 
					    db = create_database_from_context(ctx)
 | 
				
			||||||
 | 
					    p = ctx.obj["parser"]
 | 
				
			||||||
 | 
					    c = Connection()
 | 
				
			||||||
 | 
					    domains = db.get_unvisited_domains(count)
 | 
				
			||||||
 | 
					    for domain in domains:
 | 
				
			||||||
 | 
					        print(domain)
 | 
				
			||||||
 | 
					        if ctx.obj["visit"]:
 | 
				
			||||||
 | 
					          visit_domain(domain,p,db)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command(help="Visit url, get links and crawl. Start here")
 | 
				
			||||||
 | 
					@click.pass_context
 | 
				
			||||||
 | 
					@click.argument("link")
 | 
				
			||||||
 | 
					def visit(ctx, link):
 | 
				
			||||||
 | 
					    db = create_database_from_context(ctx)
 | 
				
			||||||
 | 
					    p = ctx.obj["parser"]
 | 
				
			||||||
 | 
					    c = Connection()
 | 
				
			||||||
 | 
					    nl = normalize_link(link)
 | 
				
			||||||
 | 
					    domain=nl[1]
 | 
				
			||||||
 | 
					    visit_domain(domain,p,db)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command(help="Update domain statistics")
 | 
				
			||||||
 | 
					@click.pass_context
 | 
				
			||||||
 | 
					@click.argument("domain")
 | 
				
			||||||
 | 
					def check(ctx,domain):
 | 
				
			||||||
 | 
					    db = create_database_from_context(ctx)
 | 
				
			||||||
 | 
					    res = db.check_domain(domain)
 | 
				
			||||||
 | 
					    print(res)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command(help="Print daily report")
 | 
				
			||||||
 | 
					@click.pass_context
 | 
				
			||||||
 | 
					def report(ctx):
 | 
				
			||||||
 | 
					    db = create_database_from_context(ctx)
 | 
				
			||||||
 | 
					    db.daily_report()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command(help="Print keyspace schema")
 | 
				
			||||||
 | 
					def schema():
 | 
				
			||||||
 | 
					    schema = get_schema()
 | 
				
			||||||
 | 
					    print(schema)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command(help="Fetch given url (just for debug)")
 | 
				
			||||||
 | 
					@click.pass_context
 | 
				
			||||||
 | 
					@click.argument("urls")
 | 
				
			||||||
 | 
					def fetch(ctx,urls):
 | 
				
			||||||
 | 
					    parser = ctx.obj["parser"]
 | 
				
			||||||
 | 
					    # Visit first page
 | 
				
			||||||
 | 
					    connection = Connection()
 | 
				
			||||||
 | 
					    responses = connection.html_download2(urls)
 | 
				
			||||||
 | 
					    for res in responses:
 | 
				
			||||||
 | 
					        target_link = res.get_canonical()
 | 
				
			||||||
 | 
					        pd = ParsedDocument(parser,target_link)
 | 
				
			||||||
 | 
					        pd.extract(res.content, res.bs)
 | 
				
			||||||
 | 
					        print(pd)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    cli()
 | 
				
			||||||
							
								
								
									
										444
									
								
								websucker/db.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										444
									
								
								websucker/db.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,444 @@
 | 
				
			|||||||
 | 
					import cassandra
 | 
				
			||||||
 | 
					import cassandra.cluster
 | 
				
			||||||
 | 
					import random
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import pkg_resources
 | 
				
			||||||
 | 
					import datetime
 | 
				
			||||||
 | 
					from websucker.parser import normalize_link,urlunparse
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					VERSION = "sucker6"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_schema():
 | 
				
			||||||
 | 
					    with pkg_resources.resource_stream(__name__,"schema.sql") as f:
 | 
				
			||||||
 | 
					        schema = f.read()
 | 
				
			||||||
 | 
					        return str(schema,encoding="utf8")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Data:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Database of text documents
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042):
 | 
				
			||||||
 | 
					        # execution profile
 | 
				
			||||||
 | 
					        ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0)
 | 
				
			||||||
 | 
					        profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
 | 
				
			||||||
 | 
					        self.cluster = cassandra.cluster.Cluster([cassandra_host],port=cassandra_port,execution_profiles=profiles)
 | 
				
			||||||
 | 
					        self.session = self.cluster.connect(keyspace)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.check_document_select_query = self.session.prepare("SELECT count(url_hash) FROM paragraph_checksums WHERE checksum=?" )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.index_response_link_update = self.session.prepare("""
 | 
				
			||||||
 | 
					        UPDATE links SET 
 | 
				
			||||||
 | 
					        link_status ='redirect',
 | 
				
			||||||
 | 
					        redirect_target = ?,
 | 
				
			||||||
 | 
					        update_time = toTimestamp(now())
 | 
				
			||||||
 | 
					        WHERE
 | 
				
			||||||
 | 
					        domain_name=? AND
 | 
				
			||||||
 | 
					        url_path=? AND
 | 
				
			||||||
 | 
					        url_query=? 
 | 
				
			||||||
 | 
					        """)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.domain_quality_update = self.session.prepare("""
 | 
				
			||||||
 | 
					        UPDATE domain_quality SET 
 | 
				
			||||||
 | 
					            seen_count=?,
 | 
				
			||||||
 | 
					            good_size=?,
 | 
				
			||||||
 | 
					            good_count=?,
 | 
				
			||||||
 | 
					            good_probability=?,
 | 
				
			||||||
 | 
					            good_originality=?,
 | 
				
			||||||
 | 
					            average_good_characters=?,
 | 
				
			||||||
 | 
					            content_size=?,
 | 
				
			||||||
 | 
					            content_count=?,
 | 
				
			||||||
 | 
					            content_probability=?,
 | 
				
			||||||
 | 
					            content_originality=?,
 | 
				
			||||||
 | 
					            average_content_characters=?,
 | 
				
			||||||
 | 
					            fetched_count=?,
 | 
				
			||||||
 | 
					            average_fetched_good_characters=?,
 | 
				
			||||||
 | 
					            gain_ratio=?,
 | 
				
			||||||
 | 
					            update_time = toTimestamp(now())
 | 
				
			||||||
 | 
					        WHERE
 | 
				
			||||||
 | 
					            domain_name=? AND
 | 
				
			||||||
 | 
					            day=toDate(now())
 | 
				
			||||||
 | 
					        """)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.index_response_insert_html = self.session.prepare("""
 | 
				
			||||||
 | 
					INSERT INTO html(
 | 
				
			||||||
 | 
					  day,
 | 
				
			||||||
 | 
					  domain_name,
 | 
				
			||||||
 | 
					  source_link,
 | 
				
			||||||
 | 
					  target_link,
 | 
				
			||||||
 | 
					  redirect_links,
 | 
				
			||||||
 | 
					  status,
 | 
				
			||||||
 | 
					  headers,
 | 
				
			||||||
 | 
					  content,
 | 
				
			||||||
 | 
					  agent_version,
 | 
				
			||||||
 | 
					  update_time
 | 
				
			||||||
 | 
					  ) VALUES (toDate(now()),?,?,?,?,?,?,?,?,toTimestamp(now()));
 | 
				
			||||||
 | 
					""")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.index_content_link_insert = self.session.prepare("""
 | 
				
			||||||
 | 
					INSERT INTO links (
 | 
				
			||||||
 | 
					url_schema,
 | 
				
			||||||
 | 
					domain_name,
 | 
				
			||||||
 | 
					url_path,
 | 
				
			||||||
 | 
					url_query,
 | 
				
			||||||
 | 
					link_status,
 | 
				
			||||||
 | 
					update_time
 | 
				
			||||||
 | 
					) VALUES (?,?,?,?,'seen',?) IF NOT EXISTS
 | 
				
			||||||
 | 
					""")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.daily_links_insert = self.session.prepare("""
 | 
				
			||||||
 | 
					INSERT INTO daily_links (
 | 
				
			||||||
 | 
					day,
 | 
				
			||||||
 | 
					domain_name,
 | 
				
			||||||
 | 
					url_path,
 | 
				
			||||||
 | 
					url_query,
 | 
				
			||||||
 | 
					link_status,
 | 
				
			||||||
 | 
					body_size,
 | 
				
			||||||
 | 
					link_originality,
 | 
				
			||||||
 | 
					update_time
 | 
				
			||||||
 | 
					) VALUES (toDate(now()),?,?,?,?,?,?,toTimestamp(now()))
 | 
				
			||||||
 | 
					""")
 | 
				
			||||||
 | 
					        self.daily_links_select = self.session.prepare("""
 | 
				
			||||||
 | 
					SELECT 
 | 
				
			||||||
 | 
					domain_name,
 | 
				
			||||||
 | 
					link_status,
 | 
				
			||||||
 | 
					count(link_status) 
 | 
				
			||||||
 | 
					FROM daily_links WHERE day=toDate(now()) GROUP BY domain_name,link_status
 | 
				
			||||||
 | 
					""")
 | 
				
			||||||
 | 
					        # PArsed Content
 | 
				
			||||||
 | 
					        self.index_content_content_insert = self.session.prepare("""
 | 
				
			||||||
 | 
					INSERT INTO content(
 | 
				
			||||||
 | 
					  domain_name,
 | 
				
			||||||
 | 
					  target_link,
 | 
				
			||||||
 | 
					  links,
 | 
				
			||||||
 | 
					  title,
 | 
				
			||||||
 | 
					  description,
 | 
				
			||||||
 | 
					  section,
 | 
				
			||||||
 | 
					  authors,
 | 
				
			||||||
 | 
					  tags,
 | 
				
			||||||
 | 
					  article_published_time,
 | 
				
			||||||
 | 
					  text_date,
 | 
				
			||||||
 | 
					  body,
 | 
				
			||||||
 | 
					  body_size,
 | 
				
			||||||
 | 
					  agent_version,
 | 
				
			||||||
 | 
					  update_time
 | 
				
			||||||
 | 
					  ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?);
 | 
				
			||||||
 | 
					""")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.paragraph_checksums_insert = self.session.prepare("INSERT INTO paragraph_checksums (checksum,url_hash) VALUES(?,?)")
 | 
				
			||||||
 | 
					        self.index_content_links_update = self.session.prepare("UPDATE links SET link_status=?, link_originality=?,body_size=?,url_schema=? WHERE domain_name=? AND url_path = ? AND url_query=? ")
 | 
				
			||||||
 | 
					        self.check_domain_count = self.session.prepare("select count(url_path) from links where domain_name=? and link_status = ?")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.check_domain_size = self.session.prepare("select sum(body_size),sum(link_originality) from links where domain_name=? and link_status =?")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.domains_select = self.session.prepare("SELECT domain_name,seen_count,fetched_count,gain_ratio,average_fetched_good_characters FROM domain_quality PER PARTITION LIMIT 1")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def index_responses(self,source_link,responses):
 | 
				
			||||||
 | 
					        # Redirect links
 | 
				
			||||||
 | 
					        pl = normalize_link(source_link)
 | 
				
			||||||
 | 
					        for response in responses:
 | 
				
			||||||
 | 
					            tl = response.get_canonical()
 | 
				
			||||||
 | 
					            r = (
 | 
				
			||||||
 | 
					                tl,
 | 
				
			||||||
 | 
					                pl[1],
 | 
				
			||||||
 | 
					                pl[2],
 | 
				
			||||||
 | 
					                pl[3],
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            if pl != tl:
 | 
				
			||||||
 | 
					                res = self.session.execute(self.index_response_link_update,r)
 | 
				
			||||||
 | 
					            d = (
 | 
				
			||||||
 | 
					                pl[1],
 | 
				
			||||||
 | 
					                source_link,
 | 
				
			||||||
 | 
					                response.get_canonical(),
 | 
				
			||||||
 | 
					                response.redirects,
 | 
				
			||||||
 | 
					                response.status,
 | 
				
			||||||
 | 
					                response.headers,
 | 
				
			||||||
 | 
					                response.get_content(),
 | 
				
			||||||
 | 
					                VERSION,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            self.session.execute(self.index_response_insert_html,d)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def daily_report(self):
 | 
				
			||||||
 | 
					        rows = self.session.execute(self.daily_links_select)
 | 
				
			||||||
 | 
					        for row in rows:
 | 
				
			||||||
 | 
					            print(row[0],row[1],row[2])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def index_follow_links(self,parser,links,connection):
 | 
				
			||||||
 | 
					        # Index seen links
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        follow_links = set()
 | 
				
			||||||
 | 
					        for l in links:
 | 
				
			||||||
 | 
					            if parser.is_link_good(l):
 | 
				
			||||||
 | 
					                #if connection is not None and parser.listen_robot and not connection.is_robot_good(l):
 | 
				
			||||||
 | 
					                #    continue
 | 
				
			||||||
 | 
					                link = normalize_link(l,strip_query=parser.strip_query)
 | 
				
			||||||
 | 
					                follow_links.add(urlunparse(link))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        newlinkdomains = set()
 | 
				
			||||||
 | 
					        for link in follow_links:
 | 
				
			||||||
 | 
					            value = []
 | 
				
			||||||
 | 
					            nl = normalize_link(link)
 | 
				
			||||||
 | 
					            value +=  nl
 | 
				
			||||||
 | 
					            value.append(datetime.date.today())
 | 
				
			||||||
 | 
					            rows = self.session.execute(self.index_content_link_insert,value)
 | 
				
			||||||
 | 
					            row = rows.one()
 | 
				
			||||||
 | 
					            if row.applied:
 | 
				
			||||||
 | 
					                newlinkdomains.add(nl[1])
 | 
				
			||||||
 | 
					        for domain in newlinkdomains:
 | 
				
			||||||
 | 
					            self.check_domain(domain)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def index_content(self,target_link,parsed_document):
 | 
				
			||||||
 | 
					        nl = normalize_link(target_link)
 | 
				
			||||||
 | 
					        domain_name = nl[1]
 | 
				
			||||||
 | 
					        assert len(domain_name) > 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        pd = parsed_document
 | 
				
			||||||
 | 
					        body_length = 0
 | 
				
			||||||
 | 
					        if pd.body is not None:
 | 
				
			||||||
 | 
					            body_length = len(pd.body)
 | 
				
			||||||
 | 
					        value = ( 
 | 
				
			||||||
 | 
					          domain_name,
 | 
				
			||||||
 | 
					          target_link,
 | 
				
			||||||
 | 
					          pd.get_links(),
 | 
				
			||||||
 | 
					          pd.title,
 | 
				
			||||||
 | 
					          pd.description,
 | 
				
			||||||
 | 
					          pd.section,
 | 
				
			||||||
 | 
					          pd.authors,
 | 
				
			||||||
 | 
					          pd.tags,
 | 
				
			||||||
 | 
					          pd.article_published_time,
 | 
				
			||||||
 | 
					          pd.text_date,
 | 
				
			||||||
 | 
					          pd.body,
 | 
				
			||||||
 | 
					          body_length,
 | 
				
			||||||
 | 
					            VERSION,
 | 
				
			||||||
 | 
					          pd.current_time
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        content_future = self.session.execute_async(self.index_content_content_insert,value)
 | 
				
			||||||
 | 
					        # result later
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        link_status = "good"
 | 
				
			||||||
 | 
					        originality = 0
 | 
				
			||||||
 | 
					        tsz = 0
 | 
				
			||||||
 | 
					        if pd.body is None:
 | 
				
			||||||
 | 
					            link_status = "bad_parse"
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            tsz = len(pd.body)
 | 
				
			||||||
 | 
					            if tsz < 300:
 | 
				
			||||||
 | 
					                link_status = "bad_small"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if link_status == "good":
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            futures = []
 | 
				
			||||||
 | 
					            for pc,psz in zip(pd.paragraph_checksums,pd.paragraph_sizes):
 | 
				
			||||||
 | 
					                fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3])))
 | 
				
			||||||
 | 
					                futures.append(fut)
 | 
				
			||||||
 | 
					            for fut in futures:
 | 
				
			||||||
 | 
					                fut.result()
 | 
				
			||||||
 | 
					            originality = self.check_document(pd.paragraph_checksums,pd.paragraph_sizes)
 | 
				
			||||||
 | 
					            if originality < 0.8:
 | 
				
			||||||
 | 
					                link_status = "bad_copy"
 | 
				
			||||||
 | 
					        print(nl)
 | 
				
			||||||
 | 
					        self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3]))
 | 
				
			||||||
 | 
					        content_future.result()
 | 
				
			||||||
 | 
					        print("<<<< " + link_status + " " + str(originality))
 | 
				
			||||||
 | 
					        dl = (
 | 
				
			||||||
 | 
					            nl[1],
 | 
				
			||||||
 | 
					            nl[2],
 | 
				
			||||||
 | 
					            nl[3],
 | 
				
			||||||
 | 
					            link_status,
 | 
				
			||||||
 | 
					            tsz,
 | 
				
			||||||
 | 
					            originality
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        self.session.execute(self.daily_links_insert,dl)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def check_document(self,paragraph_checksums,paragraph_sizes):
 | 
				
			||||||
 | 
					       tsz = sum(paragraph_sizes)
 | 
				
			||||||
 | 
					       if tsz == 0:
 | 
				
			||||||
 | 
					           return 0
 | 
				
			||||||
 | 
					       copies = 0
 | 
				
			||||||
 | 
					       futures = []
 | 
				
			||||||
 | 
					       for pc,psz in zip(paragraph_checksums,paragraph_sizes):
 | 
				
			||||||
 | 
					           futures.append(self.session.execute_async(self.check_document_select_query,(pc,)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					       for fut,psz in zip(futures,paragraph_sizes):
 | 
				
			||||||
 | 
					           rows = fut.result()
 | 
				
			||||||
 | 
					           res = rows.one()[0]
 | 
				
			||||||
 | 
					           if res > 1:
 | 
				
			||||||
 | 
					               copies += psz
 | 
				
			||||||
 | 
					       return (tsz-copies)/tsz
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def check_domain(self, domain):
 | 
				
			||||||
 | 
					        assert len(domain) > 0
 | 
				
			||||||
 | 
					        seen_count = None
 | 
				
			||||||
 | 
					        good_size = None
 | 
				
			||||||
 | 
					        good_count = None
 | 
				
			||||||
 | 
					        good_probability = None
 | 
				
			||||||
 | 
					        good_originality = None
 | 
				
			||||||
 | 
					        average_good_characters = None
 | 
				
			||||||
 | 
					        content_size = None
 | 
				
			||||||
 | 
					        content_count = None
 | 
				
			||||||
 | 
					        content_probability = None
 | 
				
			||||||
 | 
					        content_originality = None
 | 
				
			||||||
 | 
					        average_content_characters = None
 | 
				
			||||||
 | 
					        fetched_count = None
 | 
				
			||||||
 | 
					        average_fetched_good_characters = None
 | 
				
			||||||
 | 
					        gain_ratio = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        counts = {
 | 
				
			||||||
 | 
					            "good":0,
 | 
				
			||||||
 | 
					            "bad_copy":0,
 | 
				
			||||||
 | 
					            "bad_small":0,
 | 
				
			||||||
 | 
					            "bad_httpcode":0,
 | 
				
			||||||
 | 
					            "bad_type":0,
 | 
				
			||||||
 | 
					            "bad_content":0,
 | 
				
			||||||
 | 
					            "bad_parse":0,
 | 
				
			||||||
 | 
					            "seen":0
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        for k in counts.keys():
 | 
				
			||||||
 | 
					            res = self.session.execute(self.check_domain_count,(domain,k))
 | 
				
			||||||
 | 
					            co = res.one()[0]
 | 
				
			||||||
 | 
					            counts[k]= co
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        seen_count = counts["seen"]
 | 
				
			||||||
 | 
					        good_count = counts["good"]
 | 
				
			||||||
 | 
					        content_count = counts["good"] + counts["bad_copy"] + counts["bad_small"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        fetched_count = sum(counts.values()) - counts["seen"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if fetched_count > 0:
 | 
				
			||||||
 | 
					            content_probability = content_count / fetched_count
 | 
				
			||||||
 | 
					            good_probability = good_count / fetched_count
 | 
				
			||||||
 | 
					            sizes = {
 | 
				
			||||||
 | 
					                "good":0,
 | 
				
			||||||
 | 
					                "bad_copy":0,
 | 
				
			||||||
 | 
					                "bad_small":0
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            originalities ={}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            for k in sizes.keys():
 | 
				
			||||||
 | 
					                res = self.session.execute(self.check_domain_size,(domain,k))
 | 
				
			||||||
 | 
					                row = res.one()
 | 
				
			||||||
 | 
					                co =row[0]
 | 
				
			||||||
 | 
					                originalities[k] = row[1]
 | 
				
			||||||
 | 
					                sizes[k]= co
 | 
				
			||||||
 | 
					            good_size = sizes["good"]
 | 
				
			||||||
 | 
					            content_size = sum(sizes.values())
 | 
				
			||||||
 | 
					            if good_count > 0:
 | 
				
			||||||
 | 
					                good_originality = originalities["good"] / good_count
 | 
				
			||||||
 | 
					            if content_count > 0:
 | 
				
			||||||
 | 
					                content_originality =  sum(originalities.values()) / content_count
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        if good_count > 0:
 | 
				
			||||||
 | 
					            average_good_characters = good_size / good_count  * good_originality
 | 
				
			||||||
 | 
					            average_fetched_good_characters = good_size * good_originality / fetched_count
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            gain_ratio = average_fetched_good_characters / fetched_count
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if content_count > 0:
 | 
				
			||||||
 | 
					            average_content_characters = content_size / content_count
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        #print(sizes)
 | 
				
			||||||
 | 
					        #print(originalities)
 | 
				
			||||||
 | 
					        uv = (
 | 
				
			||||||
 | 
					        seen_count,
 | 
				
			||||||
 | 
					        good_size,
 | 
				
			||||||
 | 
					        good_count,
 | 
				
			||||||
 | 
					        good_probability,
 | 
				
			||||||
 | 
					        good_originality,
 | 
				
			||||||
 | 
					        average_good_characters,
 | 
				
			||||||
 | 
					        content_size,
 | 
				
			||||||
 | 
					        content_count,
 | 
				
			||||||
 | 
					        content_probability,
 | 
				
			||||||
 | 
					        content_originality,
 | 
				
			||||||
 | 
					        average_content_characters,
 | 
				
			||||||
 | 
					        fetched_count,
 | 
				
			||||||
 | 
					        average_fetched_good_characters,
 | 
				
			||||||
 | 
					        gain_ratio,
 | 
				
			||||||
 | 
					        domain)
 | 
				
			||||||
 | 
					        if fetched_count > 0 or seen_count > 0:
 | 
				
			||||||
 | 
					            self.session.execute(self.domain_quality_update,uv)
 | 
				
			||||||
 | 
					        return average_fetched_good_characters
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def all_domains(self,count):
 | 
				
			||||||
 | 
					        rows = self.session.execute(self.domains_select)
 | 
				
			||||||
 | 
					        domains = []
 | 
				
			||||||
 | 
					        for row in rows:
 | 
				
			||||||
 | 
					            domain = row[0]
 | 
				
			||||||
 | 
					            seen_count = row[1]
 | 
				
			||||||
 | 
					            fetched_count = row[2]
 | 
				
			||||||
 | 
					            gain_ratio = row[3]
 | 
				
			||||||
 | 
					            afg = row[4]
 | 
				
			||||||
 | 
					            if fetched_count and afg and seen_count:
 | 
				
			||||||
 | 
					                domains.append(tuple(row))
 | 
				
			||||||
 | 
					        l = len(domains)
 | 
				
			||||||
 | 
					        ss = min(l,count)
 | 
				
			||||||
 | 
					        res = []
 | 
				
			||||||
 | 
					        if ss > 0:
 | 
				
			||||||
 | 
					            # sort according to ratio
 | 
				
			||||||
 | 
					            res = list(sorted(domains,key=lambda x:x[4],reverse=True))[0:ss]
 | 
				
			||||||
 | 
					        # returns sorted list of tuples domain,gain_ratio
 | 
				
			||||||
 | 
					        return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_best_domains(self,count):
 | 
				
			||||||
 | 
					        # get all domains
 | 
				
			||||||
 | 
					        rows = self.session.execute(self.domains_select)
 | 
				
			||||||
 | 
					        domains = []
 | 
				
			||||||
 | 
					        for row in rows:
 | 
				
			||||||
 | 
					            domain = row[0]
 | 
				
			||||||
 | 
					            seen_count = row[1]
 | 
				
			||||||
 | 
					            fetched_count = row[2]
 | 
				
			||||||
 | 
					            gain_ratio = row[3]
 | 
				
			||||||
 | 
					            afg = row[4]
 | 
				
			||||||
 | 
					            if seen_count and fetched_count and gain_ratio:
 | 
				
			||||||
 | 
					                domains.append((domain,gain_ratio))
 | 
				
			||||||
 | 
					        l = len(domains)
 | 
				
			||||||
 | 
					        ss = min(l,count)
 | 
				
			||||||
 | 
					        res = []
 | 
				
			||||||
 | 
					        if ss > 0:
 | 
				
			||||||
 | 
					            # sort according to ratio
 | 
				
			||||||
 | 
					            res = list(sorted(domains,key=lambda x:x[1],reverse=True))[0:ss]
 | 
				
			||||||
 | 
					        # returns sorted list of tuples domain,gain_ratio
 | 
				
			||||||
 | 
					        return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_unvisited_domains(self,count):
 | 
				
			||||||
 | 
					        # get all domains
 | 
				
			||||||
 | 
					        rows = self.session.execute(self.domains_select)
 | 
				
			||||||
 | 
					        domains = []
 | 
				
			||||||
 | 
					        for row in rows:
 | 
				
			||||||
 | 
					            domain = row[0]
 | 
				
			||||||
 | 
					            seen_count = row[1]
 | 
				
			||||||
 | 
					            fetched_count = row[2]
 | 
				
			||||||
 | 
					            gain_ratio = row[3]
 | 
				
			||||||
 | 
					            afg = row[4]
 | 
				
			||||||
 | 
					            if seen_count and not fetched_count:
 | 
				
			||||||
 | 
					                domains.append(domain)
 | 
				
			||||||
 | 
					        ss = min(len(domains),count)
 | 
				
			||||||
 | 
					        return random.sample(domains,ss)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def get_visit_links(self,domain,recent_count,old_count,random_count):
 | 
				
			||||||
 | 
					        dblinks = []
 | 
				
			||||||
 | 
					        rows = self.session.execute("SELECT url_schema,url_path,url_query,update_time FROM links Where domain_name=%s AND link_status='seen'",(domain,))
 | 
				
			||||||
 | 
					        for row in rows:
 | 
				
			||||||
 | 
					            link = urlunparse((row[0],domain,row[1],row[2]))
 | 
				
			||||||
 | 
					            dblinks.append((link,row[3]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        visitlinks = []
 | 
				
			||||||
 | 
					        dblinks.sort(key=lambda x:x[1])
 | 
				
			||||||
 | 
					        random_links = []
 | 
				
			||||||
 | 
					        for i,(link,time) in enumerate(dblinks):
 | 
				
			||||||
 | 
					            #print(link,time)
 | 
				
			||||||
 | 
					            if i < recent_count:
 | 
				
			||||||
 | 
					                visitlinks.append(link)
 | 
				
			||||||
 | 
					            elif i >= len(dblinks) - old_count:
 | 
				
			||||||
 | 
					                visitlinks.append(link)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                random_links.append(link)
 | 
				
			||||||
 | 
					        sc = min(random_count,len(random_links))
 | 
				
			||||||
 | 
					        if sc > 0:
 | 
				
			||||||
 | 
					            visitlinks += random.sample(random_links,sc)
 | 
				
			||||||
 | 
					        return visitlinks
 | 
				
			||||||
							
								
								
									
										335
									
								
								websucker/parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										335
									
								
								websucker/parser.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,335 @@
 | 
				
			|||||||
 | 
					import dateutil.parser
 | 
				
			||||||
 | 
					import justext
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					import datetime
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import lxml.etree
 | 
				
			||||||
 | 
					import urllib.parse
 | 
				
			||||||
 | 
					import os.path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					datere = re.compile("\d{1,2}\.\s*\d{1,2}\.\s*[12]\d{3}")
 | 
				
			||||||
 | 
					yearre = re.compile(r"\s\d{4}\s")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def urlunparse(parsed_url):
 | 
				
			||||||
 | 
					    schema, netloc, path, query = parsed_url
 | 
				
			||||||
 | 
					    return urllib.parse.urlunparse((schema, netloc, path, "", query, ""))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def normalize_link(link, base=None,strip_query=False):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    link = link.strip().replace(
 | 
				
			||||||
 | 
					                "\n", "").replace("\t", "").replace("\r", "")
 | 
				
			||||||
 | 
					    parsed_link = urllib.parse.urlparse(link)
 | 
				
			||||||
 | 
					    schema = parsed_link[0]
 | 
				
			||||||
 | 
					    netloc = parsed_link[1].strip().lower()
 | 
				
			||||||
 | 
					    path = parsed_link[2].strip()
 | 
				
			||||||
 | 
					    query = parsed_link[4]
 | 
				
			||||||
 | 
					    if strip_query:
 | 
				
			||||||
 | 
					        query = ""
 | 
				
			||||||
 | 
					    if path is None or len(path) == 0:
 | 
				
			||||||
 | 
					        path = "/"
 | 
				
			||||||
 | 
					    dirname, filename = os.path.split(path)
 | 
				
			||||||
 | 
					    if base is not None:
 | 
				
			||||||
 | 
					        parsed_base = urllib.parse.urlparse(base)
 | 
				
			||||||
 | 
					        if schema == "":
 | 
				
			||||||
 | 
					            schema = parsed_base[0]
 | 
				
			||||||
 | 
					        # Ak je relativny link
 | 
				
			||||||
 | 
					        if netloc == "":
 | 
				
			||||||
 | 
					            netloc = parsed_base[1]
 | 
				
			||||||
 | 
					            schema = parsed_base[0]
 | 
				
			||||||
 | 
					            bdir, bfile = os.path.split(parsed_base[2])
 | 
				
			||||||
 | 
					            if len(bdir) > 0 and bdir[0] != "." and len(dirname) > 0 and dirname[0] != "/":
 | 
				
			||||||
 | 
					                dirname = bdir + "/" + dirname
 | 
				
			||||||
 | 
					    # if len(dirname) == 0 or dirname[0] != '/':
 | 
				
			||||||
 | 
					    #    path = '/' + path
 | 
				
			||||||
 | 
					    dirname = os.path.normpath(dirname)
 | 
				
			||||||
 | 
					    dirname = dirname.lstrip("/").lstrip(".")
 | 
				
			||||||
 | 
					    path = dirname + "/" + filename
 | 
				
			||||||
 | 
					    return schema, netloc, path, query
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_date(te):
 | 
				
			||||||
 | 
					    dates = []
 | 
				
			||||||
 | 
					    words = []
 | 
				
			||||||
 | 
					    if te is None:
 | 
				
			||||||
 | 
					        te = ""
 | 
				
			||||||
 | 
					    for t in te.split():
 | 
				
			||||||
 | 
					        t = t.strip().lower().lstrip("0").replace("\r", "\n").replace("\n", "")
 | 
				
			||||||
 | 
					        if len(t) == 0:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        for i, m in enumerate(["jan", "feb", "mar", "apr", "máj", "jún", "júl", "aug", "sept", "okt", "nov", "dec"]):
 | 
				
			||||||
 | 
					            if t.startswith(m):
 | 
				
			||||||
 | 
					                t = str(i + 1) + "."
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					        if t[0].isdigit():
 | 
				
			||||||
 | 
					            words.append(t)
 | 
				
			||||||
 | 
					    txt = " ".join(words)
 | 
				
			||||||
 | 
					    for st in re.findall(datere, txt):
 | 
				
			||||||
 | 
					        tokens = st.replace(" ", "").split(".")
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            y = int(tokens[-1])
 | 
				
			||||||
 | 
					            if y < 2000 or y > 2020:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            m = 2
 | 
				
			||||||
 | 
					            d = 2
 | 
				
			||||||
 | 
					            if len(tokens) > 2:
 | 
				
			||||||
 | 
					                m = int(tokens[-2])
 | 
				
			||||||
 | 
					                d = int(tokens[-3])
 | 
				
			||||||
 | 
					            dates.append(datetime.date(y, m, d))
 | 
				
			||||||
 | 
					        except ValueError:
 | 
				
			||||||
 | 
					            pass
 | 
				
			||||||
 | 
					    return dates
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class BaseParser:
 | 
				
			||||||
 | 
					    def __init__(self, verbose=False):
 | 
				
			||||||
 | 
					        self.strip_query = True
 | 
				
			||||||
 | 
					        self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz",
 | 
				
			||||||
 | 
					                          ".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"]
 | 
				
			||||||
 | 
					        self.skipchars = re.compile(r"[();:@& ]")
 | 
				
			||||||
 | 
					        self.store = True
 | 
				
			||||||
 | 
					        self.verbose = verbose
 | 
				
			||||||
 | 
					        self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}$")
 | 
				
			||||||
 | 
					        self.listen_robot = True
 | 
				
			||||||
 | 
					        self.recent_links = 5
 | 
				
			||||||
 | 
					        self.old_links = 3
 | 
				
			||||||
 | 
					        self.random_links = 10
 | 
				
			||||||
 | 
					        self.crawl_rounds = 3
 | 
				
			||||||
 | 
					        self.skipdomains = set()
 | 
				
			||||||
 | 
					        self.allowdomains = set()
 | 
				
			||||||
 | 
					        self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter",   "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc",  "eshop", "e-shop", "email", "gallery", "flog"])
 | 
				
			||||||
 | 
					        self.justext_language = "Slovak"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def is_domain_good(self, domain):
 | 
				
			||||||
 | 
					        r = None
 | 
				
			||||||
 | 
					        # Netloc
 | 
				
			||||||
 | 
					        if ":" in domain:
 | 
				
			||||||
 | 
					            r = "Port in domain"
 | 
				
			||||||
 | 
					        elif len(domain) < 4:
 | 
				
			||||||
 | 
					            r = "Too short domain"
 | 
				
			||||||
 | 
					        elif len(domain) > 50:
 | 
				
			||||||
 | 
					            r = "Too long location"
 | 
				
			||||||
 | 
					        elif domain.startswith(".") or domain.endswith("."):
 | 
				
			||||||
 | 
					            r = "Malformed domain"
 | 
				
			||||||
 | 
					        elif not self.domain_re.match(domain):
 | 
				
			||||||
 | 
					            r = "Bad domain"
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            da = False
 | 
				
			||||||
 | 
					            for d in self.allowdomains:
 | 
				
			||||||
 | 
					                if domain.endswith(d):
 | 
				
			||||||
 | 
					                    da = True
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					            if not da and len(self.allowdomains) > 0:
 | 
				
			||||||
 | 
					                r = "Domain not in allowlist"
 | 
				
			||||||
 | 
					            for d in self.skipdomains:
 | 
				
			||||||
 | 
					                if domain.endswith(d):
 | 
				
			||||||
 | 
					                    r = "In domain skiplist"
 | 
				
			||||||
 | 
					            for d in domain.split("."):
 | 
				
			||||||
 | 
					                if d in self.skippaths:
 | 
				
			||||||
 | 
					                    r = "Domain in skippath"
 | 
				
			||||||
 | 
					        if r is not None and self.verbose:
 | 
				
			||||||
 | 
					            print(domain + " " + r)
 | 
				
			||||||
 | 
					        return r is None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#   # Argument - parsovana url
 | 
				
			||||||
 | 
					    def is_link_good(self, link):
 | 
				
			||||||
 | 
					        assert(link is not None)
 | 
				
			||||||
 | 
					        r = None
 | 
				
			||||||
 | 
					        if sys.getsizeof(link) > 1023:
 | 
				
			||||||
 | 
					            r = "Too long"
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            schema, domain, path, query = normalize_link(link)
 | 
				
			||||||
 | 
					            if not schema.startswith("http"):
 | 
				
			||||||
 | 
					                r = "Bad schema"
 | 
				
			||||||
 | 
					            dg = self.is_domain_good(domain)
 | 
				
			||||||
 | 
					            if not dg:
 | 
				
			||||||
 | 
					                return False
 | 
				
			||||||
 | 
					            for c in link:
 | 
				
			||||||
 | 
					                if ord(c) >= 128:
 | 
				
			||||||
 | 
					                    r = "Bad domain character"
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					            for p in self.skipdomains:
 | 
				
			||||||
 | 
					                if domain.endswith(p):
 | 
				
			||||||
 | 
					                    r = "Bad domain"
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					            if ".b-" in domain:
 | 
				
			||||||
 | 
					                r = "Bad domain"
 | 
				
			||||||
 | 
					            if len(domain) > 127:
 | 
				
			||||||
 | 
					                r = "Too long path"
 | 
				
			||||||
 | 
					            # Path
 | 
				
			||||||
 | 
					            for t in self.skiptypes:
 | 
				
			||||||
 | 
					                if path.lower().endswith(t):
 | 
				
			||||||
 | 
					                    r = "Bad type"
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					            if re.search(self.skipchars, path):
 | 
				
			||||||
 | 
					                r = "Bad path"
 | 
				
			||||||
 | 
					            for p in path.split("/"):
 | 
				
			||||||
 | 
					                if p in self.skippaths or "jpg" in p or "galeria" in p:
 | 
				
			||||||
 | 
					                    r = "Bad path"
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					        except ValueError:
 | 
				
			||||||
 | 
					            r = "Bad urlparse"
 | 
				
			||||||
 | 
					        return r is None
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def filter_links(links):
 | 
				
			||||||
 | 
					        # Filter links
 | 
				
			||||||
 | 
					        linkset = set()
 | 
				
			||||||
 | 
					        for link in links:
 | 
				
			||||||
 | 
					            if not self.is_link_good(link):
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            link = urlunparse(normalize_link(link,strip_query=self.strip_query))
 | 
				
			||||||
 | 
					            linkset.add(link)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return list(linkset)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def extract_raw_text(self, content, current_time):
 | 
				
			||||||
 | 
					        result = []
 | 
				
			||||||
 | 
					        rd = None
 | 
				
			||||||
 | 
					        paragraphs = []
 | 
				
			||||||
 | 
					        content.seek(0)
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            c = content.read()
 | 
				
			||||||
 | 
					            paragraphs = justext.justext(c, justext.get_stoplist(self.justext_language), length_low=50, length_high=150)
 | 
				
			||||||
 | 
					            content.seek(0)
 | 
				
			||||||
 | 
					        except lxml.etree.XMLSyntaxError:
 | 
				
			||||||
 | 
					            print("XML Syntax parse error")
 | 
				
			||||||
 | 
					        except lxml.etree.ParserError:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            print("XML Parse parse error")
 | 
				
			||||||
 | 
					        except justext.core.JustextError:
 | 
				
			||||||
 | 
					            print("Justext error")
 | 
				
			||||||
 | 
					        except IndexError:
 | 
				
			||||||
 | 
					            print("XML error")
 | 
				
			||||||
 | 
					        except UnicodeDecodeError:
 | 
				
			||||||
 | 
					            print("Unicode Error")
 | 
				
			||||||
 | 
					        except TypeError:
 | 
				
			||||||
 | 
					            # NUll in string
 | 
				
			||||||
 | 
					            print("String Error")
 | 
				
			||||||
 | 
					        except RuntimeError:
 | 
				
			||||||
 | 
					            # Maximum recursion depth"
 | 
				
			||||||
 | 
					            print("Recursion Error")
 | 
				
			||||||
 | 
					        dates = []
 | 
				
			||||||
 | 
					        for p in paragraphs:
 | 
				
			||||||
 | 
					            # TODO - match URL for date
 | 
				
			||||||
 | 
					            if p is not None and p.text is not None and len(p.text) > 0:
 | 
				
			||||||
 | 
					                dat = get_date(p.text)
 | 
				
			||||||
 | 
					                for d in dat:
 | 
				
			||||||
 | 
					                    dates.append(d)
 | 
				
			||||||
 | 
					                if self.verbose:
 | 
				
			||||||
 | 
					                    print(p.class_type, p.links_density(), p.stopwords_density(
 | 
				
			||||||
 | 
					                        justext.get_stoplist(self.justext_language)), p.text)
 | 
				
			||||||
 | 
					                if not p.is_boilerplate:
 | 
				
			||||||
 | 
					                    result.append(p.text.strip())
 | 
				
			||||||
 | 
					        if len(dates) == 0:
 | 
				
			||||||
 | 
					            dates.append(current_time)
 | 
				
			||||||
 | 
					        if len(dates) > 0:
 | 
				
			||||||
 | 
					            rd = max(dates)
 | 
				
			||||||
 | 
					        rd = rd.isoformat()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return "\n\n".join(result), rd
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Extracts matainformation from html
 | 
				
			||||||
 | 
					    # First it looks for name, content in meta tags
 | 
				
			||||||
 | 
					    # then it looks for opengraph
 | 
				
			||||||
 | 
					    def extract_og(self, bs):
 | 
				
			||||||
 | 
					        tags = set()
 | 
				
			||||||
 | 
					        authors = set()
 | 
				
			||||||
 | 
					        title = ""
 | 
				
			||||||
 | 
					        description = ""
 | 
				
			||||||
 | 
					        section = ""
 | 
				
			||||||
 | 
					        article_published_time = ""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for m in bs.find_all("meta", attrs={"name": True, "content": True}):
 | 
				
			||||||
 | 
					            content = m["content"].strip()
 | 
				
			||||||
 | 
					            if len(content) == 0:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            name = m["name"].strip()
 | 
				
			||||||
 | 
					            if name == "keywords":
 | 
				
			||||||
 | 
					                for t in content.split(","):
 | 
				
			||||||
 | 
					                    if len(t.strip()) > 0:
 | 
				
			||||||
 | 
					                        tags.add(t.strip())
 | 
				
			||||||
 | 
					            if name == "news_keywords":
 | 
				
			||||||
 | 
					                for t in content.split(","):
 | 
				
			||||||
 | 
					                    if len(t.strip()) > 0:
 | 
				
			||||||
 | 
					                        tags.add(t.strip())
 | 
				
			||||||
 | 
					            if name == "author":
 | 
				
			||||||
 | 
					                authors.add(content)
 | 
				
			||||||
 | 
					            if name == "description":
 | 
				
			||||||
 | 
					                description = content
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for m in bs.find_all("meta", property=True, content=True):
 | 
				
			||||||
 | 
					            content = m["content"].strip()
 | 
				
			||||||
 | 
					            if len(content) == 0:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            property = m["property"].strip()
 | 
				
			||||||
 | 
					            if property == "og:title":
 | 
				
			||||||
 | 
					                title = content
 | 
				
			||||||
 | 
					            if property == "article:published_time":
 | 
				
			||||||
 | 
					                try:
 | 
				
			||||||
 | 
					                    # Je v ISO formate?
 | 
				
			||||||
 | 
					                    d = dateutil.parser.parse(content)
 | 
				
			||||||
 | 
					                    article_published_time = d.isoformat()
 | 
				
			||||||
 | 
					                except ValueError:
 | 
				
			||||||
 | 
					                    pass
 | 
				
			||||||
 | 
					                except OverflowError:
 | 
				
			||||||
 | 
					                    pass
 | 
				
			||||||
 | 
					            if property == "article:author" and "admin" not in content.lower():
 | 
				
			||||||
 | 
					                authors.add(content)
 | 
				
			||||||
 | 
					            if property == "section":
 | 
				
			||||||
 | 
					                section = content
 | 
				
			||||||
 | 
					            if property == "tag":
 | 
				
			||||||
 | 
					                tags.add(content)
 | 
				
			||||||
 | 
					            if property == "og:description":
 | 
				
			||||||
 | 
					                description = content
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if len(title) < 2 and bs.h1 is not None:
 | 
				
			||||||
 | 
					            title = bs.h1.get_text(strip=True)
 | 
				
			||||||
 | 
					        if len(title) < 2 and bs.title is not None:
 | 
				
			||||||
 | 
					            title = bs.title.get_text(strip=True)
 | 
				
			||||||
 | 
					        if len(authors) == 0:
 | 
				
			||||||
 | 
					            for m in bs.find_all(property="author"):
 | 
				
			||||||
 | 
					                authors.add(m.get_text(strip=True))
 | 
				
			||||||
 | 
					        if len(authors) == 0:
 | 
				
			||||||
 | 
					            for m in bs.find_all(itemprop="author"):
 | 
				
			||||||
 | 
					                authors.add(m.get_text(strip=True))
 | 
				
			||||||
 | 
					        authors = set(filter(lambda x: len(x) > 2, authors))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return tags,authors,title.strip(),article_published_time.strip(),description,section.strip()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def calculate_checksums(self, text):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        checksums = []
 | 
				
			||||||
 | 
					        sizes = []
 | 
				
			||||||
 | 
					        hval = 0
 | 
				
			||||||
 | 
					        hsz = 0
 | 
				
			||||||
 | 
					        sz = 0
 | 
				
			||||||
 | 
					        for c in text:
 | 
				
			||||||
 | 
					            cv = ord(c)
 | 
				
			||||||
 | 
					            sz += 1
 | 
				
			||||||
 | 
					            if cv > 64:
 | 
				
			||||||
 | 
					                hval += (hval << 3) + cv
 | 
				
			||||||
 | 
					                zv = hval >> 31
 | 
				
			||||||
 | 
					                hval &= 0x7fffffff
 | 
				
			||||||
 | 
					                hval += zv
 | 
				
			||||||
 | 
					                hsz += 1
 | 
				
			||||||
 | 
					            if c == "\n" and hsz > 0:
 | 
				
			||||||
 | 
					                if hsz > 100:
 | 
				
			||||||
 | 
					                    checksums.append(hval)
 | 
				
			||||||
 | 
					                    sizes.append(sz)
 | 
				
			||||||
 | 
					                sz = 0
 | 
				
			||||||
 | 
					                hsz = 0
 | 
				
			||||||
 | 
					        if hsz > 100:
 | 
				
			||||||
 | 
					            checksums.append(hval)
 | 
				
			||||||
 | 
					            sizes.append(sz)
 | 
				
			||||||
 | 
					        return checksums, sizes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class EnglishParser(BaseParser):
 | 
				
			||||||
 | 
					    def __init__(self):
 | 
				
			||||||
 | 
					        super(EnglishParser,self).__init__()
 | 
				
			||||||
 | 
					        self.justext_language = "English"
 | 
				
			||||||
 | 
					        self.allowdomains = set(["com","org","io"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										35
									
								
								websucker/queue.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								websucker/queue.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,35 @@
 | 
				
			|||||||
 | 
					import greenstalk
 | 
				
			||||||
 | 
					import random
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					MAX_PRIORITY = 0
 | 
				
			||||||
 | 
					MIN_PRIORITY = 4000000000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					MAX_FLOAT_PRIORITY = 10000.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def map_priority(p,max_priority):
 | 
				
			||||||
 | 
					    p = p / max_priority
 | 
				
			||||||
 | 
					    return MIN_PRIORITY - (p*MIN_PRIORITY)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class BeanstalkdQueue:
 | 
				
			||||||
 | 
					    def __init__(self,host,port,tube):
 | 
				
			||||||
 | 
					        self.c = greenstalk.Client(host,port,use=tube,encoding="utf8")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def queue_priority_domains(self,priority_domains):
 | 
				
			||||||
 | 
					        for domain,priority in priority_domains:
 | 
				
			||||||
 | 
					            p = priority / MAX_FLOAT_PRIORITY
 | 
				
			||||||
 | 
					            p = MIN_PRIORITY - (p*MIN_PRIORITY)
 | 
				
			||||||
 | 
					            self.c.put(domain,p)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def queue_random_domains(self,domains):
 | 
				
			||||||
 | 
					        for domain in domains:
 | 
				
			||||||
 | 
					            p = random.randint(MAX_PRIORITY,MIN_PRIORITY)
 | 
				
			||||||
 | 
					            self.c.put(domain,p)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def consume_domains(self,callback):
 | 
				
			||||||
 | 
					        while True:
 | 
				
			||||||
 | 
					            job = self.c.reserve()
 | 
				
			||||||
 | 
					            domain = job.body
 | 
				
			||||||
 | 
					            self.c.delete(job)
 | 
				
			||||||
 | 
					            callback(domain)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										94
									
								
								websucker/schema.sql
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										94
									
								
								websucker/schema.sql
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,94 @@
 | 
				
			|||||||
 | 
					DROP KEYSPACE websucker;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CREATE KEYSPACE websucker
 | 
				
			||||||
 | 
					WITH replication = {'class':'SimpleStrategy', 'replication_factor' : 1};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					USE websucker;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CREATE TABLE links (
 | 
				
			||||||
 | 
					    domain_name TEXT,
 | 
				
			||||||
 | 
					    url_path TEXT,
 | 
				
			||||||
 | 
					    url_query TEXT,
 | 
				
			||||||
 | 
					    url_schema TEXT,
 | 
				
			||||||
 | 
					    redirect_target TEXT,
 | 
				
			||||||
 | 
					    link_status TEXT,
 | 
				
			||||||
 | 
					    link_originality FLOAT,
 | 
				
			||||||
 | 
					    body_size INT,
 | 
				
			||||||
 | 
					    update_time TIMESTAMP,
 | 
				
			||||||
 | 
					    PRIMARY KEY(domain_name,url_path,url_query)
 | 
				
			||||||
 | 
					);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CREATE INDEX link_status_index ON links(link_status);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CREATE TABLE daily_links (
 | 
				
			||||||
 | 
					    day DATE,
 | 
				
			||||||
 | 
					    domain_name TEXT,
 | 
				
			||||||
 | 
					    url_path TEXT,
 | 
				
			||||||
 | 
					    url_query TEXT,
 | 
				
			||||||
 | 
					    link_status TEXT,
 | 
				
			||||||
 | 
					    body_size INT,
 | 
				
			||||||
 | 
					    link_originality FLOAT,
 | 
				
			||||||
 | 
					    update_time TIMESTAMP,
 | 
				
			||||||
 | 
					    PRIMARY KEY(day,domain_name,link_status,url_path,url_query)
 | 
				
			||||||
 | 
					);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CREATE TABLE domain_quality (
 | 
				
			||||||
 | 
					    domain_name TEXT,
 | 
				
			||||||
 | 
					    day DATE,
 | 
				
			||||||
 | 
					    seen_count INT,
 | 
				
			||||||
 | 
					    good_size INT,
 | 
				
			||||||
 | 
					    good_count INT,
 | 
				
			||||||
 | 
					    good_probability FLOAT,
 | 
				
			||||||
 | 
					    good_originality FLOAT,
 | 
				
			||||||
 | 
					    average_good_characters FLOAT,
 | 
				
			||||||
 | 
					    content_size INT,
 | 
				
			||||||
 | 
					    content_count INT,
 | 
				
			||||||
 | 
					    content_probability FLOAT,
 | 
				
			||||||
 | 
					    content_originality FLOAT,
 | 
				
			||||||
 | 
					    average_content_characters FLOAT,
 | 
				
			||||||
 | 
					    fetched_count INT,
 | 
				
			||||||
 | 
					    average_fetched_good_characters FLOAT,
 | 
				
			||||||
 | 
					    gain_ratio FLOAT,
 | 
				
			||||||
 | 
					    update_time TIMESTAMP STATIC ,
 | 
				
			||||||
 | 
					    PRIMARY KEY(domain_name,day)
 | 
				
			||||||
 | 
					) WITH CLUSTERING ORDER BY (day DESC);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CREATE TABLE content (
 | 
				
			||||||
 | 
					    domain_name TEXT,
 | 
				
			||||||
 | 
					    target_link TEXT,
 | 
				
			||||||
 | 
					    agent_version TEXT,
 | 
				
			||||||
 | 
					    title TEXT,
 | 
				
			||||||
 | 
					    links SET<TEXT>,
 | 
				
			||||||
 | 
					    authors SET<TEXT>,
 | 
				
			||||||
 | 
					    tags SET<TEXT>,
 | 
				
			||||||
 | 
					    description TEXT,
 | 
				
			||||||
 | 
					    section TEXT,
 | 
				
			||||||
 | 
					    article_published_time TEXT,
 | 
				
			||||||
 | 
					    text_date TEXT,
 | 
				
			||||||
 | 
					    body TEXT,
 | 
				
			||||||
 | 
					    body_size INT,
 | 
				
			||||||
 | 
					    update_time TIMESTAMP,
 | 
				
			||||||
 | 
					    PRIMARY KEY(domain_name,target_link),
 | 
				
			||||||
 | 
					);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CREATE TABLE paragraph_checksums (
 | 
				
			||||||
 | 
					    checksum BIGINT,
 | 
				
			||||||
 | 
					    url_hash BIGINT,
 | 
				
			||||||
 | 
					    PRIMARY KEY(checksum,url_hash),
 | 
				
			||||||
 | 
					);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CREATE TABLE html (
 | 
				
			||||||
 | 
					    day DATE,
 | 
				
			||||||
 | 
					    domain_name TEXT,
 | 
				
			||||||
 | 
					    source_link TEXT,
 | 
				
			||||||
 | 
					    target_link TEXT,
 | 
				
			||||||
 | 
					    redirect_links LIST<TEXT>,
 | 
				
			||||||
 | 
					    status INT,
 | 
				
			||||||
 | 
					    content TEXT,
 | 
				
			||||||
 | 
					    headers TEXT,
 | 
				
			||||||
 | 
					    agent_version TEXT,
 | 
				
			||||||
 | 
					    update_time TIMESTAMP,
 | 
				
			||||||
 | 
					    PRIMARY KEY(day,domain_name,source_link)
 | 
				
			||||||
 | 
					);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user