initial
This commit is contained in:
commit
0c9ea2b4e3
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
build
|
||||
dist
|
||||
*.egg-info
|
||||
venv
|
21
LICENSE.txt
Normal file
21
LICENSE.txt
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020 Technical University of Kosice
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
1
MANIFEST.in
Normal file
1
MANIFEST.in
Normal file
@ -0,0 +1 @@
|
||||
recursive-include websucker *.sql
|
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@ -0,0 +1,6 @@
|
||||
BeautifulSoup4
|
||||
justext
|
||||
cassandra-driver
|
||||
python-dateutil
|
||||
click
|
||||
pycurl
|
43
setup.py
Normal file
43
setup.py
Normal file
@ -0,0 +1,43 @@
|
||||
import setuptools
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
setuptools.setup(
|
||||
name="websucker", # Replace with your own username
|
||||
version="1.0.0",
|
||||
author="Daniel Hládek",
|
||||
author_email="dhladek@gmail.com",
|
||||
description="Web Crawler",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/hladek/websucker",
|
||||
packages=setuptools.find_packages(),
|
||||
# specified in MANIFEST
|
||||
include_package_data=True,
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Topic :: Internet :: WWW/HTTP :: Indexing/Search"
|
||||
],
|
||||
python_requires='>=3.6',
|
||||
entry_points={ # Optional
|
||||
'console_scripts': [
|
||||
'websuck=websucker.cli:cli',
|
||||
],
|
||||
},
|
||||
install_requires=[
|
||||
"BeautifulSoup4",
|
||||
"justext",
|
||||
"cassandra-driver",
|
||||
"python-dateutil",
|
||||
"click",
|
||||
"pycurl",
|
||||
"greenstalk"
|
||||
],
|
||||
|
||||
)
|
||||
|
0
websucker/__init__.py
Normal file
0
websucker/__init__.py
Normal file
364
websucker/agent.py
Executable file
364
websucker/agent.py
Executable file
@ -0,0 +1,364 @@
|
||||
#!/usr/bin/env python
|
||||
#! -*- coding: utf-8 -*-
|
||||
import urllib.parse
|
||||
import urllib.error
|
||||
import os
|
||||
import os.path
|
||||
import re
|
||||
import datetime
|
||||
import time
|
||||
import sys
|
||||
import tempfile
|
||||
import pprint
|
||||
import bs4
|
||||
|
||||
import pycurl
|
||||
import urllib.robotparser
|
||||
|
||||
|
||||
from websucker.parser import normalize_link,urlunparse
|
||||
|
||||
|
||||
# Parses http refresh in header or on html meta
|
||||
def get_refresh(ref,target_link):
|
||||
refresh = None
|
||||
tokens = ref.strip().split(";")
|
||||
if len(tokens) > 1 and tokens[1].lower().startswith("url="):
|
||||
refresh = urlunparse(normalize_link(
|
||||
tokens[1][4:].strip("\'"), target_link))
|
||||
return refresh
|
||||
|
||||
class Response:
|
||||
def __init__(self,url,headers,status,content,redirects,link_status):
|
||||
assert len(url) > 0
|
||||
assert url[0] != "/"
|
||||
self.url = url
|
||||
self.status = status
|
||||
self.content = content
|
||||
self.headers = headers
|
||||
self.redirects = redirects
|
||||
self.visited_time = datetime.date.today()
|
||||
self.bs = None
|
||||
self.link_status = link_status
|
||||
if content is not None and link_status == "good":
|
||||
self.bs = bs4.BeautifulSoup(content, "lxml")
|
||||
|
||||
def __str__(self):
|
||||
return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)
|
||||
|
||||
def get_content(self):
|
||||
if self.content is None:
|
||||
print("NO CONTENT")
|
||||
print(self.url,self.redirects)
|
||||
return None
|
||||
self.content.seek(0)
|
||||
text = self.content.read()
|
||||
out = str(text,encoding="utf8",errors="replace")
|
||||
return out
|
||||
|
||||
|
||||
# HMTL metarefresh redirect
|
||||
def get_metarefresh(self):
|
||||
if self.content is None:
|
||||
return None
|
||||
metarefresh = None
|
||||
t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"})
|
||||
canonical = self.get_canonical()
|
||||
for tags in t:
|
||||
if "content" in tags:
|
||||
metarefresh = get_refresh(tags["content"],canonical)
|
||||
if metarefresh is not None:
|
||||
nl = normalize_link(metarefresh, canonical)
|
||||
print("Metarefresh")
|
||||
print(nl)
|
||||
metarefresh = urlunparse(nl)
|
||||
|
||||
return metarefresh
|
||||
|
||||
def get_canonical(self):
|
||||
r = None
|
||||
last_link = self.url
|
||||
if len(self.redirects) > 0:
|
||||
last_link = self.redirects[-1]
|
||||
if self.bs is not None:
|
||||
l = self.bs.find("link", rel="canonical", href=True)
|
||||
if l is not None:
|
||||
r = urlunparse(normalize_link(l["href"], last_link))
|
||||
if r is None:
|
||||
r = last_link
|
||||
r = urlunparse(normalize_link(r, last_link))
|
||||
assert len(r) > 0
|
||||
assert r[0] != "/"
|
||||
return r
|
||||
|
||||
def get_redirects(self):
|
||||
if len(self.redirects) <2 :
|
||||
return []
|
||||
return self.redirects[0:-1]
|
||||
|
||||
|
||||
class Connection:
|
||||
def __init__(self):
|
||||
self.c = pycurl.Curl()
|
||||
self.c.setopt(self.c.FOLLOWLOCATION, True)
|
||||
# self.c.setopt(self.c.VERBOSE, True)
|
||||
self.c.setopt(self.c.CONNECTTIMEOUT, 20)
|
||||
self.c.setopt(self.c.TIMEOUT, 20)
|
||||
self.c.setopt(self.c.FAILONERROR, True)
|
||||
self.c.setopt(self.c.HTTPHEADER, [
|
||||
'Accept: text/html', 'Accept-Charset: UTF-8'])
|
||||
self.c.setopt(self.c.HEADERFUNCTION, self.header)
|
||||
self.c.setopt(self.c.USERAGENT, "Googlebot-News")
|
||||
# #self.c.setopt(pycurl.COOKIEJAR, 'cookie.txt')
|
||||
# #self.c.setopt(pycurl.COOKIEFILE, 'cookie.txt')
|
||||
self.robots = {}
|
||||
self.headers = {}
|
||||
self.redirects = []
|
||||
self.header_lines = []
|
||||
self.status = 0
|
||||
self.max_redirect = 4
|
||||
|
||||
# Zastavi spracovanie ak content nie je text
|
||||
# zaznamena location a refresh
|
||||
def header(self, data):
|
||||
if len(data) == 0:
|
||||
return None
|
||||
l = str(data, encoding="utf8")
|
||||
self.header_lines.append(l)
|
||||
s = l.find(" ")
|
||||
if s >= 1 and s < len(l):
|
||||
key = l[0:s - 1]
|
||||
value = l[s + 1:].rstrip()
|
||||
self.headers[key] = value
|
||||
if key.lower() == "refresh":
|
||||
self.add_redirect(value)
|
||||
elif key.lower() == "location":
|
||||
self.add_redirect(value)
|
||||
elif key == "Content-Type" and "text" not in value:
|
||||
# Pycurl potom vyhodi 23, failed writing header
|
||||
return 0
|
||||
|
||||
def __del__(self):
|
||||
self.c.close()
|
||||
|
||||
def close(self):
|
||||
self.c.close()
|
||||
|
||||
def add_redirect(self,link):
|
||||
last_link = self.url
|
||||
if len(self.redirects) > 0:
|
||||
last_link = self.redirects[-1]
|
||||
v = urlunparse(normalize_link(link, last_link))
|
||||
if v!=last_link and v not in set(self.redirects):
|
||||
self.redirects.append(v)
|
||||
|
||||
"""
|
||||
@returns content, link_status
|
||||
@throws pycurl.error
|
||||
"""
|
||||
def _download(self, url):
|
||||
print("Downloading " + url)
|
||||
self.url = url
|
||||
self.headers = {}
|
||||
self.redirects = []
|
||||
self.header_lines = []
|
||||
self.status = 0
|
||||
content = None
|
||||
link_status = "bad_connection"
|
||||
try:
|
||||
self.headers = {}
|
||||
del self.header_lines[:]
|
||||
content = tempfile.SpooledTemporaryFile()
|
||||
self.c.setopt(self.c.WRITEDATA, content)
|
||||
self.c.setopt(self.c.URL, url)
|
||||
self.c.perform()
|
||||
self.status = self.c.getinfo(self.c.RESPONSE_CODE)
|
||||
if self.status != 200:
|
||||
link_status = "bad_httpcode"
|
||||
elif "Content-Type" in self.headers and not self.headers["Content-Type"].startswith("text"):
|
||||
link_status = "bad_type"
|
||||
else:
|
||||
link_status = "good"
|
||||
content.seek(0)
|
||||
except pycurl.error as e:
|
||||
errno, message = e.args
|
||||
content = None
|
||||
self.status = self.c.getinfo(self.c.RESPONSE_CODE)
|
||||
if errno == 23:
|
||||
# 23 je zly content v header
|
||||
link_status = "bad_type"
|
||||
elif errno == 22:
|
||||
link_status = "bad_httpcode"
|
||||
else:
|
||||
raise e
|
||||
except UnicodeDecodeError as e:
|
||||
content = None
|
||||
link_status = "bad_unicode"
|
||||
except UnicodeEncodeError as e:
|
||||
content = None
|
||||
link_status = "bad_unicode"
|
||||
sz = self.c.getinfo(self.c.SIZE_DOWNLOAD)
|
||||
tt = self.c.getinfo(self.c.TOTAL_TIME)
|
||||
print("{} Received {} bytes in {} s".format(self.status,sz,tt))
|
||||
return content, link_status
|
||||
|
||||
# Throws pycurl.error
|
||||
def html_download2(self, url):
|
||||
dlink = url
|
||||
responses = []
|
||||
while len(responses) < 5:
|
||||
nl = normalize_link(dlink)
|
||||
url = urlunparse(nl)
|
||||
assert url.startswith("http")
|
||||
content, link_status = self._download(url)
|
||||
response = Response(url,"\r\n".join(self.header_lines),self.status,content,self.redirects,link_status)
|
||||
dlink = response.get_metarefresh()
|
||||
responses.append(response)
|
||||
if dlink is None:
|
||||
break
|
||||
return responses
|
||||
|
||||
def is_robot_good(self, url):
|
||||
schema, domain, path, query = normalize_link(url)
|
||||
res = True
|
||||
if domain not in self.robots:
|
||||
roboturl = urlunparse((schema, domain, "robots.txt", ""))
|
||||
try:
|
||||
r = self._download(roboturl)
|
||||
if r[1] == "good":
|
||||
c = r[0].read()
|
||||
lines = str(c, errors="ignore", encoding="utf8").split("\n")
|
||||
self.robots[domain] = urllib.robotparser.RobotFileParser()
|
||||
self.robots[domain].parse(lines)
|
||||
else:
|
||||
self.robots[domain] = None
|
||||
except pycurl.error as err:
|
||||
print(err)
|
||||
if domain in self.robots and self.robots[domain] is not None:
|
||||
res = self.robots[domain].can_fetch("Agent", url)
|
||||
return res
|
||||
|
||||
class ParsedDocument:
|
||||
def __init__(self, parser,work_link):
|
||||
self.parser = parser
|
||||
self.work_link = work_link
|
||||
|
||||
self.content = None
|
||||
self.bs = None
|
||||
self.paragraph_checksums = None
|
||||
self.paragraph_sizes = None
|
||||
|
||||
self.link_set = set()
|
||||
self.body = None
|
||||
self.text_date = None
|
||||
self.tags = None
|
||||
self.authors = None
|
||||
self.title = None
|
||||
self.description = None
|
||||
self.section = None
|
||||
self.article_published_time = None
|
||||
self.current_time = datetime.date.today()
|
||||
|
||||
def extract(self,content,bs):
|
||||
self.content = content
|
||||
self.bs = bs
|
||||
|
||||
# Extract text and metatext
|
||||
self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time)
|
||||
# Paragraph Checksums
|
||||
pch,pszs = self.parser.calculate_checksums(self.body)
|
||||
self.paragraph_checksums = pch
|
||||
self.paragraph_sizes = pszs
|
||||
if bs is None:
|
||||
return
|
||||
self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_og(bs)
|
||||
|
||||
# Extrakcia linkov zo stranky
|
||||
base = self.work_link
|
||||
if bs.base is not None and "href" in bs.base.attrs:
|
||||
base = bs.base["href"]
|
||||
# Normalizacia linkov
|
||||
for l in bs.find_all("a", href=True):
|
||||
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
|
||||
continue
|
||||
href = l["href"]
|
||||
try:
|
||||
nl = normalize_link(href, base)
|
||||
link = urlunparse(nl)
|
||||
if link == base:
|
||||
continue
|
||||
self.link_set.add(link)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def get_links(self):
|
||||
return self.link_set
|
||||
|
||||
def get_follow_links(self):
|
||||
follow_links = set()
|
||||
for l in self.link_set:
|
||||
if self.parser.is_link_good(l):
|
||||
link = normalize_link(l,strip_query=self.parser.strip_query)
|
||||
follow_links.add(urlunparse(link))
|
||||
return follow_links
|
||||
|
||||
|
||||
def __str__(self):
|
||||
r = []
|
||||
if self.title is not None:
|
||||
r.append(self.title)
|
||||
if self.body is not None:
|
||||
if (len(self.body) < 20):
|
||||
r.append(self.body)
|
||||
else:
|
||||
r.append(self.body[0:20]) + " ...."
|
||||
return ">>> ".join(r)
|
||||
|
||||
|
||||
def get_domains(arg):
|
||||
domains = []
|
||||
if arg == "-":
|
||||
for l in sys.stdin:
|
||||
domain = l.rstrip()
|
||||
assert(domain is not None)
|
||||
if len(domain) == 0:
|
||||
continue
|
||||
domains.append(domain)
|
||||
else:
|
||||
domains = arg.split(",")
|
||||
return domains
|
||||
|
||||
def visit_links(links,connection,parser,db):
|
||||
outlinks = []
|
||||
for work_link in links:
|
||||
responses = []
|
||||
if parser.is_link_good(work_link) and connection.is_robot_good(work_link):
|
||||
responses = connection.html_download2(work_link)
|
||||
time.sleep(4)
|
||||
db.index_responses(work_link,responses)
|
||||
if len(responses) > 0:
|
||||
lr = responses[-1]
|
||||
if lr.content is not None:
|
||||
target_link = lr.get_canonical()
|
||||
parsed = ParsedDocument(parser,target_link)
|
||||
parsed.extract(lr.content, lr.bs)
|
||||
db.index_content(target_link,parsed)
|
||||
outlinks += parsed.get_links()
|
||||
if len(outlinks) > 0:
|
||||
db.index_follow_links(parser,outlinks,connection)
|
||||
|
||||
def visit_domain(domain,parser,db):
|
||||
c = Connection()
|
||||
p = parser
|
||||
# Get links from frontpage
|
||||
# TODO Sitemap
|
||||
sitemap = "http://" + domain
|
||||
visit_links([sitemap],c,p,db)
|
||||
db.check_domain(domain)
|
||||
for i in range(p.crawl_rounds):
|
||||
# Visit links from frontpage
|
||||
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
|
||||
visit_links(links,c,p,db)
|
||||
db.check_domain(domain)
|
||||
|
||||
|
130
websucker/cli.py
Normal file
130
websucker/cli.py
Normal file
@ -0,0 +1,130 @@
|
||||
from websucker.agent import Connection,visit_links,visit_domain
|
||||
from websucker.agent import ParsedDocument
|
||||
from websucker.parser import BaseParser
|
||||
from websucker.parser import normalize_link,urlunparse
|
||||
from websucker.db import Data
|
||||
from websucker.db import get_schema
|
||||
import click
|
||||
import pprint
|
||||
|
||||
|
||||
|
||||
def create_database_from_context(ctx):
|
||||
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
|
||||
|
||||
@click.group()
|
||||
@click.pass_context
|
||||
@click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True)
|
||||
@click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True)
|
||||
@click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True)
|
||||
|
||||
@click.option("--justext-language",metavar="JUSTEXT_LANGUAGE",help="Target language (if defined, value read from JUSTEXT_LANGUAGE env variable)",envvar="JUSTEXT_LANGUAGE",default="English",show_default=True)
|
||||
@click.option("--visit",is_flag=True)
|
||||
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,justext_language,visit):
|
||||
ctx.ensure_object(dict)
|
||||
p = BaseParser()
|
||||
p.justext_language = justext_language
|
||||
ctx.obj["parser"] = p
|
||||
ctx.obj["cassandra_host"] = cassandra_host
|
||||
ctx.obj["cassandra_port"] = cassandra_port
|
||||
ctx.obj["cassandra_keyspace"] = cassandra_keyspace
|
||||
ctx.obj["visit"] = visit
|
||||
|
||||
|
||||
@cli.command(help="Print domains")
|
||||
@click.pass_context
|
||||
@click.argument("count",type=int,default=20)
|
||||
def all(ctx,count):
|
||||
p = ctx.obj["parser"]
|
||||
c = Connection()
|
||||
db = create_database_from_context(ctx)
|
||||
res = db.all_domains(count)
|
||||
for row in res:
|
||||
print(",".join(map(str,row)))
|
||||
if ctx.obj["visit"]:
|
||||
visit_domain(row[0],p,db)
|
||||
|
||||
@cli.command(help="Continue crawling of seen links from a domain")
|
||||
@click.pass_context
|
||||
@click.argument("domain")
|
||||
def crawl(ctx, domain):
|
||||
db = create_database_from_context(ctx)
|
||||
p = ctx.obj["parser"]
|
||||
c = Connection()
|
||||
links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
|
||||
visit_links(links,c,p,db)
|
||||
db.check_domain(domain)
|
||||
|
||||
@cli.command(help="find best domains")
|
||||
@click.pass_context
|
||||
@click.argument("count",type=int,default=20)
|
||||
#@click.option("visit",is_flag=True)
|
||||
def best(ctx, count):
|
||||
db = create_database_from_context(ctx)
|
||||
p = ctx.obj["parser"]
|
||||
domains = db.get_best_domains(count)
|
||||
for domain,gr in domains:
|
||||
print(domain,gr)
|
||||
if ctx.obj["visit"]:
|
||||
visit_domain(domain,p,db)
|
||||
|
||||
|
||||
@cli.command(help="Find unvisited domains, Visit a site, get links and crawl")
|
||||
@click.pass_context
|
||||
@click.argument("count",type=int,default=20)
|
||||
def unvisited(ctx, count):
|
||||
db = create_database_from_context(ctx)
|
||||
p = ctx.obj["parser"]
|
||||
c = Connection()
|
||||
domains = db.get_unvisited_domains(count)
|
||||
for domain in domains:
|
||||
print(domain)
|
||||
if ctx.obj["visit"]:
|
||||
visit_domain(domain,p,db)
|
||||
|
||||
@cli.command(help="Visit url, get links and crawl. Start here")
|
||||
@click.pass_context
|
||||
@click.argument("link")
|
||||
def visit(ctx, link):
|
||||
db = create_database_from_context(ctx)
|
||||
p = ctx.obj["parser"]
|
||||
c = Connection()
|
||||
nl = normalize_link(link)
|
||||
domain=nl[1]
|
||||
visit_domain(domain,p,db)
|
||||
|
||||
@cli.command(help="Update domain statistics")
|
||||
@click.pass_context
|
||||
@click.argument("domain")
|
||||
def check(ctx,domain):
|
||||
db = create_database_from_context(ctx)
|
||||
res = db.check_domain(domain)
|
||||
print(res)
|
||||
|
||||
@cli.command(help="Print daily report")
|
||||
@click.pass_context
|
||||
def report(ctx):
|
||||
db = create_database_from_context(ctx)
|
||||
db.daily_report()
|
||||
|
||||
@cli.command(help="Print keyspace schema")
|
||||
def schema():
|
||||
schema = get_schema()
|
||||
print(schema)
|
||||
|
||||
@cli.command(help="Fetch given url (just for debug)")
|
||||
@click.pass_context
|
||||
@click.argument("urls")
|
||||
def fetch(ctx,urls):
|
||||
parser = ctx.obj["parser"]
|
||||
# Visit first page
|
||||
connection = Connection()
|
||||
responses = connection.html_download2(urls)
|
||||
for res in responses:
|
||||
target_link = res.get_canonical()
|
||||
pd = ParsedDocument(parser,target_link)
|
||||
pd.extract(res.content, res.bs)
|
||||
print(pd)
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
444
websucker/db.py
Normal file
444
websucker/db.py
Normal file
@ -0,0 +1,444 @@
|
||||
import cassandra
|
||||
import cassandra.cluster
|
||||
import random
|
||||
import os
|
||||
import pkg_resources
|
||||
import datetime
|
||||
from websucker.parser import normalize_link,urlunparse
|
||||
|
||||
VERSION = "sucker6"
|
||||
|
||||
|
||||
def get_schema():
|
||||
with pkg_resources.resource_stream(__name__,"schema.sql") as f:
|
||||
schema = f.read()
|
||||
return str(schema,encoding="utf8")
|
||||
|
||||
class Data:
|
||||
"""
|
||||
Database of text documents
|
||||
"""
|
||||
def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042):
|
||||
# execution profile
|
||||
ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0)
|
||||
profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
|
||||
self.cluster = cassandra.cluster.Cluster([cassandra_host],port=cassandra_port,execution_profiles=profiles)
|
||||
self.session = self.cluster.connect(keyspace)
|
||||
|
||||
self.check_document_select_query = self.session.prepare("SELECT count(url_hash) FROM paragraph_checksums WHERE checksum=?" )
|
||||
|
||||
|
||||
|
||||
self.index_response_link_update = self.session.prepare("""
|
||||
UPDATE links SET
|
||||
link_status ='redirect',
|
||||
redirect_target = ?,
|
||||
update_time = toTimestamp(now())
|
||||
WHERE
|
||||
domain_name=? AND
|
||||
url_path=? AND
|
||||
url_query=?
|
||||
""")
|
||||
|
||||
self.domain_quality_update = self.session.prepare("""
|
||||
UPDATE domain_quality SET
|
||||
seen_count=?,
|
||||
good_size=?,
|
||||
good_count=?,
|
||||
good_probability=?,
|
||||
good_originality=?,
|
||||
average_good_characters=?,
|
||||
content_size=?,
|
||||
content_count=?,
|
||||
content_probability=?,
|
||||
content_originality=?,
|
||||
average_content_characters=?,
|
||||
fetched_count=?,
|
||||
average_fetched_good_characters=?,
|
||||
gain_ratio=?,
|
||||
update_time = toTimestamp(now())
|
||||
WHERE
|
||||
domain_name=? AND
|
||||
day=toDate(now())
|
||||
""")
|
||||
|
||||
self.index_response_insert_html = self.session.prepare("""
|
||||
INSERT INTO html(
|
||||
day,
|
||||
domain_name,
|
||||
source_link,
|
||||
target_link,
|
||||
redirect_links,
|
||||
status,
|
||||
headers,
|
||||
content,
|
||||
agent_version,
|
||||
update_time
|
||||
) VALUES (toDate(now()),?,?,?,?,?,?,?,?,toTimestamp(now()));
|
||||
""")
|
||||
|
||||
self.index_content_link_insert = self.session.prepare("""
|
||||
INSERT INTO links (
|
||||
url_schema,
|
||||
domain_name,
|
||||
url_path,
|
||||
url_query,
|
||||
link_status,
|
||||
update_time
|
||||
) VALUES (?,?,?,?,'seen',?) IF NOT EXISTS
|
||||
""")
|
||||
|
||||
self.daily_links_insert = self.session.prepare("""
|
||||
INSERT INTO daily_links (
|
||||
day,
|
||||
domain_name,
|
||||
url_path,
|
||||
url_query,
|
||||
link_status,
|
||||
body_size,
|
||||
link_originality,
|
||||
update_time
|
||||
) VALUES (toDate(now()),?,?,?,?,?,?,toTimestamp(now()))
|
||||
""")
|
||||
self.daily_links_select = self.session.prepare("""
|
||||
SELECT
|
||||
domain_name,
|
||||
link_status,
|
||||
count(link_status)
|
||||
FROM daily_links WHERE day=toDate(now()) GROUP BY domain_name,link_status
|
||||
""")
|
||||
# PArsed Content
|
||||
self.index_content_content_insert = self.session.prepare("""
|
||||
INSERT INTO content(
|
||||
domain_name,
|
||||
target_link,
|
||||
links,
|
||||
title,
|
||||
description,
|
||||
section,
|
||||
authors,
|
||||
tags,
|
||||
article_published_time,
|
||||
text_date,
|
||||
body,
|
||||
body_size,
|
||||
agent_version,
|
||||
update_time
|
||||
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?);
|
||||
""")
|
||||
|
||||
self.paragraph_checksums_insert = self.session.prepare("INSERT INTO paragraph_checksums (checksum,url_hash) VALUES(?,?)")
|
||||
self.index_content_links_update = self.session.prepare("UPDATE links SET link_status=?, link_originality=?,body_size=?,url_schema=? WHERE domain_name=? AND url_path = ? AND url_query=? ")
|
||||
self.check_domain_count = self.session.prepare("select count(url_path) from links where domain_name=? and link_status = ?")
|
||||
|
||||
self.check_domain_size = self.session.prepare("select sum(body_size),sum(link_originality) from links where domain_name=? and link_status =?")
|
||||
|
||||
self.domains_select = self.session.prepare("SELECT domain_name,seen_count,fetched_count,gain_ratio,average_fetched_good_characters FROM domain_quality PER PARTITION LIMIT 1")
|
||||
|
||||
|
||||
def index_responses(self,source_link,responses):
|
||||
# Redirect links
|
||||
pl = normalize_link(source_link)
|
||||
for response in responses:
|
||||
tl = response.get_canonical()
|
||||
r = (
|
||||
tl,
|
||||
pl[1],
|
||||
pl[2],
|
||||
pl[3],
|
||||
)
|
||||
if pl != tl:
|
||||
res = self.session.execute(self.index_response_link_update,r)
|
||||
d = (
|
||||
pl[1],
|
||||
source_link,
|
||||
response.get_canonical(),
|
||||
response.redirects,
|
||||
response.status,
|
||||
response.headers,
|
||||
response.get_content(),
|
||||
VERSION,
|
||||
)
|
||||
self.session.execute(self.index_response_insert_html,d)
|
||||
|
||||
def daily_report(self):
|
||||
rows = self.session.execute(self.daily_links_select)
|
||||
for row in rows:
|
||||
print(row[0],row[1],row[2])
|
||||
|
||||
def index_follow_links(self,parser,links,connection):
|
||||
# Index seen links
|
||||
|
||||
follow_links = set()
|
||||
for l in links:
|
||||
if parser.is_link_good(l):
|
||||
#if connection is not None and parser.listen_robot and not connection.is_robot_good(l):
|
||||
# continue
|
||||
link = normalize_link(l,strip_query=parser.strip_query)
|
||||
follow_links.add(urlunparse(link))
|
||||
|
||||
newlinkdomains = set()
|
||||
for link in follow_links:
|
||||
value = []
|
||||
nl = normalize_link(link)
|
||||
value += nl
|
||||
value.append(datetime.date.today())
|
||||
rows = self.session.execute(self.index_content_link_insert,value)
|
||||
row = rows.one()
|
||||
if row.applied:
|
||||
newlinkdomains.add(nl[1])
|
||||
for domain in newlinkdomains:
|
||||
self.check_domain(domain)
|
||||
|
||||
|
||||
def index_content(self,target_link,parsed_document):
|
||||
nl = normalize_link(target_link)
|
||||
domain_name = nl[1]
|
||||
assert len(domain_name) > 1
|
||||
|
||||
pd = parsed_document
|
||||
body_length = 0
|
||||
if pd.body is not None:
|
||||
body_length = len(pd.body)
|
||||
value = (
|
||||
domain_name,
|
||||
target_link,
|
||||
pd.get_links(),
|
||||
pd.title,
|
||||
pd.description,
|
||||
pd.section,
|
||||
pd.authors,
|
||||
pd.tags,
|
||||
pd.article_published_time,
|
||||
pd.text_date,
|
||||
pd.body,
|
||||
body_length,
|
||||
VERSION,
|
||||
pd.current_time
|
||||
)
|
||||
content_future = self.session.execute_async(self.index_content_content_insert,value)
|
||||
# result later
|
||||
|
||||
link_status = "good"
|
||||
originality = 0
|
||||
tsz = 0
|
||||
if pd.body is None:
|
||||
link_status = "bad_parse"
|
||||
else:
|
||||
tsz = len(pd.body)
|
||||
if tsz < 300:
|
||||
link_status = "bad_small"
|
||||
|
||||
if link_status == "good":
|
||||
|
||||
futures = []
|
||||
for pc,psz in zip(pd.paragraph_checksums,pd.paragraph_sizes):
|
||||
fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3])))
|
||||
futures.append(fut)
|
||||
for fut in futures:
|
||||
fut.result()
|
||||
originality = self.check_document(pd.paragraph_checksums,pd.paragraph_sizes)
|
||||
if originality < 0.8:
|
||||
link_status = "bad_copy"
|
||||
print(nl)
|
||||
self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3]))
|
||||
content_future.result()
|
||||
print("<<<< " + link_status + " " + str(originality))
|
||||
dl = (
|
||||
nl[1],
|
||||
nl[2],
|
||||
nl[3],
|
||||
link_status,
|
||||
tsz,
|
||||
originality
|
||||
)
|
||||
self.session.execute(self.daily_links_insert,dl)
|
||||
|
||||
def check_document(self,paragraph_checksums,paragraph_sizes):
|
||||
tsz = sum(paragraph_sizes)
|
||||
if tsz == 0:
|
||||
return 0
|
||||
copies = 0
|
||||
futures = []
|
||||
for pc,psz in zip(paragraph_checksums,paragraph_sizes):
|
||||
futures.append(self.session.execute_async(self.check_document_select_query,(pc,)))
|
||||
|
||||
for fut,psz in zip(futures,paragraph_sizes):
|
||||
rows = fut.result()
|
||||
res = rows.one()[0]
|
||||
if res > 1:
|
||||
copies += psz
|
||||
return (tsz-copies)/tsz
|
||||
|
||||
|
||||
def check_domain(self, domain):
|
||||
assert len(domain) > 0
|
||||
seen_count = None
|
||||
good_size = None
|
||||
good_count = None
|
||||
good_probability = None
|
||||
good_originality = None
|
||||
average_good_characters = None
|
||||
content_size = None
|
||||
content_count = None
|
||||
content_probability = None
|
||||
content_originality = None
|
||||
average_content_characters = None
|
||||
fetched_count = None
|
||||
average_fetched_good_characters = None
|
||||
gain_ratio = None
|
||||
|
||||
counts = {
|
||||
"good":0,
|
||||
"bad_copy":0,
|
||||
"bad_small":0,
|
||||
"bad_httpcode":0,
|
||||
"bad_type":0,
|
||||
"bad_content":0,
|
||||
"bad_parse":0,
|
||||
"seen":0
|
||||
}
|
||||
for k in counts.keys():
|
||||
res = self.session.execute(self.check_domain_count,(domain,k))
|
||||
co = res.one()[0]
|
||||
counts[k]= co
|
||||
|
||||
seen_count = counts["seen"]
|
||||
good_count = counts["good"]
|
||||
content_count = counts["good"] + counts["bad_copy"] + counts["bad_small"]
|
||||
|
||||
fetched_count = sum(counts.values()) - counts["seen"]
|
||||
|
||||
if fetched_count > 0:
|
||||
content_probability = content_count / fetched_count
|
||||
good_probability = good_count / fetched_count
|
||||
sizes = {
|
||||
"good":0,
|
||||
"bad_copy":0,
|
||||
"bad_small":0
|
||||
}
|
||||
|
||||
originalities ={}
|
||||
|
||||
for k in sizes.keys():
|
||||
res = self.session.execute(self.check_domain_size,(domain,k))
|
||||
row = res.one()
|
||||
co =row[0]
|
||||
originalities[k] = row[1]
|
||||
sizes[k]= co
|
||||
good_size = sizes["good"]
|
||||
content_size = sum(sizes.values())
|
||||
if good_count > 0:
|
||||
good_originality = originalities["good"] / good_count
|
||||
if content_count > 0:
|
||||
content_originality = sum(originalities.values()) / content_count
|
||||
|
||||
if good_count > 0:
|
||||
average_good_characters = good_size / good_count * good_originality
|
||||
average_fetched_good_characters = good_size * good_originality / fetched_count
|
||||
|
||||
gain_ratio = average_fetched_good_characters / fetched_count
|
||||
|
||||
if content_count > 0:
|
||||
average_content_characters = content_size / content_count
|
||||
|
||||
#print(sizes)
|
||||
#print(originalities)
|
||||
uv = (
|
||||
seen_count,
|
||||
good_size,
|
||||
good_count,
|
||||
good_probability,
|
||||
good_originality,
|
||||
average_good_characters,
|
||||
content_size,
|
||||
content_count,
|
||||
content_probability,
|
||||
content_originality,
|
||||
average_content_characters,
|
||||
fetched_count,
|
||||
average_fetched_good_characters,
|
||||
gain_ratio,
|
||||
domain)
|
||||
if fetched_count > 0 or seen_count > 0:
|
||||
self.session.execute(self.domain_quality_update,uv)
|
||||
return average_fetched_good_characters
|
||||
|
||||
def all_domains(self,count):
|
||||
rows = self.session.execute(self.domains_select)
|
||||
domains = []
|
||||
for row in rows:
|
||||
domain = row[0]
|
||||
seen_count = row[1]
|
||||
fetched_count = row[2]
|
||||
gain_ratio = row[3]
|
||||
afg = row[4]
|
||||
if fetched_count and afg and seen_count:
|
||||
domains.append(tuple(row))
|
||||
l = len(domains)
|
||||
ss = min(l,count)
|
||||
res = []
|
||||
if ss > 0:
|
||||
# sort according to ratio
|
||||
res = list(sorted(domains,key=lambda x:x[4],reverse=True))[0:ss]
|
||||
# returns sorted list of tuples domain,gain_ratio
|
||||
return res
|
||||
|
||||
def get_best_domains(self,count):
|
||||
# get all domains
|
||||
rows = self.session.execute(self.domains_select)
|
||||
domains = []
|
||||
for row in rows:
|
||||
domain = row[0]
|
||||
seen_count = row[1]
|
||||
fetched_count = row[2]
|
||||
gain_ratio = row[3]
|
||||
afg = row[4]
|
||||
if seen_count and fetched_count and gain_ratio:
|
||||
domains.append((domain,gain_ratio))
|
||||
l = len(domains)
|
||||
ss = min(l,count)
|
||||
res = []
|
||||
if ss > 0:
|
||||
# sort according to ratio
|
||||
res = list(sorted(domains,key=lambda x:x[1],reverse=True))[0:ss]
|
||||
# returns sorted list of tuples domain,gain_ratio
|
||||
return res
|
||||
|
||||
def get_unvisited_domains(self,count):
|
||||
# get all domains
|
||||
rows = self.session.execute(self.domains_select)
|
||||
domains = []
|
||||
for row in rows:
|
||||
domain = row[0]
|
||||
seen_count = row[1]
|
||||
fetched_count = row[2]
|
||||
gain_ratio = row[3]
|
||||
afg = row[4]
|
||||
if seen_count and not fetched_count:
|
||||
domains.append(domain)
|
||||
ss = min(len(domains),count)
|
||||
return random.sample(domains,ss)
|
||||
|
||||
def get_visit_links(self,domain,recent_count,old_count,random_count):
|
||||
dblinks = []
|
||||
rows = self.session.execute("SELECT url_schema,url_path,url_query,update_time FROM links Where domain_name=%s AND link_status='seen'",(domain,))
|
||||
for row in rows:
|
||||
link = urlunparse((row[0],domain,row[1],row[2]))
|
||||
dblinks.append((link,row[3]))
|
||||
|
||||
visitlinks = []
|
||||
dblinks.sort(key=lambda x:x[1])
|
||||
random_links = []
|
||||
for i,(link,time) in enumerate(dblinks):
|
||||
#print(link,time)
|
||||
if i < recent_count:
|
||||
visitlinks.append(link)
|
||||
elif i >= len(dblinks) - old_count:
|
||||
visitlinks.append(link)
|
||||
else:
|
||||
random_links.append(link)
|
||||
sc = min(random_count,len(random_links))
|
||||
if sc > 0:
|
||||
visitlinks += random.sample(random_links,sc)
|
||||
return visitlinks
|
335
websucker/parser.py
Normal file
335
websucker/parser.py
Normal file
@ -0,0 +1,335 @@
|
||||
import dateutil.parser
|
||||
import justext
|
||||
import re
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
import lxml.etree
|
||||
import urllib.parse
|
||||
import os.path
|
||||
|
||||
|
||||
datere = re.compile("\d{1,2}\.\s*\d{1,2}\.\s*[12]\d{3}")
|
||||
yearre = re.compile(r"\s\d{4}\s")
|
||||
|
||||
def urlunparse(parsed_url):
|
||||
schema, netloc, path, query = parsed_url
|
||||
return urllib.parse.urlunparse((schema, netloc, path, "", query, ""))
|
||||
|
||||
def normalize_link(link, base=None,strip_query=False):
|
||||
|
||||
link = link.strip().replace(
|
||||
"\n", "").replace("\t", "").replace("\r", "")
|
||||
parsed_link = urllib.parse.urlparse(link)
|
||||
schema = parsed_link[0]
|
||||
netloc = parsed_link[1].strip().lower()
|
||||
path = parsed_link[2].strip()
|
||||
query = parsed_link[4]
|
||||
if strip_query:
|
||||
query = ""
|
||||
if path is None or len(path) == 0:
|
||||
path = "/"
|
||||
dirname, filename = os.path.split(path)
|
||||
if base is not None:
|
||||
parsed_base = urllib.parse.urlparse(base)
|
||||
if schema == "":
|
||||
schema = parsed_base[0]
|
||||
# Ak je relativny link
|
||||
if netloc == "":
|
||||
netloc = parsed_base[1]
|
||||
schema = parsed_base[0]
|
||||
bdir, bfile = os.path.split(parsed_base[2])
|
||||
if len(bdir) > 0 and bdir[0] != "." and len(dirname) > 0 and dirname[0] != "/":
|
||||
dirname = bdir + "/" + dirname
|
||||
# if len(dirname) == 0 or dirname[0] != '/':
|
||||
# path = '/' + path
|
||||
dirname = os.path.normpath(dirname)
|
||||
dirname = dirname.lstrip("/").lstrip(".")
|
||||
path = dirname + "/" + filename
|
||||
return schema, netloc, path, query
|
||||
|
||||
|
||||
def get_date(te):
|
||||
dates = []
|
||||
words = []
|
||||
if te is None:
|
||||
te = ""
|
||||
for t in te.split():
|
||||
t = t.strip().lower().lstrip("0").replace("\r", "\n").replace("\n", "")
|
||||
if len(t) == 0:
|
||||
continue
|
||||
for i, m in enumerate(["jan", "feb", "mar", "apr", "máj", "jún", "júl", "aug", "sept", "okt", "nov", "dec"]):
|
||||
if t.startswith(m):
|
||||
t = str(i + 1) + "."
|
||||
break
|
||||
if t[0].isdigit():
|
||||
words.append(t)
|
||||
txt = " ".join(words)
|
||||
for st in re.findall(datere, txt):
|
||||
tokens = st.replace(" ", "").split(".")
|
||||
try:
|
||||
y = int(tokens[-1])
|
||||
if y < 2000 or y > 2020:
|
||||
continue
|
||||
m = 2
|
||||
d = 2
|
||||
if len(tokens) > 2:
|
||||
m = int(tokens[-2])
|
||||
d = int(tokens[-3])
|
||||
dates.append(datetime.date(y, m, d))
|
||||
except ValueError:
|
||||
pass
|
||||
return dates
|
||||
|
||||
|
||||
class BaseParser:
|
||||
def __init__(self, verbose=False):
|
||||
self.strip_query = True
|
||||
self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz",
|
||||
".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"]
|
||||
self.skipchars = re.compile(r"[();:@& ]")
|
||||
self.store = True
|
||||
self.verbose = verbose
|
||||
self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}$")
|
||||
self.listen_robot = True
|
||||
self.recent_links = 5
|
||||
self.old_links = 3
|
||||
self.random_links = 10
|
||||
self.crawl_rounds = 3
|
||||
self.skipdomains = set()
|
||||
self.allowdomains = set()
|
||||
self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter", "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc", "eshop", "e-shop", "email", "gallery", "flog"])
|
||||
self.justext_language = "Slovak"
|
||||
|
||||
def is_domain_good(self, domain):
|
||||
r = None
|
||||
# Netloc
|
||||
if ":" in domain:
|
||||
r = "Port in domain"
|
||||
elif len(domain) < 4:
|
||||
r = "Too short domain"
|
||||
elif len(domain) > 50:
|
||||
r = "Too long location"
|
||||
elif domain.startswith(".") or domain.endswith("."):
|
||||
r = "Malformed domain"
|
||||
elif not self.domain_re.match(domain):
|
||||
r = "Bad domain"
|
||||
else:
|
||||
da = False
|
||||
for d in self.allowdomains:
|
||||
if domain.endswith(d):
|
||||
da = True
|
||||
break
|
||||
if not da and len(self.allowdomains) > 0:
|
||||
r = "Domain not in allowlist"
|
||||
for d in self.skipdomains:
|
||||
if domain.endswith(d):
|
||||
r = "In domain skiplist"
|
||||
for d in domain.split("."):
|
||||
if d in self.skippaths:
|
||||
r = "Domain in skippath"
|
||||
if r is not None and self.verbose:
|
||||
print(domain + " " + r)
|
||||
return r is None
|
||||
|
||||
# # Argument - parsovana url
|
||||
def is_link_good(self, link):
|
||||
assert(link is not None)
|
||||
r = None
|
||||
if sys.getsizeof(link) > 1023:
|
||||
r = "Too long"
|
||||
try:
|
||||
schema, domain, path, query = normalize_link(link)
|
||||
if not schema.startswith("http"):
|
||||
r = "Bad schema"
|
||||
dg = self.is_domain_good(domain)
|
||||
if not dg:
|
||||
return False
|
||||
for c in link:
|
||||
if ord(c) >= 128:
|
||||
r = "Bad domain character"
|
||||
break
|
||||
for p in self.skipdomains:
|
||||
if domain.endswith(p):
|
||||
r = "Bad domain"
|
||||
break
|
||||
if ".b-" in domain:
|
||||
r = "Bad domain"
|
||||
if len(domain) > 127:
|
||||
r = "Too long path"
|
||||
# Path
|
||||
for t in self.skiptypes:
|
||||
if path.lower().endswith(t):
|
||||
r = "Bad type"
|
||||
break
|
||||
if re.search(self.skipchars, path):
|
||||
r = "Bad path"
|
||||
for p in path.split("/"):
|
||||
if p in self.skippaths or "jpg" in p or "galeria" in p:
|
||||
r = "Bad path"
|
||||
break
|
||||
except ValueError:
|
||||
r = "Bad urlparse"
|
||||
return r is None
|
||||
|
||||
def filter_links(links):
|
||||
# Filter links
|
||||
linkset = set()
|
||||
for link in links:
|
||||
if not self.is_link_good(link):
|
||||
continue
|
||||
link = urlunparse(normalize_link(link,strip_query=self.strip_query))
|
||||
linkset.add(link)
|
||||
|
||||
return list(linkset)
|
||||
|
||||
def extract_raw_text(self, content, current_time):
|
||||
result = []
|
||||
rd = None
|
||||
paragraphs = []
|
||||
content.seek(0)
|
||||
try:
|
||||
c = content.read()
|
||||
paragraphs = justext.justext(c, justext.get_stoplist(self.justext_language), length_low=50, length_high=150)
|
||||
content.seek(0)
|
||||
except lxml.etree.XMLSyntaxError:
|
||||
print("XML Syntax parse error")
|
||||
except lxml.etree.ParserError:
|
||||
|
||||
print("XML Parse parse error")
|
||||
except justext.core.JustextError:
|
||||
print("Justext error")
|
||||
except IndexError:
|
||||
print("XML error")
|
||||
except UnicodeDecodeError:
|
||||
print("Unicode Error")
|
||||
except TypeError:
|
||||
# NUll in string
|
||||
print("String Error")
|
||||
except RuntimeError:
|
||||
# Maximum recursion depth"
|
||||
print("Recursion Error")
|
||||
dates = []
|
||||
for p in paragraphs:
|
||||
# TODO - match URL for date
|
||||
if p is not None and p.text is not None and len(p.text) > 0:
|
||||
dat = get_date(p.text)
|
||||
for d in dat:
|
||||
dates.append(d)
|
||||
if self.verbose:
|
||||
print(p.class_type, p.links_density(), p.stopwords_density(
|
||||
justext.get_stoplist(self.justext_language)), p.text)
|
||||
if not p.is_boilerplate:
|
||||
result.append(p.text.strip())
|
||||
if len(dates) == 0:
|
||||
dates.append(current_time)
|
||||
if len(dates) > 0:
|
||||
rd = max(dates)
|
||||
rd = rd.isoformat()
|
||||
|
||||
return "\n\n".join(result), rd
|
||||
|
||||
# Extracts matainformation from html
|
||||
# First it looks for name, content in meta tags
|
||||
# then it looks for opengraph
|
||||
def extract_og(self, bs):
|
||||
tags = set()
|
||||
authors = set()
|
||||
title = ""
|
||||
description = ""
|
||||
section = ""
|
||||
article_published_time = ""
|
||||
|
||||
for m in bs.find_all("meta", attrs={"name": True, "content": True}):
|
||||
content = m["content"].strip()
|
||||
if len(content) == 0:
|
||||
continue
|
||||
name = m["name"].strip()
|
||||
if name == "keywords":
|
||||
for t in content.split(","):
|
||||
if len(t.strip()) > 0:
|
||||
tags.add(t.strip())
|
||||
if name == "news_keywords":
|
||||
for t in content.split(","):
|
||||
if len(t.strip()) > 0:
|
||||
tags.add(t.strip())
|
||||
if name == "author":
|
||||
authors.add(content)
|
||||
if name == "description":
|
||||
description = content
|
||||
|
||||
for m in bs.find_all("meta", property=True, content=True):
|
||||
content = m["content"].strip()
|
||||
if len(content) == 0:
|
||||
continue
|
||||
property = m["property"].strip()
|
||||
if property == "og:title":
|
||||
title = content
|
||||
if property == "article:published_time":
|
||||
try:
|
||||
# Je v ISO formate?
|
||||
d = dateutil.parser.parse(content)
|
||||
article_published_time = d.isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
except OverflowError:
|
||||
pass
|
||||
if property == "article:author" and "admin" not in content.lower():
|
||||
authors.add(content)
|
||||
if property == "section":
|
||||
section = content
|
||||
if property == "tag":
|
||||
tags.add(content)
|
||||
if property == "og:description":
|
||||
description = content
|
||||
|
||||
if len(title) < 2 and bs.h1 is not None:
|
||||
title = bs.h1.get_text(strip=True)
|
||||
if len(title) < 2 and bs.title is not None:
|
||||
title = bs.title.get_text(strip=True)
|
||||
if len(authors) == 0:
|
||||
for m in bs.find_all(property="author"):
|
||||
authors.add(m.get_text(strip=True))
|
||||
if len(authors) == 0:
|
||||
for m in bs.find_all(itemprop="author"):
|
||||
authors.add(m.get_text(strip=True))
|
||||
authors = set(filter(lambda x: len(x) > 2, authors))
|
||||
|
||||
return tags,authors,title.strip(),article_published_time.strip(),description,section.strip()
|
||||
|
||||
|
||||
def calculate_checksums(self, text):
|
||||
"""
|
||||
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
|
||||
"""
|
||||
checksums = []
|
||||
sizes = []
|
||||
hval = 0
|
||||
hsz = 0
|
||||
sz = 0
|
||||
for c in text:
|
||||
cv = ord(c)
|
||||
sz += 1
|
||||
if cv > 64:
|
||||
hval += (hval << 3) + cv
|
||||
zv = hval >> 31
|
||||
hval &= 0x7fffffff
|
||||
hval += zv
|
||||
hsz += 1
|
||||
if c == "\n" and hsz > 0:
|
||||
if hsz > 100:
|
||||
checksums.append(hval)
|
||||
sizes.append(sz)
|
||||
sz = 0
|
||||
hsz = 0
|
||||
if hsz > 100:
|
||||
checksums.append(hval)
|
||||
sizes.append(sz)
|
||||
return checksums, sizes
|
||||
|
||||
class EnglishParser(BaseParser):
|
||||
def __init__(self):
|
||||
super(EnglishParser,self).__init__()
|
||||
self.justext_language = "English"
|
||||
self.allowdomains = set(["com","org","io"])
|
||||
|
35
websucker/queue.py
Normal file
35
websucker/queue.py
Normal file
@ -0,0 +1,35 @@
|
||||
import greenstalk
|
||||
import random
|
||||
|
||||
|
||||
MAX_PRIORITY = 0
|
||||
MIN_PRIORITY = 4000000000
|
||||
|
||||
MAX_FLOAT_PRIORITY = 10000.0
|
||||
|
||||
def map_priority(p,max_priority):
|
||||
p = p / max_priority
|
||||
return MIN_PRIORITY - (p*MIN_PRIORITY)
|
||||
|
||||
class BeanstalkdQueue:
|
||||
def __init__(self,host,port,tube):
|
||||
self.c = greenstalk.Client(host,port,use=tube,encoding="utf8")
|
||||
|
||||
def queue_priority_domains(self,priority_domains):
|
||||
for domain,priority in priority_domains:
|
||||
p = priority / MAX_FLOAT_PRIORITY
|
||||
p = MIN_PRIORITY - (p*MIN_PRIORITY)
|
||||
self.c.put(domain,p)
|
||||
|
||||
def queue_random_domains(self,domains):
|
||||
for domain in domains:
|
||||
p = random.randint(MAX_PRIORITY,MIN_PRIORITY)
|
||||
self.c.put(domain,p)
|
||||
|
||||
def consume_domains(self,callback):
|
||||
while True:
|
||||
job = self.c.reserve()
|
||||
domain = job.body
|
||||
self.c.delete(job)
|
||||
callback(domain)
|
||||
|
94
websucker/schema.sql
Normal file
94
websucker/schema.sql
Normal file
@ -0,0 +1,94 @@
|
||||
DROP KEYSPACE websucker;
|
||||
|
||||
CREATE KEYSPACE websucker
|
||||
WITH replication = {'class':'SimpleStrategy', 'replication_factor' : 1};
|
||||
|
||||
USE websucker;
|
||||
|
||||
CREATE TABLE links (
|
||||
domain_name TEXT,
|
||||
url_path TEXT,
|
||||
url_query TEXT,
|
||||
url_schema TEXT,
|
||||
redirect_target TEXT,
|
||||
link_status TEXT,
|
||||
link_originality FLOAT,
|
||||
body_size INT,
|
||||
update_time TIMESTAMP,
|
||||
PRIMARY KEY(domain_name,url_path,url_query)
|
||||
);
|
||||
|
||||
CREATE INDEX link_status_index ON links(link_status);
|
||||
|
||||
CREATE TABLE daily_links (
|
||||
day DATE,
|
||||
domain_name TEXT,
|
||||
url_path TEXT,
|
||||
url_query TEXT,
|
||||
link_status TEXT,
|
||||
body_size INT,
|
||||
link_originality FLOAT,
|
||||
update_time TIMESTAMP,
|
||||
PRIMARY KEY(day,domain_name,link_status,url_path,url_query)
|
||||
);
|
||||
|
||||
CREATE TABLE domain_quality (
|
||||
domain_name TEXT,
|
||||
day DATE,
|
||||
seen_count INT,
|
||||
good_size INT,
|
||||
good_count INT,
|
||||
good_probability FLOAT,
|
||||
good_originality FLOAT,
|
||||
average_good_characters FLOAT,
|
||||
content_size INT,
|
||||
content_count INT,
|
||||
content_probability FLOAT,
|
||||
content_originality FLOAT,
|
||||
average_content_characters FLOAT,
|
||||
fetched_count INT,
|
||||
average_fetched_good_characters FLOAT,
|
||||
gain_ratio FLOAT,
|
||||
update_time TIMESTAMP STATIC ,
|
||||
PRIMARY KEY(domain_name,day)
|
||||
) WITH CLUSTERING ORDER BY (day DESC);
|
||||
|
||||
|
||||
CREATE TABLE content (
|
||||
domain_name TEXT,
|
||||
target_link TEXT,
|
||||
agent_version TEXT,
|
||||
title TEXT,
|
||||
links SET<TEXT>,
|
||||
authors SET<TEXT>,
|
||||
tags SET<TEXT>,
|
||||
description TEXT,
|
||||
section TEXT,
|
||||
article_published_time TEXT,
|
||||
text_date TEXT,
|
||||
body TEXT,
|
||||
body_size INT,
|
||||
update_time TIMESTAMP,
|
||||
PRIMARY KEY(domain_name,target_link),
|
||||
);
|
||||
|
||||
CREATE TABLE paragraph_checksums (
|
||||
checksum BIGINT,
|
||||
url_hash BIGINT,
|
||||
PRIMARY KEY(checksum,url_hash),
|
||||
);
|
||||
|
||||
CREATE TABLE html (
|
||||
day DATE,
|
||||
domain_name TEXT,
|
||||
source_link TEXT,
|
||||
target_link TEXT,
|
||||
redirect_links LIST<TEXT>,
|
||||
status INT,
|
||||
content TEXT,
|
||||
headers TEXT,
|
||||
agent_version TEXT,
|
||||
update_time TIMESTAMP,
|
||||
PRIMARY KEY(day,domain_name,source_link)
|
||||
);
|
||||
|
Loading…
Reference in New Issue
Block a user