zz
This commit is contained in:
parent
1e3f8dcba6
commit
cc0d720d1c
@ -6,3 +6,5 @@ pycurl
|
|||||||
lz4
|
lz4
|
||||||
lxml
|
lxml
|
||||||
cassandra-driver
|
cassandra-driver
|
||||||
|
trafilatura
|
||||||
|
py3langid
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains
|
from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains
|
||||||
from websucker.agent import ParsedDocument
|
|
||||||
from websucker.parser import BaseParser
|
from websucker.parser import BaseParser
|
||||||
from websucker.parser import normalize_link,urlunparse
|
from websucker.parser import SoupParser
|
||||||
|
from websucker.parser import TrafilaturaParser
|
||||||
from websucker.parser import load_parser
|
from websucker.parser import load_parser
|
||||||
from websucker.db import Data
|
from websucker.db import Data
|
||||||
from websucker.db import get_schema
|
from websucker.db import get_schema
|
||||||
@ -36,7 +36,7 @@ def create_queue_from_context(ctx):
|
|||||||
@click.option("--queue",is_flag=True)
|
@click.option("--queue",is_flag=True)
|
||||||
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,cassandra_username,cassandra_password,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
|
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,cassandra_username,cassandra_password,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
|
||||||
ctx.ensure_object(dict)
|
ctx.ensure_object(dict)
|
||||||
p = BaseParser()
|
p = TrafilaturaParser()
|
||||||
if parser is not None:
|
if parser is not None:
|
||||||
assert os.path.isfile(parser)
|
assert os.path.isfile(parser)
|
||||||
else:
|
else:
|
||||||
@ -220,7 +220,7 @@ def fetch(ctx,urls):
|
|||||||
responses = connection.html_download2(urls)
|
responses = connection.html_download2(urls)
|
||||||
for res in responses:
|
for res in responses:
|
||||||
target_link = res.get_canonical()
|
target_link = res.get_canonical()
|
||||||
pd = parser.full_extract(res.content,res.bs,parser,target_link)
|
pd = parser.full_extract(res.content,res.bs,target_link)
|
||||||
print(pd)
|
print(pd)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -14,7 +14,7 @@ import json
|
|||||||
VERSION = "sucker6"
|
VERSION = "sucker6"
|
||||||
|
|
||||||
|
|
||||||
def calculate_checksums(self, text):
|
def calculate_checksums(text):
|
||||||
"""
|
"""
|
||||||
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
|
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
|
||||||
"""
|
"""
|
||||||
@ -295,13 +295,7 @@ INSERT INTO content(
|
|||||||
|
|
||||||
def index_follow_links(self,parser,links,connection):
|
def index_follow_links(self,parser,links,connection):
|
||||||
# Index seen links
|
# Index seen links
|
||||||
follow_links = set()
|
follow_links = parser.filter_links(links)
|
||||||
for l in links:
|
|
||||||
if parser.is_link_good(l):
|
|
||||||
#if connection is not None and parser.listen_robot and not connection.is_robot_good(l):
|
|
||||||
# continue
|
|
||||||
link = normalize_link(l,strip_query=parser.strip_query)
|
|
||||||
follow_links.add(urlunparse(link))
|
|
||||||
|
|
||||||
newlinkdomains = set()
|
newlinkdomains = set()
|
||||||
newlinkcount = 0
|
newlinkcount = 0
|
||||||
@ -342,9 +336,10 @@ INSERT INTO content(
|
|||||||
pd.text_date,
|
pd.text_date,
|
||||||
pd.body,
|
pd.body,
|
||||||
body_length,
|
body_length,
|
||||||
VERSION,
|
VERSION,
|
||||||
pd.current_time
|
pd.current_time
|
||||||
)
|
)
|
||||||
|
print(value)
|
||||||
content_future = self.session.execute_async(self.index_content_content_insert,value)
|
content_future = self.session.execute_async(self.index_content_content_insert,value)
|
||||||
# result later
|
# result later
|
||||||
|
|
||||||
@ -361,7 +356,7 @@ INSERT INTO content(
|
|||||||
if link_status == "good":
|
if link_status == "good":
|
||||||
|
|
||||||
futures = []
|
futures = []
|
||||||
paragraph_checksums,paragraph_sizes = calculate_checksums(pd.text)
|
paragraph_checksums,paragraph_sizes = calculate_checksums(pd.body)
|
||||||
for pc,psz in zip(paragraph_checksums,paragraph_sizes):
|
for pc,psz in zip(paragraph_checksums,paragraph_sizes):
|
||||||
fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3])))
|
fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3])))
|
||||||
futures.append(fut)
|
futures.append(fut)
|
||||||
|
@ -146,10 +146,8 @@ class BaseParser:
|
|||||||
self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz",
|
self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz",
|
||||||
".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"]
|
".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"]
|
||||||
self.skipchars = re.compile(r"[();:@& ]")
|
self.skipchars = re.compile(r"[();:@& ]")
|
||||||
self.store = True
|
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}$")
|
self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}$")
|
||||||
self.listen_robot = True
|
|
||||||
self.recent_links = 5
|
self.recent_links = 5
|
||||||
self.old_links = 3
|
self.old_links = 3
|
||||||
self.random_links = 10
|
self.random_links = 10
|
||||||
@ -157,22 +155,6 @@ class BaseParser:
|
|||||||
self.skipdomains = set()
|
self.skipdomains = set()
|
||||||
self.allowdomains = set()
|
self.allowdomains = set()
|
||||||
self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter", "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc", "eshop", "e-shop", "email", "gallery", "flog"])
|
self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter", "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc", "eshop", "e-shop", "email", "gallery", "flog"])
|
||||||
self.justext_language = "Slovak"
|
|
||||||
|
|
||||||
def full_extract(self,content,bs,work_link):
|
|
||||||
"""
|
|
||||||
Parse content and fill the object
|
|
||||||
"""
|
|
||||||
pd = ParsedDocument()
|
|
||||||
pd.work_link = work_link
|
|
||||||
|
|
||||||
pd.current_time = datetime.date.today()
|
|
||||||
# Extract text and metatext
|
|
||||||
pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time)
|
|
||||||
pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs)
|
|
||||||
pd.link_set = get_bs_links(work_link,bs)
|
|
||||||
return pd
|
|
||||||
|
|
||||||
|
|
||||||
def is_domain_good(self, domain):
|
def is_domain_good(self, domain):
|
||||||
r = None
|
r = None
|
||||||
@ -237,7 +219,7 @@ class BaseParser:
|
|||||||
r = "Bad urlparse"
|
r = "Bad urlparse"
|
||||||
return r is None
|
return r is None
|
||||||
|
|
||||||
def filter_links(links):
|
def filter_links(self,links):
|
||||||
# Filter links
|
# Filter links
|
||||||
linkset = set()
|
linkset = set()
|
||||||
for link in links:
|
for link in links:
|
||||||
@ -248,6 +230,52 @@ class BaseParser:
|
|||||||
|
|
||||||
return list(linkset)
|
return list(linkset)
|
||||||
|
|
||||||
|
def full_extract(self,content,bs,work_link):
|
||||||
|
pass
|
||||||
|
|
||||||
|
import trafilatura
|
||||||
|
import courlan
|
||||||
|
|
||||||
|
class TrafilaturaParser(BaseParser):
|
||||||
|
def full_extract(self,content,bs,work_link):
|
||||||
|
content.seek(0)
|
||||||
|
content = content.read()
|
||||||
|
res = trafilatura.bare_extraction(content,url=work_link,with_metadata=True,target_language="sk",include_formatting=True)
|
||||||
|
print(res)
|
||||||
|
pd = ParsedDocument()
|
||||||
|
pd.work_link = work_link
|
||||||
|
pd.current_time = datetime.date.today()
|
||||||
|
# Extract text and metatext
|
||||||
|
pd.body = res["text"]
|
||||||
|
#pd.text_date
|
||||||
|
#pd.tags = res["tags"]
|
||||||
|
#pd.authors = res["author"]
|
||||||
|
pd.article_published_time = res["date"]
|
||||||
|
#pd.section = res["categories"]
|
||||||
|
pd.link_set = get_bs_links(work_link,bs)
|
||||||
|
return pd
|
||||||
|
|
||||||
|
class SoupParser(BaseParser):
|
||||||
|
def __init__(self, verbose=False):
|
||||||
|
BaseParser.__init__(self,verbose)
|
||||||
|
self.justext_language = "Slovak"
|
||||||
|
|
||||||
|
def full_extract(self,content,bs,work_link):
|
||||||
|
"""
|
||||||
|
Parse content and fill the object
|
||||||
|
"""
|
||||||
|
pd = ParsedDocument()
|
||||||
|
pd.work_link = work_link
|
||||||
|
|
||||||
|
pd.current_time = datetime.date.today()
|
||||||
|
# Extract text and metatext
|
||||||
|
pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time)
|
||||||
|
pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs)
|
||||||
|
pd.link_set = get_bs_links(work_link,bs)
|
||||||
|
return pd
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def extract_raw_text(self, content, current_time):
|
def extract_raw_text(self, content, current_time):
|
||||||
result = []
|
result = []
|
||||||
rd = None
|
rd = None
|
||||||
@ -363,7 +391,7 @@ class BaseParser:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
class EnglishParser(BaseParser):
|
class EnglishParser(SoupParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(EnglishParser,self).__init__()
|
super(EnglishParser,self).__init__()
|
||||||
self.justext_language = "English"
|
self.justext_language = "English"
|
||||||
|
@ -62,7 +62,7 @@ class Content(Model):
|
|||||||
article_published_time = columns.Text()
|
article_published_time = columns.Text()
|
||||||
text_date = columns.Text()
|
text_date = columns.Text()
|
||||||
body = columns.Text()
|
body = columns.Text()
|
||||||
body_size = columns.Text()
|
body_size = columns.Integer()
|
||||||
update_time = columns.DateTime()
|
update_time = columns.DateTime()
|
||||||
# PRIMARY KEY(domain_name,target_link),
|
# PRIMARY KEY(domain_name,target_link),
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user