zz
This commit is contained in:
parent
1e3f8dcba6
commit
cc0d720d1c
@ -6,3 +6,5 @@ pycurl
|
||||
lz4
|
||||
lxml
|
||||
cassandra-driver
|
||||
trafilatura
|
||||
py3langid
|
||||
|
@ -1,7 +1,7 @@
|
||||
from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains
|
||||
from websucker.agent import ParsedDocument
|
||||
from websucker.parser import BaseParser
|
||||
from websucker.parser import normalize_link,urlunparse
|
||||
from websucker.parser import SoupParser
|
||||
from websucker.parser import TrafilaturaParser
|
||||
from websucker.parser import load_parser
|
||||
from websucker.db import Data
|
||||
from websucker.db import get_schema
|
||||
@ -36,7 +36,7 @@ def create_queue_from_context(ctx):
|
||||
@click.option("--queue",is_flag=True)
|
||||
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,cassandra_username,cassandra_password,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
|
||||
ctx.ensure_object(dict)
|
||||
p = BaseParser()
|
||||
p = TrafilaturaParser()
|
||||
if parser is not None:
|
||||
assert os.path.isfile(parser)
|
||||
else:
|
||||
@ -220,7 +220,7 @@ def fetch(ctx,urls):
|
||||
responses = connection.html_download2(urls)
|
||||
for res in responses:
|
||||
target_link = res.get_canonical()
|
||||
pd = parser.full_extract(res.content,res.bs,parser,target_link)
|
||||
pd = parser.full_extract(res.content,res.bs,target_link)
|
||||
print(pd)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -14,7 +14,7 @@ import json
|
||||
VERSION = "sucker6"
|
||||
|
||||
|
||||
def calculate_checksums(self, text):
|
||||
def calculate_checksums(text):
|
||||
"""
|
||||
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
|
||||
"""
|
||||
@ -295,13 +295,7 @@ INSERT INTO content(
|
||||
|
||||
def index_follow_links(self,parser,links,connection):
|
||||
# Index seen links
|
||||
follow_links = set()
|
||||
for l in links:
|
||||
if parser.is_link_good(l):
|
||||
#if connection is not None and parser.listen_robot and not connection.is_robot_good(l):
|
||||
# continue
|
||||
link = normalize_link(l,strip_query=parser.strip_query)
|
||||
follow_links.add(urlunparse(link))
|
||||
follow_links = parser.filter_links(links)
|
||||
|
||||
newlinkdomains = set()
|
||||
newlinkcount = 0
|
||||
@ -342,9 +336,10 @@ INSERT INTO content(
|
||||
pd.text_date,
|
||||
pd.body,
|
||||
body_length,
|
||||
VERSION,
|
||||
VERSION,
|
||||
pd.current_time
|
||||
)
|
||||
print(value)
|
||||
content_future = self.session.execute_async(self.index_content_content_insert,value)
|
||||
# result later
|
||||
|
||||
@ -361,7 +356,7 @@ INSERT INTO content(
|
||||
if link_status == "good":
|
||||
|
||||
futures = []
|
||||
paragraph_checksums,paragraph_sizes = calculate_checksums(pd.text)
|
||||
paragraph_checksums,paragraph_sizes = calculate_checksums(pd.body)
|
||||
for pc,psz in zip(paragraph_checksums,paragraph_sizes):
|
||||
fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3])))
|
||||
futures.append(fut)
|
||||
|
@ -146,10 +146,8 @@ class BaseParser:
|
||||
self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz",
|
||||
".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"]
|
||||
self.skipchars = re.compile(r"[();:@& ]")
|
||||
self.store = True
|
||||
self.verbose = verbose
|
||||
self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}$")
|
||||
self.listen_robot = True
|
||||
self.recent_links = 5
|
||||
self.old_links = 3
|
||||
self.random_links = 10
|
||||
@ -157,22 +155,6 @@ class BaseParser:
|
||||
self.skipdomains = set()
|
||||
self.allowdomains = set()
|
||||
self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter", "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc", "eshop", "e-shop", "email", "gallery", "flog"])
|
||||
self.justext_language = "Slovak"
|
||||
|
||||
def full_extract(self,content,bs,work_link):
|
||||
"""
|
||||
Parse content and fill the object
|
||||
"""
|
||||
pd = ParsedDocument()
|
||||
pd.work_link = work_link
|
||||
|
||||
pd.current_time = datetime.date.today()
|
||||
# Extract text and metatext
|
||||
pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time)
|
||||
pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs)
|
||||
pd.link_set = get_bs_links(work_link,bs)
|
||||
return pd
|
||||
|
||||
|
||||
def is_domain_good(self, domain):
|
||||
r = None
|
||||
@ -237,7 +219,7 @@ class BaseParser:
|
||||
r = "Bad urlparse"
|
||||
return r is None
|
||||
|
||||
def filter_links(links):
|
||||
def filter_links(self,links):
|
||||
# Filter links
|
||||
linkset = set()
|
||||
for link in links:
|
||||
@ -248,6 +230,52 @@ class BaseParser:
|
||||
|
||||
return list(linkset)
|
||||
|
||||
def full_extract(self,content,bs,work_link):
|
||||
pass
|
||||
|
||||
import trafilatura
|
||||
import courlan
|
||||
|
||||
class TrafilaturaParser(BaseParser):
|
||||
def full_extract(self,content,bs,work_link):
|
||||
content.seek(0)
|
||||
content = content.read()
|
||||
res = trafilatura.bare_extraction(content,url=work_link,with_metadata=True,target_language="sk",include_formatting=True)
|
||||
print(res)
|
||||
pd = ParsedDocument()
|
||||
pd.work_link = work_link
|
||||
pd.current_time = datetime.date.today()
|
||||
# Extract text and metatext
|
||||
pd.body = res["text"]
|
||||
#pd.text_date
|
||||
#pd.tags = res["tags"]
|
||||
#pd.authors = res["author"]
|
||||
pd.article_published_time = res["date"]
|
||||
#pd.section = res["categories"]
|
||||
pd.link_set = get_bs_links(work_link,bs)
|
||||
return pd
|
||||
|
||||
class SoupParser(BaseParser):
|
||||
def __init__(self, verbose=False):
|
||||
BaseParser.__init__(self,verbose)
|
||||
self.justext_language = "Slovak"
|
||||
|
||||
def full_extract(self,content,bs,work_link):
|
||||
"""
|
||||
Parse content and fill the object
|
||||
"""
|
||||
pd = ParsedDocument()
|
||||
pd.work_link = work_link
|
||||
|
||||
pd.current_time = datetime.date.today()
|
||||
# Extract text and metatext
|
||||
pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time)
|
||||
pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs)
|
||||
pd.link_set = get_bs_links(work_link,bs)
|
||||
return pd
|
||||
|
||||
|
||||
|
||||
def extract_raw_text(self, content, current_time):
|
||||
result = []
|
||||
rd = None
|
||||
@ -363,7 +391,7 @@ class BaseParser:
|
||||
|
||||
|
||||
|
||||
class EnglishParser(BaseParser):
|
||||
class EnglishParser(SoupParser):
|
||||
def __init__(self):
|
||||
super(EnglishParser,self).__init__()
|
||||
self.justext_language = "English"
|
||||
|
@ -62,7 +62,7 @@ class Content(Model):
|
||||
article_published_time = columns.Text()
|
||||
text_date = columns.Text()
|
||||
body = columns.Text()
|
||||
body_size = columns.Text()
|
||||
body_size = columns.Integer()
|
||||
update_time = columns.DateTime()
|
||||
# PRIMARY KEY(domain_name,target_link),
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user