This commit is contained in:
Daniel Hládek 2023-03-06 16:29:30 +01:00
parent 1e3f8dcba6
commit cc0d720d1c
5 changed files with 60 additions and 35 deletions

View File

@ -6,3 +6,5 @@ pycurl
lz4
lxml
cassandra-driver
trafilatura
py3langid

View File

@ -1,7 +1,7 @@
from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains
from websucker.agent import ParsedDocument
from websucker.parser import BaseParser
from websucker.parser import normalize_link,urlunparse
from websucker.parser import SoupParser
from websucker.parser import TrafilaturaParser
from websucker.parser import load_parser
from websucker.db import Data
from websucker.db import get_schema
@ -36,7 +36,7 @@ def create_queue_from_context(ctx):
@click.option("--queue",is_flag=True)
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,cassandra_username,cassandra_password,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
ctx.ensure_object(dict)
p = BaseParser()
p = TrafilaturaParser()
if parser is not None:
assert os.path.isfile(parser)
else:
@ -220,7 +220,7 @@ def fetch(ctx,urls):
responses = connection.html_download2(urls)
for res in responses:
target_link = res.get_canonical()
pd = parser.full_extract(res.content,res.bs,parser,target_link)
pd = parser.full_extract(res.content,res.bs,target_link)
print(pd)
if __name__ == "__main__":

View File

@ -14,7 +14,7 @@ import json
VERSION = "sucker6"
def calculate_checksums(self, text):
def calculate_checksums(text):
"""
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
"""
@ -295,13 +295,7 @@ INSERT INTO content(
def index_follow_links(self,parser,links,connection):
# Index seen links
follow_links = set()
for l in links:
if parser.is_link_good(l):
#if connection is not None and parser.listen_robot and not connection.is_robot_good(l):
# continue
link = normalize_link(l,strip_query=parser.strip_query)
follow_links.add(urlunparse(link))
follow_links = parser.filter_links(links)
newlinkdomains = set()
newlinkcount = 0
@ -342,9 +336,10 @@ INSERT INTO content(
pd.text_date,
pd.body,
body_length,
VERSION,
VERSION,
pd.current_time
)
print(value)
content_future = self.session.execute_async(self.index_content_content_insert,value)
# result later
@ -361,7 +356,7 @@ INSERT INTO content(
if link_status == "good":
futures = []
paragraph_checksums,paragraph_sizes = calculate_checksums(pd.text)
paragraph_checksums,paragraph_sizes = calculate_checksums(pd.body)
for pc,psz in zip(paragraph_checksums,paragraph_sizes):
fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3])))
futures.append(fut)

View File

@ -146,10 +146,8 @@ class BaseParser:
self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz",
".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"]
self.skipchars = re.compile(r"[();:@& ]")
self.store = True
self.verbose = verbose
self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}$")
self.listen_robot = True
self.recent_links = 5
self.old_links = 3
self.random_links = 10
@ -157,22 +155,6 @@ class BaseParser:
self.skipdomains = set()
self.allowdomains = set()
self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter", "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc", "eshop", "e-shop", "email", "gallery", "flog"])
self.justext_language = "Slovak"
def full_extract(self,content,bs,work_link):
"""
Parse content and fill the object
"""
pd = ParsedDocument()
pd.work_link = work_link
pd.current_time = datetime.date.today()
# Extract text and metatext
pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time)
pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs)
pd.link_set = get_bs_links(work_link,bs)
return pd
def is_domain_good(self, domain):
r = None
@ -237,7 +219,7 @@ class BaseParser:
r = "Bad urlparse"
return r is None
def filter_links(links):
def filter_links(self,links):
# Filter links
linkset = set()
for link in links:
@ -248,6 +230,52 @@ class BaseParser:
return list(linkset)
def full_extract(self,content,bs,work_link):
pass
import trafilatura
import courlan
class TrafilaturaParser(BaseParser):
def full_extract(self,content,bs,work_link):
content.seek(0)
content = content.read()
res = trafilatura.bare_extraction(content,url=work_link,with_metadata=True,target_language="sk",include_formatting=True)
print(res)
pd = ParsedDocument()
pd.work_link = work_link
pd.current_time = datetime.date.today()
# Extract text and metatext
pd.body = res["text"]
#pd.text_date
#pd.tags = res["tags"]
#pd.authors = res["author"]
pd.article_published_time = res["date"]
#pd.section = res["categories"]
pd.link_set = get_bs_links(work_link,bs)
return pd
class SoupParser(BaseParser):
def __init__(self, verbose=False):
BaseParser.__init__(self,verbose)
self.justext_language = "Slovak"
def full_extract(self,content,bs,work_link):
"""
Parse content and fill the object
"""
pd = ParsedDocument()
pd.work_link = work_link
pd.current_time = datetime.date.today()
# Extract text and metatext
pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time)
pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs)
pd.link_set = get_bs_links(work_link,bs)
return pd
def extract_raw_text(self, content, current_time):
result = []
rd = None
@ -363,7 +391,7 @@ class BaseParser:
class EnglishParser(BaseParser):
class EnglishParser(SoupParser):
def __init__(self):
super(EnglishParser,self).__init__()
self.justext_language = "English"

View File

@ -62,7 +62,7 @@ class Content(Model):
article_published_time = columns.Text()
text_date = columns.Text()
body = columns.Text()
body_size = columns.Text()
body_size = columns.Integer()
update_time = columns.DateTime()
# PRIMARY KEY(domain_name,target_link),