zz
This commit is contained in:
		
							parent
							
								
									1e3f8dcba6
								
							
						
					
					
						commit
						cc0d720d1c
					
				@ -6,3 +6,5 @@ pycurl
 | 
			
		||||
lz4
 | 
			
		||||
lxml
 | 
			
		||||
cassandra-driver
 | 
			
		||||
trafilatura
 | 
			
		||||
py3langid
 | 
			
		||||
 | 
			
		||||
@ -1,7 +1,7 @@
 | 
			
		||||
from websucker.agent import Connection,visit_links,visit_domain,process_domains,work_domains
 | 
			
		||||
from websucker.agent import ParsedDocument
 | 
			
		||||
from websucker.parser import BaseParser
 | 
			
		||||
from websucker.parser import normalize_link,urlunparse
 | 
			
		||||
from websucker.parser import SoupParser
 | 
			
		||||
from websucker.parser import TrafilaturaParser
 | 
			
		||||
from websucker.parser import load_parser
 | 
			
		||||
from websucker.db import Data
 | 
			
		||||
from websucker.db import get_schema
 | 
			
		||||
@ -36,7 +36,7 @@ def create_queue_from_context(ctx):
 | 
			
		||||
@click.option("--queue",is_flag=True)
 | 
			
		||||
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,cassandra_username,cassandra_password,beanstalkd_tube,beanstalkd_host,beanstalkd_port,parser,visit,queue):
 | 
			
		||||
    ctx.ensure_object(dict)
 | 
			
		||||
    p = BaseParser()
 | 
			
		||||
    p = TrafilaturaParser()
 | 
			
		||||
    if parser is not None:
 | 
			
		||||
        assert os.path.isfile(parser)
 | 
			
		||||
    else:
 | 
			
		||||
@ -220,7 +220,7 @@ def fetch(ctx,urls):
 | 
			
		||||
    responses = connection.html_download2(urls)
 | 
			
		||||
    for res in responses:
 | 
			
		||||
        target_link = res.get_canonical()
 | 
			
		||||
        pd = parser.full_extract(res.content,res.bs,parser,target_link)
 | 
			
		||||
        pd = parser.full_extract(res.content,res.bs,target_link)
 | 
			
		||||
        print(pd)
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
@ -14,7 +14,7 @@ import json
 | 
			
		||||
VERSION = "sucker6"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def calculate_checksums(self, text):
 | 
			
		||||
def calculate_checksums(text):
 | 
			
		||||
    """
 | 
			
		||||
    @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
 | 
			
		||||
    """
 | 
			
		||||
@ -295,13 +295,7 @@ INSERT INTO content(
 | 
			
		||||
 | 
			
		||||
    def index_follow_links(self,parser,links,connection):
 | 
			
		||||
        # Index seen links
 | 
			
		||||
        follow_links = set()
 | 
			
		||||
        for l in links:
 | 
			
		||||
            if parser.is_link_good(l):
 | 
			
		||||
                #if connection is not None and parser.listen_robot and not connection.is_robot_good(l):
 | 
			
		||||
                #    continue
 | 
			
		||||
                link = normalize_link(l,strip_query=parser.strip_query)
 | 
			
		||||
                follow_links.add(urlunparse(link))
 | 
			
		||||
        follow_links = parser.filter_links(links)
 | 
			
		||||
 | 
			
		||||
        newlinkdomains = set()
 | 
			
		||||
        newlinkcount = 0
 | 
			
		||||
@ -342,9 +336,10 @@ INSERT INTO content(
 | 
			
		||||
          pd.text_date,
 | 
			
		||||
          pd.body,
 | 
			
		||||
          body_length,
 | 
			
		||||
            VERSION,
 | 
			
		||||
          VERSION,
 | 
			
		||||
          pd.current_time
 | 
			
		||||
        )
 | 
			
		||||
        print(value)
 | 
			
		||||
        content_future = self.session.execute_async(self.index_content_content_insert,value)
 | 
			
		||||
        # result later
 | 
			
		||||
 | 
			
		||||
@ -361,7 +356,7 @@ INSERT INTO content(
 | 
			
		||||
        if link_status == "good":
 | 
			
		||||
 | 
			
		||||
            futures = []
 | 
			
		||||
            paragraph_checksums,paragraph_sizes = calculate_checksums(pd.text)
 | 
			
		||||
            paragraph_checksums,paragraph_sizes = calculate_checksums(pd.body)
 | 
			
		||||
            for pc,psz in zip(paragraph_checksums,paragraph_sizes):
 | 
			
		||||
                fut = self.session.execute_async(self.paragraph_checksums_insert,(pc,hash(nl[1] + "/" + nl[2] + "?" + nl[3])))
 | 
			
		||||
                futures.append(fut)
 | 
			
		||||
 | 
			
		||||
@ -146,10 +146,8 @@ class BaseParser:
 | 
			
		||||
        self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz",
 | 
			
		||||
                          ".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"]
 | 
			
		||||
        self.skipchars = re.compile(r"[();:@& ]")
 | 
			
		||||
        self.store = True
 | 
			
		||||
        self.verbose = verbose
 | 
			
		||||
        self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}$")
 | 
			
		||||
        self.listen_robot = True
 | 
			
		||||
        self.recent_links = 5
 | 
			
		||||
        self.old_links = 3
 | 
			
		||||
        self.random_links = 10
 | 
			
		||||
@ -157,22 +155,6 @@ class BaseParser:
 | 
			
		||||
        self.skipdomains = set()
 | 
			
		||||
        self.allowdomains = set()
 | 
			
		||||
        self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter",   "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc",  "eshop", "e-shop", "email", "gallery", "flog"])
 | 
			
		||||
        self.justext_language = "Slovak"
 | 
			
		||||
 | 
			
		||||
    def full_extract(self,content,bs,work_link):
 | 
			
		||||
        """
 | 
			
		||||
        Parse content and fill the object
 | 
			
		||||
        """
 | 
			
		||||
        pd = ParsedDocument()
 | 
			
		||||
        pd.work_link = work_link
 | 
			
		||||
 | 
			
		||||
        pd.current_time = datetime.date.today()
 | 
			
		||||
        # Extract text and metatext
 | 
			
		||||
        pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time)
 | 
			
		||||
        pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs)
 | 
			
		||||
        pd.link_set = get_bs_links(work_link,bs)
 | 
			
		||||
        return pd
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def is_domain_good(self, domain):
 | 
			
		||||
        r = None
 | 
			
		||||
@ -237,7 +219,7 @@ class BaseParser:
 | 
			
		||||
            r = "Bad urlparse"
 | 
			
		||||
        return r is None
 | 
			
		||||
    
 | 
			
		||||
    def filter_links(links):
 | 
			
		||||
    def filter_links(self,links):
 | 
			
		||||
        # Filter links
 | 
			
		||||
        linkset = set()
 | 
			
		||||
        for link in links:
 | 
			
		||||
@ -248,6 +230,52 @@ class BaseParser:
 | 
			
		||||
 | 
			
		||||
        return list(linkset)
 | 
			
		||||
 | 
			
		||||
    def full_extract(self,content,bs,work_link):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
import trafilatura
 | 
			
		||||
import courlan
 | 
			
		||||
 | 
			
		||||
class TrafilaturaParser(BaseParser):
 | 
			
		||||
    def full_extract(self,content,bs,work_link):
 | 
			
		||||
        content.seek(0)
 | 
			
		||||
        content = content.read()
 | 
			
		||||
        res = trafilatura.bare_extraction(content,url=work_link,with_metadata=True,target_language="sk",include_formatting=True)
 | 
			
		||||
        print(res)
 | 
			
		||||
        pd = ParsedDocument()
 | 
			
		||||
        pd.work_link = work_link
 | 
			
		||||
        pd.current_time = datetime.date.today()
 | 
			
		||||
        # Extract text and metatext
 | 
			
		||||
        pd.body = res["text"]
 | 
			
		||||
        #pd.text_date 
 | 
			
		||||
        #pd.tags = res["tags"]
 | 
			
		||||
        #pd.authors = res["author"]
 | 
			
		||||
        pd.article_published_time = res["date"]
 | 
			
		||||
        #pd.section = res["categories"]
 | 
			
		||||
        pd.link_set = get_bs_links(work_link,bs)
 | 
			
		||||
        return pd
 | 
			
		||||
 | 
			
		||||
class SoupParser(BaseParser):
 | 
			
		||||
    def __init__(self, verbose=False):
 | 
			
		||||
        BaseParser.__init__(self,verbose)
 | 
			
		||||
        self.justext_language = "Slovak"
 | 
			
		||||
 | 
			
		||||
    def full_extract(self,content,bs,work_link):
 | 
			
		||||
        """
 | 
			
		||||
        Parse content and fill the object
 | 
			
		||||
        """
 | 
			
		||||
        pd = ParsedDocument()
 | 
			
		||||
        pd.work_link = work_link
 | 
			
		||||
 | 
			
		||||
        pd.current_time = datetime.date.today()
 | 
			
		||||
        # Extract text and metatext
 | 
			
		||||
        pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time)
 | 
			
		||||
        pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs)
 | 
			
		||||
        pd.link_set = get_bs_links(work_link,bs)
 | 
			
		||||
        return pd
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def extract_raw_text(self, content, current_time):
 | 
			
		||||
        result = []
 | 
			
		||||
        rd = None
 | 
			
		||||
@ -363,7 +391,7 @@ class BaseParser:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class EnglishParser(BaseParser):
 | 
			
		||||
class EnglishParser(SoupParser):
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        super(EnglishParser,self).__init__()
 | 
			
		||||
        self.justext_language = "English"
 | 
			
		||||
 | 
			
		||||
@ -62,7 +62,7 @@ class Content(Model):
 | 
			
		||||
    article_published_time = columns.Text()
 | 
			
		||||
    text_date = columns.Text()
 | 
			
		||||
    body = columns.Text()
 | 
			
		||||
    body_size = columns.Text()
 | 
			
		||||
    body_size = columns.Integer()
 | 
			
		||||
    update_time = columns.DateTime()
 | 
			
		||||
#    PRIMARY KEY(domain_name,target_link),
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user