Compare commits

..

No commits in common. "75840f6d21d517a4344f4f1ce0166c103c229462" and "123191b0f47a90ca7dd27e5bf2072e4886ca5b33" have entirely different histories.

View File

@ -93,6 +93,15 @@ def get_link_doc(link,status="frontlink"):
domain = courlan.extract_domain(link) domain = courlan.extract_domain(link)
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()} return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()}
def generic_visit(hostname):
known_links = set(get_visited_links(hostname))
visit_links = []
visit_links = trafilatura.find_feed_urls(hostname)
if visit_links is None:
visit_links = trafilatura.sitemap_search(hostname)
if visit_links is None:
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
def fetch_pages(link_batch): def fetch_pages(link_batch):
htmls = [] htmls = []
@ -110,12 +119,12 @@ def fetch_pages(link_batch):
good = False good = False
LOGGER.error('not a 200 response: %s for URL %s', response.status, url) LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
elif response.data is None or len(response.data) < MINFILESIZE: elif response.data is None or len(response.data) < MINFILESIZE:
LOGGER.error('too small/incorrect for URL %s', link) LOGGER.error('too small/incorrect for URL %s', url)
good = False good = False
# raise error instead? # raise error instead?
elif len(response.data) > MAXFILESIZE: elif len(response.data) > MAXFILESIZE:
good = False good = False
LOGGER.error('too large: length %s for URL %s', len(response.data), link) LOGGER.error('too large: length %s for URL %s', len(response.data), url)
if good: if good:
html = trafilatura.utils.decode_response(response) html = trafilatura.utils.decode_response(response)
final_link = response.url final_link = response.url
@ -238,6 +247,7 @@ def index_links(db,extracted_links):
except pymongo.errors.DuplicateKeyError as ex: except pymongo.errors.DuplicateKeyError as ex:
pass pass
def get_links(db,hostname,status,batch_size=BATCHSIZE): def get_links(db,hostname,status,batch_size=BATCHSIZE):
linkcol = db["links"] linkcol = db["links"]
res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
@ -300,29 +310,6 @@ def createdb():
htmlcol = db["html"] htmlcol = db["html"]
htmlcol.create_index("url",unique=True) htmlcol.create_index("url",unique=True)
@cli.command()
@click.argument("link")
def parseurl(link):
html = trafilatura.fetch_url(link,decode=True)
doc = trafilatura.bare_extraction(html)
import pprint
pprint.pprint(doc)
@cli.command()
@click.argument("link")
def externaldomains(link):
html = trafilatura.fetch_url(link,decode=True)
external_links = courlan.extract_links(html,link,external_bool=True,language=LANGUAGE)
domains = set()
for l in external_links:
r = courlan.check_url(l)
if r is None:
pass
link,domain = r
domains.add(domain)
for d in domains:
print(d)
@cli.command() @cli.command()
@click.argument("start_link") @click.argument("start_link")
def parseurl(start_link): def parseurl(start_link):