This commit is contained in:
Daniel Hládek 2023-03-12 09:50:22 +01:00
parent 3993a61899
commit 44fbf6b755

View File

@ -9,10 +9,14 @@ import courlan
import urllib import urllib
from datetime import datetime from datetime import datetime
import click import click
import logging as LOGGER
import os
LANGUAGE="sk" LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
DOMAIN = "sk" DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
BATCHSIZE=10 BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10)
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
MINFILESIZE=300 MINFILESIZE=300
MAXFILESIZE=10000000 MAXFILESIZE=10000000
MINTEXTSIZE=200 MINTEXTSIZE=200
@ -48,7 +52,7 @@ def calculate_checksums(text):
def is_robot_good(link,rules): def is_robot_good(link,rules):
# check robots.txt rules # check robots.txt rules
if rules is not None and not rules.can_fetch("*", llink): if rules is not None and not rules.can_fetch("*", link):
return False return False
return True return True
@ -61,10 +65,10 @@ def is_link_good(link):
print(llink,ldomain) print(llink,ldomain)
# domain rules # domain rules
if not ldomain.endswith(DOMAIN): if not ldomain.endswith(DOMAIN):
print("bad domain") LOGGER.debug("bad domain")
return None return None
if courlan.is_not_crawlable(llink): if courlan.is_not_crawlable(llink):
print("not crawlable") LOGGER.debug("not crawlable")
return None return None
return llink return llink
@ -112,14 +116,14 @@ def fetch_pages(link_batch):
good = True good = True
if response.status != 200: if response.status != 200:
good = False good = False
#LOGGER.error('not a 200 response: %s for URL %s', response.status, url) LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
elif response.data is None or len(response.data) < MINFILESIZE: elif response.data is None or len(response.data) < MINFILESIZE:
#LOGGER.error('too small/incorrect for URL %s', url) LOGGER.error('too small/incorrect for URL %s', url)
good = False good = False
# raise error instead? # raise error instead?
elif len(response.data) > MAXFILESIZE: elif len(response.data) > MAXFILESIZE:
good = False good = False
#LOGGER.error('too large: length %s for URL %s', len(response.data), url) LOGGER.error('too large: length %s for URL %s', len(response.data), url)
if good: if good:
html = trafilatura.utils.decode_response(response) html = trafilatura.utils.decode_response(response)
final_link = response.url final_link = response.url
@ -133,12 +137,12 @@ def fetch_pages(link_batch):
def fetch_robot(base_url): def fetch_robot(base_url):
rules = urllib.robotparser.RobotFileParser() rules = urllib.robotparser.RobotFileParser()
rules.set_url(base_url + '/robots.txt') rules.set_url("https://" + base_url + '/robots.txt')
# exceptions happening here # exceptions happening here
try: try:
rules.read() rules.read()
except Exception as exc: except Exception as exc:
#LOGGER.error('cannot read robots.txt: %s', exc) LOGGER.error('cannot read robots.txt: %s', exc)
rules = None rules = None
return rules return rules
@ -166,25 +170,31 @@ def index_pages(db,domain,extracted_pages):
links = [] links = []
for original_link,final_link,html,doc in extracted_pages: for original_link,final_link,html,doc in extracted_pages:
state = "good" state = "good"
link = original_link
if original_link != final_link:
linkcol.insert_one(get_link_doc(original_link,"redirect"))
link = final_link
if html is None: if html is None:
state = "html_error" state = "html_error"
elif doc is None: elif doc is None:
state = "content_error" state = "content_error"
if original_link != final_link:
linkcol.insert_one(get_link_doc(final_link,state))
state = "redirect"
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
if doc is not None: if doc is not None:
if html is not None:
htmlcol.insert_one({"url":final_link,"html":html,"html_size":len(html),"created_at":datetime.utcnow()})
checksums,sizes = calculate_checksums(doc["text"]) checksums,sizes = calculate_checksums(doc["text"])
doc["created_at"] = datetime.utcnow()
doc["text_size"] = len(doc["text"]) doc["text_size"] = len(doc["text"])
doc["paragraph_checksums"] = checksums doc["paragraph_checksums"] = checksums
doc["paragraph_sizes"] = sizes doc["paragraph_sizes"] = sizes
if len(checksums) < 1:
state = "trash"
if state == "good":
htdoc = get_link_doc(link,state)
htdoc["html"] = html
htdoc["html_size"] = len(html)
htmlcol.insert_one(htdoc)
doc.update(get_link_doc(link,"good"))
# todo extract links # todo extract links
print(doc) print(doc)
contentcol.insert_one(doc) contentcol.insert_one(doc)
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
def extract_links(link_batch,responses,domain,rules,default_status="frontlink"): def extract_links(link_batch,responses,domain,rules,default_status="frontlink"):
@ -245,14 +255,31 @@ def link_summary(db,domain):
linkcol = db["links"] linkcol = db["links"]
#res = linkcol.distinct("domain",{"hostname":domain}) #res = linkcol.distinct("domain",{"hostname":domain})
# count links
res = linkcol.aggregate([ res = linkcol.aggregate([
{"$match":{"host":domain}}, {"$match":{"host":domain}},
{"$group":{"_id":"$status","count":{"$sum":1}}}, {"$group":{"_id":"$status","count":{"$sum":1}}},
]) ])
for item in res: for item in res:
print(item) print(item)
contentcol = db["content"]
res = contentcol.aggregate([
{"$match":{"hostname":domain}},
{"$group":{"_id":None,"text_size_sum":{"$sum":"text_size"}}},
])
for item in res:
print(item)
def create_indices(db): global DB
@click.group()
def cli():
pass
@cli.command()
def dropdb():
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
linkcol = db["links"] linkcol = db["links"]
linkcol.create_index({"url":1},{"name":"url"}) linkcol.create_index({"url":1},{"name":"url"})
linkcol.create_index({"host":1,"status":1},{"name":"hostname_status"}) linkcol.create_index({"host":1,"status":1},{"name":"hostname_status"})
@ -263,17 +290,14 @@ def create_indices(db):
htmlcol = db["html"] htmlcol = db["html"]
htmlcol.create_index({"url":1}) htmlcol.create_index({"url":1})
@click.group() @cli.command()
def cli():
pass
@click.command()
@click.argument("start_link") @click.argument("start_link")
def simple_visit(start_link): def visit(start_link):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
start_link,domain = courlan.check_url(start_link) start_link,domain = courlan.check_url(start_link)
myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/")
db=myclient["crawler"]
rules = fetch_robot(domain) rules = fetch_robot(domain)
print(rules)
batch_size = BATCHSIZE batch_size = BATCHSIZE
navigation_links = get_links(db,domain,"navigation",batch_size) navigation_links = get_links(db,domain,"navigation",batch_size)
if start_link is not None: if start_link is not None: