This commit is contained in:
Daniel Hládek 2023-04-12 14:35:35 +02:00
parent 725b61d9bb
commit ad52af705b
3 changed files with 39 additions and 26 deletions

View File

@ -2,7 +2,7 @@ FROM python:3.9
RUN mkdir /app
COPY requirements.txt /app
RUN pip install -r /app/requirements.txt
COPY *.py /app
COPY *.py /app/
WORKDIR /app
# redis config
ENV REDIS_URL="redis://localhost:6379/"

View File

@ -25,9 +25,13 @@ def classify(start_link):
mongocrawler.classify(start_link)
@cli.command()
@click.argument("start_link")
def visit(start_link):
mongocrawler.visit(start_link)
@click.argument("hostname")
def visit(hostname):
mongocrawler.visit(hostname)
@cli.command()
def summary():
mongocrawler.crawl_summary()
if __name__ == "__main__":
cli()

View File

@ -9,7 +9,8 @@ import trafilatura.external
import sys
import courlan
import urllib
from datetime import datetime
from datetime import datetime as dat
import datetime
import click
import logging as LOGGER
import os
@ -107,7 +108,7 @@ def get_link_doc(link:str,status="frontlink")->dict:
assert r is not None
link,host = r
domain = courlan.extract_domain(link)
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()}
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":dat.utcnow()}
def fetch_page(link:str)->(str,str):
@ -256,7 +257,7 @@ def index_pages(db,hostname,extracted_pages):
batchdoc = {
"host": linkdoc["host"],
"domain": linkdoc["domain"],
"created_at": datetime.utcnow(),
"created_at": dat.utcnow(),
"good_document_count":good_document_count,
"document_count":document_count,
"text_size":text_size,
@ -280,8 +281,7 @@ def get_bs_links(link,html):
base = bs.base["href"]
base = urllib.parse.urlparse(courlan.normalize_url(base))
external_links = set()
internal_links = set()
links = set()
# Normalizacia linkov
for l in bs.find_all("a", href=True):
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
@ -305,19 +305,13 @@ def get_bs_links(link,html):
if path.endswith(")"):
# javascript
continue
external = True
if parsed.netloc == base.netloc:
external = False
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
href = courlan.normalize_url(href)
if external:
external_links.add(href)
else:
internal_links.add(href)
links.add(href)
except ValueError as err:
print(err)
pass
return internal_links,external_links
return links
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
links = {}
@ -326,12 +320,12 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
status = default_status
if html is None or len(html) < 256:
continue
internal_links, external_links = get_bs_links(final_link,html)
page_links = get_bs_links(final_link,html)
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
#print(extracted_links)
for link in internal_links:
if not is_robot_good(link,rules):
for link in page_links:
if not courlan.is_external(link,final_link) and not is_robot_good(link,rules):
badrobot += 1
continue
status = str(default_status)
@ -362,7 +356,7 @@ def index_links(db,extracted_links):
pass
else:
print("updating " + link,status)
linkcol.update_one({"url":link},{"$set":{"status":status,"updated_at":datetime.utcnow()}})
linkcol.update_one({"url":link},{"$set":{"status":status,"updated_at":dat.utcnow()}})
def get_link_features(link):
a, urlpath = courlan.get_host_and_path(link)
@ -501,7 +495,7 @@ def link_summary(db,hostname):
res = linkcol.aggregate([
{"$match":{"host":hostname}},
{"$group":{"_id":"$status",
"count":{"$count":{}},
"count":{"$sum":1},
}
},
])
@ -551,7 +545,7 @@ def link_summary(db,hostname):
info["total_good_characters"] = text_size
info["average_good_characters"] = good_document_characters
info["average_fetch_characters"] = fetch_average_characters
domaincol = db["domain"]
domaincol = db["domains"]
domaincol.update_one({"host":hostname},{"$set":info},upsert=True)
res = domaincol.find_one({"host":hostname})
print(res)
@ -634,7 +628,7 @@ def createdb():
htmlcol.create_index("html_md5",unique=True)
domaincol = db["domains"]
domaincol.create_index("host",unique=True)
domaincol.create_index("average_fetch_characters",unique=True)
domaincol.create_index("average_fetch_characters")
batchcol = db["batches"]
batchcol.create_index("host")
batchcol.create_index("created_at")
@ -684,12 +678,12 @@ def classify(start_link):
cl.train(trainset)
cl.test(testset)
def visit(start_link):
def visit(hostname):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
start_link,hostname = courlan.check_url(start_link)
batch_size = BATCHSIZE
rules = fetch_robot(hostname)
start_link = "https://" + hostname
# renew front links
front_links = fetch_front_links(start_link,rules)
index_links(db,front_links)
@ -711,3 +705,18 @@ def visit(start_link):
index_pages(db,hostname,extracted_pages)
link_summary(db,hostname)
def crawl_summary():
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
batchcol = db["batches"]
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
print(yesterday)
res = batchcol.find({"created_at":{"$gt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters")
print(">>>> Batches")
for item in res:
print(item["url"],item["average_fetch_characters"])
domaincol = db["domains"]
print(">>>> Best domains")
res = domaincol.find({},limit=100).sort("average_fetch_characters")
for item in res:
print(item)