Compare commits

..

2 Commits

Author SHA1 Message Date
f5dc1f42cf zz 2023-04-12 14:50:18 +02:00
ad52af705b zz 2023-04-12 14:35:35 +02:00
3 changed files with 50 additions and 26 deletions

View File

@ -2,7 +2,7 @@ FROM python:3.9
RUN mkdir /app RUN mkdir /app
COPY requirements.txt /app COPY requirements.txt /app
RUN pip install -r /app/requirements.txt RUN pip install -r /app/requirements.txt
COPY *.py /app COPY *.py /app/
WORKDIR /app WORKDIR /app
# redis config # redis config
ENV REDIS_URL="redis://localhost:6379/" ENV REDIS_URL="redis://localhost:6379/"

View File

@ -25,9 +25,13 @@ def classify(start_link):
mongocrawler.classify(start_link) mongocrawler.classify(start_link)
@cli.command() @cli.command()
@click.argument("start_link") @click.argument("hostname")
def visit(start_link): def visit(hostname):
mongocrawler.visit(start_link) mongocrawler.visit(hostname)
@cli.command()
def summary():
mongocrawler.crawl_summary()
if __name__ == "__main__": if __name__ == "__main__":
cli() cli()

View File

@ -9,7 +9,8 @@ import trafilatura.external
import sys import sys
import courlan import courlan
import urllib import urllib
from datetime import datetime from datetime import datetime as dat
import datetime
import click import click
import logging as LOGGER import logging as LOGGER
import os import os
@ -107,7 +108,7 @@ def get_link_doc(link:str,status="frontlink")->dict:
assert r is not None assert r is not None
link,host = r link,host = r
domain = courlan.extract_domain(link) domain = courlan.extract_domain(link)
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()} return {"url":link,"host":host,"domain":domain,"status":status,"created_at":dat.utcnow()}
def fetch_page(link:str)->(str,str): def fetch_page(link:str)->(str,str):
@ -256,7 +257,7 @@ def index_pages(db,hostname,extracted_pages):
batchdoc = { batchdoc = {
"host": linkdoc["host"], "host": linkdoc["host"],
"domain": linkdoc["domain"], "domain": linkdoc["domain"],
"created_at": datetime.utcnow(), "created_at": dat.utcnow(),
"good_document_count":good_document_count, "good_document_count":good_document_count,
"document_count":document_count, "document_count":document_count,
"text_size":text_size, "text_size":text_size,
@ -280,8 +281,7 @@ def get_bs_links(link,html):
base = bs.base["href"] base = bs.base["href"]
base = urllib.parse.urlparse(courlan.normalize_url(base)) base = urllib.parse.urlparse(courlan.normalize_url(base))
external_links = set() links = set()
internal_links = set()
# Normalizacia linkov # Normalizacia linkov
for l in bs.find_all("a", href=True): for l in bs.find_all("a", href=True):
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs: if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
@ -305,19 +305,13 @@ def get_bs_links(link,html):
if path.endswith(")"): if path.endswith(")"):
# javascript # javascript
continue continue
external = True
if parsed.netloc == base.netloc:
external = False
href = urllib.parse.urlunparse((scheme,netloc,path,"","","")) href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
href = courlan.normalize_url(href) href = courlan.normalize_url(href)
if external: links.add(href)
external_links.add(href)
else:
internal_links.add(href)
except ValueError as err: except ValueError as err:
print(err) print(err)
pass pass
return internal_links,external_links return links
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list: def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
links = {} links = {}
@ -326,12 +320,12 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
status = default_status status = default_status
if html is None or len(html) < 256: if html is None or len(html) < 256:
continue continue
internal_links, external_links = get_bs_links(final_link,html) page_links = get_bs_links(final_link,html)
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE) #external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE) #internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
#print(extracted_links) #print(extracted_links)
for link in internal_links: for link in page_links:
if not is_robot_good(link,rules): if not courlan.is_external(link,final_link) and not is_robot_good(link,rules):
badrobot += 1 badrobot += 1
continue continue
status = str(default_status) status = str(default_status)
@ -362,7 +356,7 @@ def index_links(db,extracted_links):
pass pass
else: else:
print("updating " + link,status) print("updating " + link,status)
linkcol.update_one({"url":link},{"$set":{"status":status,"updated_at":datetime.utcnow()}}) linkcol.update_one({"url":link},{"$set":{"status":status,"updated_at":dat.utcnow()}})
def get_link_features(link): def get_link_features(link):
a, urlpath = courlan.get_host_and_path(link) a, urlpath = courlan.get_host_and_path(link)
@ -501,7 +495,7 @@ def link_summary(db,hostname):
res = linkcol.aggregate([ res = linkcol.aggregate([
{"$match":{"host":hostname}}, {"$match":{"host":hostname}},
{"$group":{"_id":"$status", {"$group":{"_id":"$status",
"count":{"$count":{}}, "count":{"$sum":1},
} }
}, },
]) ])
@ -551,7 +545,7 @@ def link_summary(db,hostname):
info["total_good_characters"] = text_size info["total_good_characters"] = text_size
info["average_good_characters"] = good_document_characters info["average_good_characters"] = good_document_characters
info["average_fetch_characters"] = fetch_average_characters info["average_fetch_characters"] = fetch_average_characters
domaincol = db["domain"] domaincol = db["domains"]
domaincol.update_one({"host":hostname},{"$set":info},upsert=True) domaincol.update_one({"host":hostname},{"$set":info},upsert=True)
res = domaincol.find_one({"host":hostname}) res = domaincol.find_one({"host":hostname})
print(res) print(res)
@ -634,7 +628,7 @@ def createdb():
htmlcol.create_index("html_md5",unique=True) htmlcol.create_index("html_md5",unique=True)
domaincol = db["domains"] domaincol = db["domains"]
domaincol.create_index("host",unique=True) domaincol.create_index("host",unique=True)
domaincol.create_index("average_fetch_characters",unique=True) domaincol.create_index("average_fetch_characters")
batchcol = db["batches"] batchcol = db["batches"]
batchcol.create_index("host") batchcol.create_index("host")
batchcol.create_index("created_at") batchcol.create_index("created_at")
@ -684,12 +678,12 @@ def classify(start_link):
cl.train(trainset) cl.train(trainset)
cl.test(testset) cl.test(testset)
def visit(start_link): def visit(hostname):
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
start_link,hostname = courlan.check_url(start_link)
batch_size = BATCHSIZE batch_size = BATCHSIZE
rules = fetch_robot(hostname) rules = fetch_robot(hostname)
start_link = "https://" + hostname
# renew front links # renew front links
front_links = fetch_front_links(start_link,rules) front_links = fetch_front_links(start_link,rules)
index_links(db,front_links) index_links(db,front_links)
@ -711,3 +705,29 @@ def visit(start_link):
index_pages(db,hostname,extracted_pages) index_pages(db,hostname,extracted_pages)
link_summary(db,hostname) link_summary(db,hostname)
def crawl_summary():
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
batchcol = db["batches"]
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
print(yesterday)
res = batchcol.find({"created_at":{"$lt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters")
res = batchcol.aggregate([
{"$match":{"created_at":{"$lt": yesterday.utcnow()}}},
{"$group":{"_id":"$host",
"document_count":{"$sum":{"document_count":1}},
"good_document_count":{"$sum":{"good_document_count":1}},
"batch_size":{"$sum":{"batch_size":1}},
"count":{"$sum":1},
}
},
])
print(">>>> Batches")
for item in res:
print(item)
#print(item["host"],item["document_count"],item["good_document_count"],item["created_at"])
domaincol = db["domains"]
print(">>>> Best domains")
res = domaincol.find({},limit=100).sort("average_fetch_characters")
for item in res:
print(item)