Compare commits

..

No commits in common. "f5dc1f42cfbb63afda0121df382ae41c3e1bc81c" and "725b61d9bbaaa33c21e9f071bf2362eb2e0f9b81" have entirely different histories.

3 changed files with 26 additions and 50 deletions

View File

@ -2,7 +2,7 @@ FROM python:3.9
RUN mkdir /app RUN mkdir /app
COPY requirements.txt /app COPY requirements.txt /app
RUN pip install -r /app/requirements.txt RUN pip install -r /app/requirements.txt
COPY *.py /app/ COPY *.py /app
WORKDIR /app WORKDIR /app
# redis config # redis config
ENV REDIS_URL="redis://localhost:6379/" ENV REDIS_URL="redis://localhost:6379/"

View File

@ -25,13 +25,9 @@ def classify(start_link):
mongocrawler.classify(start_link) mongocrawler.classify(start_link)
@cli.command() @cli.command()
@click.argument("hostname") @click.argument("start_link")
def visit(hostname): def visit(start_link):
mongocrawler.visit(hostname) mongocrawler.visit(start_link)
@cli.command()
def summary():
mongocrawler.crawl_summary()
if __name__ == "__main__": if __name__ == "__main__":
cli() cli()

View File

@ -9,8 +9,7 @@ import trafilatura.external
import sys import sys
import courlan import courlan
import urllib import urllib
from datetime import datetime as dat from datetime import datetime
import datetime
import click import click
import logging as LOGGER import logging as LOGGER
import os import os
@ -108,7 +107,7 @@ def get_link_doc(link:str,status="frontlink")->dict:
assert r is not None assert r is not None
link,host = r link,host = r
domain = courlan.extract_domain(link) domain = courlan.extract_domain(link)
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":dat.utcnow()} return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()}
def fetch_page(link:str)->(str,str): def fetch_page(link:str)->(str,str):
@ -257,7 +256,7 @@ def index_pages(db,hostname,extracted_pages):
batchdoc = { batchdoc = {
"host": linkdoc["host"], "host": linkdoc["host"],
"domain": linkdoc["domain"], "domain": linkdoc["domain"],
"created_at": dat.utcnow(), "created_at": datetime.utcnow(),
"good_document_count":good_document_count, "good_document_count":good_document_count,
"document_count":document_count, "document_count":document_count,
"text_size":text_size, "text_size":text_size,
@ -281,7 +280,8 @@ def get_bs_links(link,html):
base = bs.base["href"] base = bs.base["href"]
base = urllib.parse.urlparse(courlan.normalize_url(base)) base = urllib.parse.urlparse(courlan.normalize_url(base))
links = set() external_links = set()
internal_links = set()
# Normalizacia linkov # Normalizacia linkov
for l in bs.find_all("a", href=True): for l in bs.find_all("a", href=True):
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs: if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
@ -305,13 +305,19 @@ def get_bs_links(link,html):
if path.endswith(")"): if path.endswith(")"):
# javascript # javascript
continue continue
external = True
if parsed.netloc == base.netloc:
external = False
href = urllib.parse.urlunparse((scheme,netloc,path,"","","")) href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
href = courlan.normalize_url(href) href = courlan.normalize_url(href)
links.add(href) if external:
external_links.add(href)
else:
internal_links.add(href)
except ValueError as err: except ValueError as err:
print(err) print(err)
pass pass
return links return internal_links,external_links
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list: def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
links = {} links = {}
@ -320,12 +326,12 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
status = default_status status = default_status
if html is None or len(html) < 256: if html is None or len(html) < 256:
continue continue
page_links = get_bs_links(final_link,html) internal_links, external_links = get_bs_links(final_link,html)
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE) #external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE) #internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
#print(extracted_links) #print(extracted_links)
for link in page_links: for link in internal_links:
if not courlan.is_external(link,final_link) and not is_robot_good(link,rules): if not is_robot_good(link,rules):
badrobot += 1 badrobot += 1
continue continue
status = str(default_status) status = str(default_status)
@ -356,7 +362,7 @@ def index_links(db,extracted_links):
pass pass
else: else:
print("updating " + link,status) print("updating " + link,status)
linkcol.update_one({"url":link},{"$set":{"status":status,"updated_at":dat.utcnow()}}) linkcol.update_one({"url":link},{"$set":{"status":status,"updated_at":datetime.utcnow()}})
def get_link_features(link): def get_link_features(link):
a, urlpath = courlan.get_host_and_path(link) a, urlpath = courlan.get_host_and_path(link)
@ -495,7 +501,7 @@ def link_summary(db,hostname):
res = linkcol.aggregate([ res = linkcol.aggregate([
{"$match":{"host":hostname}}, {"$match":{"host":hostname}},
{"$group":{"_id":"$status", {"$group":{"_id":"$status",
"count":{"$sum":1}, "count":{"$count":{}},
} }
}, },
]) ])
@ -545,7 +551,7 @@ def link_summary(db,hostname):
info["total_good_characters"] = text_size info["total_good_characters"] = text_size
info["average_good_characters"] = good_document_characters info["average_good_characters"] = good_document_characters
info["average_fetch_characters"] = fetch_average_characters info["average_fetch_characters"] = fetch_average_characters
domaincol = db["domains"] domaincol = db["domain"]
domaincol.update_one({"host":hostname},{"$set":info},upsert=True) domaincol.update_one({"host":hostname},{"$set":info},upsert=True)
res = domaincol.find_one({"host":hostname}) res = domaincol.find_one({"host":hostname})
print(res) print(res)
@ -628,7 +634,7 @@ def createdb():
htmlcol.create_index("html_md5",unique=True) htmlcol.create_index("html_md5",unique=True)
domaincol = db["domains"] domaincol = db["domains"]
domaincol.create_index("host",unique=True) domaincol.create_index("host",unique=True)
domaincol.create_index("average_fetch_characters") domaincol.create_index("average_fetch_characters",unique=True)
batchcol = db["batches"] batchcol = db["batches"]
batchcol.create_index("host") batchcol.create_index("host")
batchcol.create_index("created_at") batchcol.create_index("created_at")
@ -678,12 +684,12 @@ def classify(start_link):
cl.train(trainset) cl.train(trainset)
cl.test(testset) cl.test(testset)
def visit(hostname): def visit(start_link):
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
start_link,hostname = courlan.check_url(start_link)
batch_size = BATCHSIZE batch_size = BATCHSIZE
rules = fetch_robot(hostname) rules = fetch_robot(hostname)
start_link = "https://" + hostname
# renew front links # renew front links
front_links = fetch_front_links(start_link,rules) front_links = fetch_front_links(start_link,rules)
index_links(db,front_links) index_links(db,front_links)
@ -705,29 +711,3 @@ def visit(hostname):
index_pages(db,hostname,extracted_pages) index_pages(db,hostname,extracted_pages)
link_summary(db,hostname) link_summary(db,hostname)
def crawl_summary():
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
batchcol = db["batches"]
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
print(yesterday)
res = batchcol.find({"created_at":{"$lt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters")
res = batchcol.aggregate([
{"$match":{"created_at":{"$lt": yesterday.utcnow()}}},
{"$group":{"_id":"$host",
"document_count":{"$sum":{"document_count":1}},
"good_document_count":{"$sum":{"good_document_count":1}},
"batch_size":{"$sum":{"batch_size":1}},
"count":{"$sum":1},
}
},
])
print(">>>> Batches")
for item in res:
print(item)
#print(item["host"],item["document_count"],item["good_document_count"],item["created_at"])
domaincol = db["domains"]
print(">>>> Best domains")
res = domaincol.find({},limit=100).sort("average_fetch_characters")
for item in res:
print(item)