Compare commits
2 Commits
725b61d9bb
...
f5dc1f42cf
Author | SHA1 | Date | |
---|---|---|---|
f5dc1f42cf | |||
ad52af705b |
@ -2,7 +2,7 @@ FROM python:3.9
|
|||||||
RUN mkdir /app
|
RUN mkdir /app
|
||||||
COPY requirements.txt /app
|
COPY requirements.txt /app
|
||||||
RUN pip install -r /app/requirements.txt
|
RUN pip install -r /app/requirements.txt
|
||||||
COPY *.py /app
|
COPY *.py /app/
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
# redis config
|
# redis config
|
||||||
ENV REDIS_URL="redis://localhost:6379/"
|
ENV REDIS_URL="redis://localhost:6379/"
|
||||||
|
10
mongo/cli.py
10
mongo/cli.py
@ -25,9 +25,13 @@ def classify(start_link):
|
|||||||
mongocrawler.classify(start_link)
|
mongocrawler.classify(start_link)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("start_link")
|
@click.argument("hostname")
|
||||||
def visit(start_link):
|
def visit(hostname):
|
||||||
mongocrawler.visit(start_link)
|
mongocrawler.visit(hostname)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
def summary():
|
||||||
|
mongocrawler.crawl_summary()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cli()
|
cli()
|
||||||
|
@ -9,7 +9,8 @@ import trafilatura.external
|
|||||||
import sys
|
import sys
|
||||||
import courlan
|
import courlan
|
||||||
import urllib
|
import urllib
|
||||||
from datetime import datetime
|
from datetime import datetime as dat
|
||||||
|
import datetime
|
||||||
import click
|
import click
|
||||||
import logging as LOGGER
|
import logging as LOGGER
|
||||||
import os
|
import os
|
||||||
@ -107,7 +108,7 @@ def get_link_doc(link:str,status="frontlink")->dict:
|
|||||||
assert r is not None
|
assert r is not None
|
||||||
link,host = r
|
link,host = r
|
||||||
domain = courlan.extract_domain(link)
|
domain = courlan.extract_domain(link)
|
||||||
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()}
|
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":dat.utcnow()}
|
||||||
|
|
||||||
|
|
||||||
def fetch_page(link:str)->(str,str):
|
def fetch_page(link:str)->(str,str):
|
||||||
@ -256,7 +257,7 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
batchdoc = {
|
batchdoc = {
|
||||||
"host": linkdoc["host"],
|
"host": linkdoc["host"],
|
||||||
"domain": linkdoc["domain"],
|
"domain": linkdoc["domain"],
|
||||||
"created_at": datetime.utcnow(),
|
"created_at": dat.utcnow(),
|
||||||
"good_document_count":good_document_count,
|
"good_document_count":good_document_count,
|
||||||
"document_count":document_count,
|
"document_count":document_count,
|
||||||
"text_size":text_size,
|
"text_size":text_size,
|
||||||
@ -280,8 +281,7 @@ def get_bs_links(link,html):
|
|||||||
base = bs.base["href"]
|
base = bs.base["href"]
|
||||||
base = urllib.parse.urlparse(courlan.normalize_url(base))
|
base = urllib.parse.urlparse(courlan.normalize_url(base))
|
||||||
|
|
||||||
external_links = set()
|
links = set()
|
||||||
internal_links = set()
|
|
||||||
# Normalizacia linkov
|
# Normalizacia linkov
|
||||||
for l in bs.find_all("a", href=True):
|
for l in bs.find_all("a", href=True):
|
||||||
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
|
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
|
||||||
@ -305,19 +305,13 @@ def get_bs_links(link,html):
|
|||||||
if path.endswith(")"):
|
if path.endswith(")"):
|
||||||
# javascript
|
# javascript
|
||||||
continue
|
continue
|
||||||
external = True
|
|
||||||
if parsed.netloc == base.netloc:
|
|
||||||
external = False
|
|
||||||
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
|
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
|
||||||
href = courlan.normalize_url(href)
|
href = courlan.normalize_url(href)
|
||||||
if external:
|
links.add(href)
|
||||||
external_links.add(href)
|
|
||||||
else:
|
|
||||||
internal_links.add(href)
|
|
||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
print(err)
|
print(err)
|
||||||
pass
|
pass
|
||||||
return internal_links,external_links
|
return links
|
||||||
|
|
||||||
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
||||||
links = {}
|
links = {}
|
||||||
@ -326,12 +320,12 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
|
|||||||
status = default_status
|
status = default_status
|
||||||
if html is None or len(html) < 256:
|
if html is None or len(html) < 256:
|
||||||
continue
|
continue
|
||||||
internal_links, external_links = get_bs_links(final_link,html)
|
page_links = get_bs_links(final_link,html)
|
||||||
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
||||||
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
|
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
|
||||||
#print(extracted_links)
|
#print(extracted_links)
|
||||||
for link in internal_links:
|
for link in page_links:
|
||||||
if not is_robot_good(link,rules):
|
if not courlan.is_external(link,final_link) and not is_robot_good(link,rules):
|
||||||
badrobot += 1
|
badrobot += 1
|
||||||
continue
|
continue
|
||||||
status = str(default_status)
|
status = str(default_status)
|
||||||
@ -362,7 +356,7 @@ def index_links(db,extracted_links):
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
print("updating " + link,status)
|
print("updating " + link,status)
|
||||||
linkcol.update_one({"url":link},{"$set":{"status":status,"updated_at":datetime.utcnow()}})
|
linkcol.update_one({"url":link},{"$set":{"status":status,"updated_at":dat.utcnow()}})
|
||||||
|
|
||||||
def get_link_features(link):
|
def get_link_features(link):
|
||||||
a, urlpath = courlan.get_host_and_path(link)
|
a, urlpath = courlan.get_host_and_path(link)
|
||||||
@ -501,7 +495,7 @@ def link_summary(db,hostname):
|
|||||||
res = linkcol.aggregate([
|
res = linkcol.aggregate([
|
||||||
{"$match":{"host":hostname}},
|
{"$match":{"host":hostname}},
|
||||||
{"$group":{"_id":"$status",
|
{"$group":{"_id":"$status",
|
||||||
"count":{"$count":{}},
|
"count":{"$sum":1},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
])
|
])
|
||||||
@ -551,7 +545,7 @@ def link_summary(db,hostname):
|
|||||||
info["total_good_characters"] = text_size
|
info["total_good_characters"] = text_size
|
||||||
info["average_good_characters"] = good_document_characters
|
info["average_good_characters"] = good_document_characters
|
||||||
info["average_fetch_characters"] = fetch_average_characters
|
info["average_fetch_characters"] = fetch_average_characters
|
||||||
domaincol = db["domain"]
|
domaincol = db["domains"]
|
||||||
domaincol.update_one({"host":hostname},{"$set":info},upsert=True)
|
domaincol.update_one({"host":hostname},{"$set":info},upsert=True)
|
||||||
res = domaincol.find_one({"host":hostname})
|
res = domaincol.find_one({"host":hostname})
|
||||||
print(res)
|
print(res)
|
||||||
@ -634,7 +628,7 @@ def createdb():
|
|||||||
htmlcol.create_index("html_md5",unique=True)
|
htmlcol.create_index("html_md5",unique=True)
|
||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
domaincol.create_index("host",unique=True)
|
domaincol.create_index("host",unique=True)
|
||||||
domaincol.create_index("average_fetch_characters",unique=True)
|
domaincol.create_index("average_fetch_characters")
|
||||||
batchcol = db["batches"]
|
batchcol = db["batches"]
|
||||||
batchcol.create_index("host")
|
batchcol.create_index("host")
|
||||||
batchcol.create_index("created_at")
|
batchcol.create_index("created_at")
|
||||||
@ -684,12 +678,12 @@ def classify(start_link):
|
|||||||
cl.train(trainset)
|
cl.train(trainset)
|
||||||
cl.test(testset)
|
cl.test(testset)
|
||||||
|
|
||||||
def visit(start_link):
|
def visit(hostname):
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
start_link,hostname = courlan.check_url(start_link)
|
|
||||||
batch_size = BATCHSIZE
|
batch_size = BATCHSIZE
|
||||||
rules = fetch_robot(hostname)
|
rules = fetch_robot(hostname)
|
||||||
|
start_link = "https://" + hostname
|
||||||
# renew front links
|
# renew front links
|
||||||
front_links = fetch_front_links(start_link,rules)
|
front_links = fetch_front_links(start_link,rules)
|
||||||
index_links(db,front_links)
|
index_links(db,front_links)
|
||||||
@ -711,3 +705,29 @@ def visit(start_link):
|
|||||||
index_pages(db,hostname,extracted_pages)
|
index_pages(db,hostname,extracted_pages)
|
||||||
link_summary(db,hostname)
|
link_summary(db,hostname)
|
||||||
|
|
||||||
|
def crawl_summary():
|
||||||
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
db=myclient[DBNAME]
|
||||||
|
batchcol = db["batches"]
|
||||||
|
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
|
||||||
|
print(yesterday)
|
||||||
|
res = batchcol.find({"created_at":{"$lt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters")
|
||||||
|
res = batchcol.aggregate([
|
||||||
|
{"$match":{"created_at":{"$lt": yesterday.utcnow()}}},
|
||||||
|
{"$group":{"_id":"$host",
|
||||||
|
"document_count":{"$sum":{"document_count":1}},
|
||||||
|
"good_document_count":{"$sum":{"good_document_count":1}},
|
||||||
|
"batch_size":{"$sum":{"batch_size":1}},
|
||||||
|
"count":{"$sum":1},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
])
|
||||||
|
print(">>>> Batches")
|
||||||
|
for item in res:
|
||||||
|
print(item)
|
||||||
|
#print(item["host"],item["document_count"],item["good_document_count"],item["created_at"])
|
||||||
|
domaincol = db["domains"]
|
||||||
|
print(">>>> Best domains")
|
||||||
|
res = domaincol.find({},limit=100).sort("average_fetch_characters")
|
||||||
|
for item in res:
|
||||||
|
print(item)
|
||||||
|
Loading…
Reference in New Issue
Block a user