zz
This commit is contained in:
parent
37a115fb94
commit
8e8d4b9625
@ -1,6 +1,8 @@
|
|||||||
import click
|
import click
|
||||||
import mongocrawler
|
import mongocrawler
|
||||||
import rq
|
import rq
|
||||||
|
import redis
|
||||||
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
|
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
|
||||||
|
@ -154,21 +154,16 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
|
|||||||
# exceptions happening here
|
# exceptions happening here
|
||||||
return rules
|
return rules
|
||||||
|
|
||||||
|
def extract_page(final_link,html):
|
||||||
|
doc = None
|
||||||
|
if html is not None:
|
||||||
|
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
|
||||||
|
if doc is not None:
|
||||||
|
if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
|
||||||
|
# text too small
|
||||||
|
doc = None
|
||||||
|
return doc
|
||||||
|
|
||||||
def extract_pages(link_batch:list,responses:list)->list:
|
|
||||||
out = []
|
|
||||||
for original_link,(final_link,html) in zip(link_batch,responses):
|
|
||||||
doc = None
|
|
||||||
assert original_link is not None
|
|
||||||
if html is not None:
|
|
||||||
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
|
|
||||||
if doc is not None:
|
|
||||||
if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
|
|
||||||
# text too small
|
|
||||||
doc = None
|
|
||||||
|
|
||||||
out.append((original_link,final_link,html,doc))
|
|
||||||
return out
|
|
||||||
|
|
||||||
def set_content_checksums(doc):
|
def set_content_checksums(doc):
|
||||||
text = doc["text"]
|
text = doc["text"]
|
||||||
@ -186,85 +181,82 @@ def set_content_checksums(doc):
|
|||||||
sentences += 1
|
sentences += 1
|
||||||
doc["sentences_count"] = sentences
|
doc["sentences_count"] = sentences
|
||||||
|
|
||||||
def index_pages(db,hostname,extracted_pages):
|
def index_page(db,original_link,final_link,html,doc):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
htmlcol = db["html"]
|
htmlcol = db["html"]
|
||||||
contentcol = db["content"]
|
contentcol = db["content"]
|
||||||
checkcol = db["check"]
|
checkcol = db["check"]
|
||||||
links = []
|
state = "good"
|
||||||
# stats of the batch
|
link = original_link
|
||||||
good_document_count = 0
|
if original_link != final_link:
|
||||||
document_count = 0
|
print(original_link,final_link)
|
||||||
text_size = 0
|
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
||||||
good_text_size = 0
|
link = final_link
|
||||||
original_text_size = 0
|
if html is None:
|
||||||
for original_link,final_link,html,doc in extracted_pages:
|
state = "html_error"
|
||||||
document_count += 1
|
elif doc is None:
|
||||||
state = "good"
|
state = "content_error"
|
||||||
link = original_link
|
if doc is not None:
|
||||||
if original_link != final_link:
|
set_content_checksums(doc)
|
||||||
print(original_link,final_link)
|
tsz = doc["text_size"]
|
||||||
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
psz = doc["paragraph_sizes_sum"]
|
||||||
link = final_link
|
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
||||||
if html is None:
|
state = "small"
|
||||||
state = "html_error"
|
# check copy
|
||||||
elif doc is None:
|
if state == "good":
|
||||||
state = "content_error"
|
origsz = 0
|
||||||
if doc is not None:
|
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
||||||
set_content_checksums(doc)
|
# index paragraph checksums
|
||||||
tsz = doc["text_size"]
|
nd = checkcol.find_one({"_id":chs})
|
||||||
text_size += tsz
|
if nd is None:
|
||||||
psz = doc["paragraph_sizes_sum"]
|
origsz += paragraph_size
|
||||||
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
doc["original_text_size"] = origsz
|
||||||
state = "small"
|
|
||||||
# check copy
|
|
||||||
if state == "good":
|
|
||||||
origsz = 0
|
|
||||||
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
|
||||||
# index paragraph checksums
|
|
||||||
nd = checkcol.find_one({"_id":chs})
|
|
||||||
if nd is None:
|
|
||||||
origsz += paragraph_size
|
|
||||||
|
|
||||||
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
||||||
state = "copy"
|
state = "copy"
|
||||||
original_text_size += origsz
|
print(origsz)
|
||||||
print(origsz)
|
if state == "good":
|
||||||
|
htdoc = get_link_doc(link,state)
|
||||||
|
htdoc["html"] = html
|
||||||
|
htdoc["html_size"] = len(html)
|
||||||
|
htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
|
||||||
|
# can be revisited - upsert
|
||||||
|
del htdoc["url"]
|
||||||
|
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
||||||
|
doc.update(get_link_doc(link,"good"))
|
||||||
|
# todo extract links
|
||||||
|
print(link,doc)
|
||||||
|
del doc["url"]
|
||||||
|
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
||||||
|
for chs in doc["paragraph_checksums"]:
|
||||||
|
try:
|
||||||
|
checkcol.insert_one({"_id":chs})
|
||||||
|
except pymongo.errors.DuplicateKeyError as err:
|
||||||
|
pass
|
||||||
|
|
||||||
|
linkdoc = get_link_doc(link,state)
|
||||||
|
del linkdoc["url"]
|
||||||
|
linkcol.update_one({"url":link},{"$set":linkdoc})
|
||||||
|
return state
|
||||||
|
|
||||||
|
def save_batch_info(db,host,states,docs):
|
||||||
|
good_document_count = 0
|
||||||
|
original_text_size = 0
|
||||||
|
batch_size = 0
|
||||||
|
_,domain = courlan.get_hostinfo(host)
|
||||||
|
for state,doc in zip(states,docs):
|
||||||
|
batch_size += 1
|
||||||
if state == "good":
|
if state == "good":
|
||||||
good_document_count += 1
|
good_document_count += 1
|
||||||
good_text_size += doc["text_size"]
|
original_text_size += doc["original_text_size"]
|
||||||
htdoc = get_link_doc(link,state)
|
|
||||||
htdoc["html"] = html
|
|
||||||
htdoc["html_size"] = len(html)
|
|
||||||
htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
|
|
||||||
# can be revisited - upsert
|
|
||||||
del htdoc["url"]
|
|
||||||
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
|
||||||
doc.update(get_link_doc(link,"good"))
|
|
||||||
# todo extract links
|
|
||||||
print(link,doc)
|
|
||||||
del doc["url"]
|
|
||||||
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
|
||||||
for chs in doc["paragraph_checksums"]:
|
|
||||||
try:
|
|
||||||
checkcol.insert_one({"_id":chs})
|
|
||||||
except pymongo.errors.DuplicateKeyError as err:
|
|
||||||
pass
|
|
||||||
|
|
||||||
linkdoc = get_link_doc(link,state)
|
|
||||||
del linkdoc["url"]
|
|
||||||
linkcol.update_one({"url":link},{"$set":linkdoc})
|
|
||||||
batchdoc = {
|
batchdoc = {
|
||||||
"host": linkdoc["host"],
|
"host": host,
|
||||||
"domain": linkdoc["domain"],
|
"domain": domain,
|
||||||
"created_at": dat.utcnow(),
|
"created_at": dat.utcnow(),
|
||||||
"good_document_count":good_document_count,
|
"good_document_count":good_document_count,
|
||||||
"document_count":document_count,
|
|
||||||
"text_size":text_size,
|
|
||||||
"good_text_size":good_text_size,
|
|
||||||
"original_text_size":original_text_size,
|
"original_text_size":original_text_size,
|
||||||
"batch_size": BATCHSIZE,
|
"good_prob": good_document_count / batch_size,
|
||||||
"average_fetch_characters": text_size / BATCHSIZE,
|
"batch_size": batch_size,
|
||||||
}
|
}
|
||||||
db["batches"].insert_one(batchdoc)
|
db["batches"].insert_one(batchdoc)
|
||||||
print(batchdoc)
|
print(batchdoc)
|
||||||
@ -699,10 +691,23 @@ def visit(hostname):
|
|||||||
responses = []
|
responses = []
|
||||||
for link in links:
|
for link in links:
|
||||||
responses.append(fetch_page(link))
|
responses.append(fetch_page(link))
|
||||||
extracted_pages = extract_pages(links,responses)
|
|
||||||
|
extracted_pages = []
|
||||||
|
for original_link,(final_link,html) in zip(links,responses):
|
||||||
|
doc = None
|
||||||
|
assert original_link is not None
|
||||||
|
doc = extract_page(final_link,html)
|
||||||
|
extracted_pages.append((original_link,final_link,html,doc))
|
||||||
|
|
||||||
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
||||||
index_links(db,extracted_links)
|
index_links(db,extracted_links)
|
||||||
index_pages(db,hostname,extracted_pages)
|
final_states = []
|
||||||
|
docs = []
|
||||||
|
for original_link,final_link,html,doc in extracted_pages:
|
||||||
|
status = index_page(db,original_link,final_link,html,doc)
|
||||||
|
final_states.append(status)
|
||||||
|
docs.append(doc)
|
||||||
|
save_batch_info(db,hostname,final_states,docs)
|
||||||
link_summary(db,hostname)
|
link_summary(db,hostname)
|
||||||
|
|
||||||
def crawl_summary():
|
def crawl_summary():
|
||||||
@ -719,17 +724,28 @@ def crawl_summary():
|
|||||||
"batch_count":{"$sum":"$batch_size"},
|
"batch_count":{"$sum":"$batch_size"},
|
||||||
"text_size":{"$sum":"$text_size"},
|
"text_size":{"$sum":"$text_size"},
|
||||||
"original_text_size":{"$sum":"$original_text_size"},
|
"original_text_size":{"$sum":"$original_text_size"},
|
||||||
"count":{"$sum":1},
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{"$sort":{"original_text_size":-1}},
|
||||||
])
|
])
|
||||||
print(">>>> Batches")
|
print(">>>> Batches")
|
||||||
headers = ["_id","document_count","good_document_count","count","batch_count","text_size","original_text_size"]
|
headers = ["_id","document_count","good_document_count","batch_count","text_size","original_text_size"]
|
||||||
print("\t".join(headers))
|
print("\t".join(headers))
|
||||||
for item in res:
|
for item in res:
|
||||||
values = [str(item[x]) for x in headers]
|
values = [str(item[x]) for x in headers]
|
||||||
print("\t".join(values))
|
print("\t".join(values))
|
||||||
|
|
||||||
|
import binascii
|
||||||
|
|
||||||
|
def import_html():
|
||||||
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
for l in sys.stdin:
|
||||||
|
hdoc = json.loads(l)
|
||||||
|
url = hdoc["url"]
|
||||||
|
html = bs4.BeautifulSoup(binascii.b2a_qp(hdoc["quoted_html"])).prettify()
|
||||||
|
doc = extract_pages(url,html)
|
||||||
|
index_page(db,url,url,html,doc)
|
||||||
|
|
||||||
def sample_domains():
|
def sample_domains():
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
|
Loading…
Reference in New Issue
Block a user