works
This commit is contained in:
parent
a7d048c952
commit
a8f5b149f2
@ -172,7 +172,6 @@ def fetch_page(link:str)->(str,str):
|
||||
html = None
|
||||
if response is not None :
|
||||
good = True
|
||||
print(response)
|
||||
if response.status != 200:
|
||||
good = False
|
||||
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
||||
@ -212,8 +211,16 @@ def extract_page(final_link,html):
|
||||
if html is not None:
|
||||
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
|
||||
if doc is not None:
|
||||
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
|
||||
lines = doc["text"].split("\n")
|
||||
# filter out tables
|
||||
good_lines = []
|
||||
for line in lines:
|
||||
if line.startswith("|") or line.startswith("1 2 3 4") or line.startswith("12345"):
|
||||
continue
|
||||
good_lines.append(line)
|
||||
doc["text"] = "\n".join(good_lines)
|
||||
# text too small
|
||||
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
|
||||
doc = None
|
||||
return doc
|
||||
|
||||
@ -642,7 +649,8 @@ def parseurl(link):
|
||||
print(rules.crawl_delay("*"))
|
||||
html = trafilatura.fetch_url(link,decode=True)
|
||||
get_bs_links(link,html)
|
||||
doc = trafilatura.bare_extraction(html)
|
||||
doc = extract_page(link,html)
|
||||
if doc is not None:
|
||||
import pprint
|
||||
pprint.pprint(doc)
|
||||
internal_links, external_links = get_bs_links(link,html)
|
||||
@ -719,6 +727,21 @@ def visit(hostname,filter_content=True):
|
||||
def crawl_summary():
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
contentcol = db["content"]
|
||||
res = contentcol.aggregate([
|
||||
{"$group":{"_id":None,"total_text_size":{"$sum":"$text_size"}}}
|
||||
])
|
||||
print(">>>>> Total text size in content")
|
||||
for item in res:
|
||||
print(item)
|
||||
linkscol = db["links"]
|
||||
# find counts of link statuses
|
||||
res = linkscol.aggregate([
|
||||
{"$group":{"_id":"$status","count":{"$sum":1}}}
|
||||
])
|
||||
print(">>>>> Link status counts")
|
||||
for item in res:
|
||||
print(item["_id"],item["count"])
|
||||
batchcol = db["batches"]
|
||||
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
|
||||
print(yesterday)
|
||||
@ -732,6 +755,7 @@ def crawl_summary():
|
||||
}
|
||||
},
|
||||
{"$sort":{"original_text_size":-1}},
|
||||
{"$limit":100},
|
||||
])
|
||||
print(">>>> Batches")
|
||||
headers = ["_id","document_count","good_document_count","batch_size","original_text_size"]
|
||||
@ -739,29 +763,41 @@ def crawl_summary():
|
||||
for item in res:
|
||||
values = [str(item[x]) for x in headers]
|
||||
print("\t".join(values))
|
||||
contentcol = db["content"]
|
||||
res = contentcol.aggregate([
|
||||
{"$group":{"_id":None,total_text_size:{"$sum":"$text_size"}}}
|
||||
])
|
||||
print(">>>>> Total text size in content")
|
||||
for item in res:
|
||||
print(res)
|
||||
|
||||
def extr(hdoc):
|
||||
url = hdoc["url"]
|
||||
html = binascii.a2b_qp(hdoc["quoted_html"])
|
||||
doc = extract_page(url,html)
|
||||
return doc
|
||||
|
||||
def import_html():
|
||||
myclient= pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
for l in sys.stdin:
|
||||
contentcol = db["content"]
|
||||
buffer = []
|
||||
counter = 0
|
||||
for i,l in enumerate(sys.stdin):
|
||||
hdoc = json.loads(l)
|
||||
url = hdoc["url"]
|
||||
# beautifusoup is to unify encoding
|
||||
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
|
||||
doc = extract_page(url,html)
|
||||
if doc is not None:
|
||||
print("------=====-")
|
||||
print(doc)
|
||||
status = index_page(db,url,url,html,doc)
|
||||
print(status)
|
||||
r = contentcol.find_one({"url":url},projection=["_id"])
|
||||
if r is not None:
|
||||
print(">>>>" + str(i) + " copy: " + url)
|
||||
continue
|
||||
buffer.append(hdoc)
|
||||
if len(buffer) < 128:
|
||||
continue
|
||||
from multiprocessing import Pool
|
||||
with Pool(8) as p:
|
||||
outs = p.map(extr,buffer)
|
||||
for hdoc,doc in zip(buffer,outs):
|
||||
if doc is None:
|
||||
print("bad html" + hdoc["url"])
|
||||
continue
|
||||
status = index_page(db,hdoc["url"],hdoc["url"],hdoc["quoted_html"],doc)
|
||||
counter += 1
|
||||
print( ">>> " + str(counter) + " " + hdoc["url"] + " " + status)
|
||||
del buffer[:]
|
||||
|
||||
|
||||
def sample_domains():
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
|
Loading…
Reference in New Issue
Block a user