works
This commit is contained in:
parent
a7d048c952
commit
a8f5b149f2
@ -172,7 +172,6 @@ def fetch_page(link:str)->(str,str):
|
|||||||
html = None
|
html = None
|
||||||
if response is not None :
|
if response is not None :
|
||||||
good = True
|
good = True
|
||||||
print(response)
|
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
good = False
|
good = False
|
||||||
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
||||||
@ -212,8 +211,16 @@ def extract_page(final_link,html):
|
|||||||
if html is not None:
|
if html is not None:
|
||||||
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
|
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
|
||||||
if doc is not None:
|
if doc is not None:
|
||||||
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
|
lines = doc["text"].split("\n")
|
||||||
|
# filter out tables
|
||||||
|
good_lines = []
|
||||||
|
for line in lines:
|
||||||
|
if line.startswith("|") or line.startswith("1 2 3 4") or line.startswith("12345"):
|
||||||
|
continue
|
||||||
|
good_lines.append(line)
|
||||||
|
doc["text"] = "\n".join(good_lines)
|
||||||
# text too small
|
# text too small
|
||||||
|
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
|
||||||
doc = None
|
doc = None
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
@ -642,7 +649,8 @@ def parseurl(link):
|
|||||||
print(rules.crawl_delay("*"))
|
print(rules.crawl_delay("*"))
|
||||||
html = trafilatura.fetch_url(link,decode=True)
|
html = trafilatura.fetch_url(link,decode=True)
|
||||||
get_bs_links(link,html)
|
get_bs_links(link,html)
|
||||||
doc = trafilatura.bare_extraction(html)
|
doc = extract_page(link,html)
|
||||||
|
if doc is not None:
|
||||||
import pprint
|
import pprint
|
||||||
pprint.pprint(doc)
|
pprint.pprint(doc)
|
||||||
internal_links, external_links = get_bs_links(link,html)
|
internal_links, external_links = get_bs_links(link,html)
|
||||||
@ -719,6 +727,21 @@ def visit(hostname,filter_content=True):
|
|||||||
def crawl_summary():
|
def crawl_summary():
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
|
contentcol = db["content"]
|
||||||
|
res = contentcol.aggregate([
|
||||||
|
{"$group":{"_id":None,"total_text_size":{"$sum":"$text_size"}}}
|
||||||
|
])
|
||||||
|
print(">>>>> Total text size in content")
|
||||||
|
for item in res:
|
||||||
|
print(item)
|
||||||
|
linkscol = db["links"]
|
||||||
|
# find counts of link statuses
|
||||||
|
res = linkscol.aggregate([
|
||||||
|
{"$group":{"_id":"$status","count":{"$sum":1}}}
|
||||||
|
])
|
||||||
|
print(">>>>> Link status counts")
|
||||||
|
for item in res:
|
||||||
|
print(item["_id"],item["count"])
|
||||||
batchcol = db["batches"]
|
batchcol = db["batches"]
|
||||||
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
|
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
|
||||||
print(yesterday)
|
print(yesterday)
|
||||||
@ -732,6 +755,7 @@ def crawl_summary():
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{"$sort":{"original_text_size":-1}},
|
{"$sort":{"original_text_size":-1}},
|
||||||
|
{"$limit":100},
|
||||||
])
|
])
|
||||||
print(">>>> Batches")
|
print(">>>> Batches")
|
||||||
headers = ["_id","document_count","good_document_count","batch_size","original_text_size"]
|
headers = ["_id","document_count","good_document_count","batch_size","original_text_size"]
|
||||||
@ -739,29 +763,41 @@ def crawl_summary():
|
|||||||
for item in res:
|
for item in res:
|
||||||
values = [str(item[x]) for x in headers]
|
values = [str(item[x]) for x in headers]
|
||||||
print("\t".join(values))
|
print("\t".join(values))
|
||||||
contentcol = db["content"]
|
|
||||||
res = contentcol.aggregate([
|
|
||||||
{"$group":{"_id":None,total_text_size:{"$sum":"$text_size"}}}
|
|
||||||
])
|
|
||||||
print(">>>>> Total text size in content")
|
|
||||||
for item in res:
|
|
||||||
print(res)
|
|
||||||
|
|
||||||
|
def extr(hdoc):
|
||||||
|
url = hdoc["url"]
|
||||||
|
html = binascii.a2b_qp(hdoc["quoted_html"])
|
||||||
|
doc = extract_page(url,html)
|
||||||
|
return doc
|
||||||
|
|
||||||
def import_html():
|
def import_html():
|
||||||
myclient= pymongo.MongoClient(CONNECTION)
|
myclient= pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
for l in sys.stdin:
|
contentcol = db["content"]
|
||||||
|
buffer = []
|
||||||
|
counter = 0
|
||||||
|
for i,l in enumerate(sys.stdin):
|
||||||
hdoc = json.loads(l)
|
hdoc = json.loads(l)
|
||||||
url = hdoc["url"]
|
url = hdoc["url"]
|
||||||
# beautifusoup is to unify encoding
|
r = contentcol.find_one({"url":url},projection=["_id"])
|
||||||
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
|
if r is not None:
|
||||||
doc = extract_page(url,html)
|
print(">>>>" + str(i) + " copy: " + url)
|
||||||
if doc is not None:
|
continue
|
||||||
print("------=====-")
|
buffer.append(hdoc)
|
||||||
print(doc)
|
if len(buffer) < 128:
|
||||||
status = index_page(db,url,url,html,doc)
|
continue
|
||||||
print(status)
|
from multiprocessing import Pool
|
||||||
|
with Pool(8) as p:
|
||||||
|
outs = p.map(extr,buffer)
|
||||||
|
for hdoc,doc in zip(buffer,outs):
|
||||||
|
if doc is None:
|
||||||
|
print("bad html" + hdoc["url"])
|
||||||
|
continue
|
||||||
|
status = index_page(db,hdoc["url"],hdoc["url"],hdoc["quoted_html"],doc)
|
||||||
|
counter += 1
|
||||||
|
print( ">>> " + str(counter) + " " + hdoc["url"] + " " + status)
|
||||||
|
del buffer[:]
|
||||||
|
|
||||||
|
|
||||||
def sample_domains():
|
def sample_domains():
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
Loading…
Reference in New Issue
Block a user