This commit is contained in:
Daniel Hládek 2023-04-13 16:37:32 +02:00
parent 44dc4be8c3
commit 4a42078bef
2 changed files with 15 additions and 4 deletions

View File

@ -47,11 +47,16 @@ def sampledomains():
@cli.command(help="Enqueue a list of links into redis queue for crawling")
def enqueue():
# TODO: select queues
q = rq.Queue(connection=redis.from_url(REDIS_URL))
for l in sys.stdin:
print(l.strip())
r = q.enqueue(mongocrawler.visit, l.strip())
print(r)
@cli.command()
def importhtml():
mongocrawler.import_html()
if __name__ == "__main__":
cli()

View File

@ -250,6 +250,7 @@ def index_page(db,original_link,final_link,html,doc):
origsz = 0
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
# index paragraph checksums
print(checkcol)
nd = checkcol.find_one({"_id":chs})
if nd is None:
origsz += paragraph_size
@ -736,15 +737,20 @@ def crawl_summary():
print("\t".join(values))
import binascii
import json
def import_html():
myclient= pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
for l in sys.stdin:
hdoc = json.loads(l)
url = hdoc["url"]
html = bs4.BeautifulSoup(binascii.b2a_qp(hdoc["quoted_html"])).prettify()
doc = extract_pages(url,html)
index_page(db,url,url,html,doc)
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
doc = extract_page(url,html)
if doc is not None:
print(doc)
status = index_page(db,url,url,html,doc)
print(status)
def sample_domains():
myclient = pymongo.MongoClient(CONNECTION)