zz
This commit is contained in:
parent
1752d5c776
commit
1e3f8dcba6
@ -1,23 +1,45 @@
|
||||
import pymongo
|
||||
|
||||
|
||||
import trafilatura
|
||||
import trafilatura.feeds
|
||||
import trafilatura.sitemaps
|
||||
import trafilatura.spider
|
||||
import sys
|
||||
|
||||
def index_page(db,url,content,extracted_page):
|
||||
htmlldb = db["html"]
|
||||
htmldb.insert_one({"url":ulr,"content":content})
|
||||
contentdb = db["content"]
|
||||
contentdb.insert_one(extracted_page)
|
||||
pass
|
||||
|
||||
def calculate_checksums(self, text):
|
||||
"""
|
||||
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
|
||||
"""
|
||||
checksums = []
|
||||
sizes = []
|
||||
hval = 0
|
||||
hsz = 0
|
||||
sz = 0
|
||||
for c in text:
|
||||
cv = ord(c)
|
||||
sz += 1
|
||||
if cv > 64:
|
||||
hval += (hval << 3) + cv
|
||||
zv = hval >> 31
|
||||
hval &= 0x7fffffff
|
||||
hval += zv
|
||||
hsz += 1
|
||||
if c == "\n" and hsz > 0:
|
||||
if hsz > 100:
|
||||
checksums.append(hval)
|
||||
sizes.append(sz)
|
||||
sz = 0
|
||||
hsz = 0
|
||||
if hsz > 100:
|
||||
checksums.append(hval)
|
||||
sizes.append(sz)
|
||||
return checksums, sizes
|
||||
|
||||
|
||||
def fetch_pages(link_batch):
|
||||
docs = []
|
||||
for link in link_batch:
|
||||
link_doc = {"url":link,"status": "unvisited"}
|
||||
link_doc = {"url":link,"status": "html_error"}
|
||||
rr = trafilatura.fetch_url(page,decode=True)
|
||||
if rr is not None:
|
||||
link_doc["status"] = "html_ok"
|
||||
@ -39,14 +61,36 @@ def extract_pages(link_docs):
|
||||
return content, extracted_links
|
||||
|
||||
def index_pages(pagedb,pages_list):
|
||||
mycol = pagedb["content"]
|
||||
contentcol = pagedb["content"]
|
||||
for page in page_list:
|
||||
# get paragraph checksums
|
||||
checksums = get_checksums(page["text"])
|
||||
page["checksums"] = checksums
|
||||
x = mycol.insert_many(pages_list)
|
||||
checksums,sizes = get_checksums(page["text"])
|
||||
page["paragraph_checksums"] = checksums
|
||||
page["paragraph_sizes"] = sizes
|
||||
x = contentcol.insert_many(pages_list)
|
||||
page_hashes = []
|
||||
pass
|
||||
|
||||
def process_pages(db,fetched_pages):
|
||||
content_pages, extracted_links = extract_pages(fetched_pages)
|
||||
contentcol = db["content"]
|
||||
contentcol.insertMany(content_pages)
|
||||
linkcol = db["links"]
|
||||
extracted = []
|
||||
for link in extracted_links:
|
||||
extracted.append({"url":link,"status":"backlink"})
|
||||
# shuld fail if link already exists
|
||||
linkcol.insertMany(extracted)
|
||||
|
||||
html_pages = []
|
||||
for page in fetched_pages:
|
||||
linkcol.updateOne({"url":page["url"]},{"$set":{"status":"visited"}})
|
||||
if "html" in page:
|
||||
html_pages.append({"url":page["url"],"html":page["html"],"update_time":datetime.now()})
|
||||
del page["html"]
|
||||
htmlcol = db["html"]
|
||||
htmlcol.insertMany(html_pages)
|
||||
|
||||
|
||||
|
||||
def get_visited_links(domain):
|
||||
return []
|
||||
|
Loading…
Reference in New Issue
Block a user