zz
This commit is contained in:
parent
1752d5c776
commit
1e3f8dcba6
@ -1,23 +1,45 @@
|
|||||||
import pymongo
|
import pymongo
|
||||||
|
|
||||||
|
|
||||||
import trafilatura
|
import trafilatura
|
||||||
import trafilatura.feeds
|
import trafilatura.feeds
|
||||||
import trafilatura.sitemaps
|
import trafilatura.sitemaps
|
||||||
import trafilatura.spider
|
import trafilatura.spider
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
def index_page(db,url,content,extracted_page):
|
|
||||||
htmlldb = db["html"]
|
def calculate_checksums(self, text):
|
||||||
htmldb.insert_one({"url":ulr,"content":content})
|
"""
|
||||||
contentdb = db["content"]
|
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
|
||||||
contentdb.insert_one(extracted_page)
|
"""
|
||||||
pass
|
checksums = []
|
||||||
|
sizes = []
|
||||||
|
hval = 0
|
||||||
|
hsz = 0
|
||||||
|
sz = 0
|
||||||
|
for c in text:
|
||||||
|
cv = ord(c)
|
||||||
|
sz += 1
|
||||||
|
if cv > 64:
|
||||||
|
hval += (hval << 3) + cv
|
||||||
|
zv = hval >> 31
|
||||||
|
hval &= 0x7fffffff
|
||||||
|
hval += zv
|
||||||
|
hsz += 1
|
||||||
|
if c == "\n" and hsz > 0:
|
||||||
|
if hsz > 100:
|
||||||
|
checksums.append(hval)
|
||||||
|
sizes.append(sz)
|
||||||
|
sz = 0
|
||||||
|
hsz = 0
|
||||||
|
if hsz > 100:
|
||||||
|
checksums.append(hval)
|
||||||
|
sizes.append(sz)
|
||||||
|
return checksums, sizes
|
||||||
|
|
||||||
|
|
||||||
def fetch_pages(link_batch):
|
def fetch_pages(link_batch):
|
||||||
docs = []
|
docs = []
|
||||||
for link in link_batch:
|
for link in link_batch:
|
||||||
link_doc = {"url":link,"status": "unvisited"}
|
link_doc = {"url":link,"status": "html_error"}
|
||||||
rr = trafilatura.fetch_url(page,decode=True)
|
rr = trafilatura.fetch_url(page,decode=True)
|
||||||
if rr is not None:
|
if rr is not None:
|
||||||
link_doc["status"] = "html_ok"
|
link_doc["status"] = "html_ok"
|
||||||
@ -39,14 +61,36 @@ def extract_pages(link_docs):
|
|||||||
return content, extracted_links
|
return content, extracted_links
|
||||||
|
|
||||||
def index_pages(pagedb,pages_list):
|
def index_pages(pagedb,pages_list):
|
||||||
mycol = pagedb["content"]
|
contentcol = pagedb["content"]
|
||||||
for page in page_list:
|
for page in page_list:
|
||||||
# get paragraph checksums
|
# get paragraph checksums
|
||||||
checksums = get_checksums(page["text"])
|
checksums,sizes = get_checksums(page["text"])
|
||||||
page["checksums"] = checksums
|
page["paragraph_checksums"] = checksums
|
||||||
x = mycol.insert_many(pages_list)
|
page["paragraph_sizes"] = sizes
|
||||||
|
x = contentcol.insert_many(pages_list)
|
||||||
page_hashes = []
|
page_hashes = []
|
||||||
pass
|
|
||||||
|
def process_pages(db,fetched_pages):
|
||||||
|
content_pages, extracted_links = extract_pages(fetched_pages)
|
||||||
|
contentcol = db["content"]
|
||||||
|
contentcol.insertMany(content_pages)
|
||||||
|
linkcol = db["links"]
|
||||||
|
extracted = []
|
||||||
|
for link in extracted_links:
|
||||||
|
extracted.append({"url":link,"status":"backlink"})
|
||||||
|
# shuld fail if link already exists
|
||||||
|
linkcol.insertMany(extracted)
|
||||||
|
|
||||||
|
html_pages = []
|
||||||
|
for page in fetched_pages:
|
||||||
|
linkcol.updateOne({"url":page["url"]},{"$set":{"status":"visited"}})
|
||||||
|
if "html" in page:
|
||||||
|
html_pages.append({"url":page["url"],"html":page["html"],"update_time":datetime.now()})
|
||||||
|
del page["html"]
|
||||||
|
htmlcol = db["html"]
|
||||||
|
htmlcol.insertMany(html_pages)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_visited_links(domain):
|
def get_visited_links(domain):
|
||||||
return []
|
return []
|
||||||
|
Loading…
Reference in New Issue
Block a user