diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 53c830a..4b49508 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -1,23 +1,45 @@ import pymongo - - import trafilatura import trafilatura.feeds import trafilatura.sitemaps import trafilatura.spider import sys -def index_page(db,url,content,extracted_page): - htmlldb = db["html"] - htmldb.insert_one({"url":ulr,"content":content}) - contentdb = db["content"] - contentdb.insert_one(extracted_page) - pass + +def calculate_checksums(self, text): + """ + @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line + """ + checksums = [] + sizes = [] + hval = 0 + hsz = 0 + sz = 0 + for c in text: + cv = ord(c) + sz += 1 + if cv > 64: + hval += (hval << 3) + cv + zv = hval >> 31 + hval &= 0x7fffffff + hval += zv + hsz += 1 + if c == "\n" and hsz > 0: + if hsz > 100: + checksums.append(hval) + sizes.append(sz) + sz = 0 + hsz = 0 + if hsz > 100: + checksums.append(hval) + sizes.append(sz) + return checksums, sizes + def fetch_pages(link_batch): docs = [] for link in link_batch: - link_doc = {"url":link,"status": "unvisited"} + link_doc = {"url":link,"status": "html_error"} rr = trafilatura.fetch_url(page,decode=True) if rr is not None: link_doc["status"] = "html_ok" @@ -39,14 +61,36 @@ def extract_pages(link_docs): return content, extracted_links def index_pages(pagedb,pages_list): - mycol = pagedb["content"] + contentcol = pagedb["content"] for page in page_list: # get paragraph checksums - checksums = get_checksums(page["text"]) - page["checksums"] = checksums - x = mycol.insert_many(pages_list) + checksums,sizes = get_checksums(page["text"]) + page["paragraph_checksums"] = checksums + page["paragraph_sizes"] = sizes + x = contentcol.insert_many(pages_list) page_hashes = [] - pass + +def process_pages(db,fetched_pages): + content_pages, extracted_links = extract_pages(fetched_pages) + contentcol = db["content"] + contentcol.insertMany(content_pages) + linkcol = db["links"] + extracted = [] + for link in extracted_links: + extracted.append({"url":link,"status":"backlink"}) + # shuld fail if link already exists + linkcol.insertMany(extracted) + + html_pages = [] + for page in fetched_pages: + linkcol.updateOne({"url":page["url"]},{"$set":{"status":"visited"}}) + if "html" in page: + html_pages.append({"url":page["url"],"html":page["html"],"update_time":datetime.now()}) + del page["html"] + htmlcol = db["html"] + htmlcol.insertMany(html_pages) + + def get_visited_links(domain): return []