webcheck commit: r434 - webcheck
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r434 - webcheck
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r434 - webcheck
- Date: Sun, 11 Sep 2011 17:33:56 +0200 (CEST)
Author: arthur
Date: Sun Sep 11 17:33:55 2011
New Revision: 434
URL: http://arthurdejong.org/viewvc/webcheck?revision=434&view=revision
Log:
show a better estimate of the number of links remaining
Modified:
webcheck/crawler.py
Modified: webcheck/crawler.py
==============================================================================
--- webcheck/crawler.py Sun Sep 11 17:33:25 2011 (r433)
+++ webcheck/crawler.py Sun Sep 11 17:33:55 2011 (r434)
@@ -232,7 +232,7 @@
def get_links_to_crawl(self, session):
links = session.query(db.Link).filter(db.Link.fetched == None)
- return links.filter(db.Link.yanked == None)[:100]
+ return links.filter(db.Link.yanked == None)
def crawl(self):
"""Crawl the website based on the urls specified with
@@ -259,6 +259,9 @@
self.get_link(session, url)
# add some URLs from the database that haven't been fetched
tocheck = self.get_links_to_crawl(session)
+ remaining = tocheck.count()
+ tocheck = tocheck[:100]
+ remaining -= len(tocheck)
# repeat until we have nothing more to check
while tocheck:
# choose a link from the tocheck list
@@ -268,6 +271,9 @@
# see if there are any more links to check
if not tocheck:
tocheck = self.get_links_to_crawl(session)
+ remaining = tocheck.count()
+ tocheck = tocheck[:100]
+ remaining -= len(tocheck)
# skip link it there is nothing to check
if link.yanked or link.fetched:
continue
@@ -282,7 +288,8 @@
debugio.debug('crawler.crawl(): sleeping %s seconds' %
config.WAIT_BETWEEN_REQUESTS)
time.sleep(config.WAIT_BETWEEN_REQUESTS)
- debugio.debug('crawler.crawl(): items left to check: %d' %
len(tocheck))
+ debugio.debug('crawler.crawl(): items left to check: %d' %
+ (remaining + len(tocheck)))
session.commit()
def fetch(self, link):
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
- webcheck commit: r434 - webcheck,
Commits of the webcheck project