lists.arthurdejong.org
RSS feed

webcheck commit: r459 - in webcheck: . webcheck

[Date Prev][Date Next] [Thread Prev][Thread Next]

webcheck commit: r459 - in webcheck: . webcheck



Author: arthur
Date: Fri Nov  4 10:13:40 2011
New Revision: 459
URL: http://arthurdejong.org/viewvc/webcheck?revision=459&view=revision

Log:
implement a MAX_DEPTH configuration option to limit crawling based on a patch 
by Devin Bayer

Modified:
   webcheck/AUTHORS
   webcheck/webcheck/config.py
   webcheck/webcheck/crawler.py
   webcheck/webcheck/db.py

Modified: webcheck/AUTHORS
==============================================================================
--- webcheck/AUTHORS    Fri Oct 14 16:10:58 2011        (r458)
+++ webcheck/AUTHORS    Fri Nov  4 10:13:40 2011        (r459)
@@ -25,3 +25,4 @@
 Chris Shenton <Chris.Shenton@nasa.gov>
 Robert M. Jansen <dutch12154@yahoo.com>
 Henning Sielaff <hsielaff@eformation.de>
+Devin Bayer <l@t-0.be>

Modified: webcheck/webcheck/config.py
==============================================================================
--- webcheck/webcheck/config.py Fri Oct 14 16:10:58 2011        (r458)
+++ webcheck/webcheck/config.py Fri Nov  4 10:13:40 2011        (r459)
@@ -50,6 +50,9 @@
 # the -w command line option.
 WAIT_BETWEEN_REQUESTS = 0
 
+# Maximum number of links to follow from the specified base URLs.
+MAX_DEPTH = None
+
 # Redirect depth, the number of redirects to follow. This is the state of the
 # -r command line option.
 REDIRECT_DEPTH = 5

Modified: webcheck/webcheck/crawler.py
==============================================================================
--- webcheck/webcheck/crawler.py        Fri Oct 14 16:10:58 2011        (r458)
+++ webcheck/webcheck/crawler.py        Fri Nov  4 10:13:40 2011        (r459)
@@ -254,6 +254,8 @@
 
     def get_links_to_crawl(self, session):
         links = session.query(Link).filter(Link.fetched == None)
+        if config.MAX_DEPTH:
+            links = links.filter(Link.depth <= config.MAX_DEPTH)
         return links.filter(Link.yanked == None)
 
     def crawl(self):
@@ -406,7 +408,8 @@
             link.depth = 0
         session.commit()
         while count > 0:
-            logger.debug('%d links at depth %d', count, depth)
+            logger.debug('%d links at depth %d%s', count, depth,
+                         ' (max)' if depth == config.MAX_DEPTH else '')
             # update the depth of all links without a depth that have a
             # parent with the previous depth
             qry = session.query(Link).filter(Link.depth == None)

Modified: webcheck/webcheck/db.py
==============================================================================
--- webcheck/webcheck/db.py     Fri Oct 14 16:10:58 2011        (r458)
+++ webcheck/webcheck/db.py     Fri Nov  4 10:13:40 2011        (r459)
@@ -92,7 +92,7 @@
 
     # crawling information
     redirectdepth = Column(Integer, default=0)
-    depth = Column(Integer)
+    depth = Column(Integer, default=0)
 
     @staticmethod
     def clean_url(url):
@@ -108,8 +108,13 @@
         # try to find the link
         instance = session.query(Link).filter_by(url=url).first()
         if not instance:
-            instance = Link(url=url)
+            if config.MAX_DEPTH and self.depth >= config.MAX_DEPTH:
+                logger.debug('link %s too deep', url)
+            instance = Link(url=url, depth=self.depth + 1)
             session.add(instance)
+        else:
+            # we may have discovered a shorter path
+            instance.depth = min(instance.depth, self.depth + 1)
         # mark that we were looking for an anchor/fragment
         if fragment:
             instance.add_reqanchor(self, fragment)
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/