lists.arthurdejong.org
RSS feed

webcheck branch master updated. 1.10.4-64-g24e191f

[Date Prev][Date Next] [Thread Prev][Thread Next]

webcheck branch master updated. 1.10.4-64-g24e191f



This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "webcheck".

The branch, master has been updated
       via  24e191f42e45b408d1b34210dcedb710d201a669 (commit)
       via  a24b222e1290c68c905ae629a95d92a2aea305d4 (commit)
       via  3eba4a4fc19a94a8b8cca9e57595bf5f1d4b0740 (commit)
       via  07172e0cd582b89437f94fde3307a0e8e81b6ee9 (commit)
      from  ca8e3e45cbb498bd628ceefce18e55949738402c (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://arthurdejong.org/git/webcheck/commit/?id=24e191f42e45b408d1b34210dcedb710d201a669

commit 24e191f42e45b408d1b34210dcedb710d201a669
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Fri Sep 20 17:02:28 2013 +0200

    Initialise crawler with a configuration
    
    This changes the constructor to accept a dict configuration of the
    crawler. This is currently combined with the configuration in the config
    module but the goal is to replace it completely.

diff --git a/run.py b/run.py
index e608ec0..eee0801 100755
--- a/run.py
+++ b/run.py
@@ -4,7 +4,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011, 2013 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -25,25 +25,17 @@
 
 """Alternative entry_point for development."""
 
-import sys, os, logging
+import os
+import sys
 
-from webcheck.crawler import Crawler
-from webcheck.cmd import parse_args, main
-from webcheck import config
+from webcheck.cmd import parser, main
 
-# Whether to produce profiling information. This is for development
-# purposes and as such undocumented.
-# http://docs.python.org/lib/profile.html
-PROFILE = False
 
 if __name__ == '__main__':
     try:
-        # initialize crawler object
-        crawler = Crawler()
-        # parse command-line arguments
-        parse_args(crawler)
-        if PROFILE or '--profile' in sys.argv:
-            fname = os.path.join(config.OUTPUT_DIR, 'webcheck.prof')
+        args = parser.parse_args()
+        if args.profile:
+            fname = os.path.join(args.output_dir, 'webcheck.prof')
             try:
                 import cProfile
             except ImportError:
@@ -53,12 +45,12 @@ if __name__ == '__main__':
                 sqltap.start()
             except ImportError:
                 pass
-            cProfile.run('main(crawler)', fname)
+            cProfile.run('main(vars(args))', fname)
             if 'sqltap' in locals():
                 statistics = sqltap.collect()
-                sqltap.report(statistics, os.path.join(config.OUTPUT_DIR, 
'sqltap.html'))
+                sqltap.report(statistics, os.path.join(args.output_dir, 
'sqltap.html'))
         else:
-            main(crawler)
+            main(vars(args))
     except KeyboardInterrupt:
         sys.stderr.write('Interrupted\n')
         sys.exit(1)
diff --git a/webcheck/cmd.py b/webcheck/cmd.py
index fd1a0f3..7020a20 100755
--- a/webcheck/cmd.py
+++ b/webcheck/cmd.py
@@ -27,19 +27,12 @@
 
 import argparse
 import logging
-import os
-import urllib
-import urlparse
 
 import webcheck
 import webcheck.monkeypatch
 from webcheck.crawler import Crawler, default_cfg
 
 
-# The loglevel to use for the logger that is configured.
-LOGLEVEL = logging.INFO
-
-
 version_string = '''
 webcheck %s
 Written by Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.
@@ -122,40 +115,18 @@ parser.add_argument(
 parser.set_defaults(**default_cfg)
 
 
-def parse_args(crawler):
-    """Parse command-line arguments."""
-    # these global options are set here
-    global LOGLEVEL
-    args = parser.parse_args()
-    for pattern in args.internal:
-        crawler.add_internal_re(pattern)
-    for pattern in args.external:
-        crawler.add_external_re(pattern)
-    for pattern in args.yank:
-        crawler.add_yanked_re(pattern)
-    config.BASE_URLS_ONLY = args.base_only
-    config.AVOID_EXTERNAL_LINKS = args.avoid_external
-    config.USE_ROBOTS = not(args.ignore_robots)
-    if args.quiet:
-        LOGLEVEL = logging.WARNING
-    elif args.debug:
-        LOGLEVEL = logging.DEBUG
-    config.OUTPUT_DIR = args.output
-    config.CONTINUE = getattr(args, 'continue')
-    config.OVERWRITE_FILES = args.force
-    config.REDIRECT_DEPTH = args.redirects
-    config.MAX_DEPTH = args.max_depth
-    config.WAIT_BETWEEN_REQUESTS = args.wait
-    for arg in args.urls:
-        # if it does not look like a url it is probably a local file
-        if urlparse.urlsplit(arg)[0] == '':
-            arg = 'file://' + urllib.pathname2url(os.path.abspath(arg))
-        crawler.add_base(arg)
-
-
-def main(crawler):
+def main(cfg):
     """Main program."""
-    logging.basicConfig(format='webcheck: %(levelname)s: %(message)s', 
level=LOGLEVEL)
+    # configure logging
+    if cfg.get('quiet', False):
+        level = logging.WARNING
+    elif cfg.get('debug', False):
+        level = logging.DEBUG
+    else:
+        level = logging.INFO
+    logging.basicConfig(format='webcheck: %(levelname)s: %(message)s', 
level=level)
+    # set up crawler and go
+    crawler = Crawler(cfg)
     logging.info('checking site....')
     crawler.crawl()
     logging.info('done.')
@@ -169,6 +140,5 @@ def main(crawler):
 
 def entry_point():
     """setuptools entry point"""
-    crawler = Crawler()
-    parse_args(crawler)
-    main(crawler)
+    args = parser.parse_args()
+    main(vars(cfg))
diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index 3710bbc..749485a 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -104,6 +104,17 @@ default_cfg = dict(
 default_cfg.update({'continue': config.CONTINUE})
 
 
+class Config(object):
+
+    def __init__(self, *args, **kwargs):
+        self.update(*args, **kwargs)
+
+    def update(self, *args, **kwargs):
+        for arg in args:
+            vars(self).update(arg)
+        vars(self).update(kwargs)
+
+
 class Crawler(object):
     """Class to represent gathered data of a site.
 
@@ -113,25 +124,52 @@ class Crawler(object):
       plugins    - a list of plugin modules used by the crawler
     """
 
-    def __init__(self):
+    def __init__(self, cfg):
         """Creates an instance of the Crawler class and initializes the
         state of the site."""
-        # list of internal urls
-        self._internal_urls = set()
+        # complete the configuration
+        self.cfg = Config(default_cfg)
+        self.cfg.update(cfg)
         # list of regexps considered internal
         self._internal_res = {}
+        for pattern in self.cfg.internal:
+            self._internal_res[pattern] = re.compile(pattern, re.IGNORECASE)
         # list of regexps considered external
         self._external_res = {}
+        for pattern in self.cfg.external:
+            self._external_res[pattern] = re.compile(pattern, re.IGNORECASE)
         # list of regexps matching links that should not be checked
         self._yanked_res = {}
-        # map of scheme+netloc to robot handleds
+        for pattern in self.cfg.yank:
+            self._yanked_res[pattern] = re.compile(pattern, re.IGNORECASE)
+        # update other configuration
+        config.BASE_URLS_ONLY = self.cfg.base_only
+        config.AVOID_EXTERNAL_LINKS = self.cfg.avoid_external
+        config.USE_ROBOTS = not(self.cfg.ignore_robots)
+        config.OUTPUT_DIR = self.cfg.output_dir
+        config.CONTINUE = getattr(self.cfg, 'continue')
+        config.OVERWRITE_FILES = self.cfg.force
+        config.REDIRECT_DEPTH = self.cfg.redirects
+        config.MAX_DEPTH = self.cfg.max_depth
+        config.WAIT_BETWEEN_REQUESTS = self.cfg.wait
+        # map of scheme+netloc to robot parsers
         self._robotparsers = {}
-        # list of base urls (these are the internal urls to start from)
-        self.bases = []
         # load the plugins
         self.plugins = [
             __import__(plugin, globals(), locals(), [plugin])
             for plugin in config.PLUGINS]
+        # add base urls
+        self._internal_urls = set()
+        for url in self.cfg.base_urls:
+            # if it does not look like a url it is probably a local file
+            if urlparse.urlsplit(url)[0] == '':
+                url = 'file://' + urllib.pathname2url(os.path.abspath(url))
+            # clean the URL and add it
+            url = Link.clean_url(url)
+            if url not in self._internal_urls:
+                self._internal_urls.add(url)
+        # list of base link objects
+        self.bases = []
 
     def setup_database(self):
         if hasattr(self, 'database_configed'):
@@ -148,31 +186,6 @@ class Crawler(object):
         Base.metadata.create_all(engine)
         # TODO: schema migraton goes here
 
-    def add_base(self, url):
-        """Add the given url and consider all urls below it to be internal.
-        These links are all marked for checking with the crawl() function."""
-        # ensure we have a connection to the database
-        self.setup_database()
-        # clean the URL and add it
-        url = Link.clean_url(url)
-        if url not in self._internal_urls:
-            self._internal_urls.add(url)
-
-    def add_internal_re(self, exp):
-        """Adds the gived regular expression as a pattern to match internal
-        urls."""
-        self._internal_res[exp] = re.compile(exp, re.IGNORECASE)
-
-    def add_external_re(self, exp):
-        """Adds the gived regular expression as a pattern to match external
-        urls."""
-        self._external_res[exp] = re.compile(exp, re.IGNORECASE)
-
-    def add_yanked_re(self, exp):
-        """Adds the gived regular expression as a pattern to match urls that
-        will not be checked at all."""
-        self._yanked_res[exp] = re.compile(exp, re.IGNORECASE)
-
     def _is_internal(self, url):
         """Check whether the specified url is external or internal. This
         uses the urls marked with add_base() and the regular expressions

http://arthurdejong.org/git/webcheck/commit/?id=a24b222e1290c68c905ae629a95d92a2aea305d4

commit a24b222e1290c68c905ae629a95d92a2aea305d4
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Fri Sep 20 15:50:03 2013 +0200

    Expose configured plugins via crawler.plugins
    
    This avoids having module loading code in different places.

diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index a516fbc..3710bbc 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -110,7 +110,8 @@ class Crawler(object):
     The available properties of this class are:
 
       bases      - a list of base link object
-   """
+      plugins    - a list of plugin modules used by the crawler
+    """
 
     def __init__(self):
         """Creates an instance of the Crawler class and initializes the
@@ -127,6 +128,10 @@ class Crawler(object):
         self._robotparsers = {}
         # list of base urls (these are the internal urls to start from)
         self.bases = []
+        # load the plugins
+        self.plugins = [
+            __import__(plugin, globals(), locals(), [plugin])
+            for plugin in config.PLUGINS]
 
     def setup_database(self):
         if hasattr(self, 'database_configed'):
@@ -429,24 +434,20 @@ class Crawler(object):
             depth += 1
             # TODO: also handle embeds
         # see if any of the plugins want to do postprocessing
-        for plugin in config.PLUGINS:
-            # import the plugin
-            pluginmod = __import__(plugin, globals(), locals(), [plugin])
-            if hasattr(pluginmod, 'postprocess'):
-                logger.info(plugin)
-                pluginmod.postprocess(self)
+        for plugin in self.plugins:
+            if hasattr(plugin, 'postprocess'):
+                logger.info(plugin.__name__)
+                plugin.postprocess(self)
 
     def generate(self):
         """Generate pages for plugins."""
         # ensure we have a connection to the database
         self.setup_database()
         # call all the plugins
-        for plugin in config.PLUGINS:
-            # import the plugin
-            pluginmod = __import__(plugin, globals(), locals(), [plugin])
-            if hasattr(pluginmod, 'generate'):
-                logger.info(plugin)
-                pluginmod.generate(self)
+        for plugin in self.plugins:
+            if hasattr(plugin, 'generate'):
+                logger.info(plugin.__name__)
+                plugin.generate(self)
         # install theme files
         install_file('webcheck.css', True)
         install_file('fancytooltips/fancytooltips.js', True)
diff --git a/webcheck/plugins/__init__.py b/webcheck/plugins/__init__.py
index 9978b85..4c534fd 100644
--- a/webcheck/plugins/__init__.py
+++ b/webcheck/plugins/__init__.py
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2009, 2011 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2009, 2011, 2013 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -43,11 +43,9 @@ fields:
 
 Pluings can use the functions exported by this module."""
 
-import sys
 import time
 
 from sqlalchemy.orm import joinedload
-from sqlalchemy.orm.session import object_session
 
 import webcheck
 from webcheck import config
@@ -160,25 +158,23 @@ def print_parents(fp, link, indent='     '):
       indent + '</div>\n')
 
 
-def _print_navbar(fp, selected):
+def _print_navbar(fp, selected, crawler):
     """Return an html fragement representing the navigation bar for a page."""
     fp.write('  <ul class="navbar">\n')
-    for plugin in config.PLUGINS:
-        # import the plugin
-        pluginmod = __import__(plugin, globals(), locals(), [plugin])
+    for plugin in crawler.plugins:
         # skip if no outputfile
-        if not hasattr(pluginmod, '__outputfile__'):
+        if not hasattr(plugin, '__outputfile__'):
             continue
         # generate a link to the plugin page
         selected = ''
-        if pluginmod == selected:
+        if plugin == selected:
             selected = ' class="selected"'
         fp.write(
           '   <li><a href="%(pluginfile)s"%(selected)s 
title="%(description)s">%(title)s</a></li>\n'
-          % {'pluginfile':  pluginmod.__outputfile__,
+          % {'pluginfile':  plugin.__outputfile__,
              'selected':    selected,
-             'title':       htmlescape(pluginmod.__title__),
-             'description': htmlescape(pluginmod.__doc__)})
+             'title':       htmlescape(plugin.__title__),
+             'description': htmlescape(plugin.__doc__)})
     fp.write('  </ul>\n')
 
 
@@ -209,7 +205,7 @@ def open_html(plugin, crawler):
          'siteurl':     base.url,
          'version':     webcheck.__version__})
     # write navigation bar
-    _print_navbar(fp, plugin)
+    _print_navbar(fp, plugin, crawler)
     # write plugin heading
     fp.write('  <h2>%s</h2>\n' % htmlescape(plugin.__title__))
     # write plugin contents
diff --git a/webcheck/plugins/about.py b/webcheck/plugins/about.py
index b0b3ac3..25a2c62 100644
--- a/webcheck/plugins/about.py
+++ b/webcheck/plugins/about.py
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2011, 2013 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -31,7 +31,6 @@ __outputfile__ = 'about.html'
 import time
 
 import webcheck
-from webcheck import config
 from webcheck.db import Session, Link
 import webcheck.plugins
 
@@ -78,7 +77,7 @@ def generate(crawler):
       '    particular purpose. See the source for further details.\n'
       '   </p>\n'
       '   <p>\n'
-      '    Copyright &copy; 1998-2011 Albert Hopkins (marduk),\n'
+      '    Copyright &copy; 1998-2013 Albert Hopkins (marduk),\n'
       '    Mike W. Meyer and Arthur de Jong\n'
       '   </p>\n'
       '   <p>\n'
@@ -101,14 +100,13 @@ def generate(crawler):
     fp.write(
       '   <h3>Plugins</h3>\n'
       '   <ul>\n')
-    for plugin in config.PLUGINS:
-        pluginmod = __import__(plugin, globals(), locals(), [plugin])
+    for plugin in crawler.plugins:
         fp.write(
           '    <li>\n'
           '     <strong>%s</strong><br />\n'
-          % webcheck.plugins.htmlescape(pluginmod.__title__))
-        if hasattr(pluginmod, '__doc__'):
-            fp.write('     %s<br />\n' % 
webcheck.plugins.htmlescape(pluginmod.__doc__))
+          % webcheck.plugins.htmlescape(plugin.__title__))
+        if hasattr(plugin, '__doc__'):
+            fp.write('     %s<br />\n' % 
webcheck.plugins.htmlescape(plugin.__doc__))
         fp.write('    </li>\n')
     fp.write(
       '   </ul>\n')

http://arthurdejong.org/git/webcheck/commit/?id=3eba4a4fc19a94a8b8cca9e57595bf5f1d4b0740

commit 3eba4a4fc19a94a8b8cca9e57595bf5f1d4b0740
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Fri Sep 20 15:49:57 2013 +0200

    Get default configuration from config module

diff --git a/webcheck/cmd.py b/webcheck/cmd.py
index 03e4cf0..fd1a0f3 100755
--- a/webcheck/cmd.py
+++ b/webcheck/cmd.py
@@ -33,8 +33,7 @@ import urlparse
 
 import webcheck
 import webcheck.monkeypatch
-from webcheck import config
-from webcheck.crawler import Crawler
+from webcheck.crawler import Crawler, default_cfg
 
 
 # The loglevel to use for the logger that is configured.
@@ -75,13 +74,13 @@ parser = argparse.ArgumentParser(
 parser.add_argument(
     '-V', '--version', action=VersionAction)
 parser.add_argument(
-    '-i', '--internal', metavar='PATTERN', action='append', default=[],
+    '-i', '--internal', metavar='PATTERN', action='append',
     help='mark URLs matching PATTERN as internal')
 parser.add_argument(
-    '-x', '--external', metavar='PATTERN', action='append', default=[],
+    '-x', '--external', metavar='PATTERN', action='append',
     help='mark URLs matching PATTERN as external')
 parser.add_argument(
-    '-y', '--yank', metavar='PATTERN', action='append', default=[],
+    '-y', '--yank', metavar='PATTERN', action='append',
     help='do not check URLs matching PATTERN')
 parser.add_argument(
     '-b', '--base-only', action='store_true',
@@ -99,7 +98,7 @@ parser.add_argument(
     '-d', '--debug', action='store_true',
     help='show programmer-level debug information')
 parser.add_argument(
-    '-o', '--output', dest='output_dir', metavar='DIRECTORY', 
default=config.OUTPUT_DIR,
+    '-o', '--output', dest='output_dir', metavar='DIRECTORY',
     help='store the generated reports in the specified directory')
 parser.add_argument(
     '-c', '--continue', action='store_true',
@@ -108,18 +107,19 @@ parser.add_argument(
     '-f', '--force', action='store_true',
     help='overwrite files without asking')
 parser.add_argument(
-    '-r', '--redirects', metavar='N', type=int, default=config.REDIRECT_DEPTH,
+    '-r', '--redirects', metavar='N', type=int,
     help='the number of redirects webcheck should follow, 0 implies to follow 
all redirects')
 parser.add_argument(
-    '-l', '--max-depth', '--levels', metavar='N', type=int, 
default=config.MAX_DEPTH,
+    '-l', '--max-depth', '--levels', metavar='N', type=int,
     help='maximum depth of links to follow from base urls')
 parser.add_argument(
-    '-w', '--wait', metavar='SECONDS', type=float, 
default=config.WAIT_BETWEEN_REQUESTS,
+    '-w', '--wait', metavar='SECONDS', type=float,
     help='wait SECONDS between retrievals')
 parser.add_argument(
     '--profile', action='store_true', help=argparse.SUPPRESS)
 parser.add_argument(
     'base_urls', metavar='URL', nargs='+')
+parser.set_defaults(**default_cfg)
 
 
 def parse_args(crawler):
diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index 0099399..a516fbc 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2011, 2013 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -94,6 +94,16 @@ _spacepattern = re.compile(' ')
 _anchorpattern = re.compile('#([^#]+)$')
 
 
+# get default configuration
+default_cfg = dict(
+    internal=[], external=[], yank=[], base_only=config.BASE_URLS_ONLY,
+    avoid_external=config.AVOID_EXTERNAL_LINKS, 
ignore_robots=not(config.USE_ROBOTS),
+    output=config.OUTPUT_DIR, force=config.OVERWRITE_FILES,
+    redirects=config.REDIRECT_DEPTH, max_depth=config.MAX_DEPTH,
+    wait=config.WAIT_BETWEEN_REQUESTS)
+default_cfg.update({'continue': config.CONTINUE})
+
+
 class Crawler(object):
     """Class to represent gathered data of a site.
 

http://arthurdejong.org/git/webcheck/commit/?id=07172e0cd582b89437f94fde3307a0e8e81b6ee9

commit 07172e0cd582b89437f94fde3307a0e8e81b6ee9
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Fri Sep 20 15:45:53 2013 +0200

    Use the argparse Python module
    
    This greatly simplifies the command line parsing.

diff --git a/webcheck/cmd.py b/webcheck/cmd.py
index 42ad650..03e4cf0 100755
--- a/webcheck/cmd.py
+++ b/webcheck/cmd.py
@@ -4,7 +4,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011, 2013 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -25,11 +25,9 @@
 
 """This is the main webcheck module."""
 
-import getopt
+import argparse
 import logging
 import os
-import re
-import sys
 import urllib
 import urlparse
 
@@ -38,163 +36,139 @@ import webcheck.monkeypatch
 from webcheck import config
 from webcheck.crawler import Crawler
 
+
 # The loglevel to use for the logger that is configured.
 LOGLEVEL = logging.INFO
 
 
-def print_version():
-    """Print version information."""
-    sys.stdout.write(
-      'webcheck %(version)s\n'
-      'Written by Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.\n'
-      '\n'
-      'Copyright (C) 1998-2011\n'
-      'Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.\n'
-      'This is free software; see the source for copying conditions.  There is 
NO\n'
-      'warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR 
PURPOSE.\n'
-      % {'version': webcheck.__version__})
-
-
-def print_usage():
-    """Print short usage information."""
-    sys.stderr.write(
-      'Usage: webcheck [OPTION]... URL...\n')
-
-
-def print_tryhelp():
-    """Print friendly pointer to more information."""
-    sys.stderr.write(
-      'Try \'webcheck --help\' for more information.\n')
-
-
-def print_help():
-    """Print the option list."""
-    sys.stdout.write(
-      'Usage: webcheck [OPTION]... URL...\n'
-      'Generate a report for the given URLs\n'
-      '\n'
-      '  -i, --internal=PATTERN mark URLs matching PATTERN as internal\n'
-      '  -x, --external=PATTERN mark URLs matching PATTERN as external\n'
-      '  -y, --yank=PATTERN     do not check URLs matching PATTERN\n'
-      '  -b, --base-only        base URLs only: consider any URL not 
starting\n'
-      '                         with any of the base URLs to be external\n'
-      '  -a, --avoid-external   do not check external URLs\n'
-      '      --ignore-robots    do not retrieve and parse robots.txt files\n'
-      '  -q, --quiet, --silent  suppress progress messages\n'
-      '  -d, --debug            do programmer-level debugging\n'
-      '  -o, --output=DIRECTORY store the generated reports in the specified\n'
-      '                         directory\n'
-      '  -c, --continue         try to continue from a previous run\n'
-      '  -f, --force            overwrite files without asking\n'
-      '  -r, --redirects=N      the number of redirects webcheck should 
follow,\n'
-      '                         0 implies to follow all redirects 
(default=%(redirects)d)\n'
-      '  -l, --levels=N         maximum depth of links to follow from base 
urls (default=inf)\n'
-      '  -w, --wait=SECONDS     wait SECONDS between retrievals\n'
-      '  -V, --version          output version information and exit\n'
-      '  -h, --help             display this help and exit\n'
-      % {'redirects': config.REDIRECT_DEPTH})
+version_string = '''
+webcheck %s
+Written by Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.
+
+Copyright (C) 1998-2013
+Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+'''.strip() % webcheck.__version__
+
+
+class VersionAction(argparse.Action):
+
+    def __init__(self, option_strings, dest,
+                 help='output version information and exit'):
+        super(VersionAction, self).__init__(
+            option_strings=option_strings,
+            dest=argparse.SUPPRESS,
+            default=argparse.SUPPRESS,
+            nargs=0,
+            help=help)
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        print version_string
+        parser.exit()
+
+
+# set up command line parser
+parser = argparse.ArgumentParser(
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    description='Generate a report for the given URLs.')
+parser.add_argument(
+    '-V', '--version', action=VersionAction)
+parser.add_argument(
+    '-i', '--internal', metavar='PATTERN', action='append', default=[],
+    help='mark URLs matching PATTERN as internal')
+parser.add_argument(
+    '-x', '--external', metavar='PATTERN', action='append', default=[],
+    help='mark URLs matching PATTERN as external')
+parser.add_argument(
+    '-y', '--yank', metavar='PATTERN', action='append', default=[],
+    help='do not check URLs matching PATTERN')
+parser.add_argument(
+    '-b', '--base-only', action='store_true',
+    help='base URLs only: consider any URL not starting with any of the base 
URLs to be external')
+parser.add_argument(
+    '-a', '--avoid-external', action='store_true',
+    help='do not check external URLs')
+parser.add_argument(
+    '--ignore-robots', action='store_true',
+    help='do not retrieve or parse robots.txt files')
+parser.add_argument(
+    '-q', '--quiet', '--silent', action='store_true',
+    help='suppress progress messages')
+parser.add_argument(
+    '-d', '--debug', action='store_true',
+    help='show programmer-level debug information')
+parser.add_argument(
+    '-o', '--output', dest='output_dir', metavar='DIRECTORY', 
default=config.OUTPUT_DIR,
+    help='store the generated reports in the specified directory')
+parser.add_argument(
+    '-c', '--continue', action='store_true',
+    help='try to continue from a previous run')
+parser.add_argument(
+    '-f', '--force', action='store_true',
+    help='overwrite files without asking')
+parser.add_argument(
+    '-r', '--redirects', metavar='N', type=int, default=config.REDIRECT_DEPTH,
+    help='the number of redirects webcheck should follow, 0 implies to follow 
all redirects')
+parser.add_argument(
+    '-l', '--max-depth', '--levels', metavar='N', type=int, 
default=config.MAX_DEPTH,
+    help='maximum depth of links to follow from base urls')
+parser.add_argument(
+    '-w', '--wait', metavar='SECONDS', type=float, 
default=config.WAIT_BETWEEN_REQUESTS,
+    help='wait SECONDS between retrievals')
+parser.add_argument(
+    '--profile', action='store_true', help=argparse.SUPPRESS)
+parser.add_argument(
+    'base_urls', metavar='URL', nargs='+')
+
 
 def parse_args(crawler):
     """Parse command-line arguments."""
     # these global options are set here
     global LOGLEVEL
-    try:
-        optlist, args = getopt.gnu_getopt(sys.argv[1:],
-          'i:x:y:l:baqdo:cfr:u:w:Vh',
-          ('internal=', 'external=', 'yank=', 'base-only', 'avoid-external',
-           'ignore-robots',
-           'quiet', 'silent', 'debug', 'profile', 'output=', 'continue',
-           'force', 'redirects=', 'levels=', 'wait=', 'version', 'help'))
-        internal_urls = []
-        external_urls = []
-        yank_urls = []
-        for flag, arg in optlist:
-            if flag in ('-i', '--internal'):
-                internal_urls.append(arg)
-            elif flag in ('-x', '--external'):
-                external_urls.append(arg)
-            elif flag in ('-y', '--yank'):
-                yank_urls.append(arg)
-            elif flag in ('-b', '--base-only'):
-                config.BASE_URLS_ONLY = True
-            elif flag in ('-a', '--avoid-external'):
-                config.AVOID_EXTERNAL_LINKS = True
-            elif flag in ('--ignore-robots',):
-                config.USE_ROBOTS = False
-            elif flag in ('-q', '--quiet', '--silent'):
-                LOGLEVEL = logging.WARNING
-            elif flag in ('-d', '--debug'):
-                LOGLEVEL = logging.DEBUG
-            elif flag in ('--profile',):
-                # undocumented on purpose
-                pass
-            elif flag in ('-o', '--output'):
-                config.OUTPUT_DIR = arg
-            elif flag in ('-c', '--continue'):
-                config.CONTINUE = True
-            elif flag in ('-f', '--force'):
-                config.OVERWRITE_FILES = True
-            elif flag in ('-r', '--redirects'):
-                config.REDIRECT_DEPTH = int(arg)
-            elif flag in ('-l', '--levels'):
-                config.MAX_DEPTH = int(arg)
-            elif flag in ('-w', '--wait'):
-                config.WAIT_BETWEEN_REQUESTS = float(arg)
-            elif flag in ('-V', '--version'):
-                print_version()
-                sys.exit(0)
-            elif flag in ('-h', '--help'):
-                print_help()
-                sys.exit(0)
-        if len(args) == 0 and not config.CONTINUE:
-            print_usage()
-            print_tryhelp()
-            sys.exit(1)
-        # add configuration to site
-        for pattern in internal_urls:
-            crawler.add_internal_re(pattern)
-        for pattern in external_urls:
-            crawler.add_external_re(pattern)
-        for pattern in yank_urls:
-            crawler.add_yanked_re(pattern)
-        for arg in args:
-            # if it does not look like a url it is probably a local file
-            if urlparse.urlsplit(arg)[0] == '':
-                arg = 'file://' + urllib.pathname2url(os.path.abspath(arg))
-            crawler.add_base(arg)
-    except getopt.error, reason:
-        sys.stderr.write('webcheck: %s\n' % reason)
-        print_tryhelp()
-        sys.exit(1)
-    except re.error, e:
-        sys.stderr.write('webcheck: %s\n' % str(e))
-        sys.exit(1)
+    args = parser.parse_args()
+    for pattern in args.internal:
+        crawler.add_internal_re(pattern)
+    for pattern in args.external:
+        crawler.add_external_re(pattern)
+    for pattern in args.yank:
+        crawler.add_yanked_re(pattern)
+    config.BASE_URLS_ONLY = args.base_only
+    config.AVOID_EXTERNAL_LINKS = args.avoid_external
+    config.USE_ROBOTS = not(args.ignore_robots)
+    if args.quiet:
+        LOGLEVEL = logging.WARNING
+    elif args.debug:
+        LOGLEVEL = logging.DEBUG
+    config.OUTPUT_DIR = args.output
+    config.CONTINUE = getattr(args, 'continue')
+    config.OVERWRITE_FILES = args.force
+    config.REDIRECT_DEPTH = args.redirects
+    config.MAX_DEPTH = args.max_depth
+    config.WAIT_BETWEEN_REQUESTS = args.wait
+    for arg in args.urls:
+        # if it does not look like a url it is probably a local file
+        if urlparse.urlsplit(arg)[0] == '':
+            arg = 'file://' + urllib.pathname2url(os.path.abspath(arg))
+        crawler.add_base(arg)
 
 
 def main(crawler):
     """Main program."""
-    # configure logging
     logging.basicConfig(format='webcheck: %(levelname)s: %(message)s', 
level=LOGLEVEL)
-    # crawl through the website
     logging.info('checking site....')
-    crawler.crawl()  # this will take a while
+    crawler.crawl()
     logging.info('done.')
-    # do postprocessing (building site structure, etc)
     logging.info('postprocessing....')
     crawler.postprocess()
     logging.info('done.')
-    # now we can write out the files
-    # start with the frame-description page
     logging.info('generating reports...')
-    # for every plugin, generate a page
     crawler.generate()
     logging.info('done.')
 
+
 def entry_point():
     """setuptools entry point"""
-    # initialize crawler object
     crawler = Crawler()
-    # parse command-line arguments
     parse_args(crawler)
-    # run the main program
     main(crawler)

-----------------------------------------------------------------------

Summary of changes:
 run.py                       |   28 ++---
 webcheck/cmd.py              |  258 +++++++++++++++++-------------------------
 webcheck/crawler.py          |  112 +++++++++++-------
 webcheck/plugins/__init__.py |   22 ++--
 webcheck/plugins/about.py    |   14 +--
 5 files changed, 194 insertions(+), 240 deletions(-)


hooks/post-receive
-- 
webcheck
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/