webcheck branch master updated. 1.10.4-64-g24e191f
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck branch master updated. 1.10.4-64-g24e191f
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck branch master updated. 1.10.4-64-g24e191f
- Date: Fri, 20 Sep 2013 22:56:47 +0200 (CEST)
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "webcheck".
The branch, master has been updated
via 24e191f42e45b408d1b34210dcedb710d201a669 (commit)
via a24b222e1290c68c905ae629a95d92a2aea305d4 (commit)
via 3eba4a4fc19a94a8b8cca9e57595bf5f1d4b0740 (commit)
via 07172e0cd582b89437f94fde3307a0e8e81b6ee9 (commit)
from ca8e3e45cbb498bd628ceefce18e55949738402c (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://arthurdejong.org/git/webcheck/commit/?id=24e191f42e45b408d1b34210dcedb710d201a669
commit 24e191f42e45b408d1b34210dcedb710d201a669
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Fri Sep 20 17:02:28 2013 +0200
Initialise crawler with a configuration
This changes the constructor to accept a dict configuration of the
crawler. This is currently combined with the configuration in the config
module but the goal is to replace it completely.
diff --git a/run.py b/run.py
index e608ec0..eee0801 100755
--- a/run.py
+++ b/run.py
@@ -4,7 +4,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011, 2013 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -25,25 +25,17 @@
"""Alternative entry_point for development."""
-import sys, os, logging
+import os
+import sys
-from webcheck.crawler import Crawler
-from webcheck.cmd import parse_args, main
-from webcheck import config
+from webcheck.cmd import parser, main
-# Whether to produce profiling information. This is for development
-# purposes and as such undocumented.
-# http://docs.python.org/lib/profile.html
-PROFILE = False
if __name__ == '__main__':
try:
- # initialize crawler object
- crawler = Crawler()
- # parse command-line arguments
- parse_args(crawler)
- if PROFILE or '--profile' in sys.argv:
- fname = os.path.join(config.OUTPUT_DIR, 'webcheck.prof')
+ args = parser.parse_args()
+ if args.profile:
+ fname = os.path.join(args.output_dir, 'webcheck.prof')
try:
import cProfile
except ImportError:
@@ -53,12 +45,12 @@ if __name__ == '__main__':
sqltap.start()
except ImportError:
pass
- cProfile.run('main(crawler)', fname)
+ cProfile.run('main(vars(args))', fname)
if 'sqltap' in locals():
statistics = sqltap.collect()
- sqltap.report(statistics, os.path.join(config.OUTPUT_DIR,
'sqltap.html'))
+ sqltap.report(statistics, os.path.join(args.output_dir,
'sqltap.html'))
else:
- main(crawler)
+ main(vars(args))
except KeyboardInterrupt:
sys.stderr.write('Interrupted\n')
sys.exit(1)
diff --git a/webcheck/cmd.py b/webcheck/cmd.py
index fd1a0f3..7020a20 100755
--- a/webcheck/cmd.py
+++ b/webcheck/cmd.py
@@ -27,19 +27,12 @@
import argparse
import logging
-import os
-import urllib
-import urlparse
import webcheck
import webcheck.monkeypatch
from webcheck.crawler import Crawler, default_cfg
-# The loglevel to use for the logger that is configured.
-LOGLEVEL = logging.INFO
-
-
version_string = '''
webcheck %s
Written by Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.
@@ -122,40 +115,18 @@ parser.add_argument(
parser.set_defaults(**default_cfg)
-def parse_args(crawler):
- """Parse command-line arguments."""
- # these global options are set here
- global LOGLEVEL
- args = parser.parse_args()
- for pattern in args.internal:
- crawler.add_internal_re(pattern)
- for pattern in args.external:
- crawler.add_external_re(pattern)
- for pattern in args.yank:
- crawler.add_yanked_re(pattern)
- config.BASE_URLS_ONLY = args.base_only
- config.AVOID_EXTERNAL_LINKS = args.avoid_external
- config.USE_ROBOTS = not(args.ignore_robots)
- if args.quiet:
- LOGLEVEL = logging.WARNING
- elif args.debug:
- LOGLEVEL = logging.DEBUG
- config.OUTPUT_DIR = args.output
- config.CONTINUE = getattr(args, 'continue')
- config.OVERWRITE_FILES = args.force
- config.REDIRECT_DEPTH = args.redirects
- config.MAX_DEPTH = args.max_depth
- config.WAIT_BETWEEN_REQUESTS = args.wait
- for arg in args.urls:
- # if it does not look like a url it is probably a local file
- if urlparse.urlsplit(arg)[0] == '':
- arg = 'file://' + urllib.pathname2url(os.path.abspath(arg))
- crawler.add_base(arg)
-
-
-def main(crawler):
+def main(cfg):
"""Main program."""
- logging.basicConfig(format='webcheck: %(levelname)s: %(message)s',
level=LOGLEVEL)
+ # configure logging
+ if cfg.get('quiet', False):
+ level = logging.WARNING
+ elif cfg.get('debug', False):
+ level = logging.DEBUG
+ else:
+ level = logging.INFO
+ logging.basicConfig(format='webcheck: %(levelname)s: %(message)s',
level=level)
+ # set up crawler and go
+ crawler = Crawler(cfg)
logging.info('checking site....')
crawler.crawl()
logging.info('done.')
@@ -169,6 +140,5 @@ def main(crawler):
def entry_point():
"""setuptools entry point"""
- crawler = Crawler()
- parse_args(crawler)
- main(crawler)
+ args = parser.parse_args()
+ main(vars(cfg))
diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index 3710bbc..749485a 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -104,6 +104,17 @@ default_cfg = dict(
default_cfg.update({'continue': config.CONTINUE})
+class Config(object):
+
+ def __init__(self, *args, **kwargs):
+ self.update(*args, **kwargs)
+
+ def update(self, *args, **kwargs):
+ for arg in args:
+ vars(self).update(arg)
+ vars(self).update(kwargs)
+
+
class Crawler(object):
"""Class to represent gathered data of a site.
@@ -113,25 +124,52 @@ class Crawler(object):
plugins - a list of plugin modules used by the crawler
"""
- def __init__(self):
+ def __init__(self, cfg):
"""Creates an instance of the Crawler class and initializes the
state of the site."""
- # list of internal urls
- self._internal_urls = set()
+ # complete the configuration
+ self.cfg = Config(default_cfg)
+ self.cfg.update(cfg)
# list of regexps considered internal
self._internal_res = {}
+ for pattern in self.cfg.internal:
+ self._internal_res[pattern] = re.compile(pattern, re.IGNORECASE)
# list of regexps considered external
self._external_res = {}
+ for pattern in self.cfg.external:
+ self._external_res[pattern] = re.compile(pattern, re.IGNORECASE)
# list of regexps matching links that should not be checked
self._yanked_res = {}
- # map of scheme+netloc to robot handleds
+ for pattern in self.cfg.yank:
+ self._yanked_res[pattern] = re.compile(pattern, re.IGNORECASE)
+ # update other configuration
+ config.BASE_URLS_ONLY = self.cfg.base_only
+ config.AVOID_EXTERNAL_LINKS = self.cfg.avoid_external
+ config.USE_ROBOTS = not(self.cfg.ignore_robots)
+ config.OUTPUT_DIR = self.cfg.output_dir
+ config.CONTINUE = getattr(self.cfg, 'continue')
+ config.OVERWRITE_FILES = self.cfg.force
+ config.REDIRECT_DEPTH = self.cfg.redirects
+ config.MAX_DEPTH = self.cfg.max_depth
+ config.WAIT_BETWEEN_REQUESTS = self.cfg.wait
+ # map of scheme+netloc to robot parsers
self._robotparsers = {}
- # list of base urls (these are the internal urls to start from)
- self.bases = []
# load the plugins
self.plugins = [
__import__(plugin, globals(), locals(), [plugin])
for plugin in config.PLUGINS]
+ # add base urls
+ self._internal_urls = set()
+ for url in self.cfg.base_urls:
+ # if it does not look like a url it is probably a local file
+ if urlparse.urlsplit(url)[0] == '':
+ url = 'file://' + urllib.pathname2url(os.path.abspath(url))
+ # clean the URL and add it
+ url = Link.clean_url(url)
+ if url not in self._internal_urls:
+ self._internal_urls.add(url)
+ # list of base link objects
+ self.bases = []
def setup_database(self):
if hasattr(self, 'database_configed'):
@@ -148,31 +186,6 @@ class Crawler(object):
Base.metadata.create_all(engine)
# TODO: schema migraton goes here
- def add_base(self, url):
- """Add the given url and consider all urls below it to be internal.
- These links are all marked for checking with the crawl() function."""
- # ensure we have a connection to the database
- self.setup_database()
- # clean the URL and add it
- url = Link.clean_url(url)
- if url not in self._internal_urls:
- self._internal_urls.add(url)
-
- def add_internal_re(self, exp):
- """Adds the gived regular expression as a pattern to match internal
- urls."""
- self._internal_res[exp] = re.compile(exp, re.IGNORECASE)
-
- def add_external_re(self, exp):
- """Adds the gived regular expression as a pattern to match external
- urls."""
- self._external_res[exp] = re.compile(exp, re.IGNORECASE)
-
- def add_yanked_re(self, exp):
- """Adds the gived regular expression as a pattern to match urls that
- will not be checked at all."""
- self._yanked_res[exp] = re.compile(exp, re.IGNORECASE)
-
def _is_internal(self, url):
"""Check whether the specified url is external or internal. This
uses the urls marked with add_base() and the regular expressions
http://arthurdejong.org/git/webcheck/commit/?id=a24b222e1290c68c905ae629a95d92a2aea305d4
commit a24b222e1290c68c905ae629a95d92a2aea305d4
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Fri Sep 20 15:50:03 2013 +0200
Expose configured plugins via crawler.plugins
This avoids having module loading code in different places.
diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index a516fbc..3710bbc 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -110,7 +110,8 @@ class Crawler(object):
The available properties of this class are:
bases - a list of base link object
- """
+ plugins - a list of plugin modules used by the crawler
+ """
def __init__(self):
"""Creates an instance of the Crawler class and initializes the
@@ -127,6 +128,10 @@ class Crawler(object):
self._robotparsers = {}
# list of base urls (these are the internal urls to start from)
self.bases = []
+ # load the plugins
+ self.plugins = [
+ __import__(plugin, globals(), locals(), [plugin])
+ for plugin in config.PLUGINS]
def setup_database(self):
if hasattr(self, 'database_configed'):
@@ -429,24 +434,20 @@ class Crawler(object):
depth += 1
# TODO: also handle embeds
# see if any of the plugins want to do postprocessing
- for plugin in config.PLUGINS:
- # import the plugin
- pluginmod = __import__(plugin, globals(), locals(), [plugin])
- if hasattr(pluginmod, 'postprocess'):
- logger.info(plugin)
- pluginmod.postprocess(self)
+ for plugin in self.plugins:
+ if hasattr(plugin, 'postprocess'):
+ logger.info(plugin.__name__)
+ plugin.postprocess(self)
def generate(self):
"""Generate pages for plugins."""
# ensure we have a connection to the database
self.setup_database()
# call all the plugins
- for plugin in config.PLUGINS:
- # import the plugin
- pluginmod = __import__(plugin, globals(), locals(), [plugin])
- if hasattr(pluginmod, 'generate'):
- logger.info(plugin)
- pluginmod.generate(self)
+ for plugin in self.plugins:
+ if hasattr(plugin, 'generate'):
+ logger.info(plugin.__name__)
+ plugin.generate(self)
# install theme files
install_file('webcheck.css', True)
install_file('fancytooltips/fancytooltips.js', True)
diff --git a/webcheck/plugins/__init__.py b/webcheck/plugins/__init__.py
index 9978b85..4c534fd 100644
--- a/webcheck/plugins/__init__.py
+++ b/webcheck/plugins/__init__.py
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2009, 2011 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2009, 2011, 2013 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -43,11 +43,9 @@ fields:
Pluings can use the functions exported by this module."""
-import sys
import time
from sqlalchemy.orm import joinedload
-from sqlalchemy.orm.session import object_session
import webcheck
from webcheck import config
@@ -160,25 +158,23 @@ def print_parents(fp, link, indent=' '):
indent + '</div>\n')
-def _print_navbar(fp, selected):
+def _print_navbar(fp, selected, crawler):
"""Return an html fragement representing the navigation bar for a page."""
fp.write(' <ul class="navbar">\n')
- for plugin in config.PLUGINS:
- # import the plugin
- pluginmod = __import__(plugin, globals(), locals(), [plugin])
+ for plugin in crawler.plugins:
# skip if no outputfile
- if not hasattr(pluginmod, '__outputfile__'):
+ if not hasattr(plugin, '__outputfile__'):
continue
# generate a link to the plugin page
selected = ''
- if pluginmod == selected:
+ if plugin == selected:
selected = ' class="selected"'
fp.write(
' <li><a href="%(pluginfile)s"%(selected)s
title="%(description)s">%(title)s</a></li>\n'
- % {'pluginfile': pluginmod.__outputfile__,
+ % {'pluginfile': plugin.__outputfile__,
'selected': selected,
- 'title': htmlescape(pluginmod.__title__),
- 'description': htmlescape(pluginmod.__doc__)})
+ 'title': htmlescape(plugin.__title__),
+ 'description': htmlescape(plugin.__doc__)})
fp.write(' </ul>\n')
@@ -209,7 +205,7 @@ def open_html(plugin, crawler):
'siteurl': base.url,
'version': webcheck.__version__})
# write navigation bar
- _print_navbar(fp, plugin)
+ _print_navbar(fp, plugin, crawler)
# write plugin heading
fp.write(' <h2>%s</h2>\n' % htmlescape(plugin.__title__))
# write plugin contents
diff --git a/webcheck/plugins/about.py b/webcheck/plugins/about.py
index b0b3ac3..25a2c62 100644
--- a/webcheck/plugins/about.py
+++ b/webcheck/plugins/about.py
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2011, 2013 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -31,7 +31,6 @@ __outputfile__ = 'about.html'
import time
import webcheck
-from webcheck import config
from webcheck.db import Session, Link
import webcheck.plugins
@@ -78,7 +77,7 @@ def generate(crawler):
' particular purpose. See the source for further details.\n'
' </p>\n'
' <p>\n'
- ' Copyright © 1998-2011 Albert Hopkins (marduk),\n'
+ ' Copyright © 1998-2013 Albert Hopkins (marduk),\n'
' Mike W. Meyer and Arthur de Jong\n'
' </p>\n'
' <p>\n'
@@ -101,14 +100,13 @@ def generate(crawler):
fp.write(
' <h3>Plugins</h3>\n'
' <ul>\n')
- for plugin in config.PLUGINS:
- pluginmod = __import__(plugin, globals(), locals(), [plugin])
+ for plugin in crawler.plugins:
fp.write(
' <li>\n'
' <strong>%s</strong><br />\n'
- % webcheck.plugins.htmlescape(pluginmod.__title__))
- if hasattr(pluginmod, '__doc__'):
- fp.write(' %s<br />\n' %
webcheck.plugins.htmlescape(pluginmod.__doc__))
+ % webcheck.plugins.htmlescape(plugin.__title__))
+ if hasattr(plugin, '__doc__'):
+ fp.write(' %s<br />\n' %
webcheck.plugins.htmlescape(plugin.__doc__))
fp.write(' </li>\n')
fp.write(
' </ul>\n')
http://arthurdejong.org/git/webcheck/commit/?id=3eba4a4fc19a94a8b8cca9e57595bf5f1d4b0740
commit 3eba4a4fc19a94a8b8cca9e57595bf5f1d4b0740
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Fri Sep 20 15:49:57 2013 +0200
Get default configuration from config module
diff --git a/webcheck/cmd.py b/webcheck/cmd.py
index 03e4cf0..fd1a0f3 100755
--- a/webcheck/cmd.py
+++ b/webcheck/cmd.py
@@ -33,8 +33,7 @@ import urlparse
import webcheck
import webcheck.monkeypatch
-from webcheck import config
-from webcheck.crawler import Crawler
+from webcheck.crawler import Crawler, default_cfg
# The loglevel to use for the logger that is configured.
@@ -75,13 +74,13 @@ parser = argparse.ArgumentParser(
parser.add_argument(
'-V', '--version', action=VersionAction)
parser.add_argument(
- '-i', '--internal', metavar='PATTERN', action='append', default=[],
+ '-i', '--internal', metavar='PATTERN', action='append',
help='mark URLs matching PATTERN as internal')
parser.add_argument(
- '-x', '--external', metavar='PATTERN', action='append', default=[],
+ '-x', '--external', metavar='PATTERN', action='append',
help='mark URLs matching PATTERN as external')
parser.add_argument(
- '-y', '--yank', metavar='PATTERN', action='append', default=[],
+ '-y', '--yank', metavar='PATTERN', action='append',
help='do not check URLs matching PATTERN')
parser.add_argument(
'-b', '--base-only', action='store_true',
@@ -99,7 +98,7 @@ parser.add_argument(
'-d', '--debug', action='store_true',
help='show programmer-level debug information')
parser.add_argument(
- '-o', '--output', dest='output_dir', metavar='DIRECTORY',
default=config.OUTPUT_DIR,
+ '-o', '--output', dest='output_dir', metavar='DIRECTORY',
help='store the generated reports in the specified directory')
parser.add_argument(
'-c', '--continue', action='store_true',
@@ -108,18 +107,19 @@ parser.add_argument(
'-f', '--force', action='store_true',
help='overwrite files without asking')
parser.add_argument(
- '-r', '--redirects', metavar='N', type=int, default=config.REDIRECT_DEPTH,
+ '-r', '--redirects', metavar='N', type=int,
help='the number of redirects webcheck should follow, 0 implies to follow
all redirects')
parser.add_argument(
- '-l', '--max-depth', '--levels', metavar='N', type=int,
default=config.MAX_DEPTH,
+ '-l', '--max-depth', '--levels', metavar='N', type=int,
help='maximum depth of links to follow from base urls')
parser.add_argument(
- '-w', '--wait', metavar='SECONDS', type=float,
default=config.WAIT_BETWEEN_REQUESTS,
+ '-w', '--wait', metavar='SECONDS', type=float,
help='wait SECONDS between retrievals')
parser.add_argument(
'--profile', action='store_true', help=argparse.SUPPRESS)
parser.add_argument(
'base_urls', metavar='URL', nargs='+')
+parser.set_defaults(**default_cfg)
def parse_args(crawler):
diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index 0099399..a516fbc 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2011, 2013 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -94,6 +94,16 @@ _spacepattern = re.compile(' ')
_anchorpattern = re.compile('#([^#]+)$')
+# get default configuration
+default_cfg = dict(
+ internal=[], external=[], yank=[], base_only=config.BASE_URLS_ONLY,
+ avoid_external=config.AVOID_EXTERNAL_LINKS,
ignore_robots=not(config.USE_ROBOTS),
+ output=config.OUTPUT_DIR, force=config.OVERWRITE_FILES,
+ redirects=config.REDIRECT_DEPTH, max_depth=config.MAX_DEPTH,
+ wait=config.WAIT_BETWEEN_REQUESTS)
+default_cfg.update({'continue': config.CONTINUE})
+
+
class Crawler(object):
"""Class to represent gathered data of a site.
http://arthurdejong.org/git/webcheck/commit/?id=07172e0cd582b89437f94fde3307a0e8e81b6ee9
commit 07172e0cd582b89437f94fde3307a0e8e81b6ee9
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Fri Sep 20 15:45:53 2013 +0200
Use the argparse Python module
This greatly simplifies the command line parsing.
diff --git a/webcheck/cmd.py b/webcheck/cmd.py
index 42ad650..03e4cf0 100755
--- a/webcheck/cmd.py
+++ b/webcheck/cmd.py
@@ -4,7 +4,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011, 2013 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -25,11 +25,9 @@
"""This is the main webcheck module."""
-import getopt
+import argparse
import logging
import os
-import re
-import sys
import urllib
import urlparse
@@ -38,163 +36,139 @@ import webcheck.monkeypatch
from webcheck import config
from webcheck.crawler import Crawler
+
# The loglevel to use for the logger that is configured.
LOGLEVEL = logging.INFO
-def print_version():
- """Print version information."""
- sys.stdout.write(
- 'webcheck %(version)s\n'
- 'Written by Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.\n'
- '\n'
- 'Copyright (C) 1998-2011\n'
- 'Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.\n'
- 'This is free software; see the source for copying conditions. There is
NO\n'
- 'warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE.\n'
- % {'version': webcheck.__version__})
-
-
-def print_usage():
- """Print short usage information."""
- sys.stderr.write(
- 'Usage: webcheck [OPTION]... URL...\n')
-
-
-def print_tryhelp():
- """Print friendly pointer to more information."""
- sys.stderr.write(
- 'Try \'webcheck --help\' for more information.\n')
-
-
-def print_help():
- """Print the option list."""
- sys.stdout.write(
- 'Usage: webcheck [OPTION]... URL...\n'
- 'Generate a report for the given URLs\n'
- '\n'
- ' -i, --internal=PATTERN mark URLs matching PATTERN as internal\n'
- ' -x, --external=PATTERN mark URLs matching PATTERN as external\n'
- ' -y, --yank=PATTERN do not check URLs matching PATTERN\n'
- ' -b, --base-only base URLs only: consider any URL not
starting\n'
- ' with any of the base URLs to be external\n'
- ' -a, --avoid-external do not check external URLs\n'
- ' --ignore-robots do not retrieve and parse robots.txt files\n'
- ' -q, --quiet, --silent suppress progress messages\n'
- ' -d, --debug do programmer-level debugging\n'
- ' -o, --output=DIRECTORY store the generated reports in the specified\n'
- ' directory\n'
- ' -c, --continue try to continue from a previous run\n'
- ' -f, --force overwrite files without asking\n'
- ' -r, --redirects=N the number of redirects webcheck should
follow,\n'
- ' 0 implies to follow all redirects
(default=%(redirects)d)\n'
- ' -l, --levels=N maximum depth of links to follow from base
urls (default=inf)\n'
- ' -w, --wait=SECONDS wait SECONDS between retrievals\n'
- ' -V, --version output version information and exit\n'
- ' -h, --help display this help and exit\n'
- % {'redirects': config.REDIRECT_DEPTH})
+version_string = '''
+webcheck %s
+Written by Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.
+
+Copyright (C) 1998-2013
+Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.
+This is free software; see the source for copying conditions. There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+'''.strip() % webcheck.__version__
+
+
+class VersionAction(argparse.Action):
+
+ def __init__(self, option_strings, dest,
+ help='output version information and exit'):
+ super(VersionAction, self).__init__(
+ option_strings=option_strings,
+ dest=argparse.SUPPRESS,
+ default=argparse.SUPPRESS,
+ nargs=0,
+ help=help)
+
+ def __call__(self, parser, namespace, values, option_string=None):
+ print version_string
+ parser.exit()
+
+
+# set up command line parser
+parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='Generate a report for the given URLs.')
+parser.add_argument(
+ '-V', '--version', action=VersionAction)
+parser.add_argument(
+ '-i', '--internal', metavar='PATTERN', action='append', default=[],
+ help='mark URLs matching PATTERN as internal')
+parser.add_argument(
+ '-x', '--external', metavar='PATTERN', action='append', default=[],
+ help='mark URLs matching PATTERN as external')
+parser.add_argument(
+ '-y', '--yank', metavar='PATTERN', action='append', default=[],
+ help='do not check URLs matching PATTERN')
+parser.add_argument(
+ '-b', '--base-only', action='store_true',
+ help='base URLs only: consider any URL not starting with any of the base
URLs to be external')
+parser.add_argument(
+ '-a', '--avoid-external', action='store_true',
+ help='do not check external URLs')
+parser.add_argument(
+ '--ignore-robots', action='store_true',
+ help='do not retrieve or parse robots.txt files')
+parser.add_argument(
+ '-q', '--quiet', '--silent', action='store_true',
+ help='suppress progress messages')
+parser.add_argument(
+ '-d', '--debug', action='store_true',
+ help='show programmer-level debug information')
+parser.add_argument(
+ '-o', '--output', dest='output_dir', metavar='DIRECTORY',
default=config.OUTPUT_DIR,
+ help='store the generated reports in the specified directory')
+parser.add_argument(
+ '-c', '--continue', action='store_true',
+ help='try to continue from a previous run')
+parser.add_argument(
+ '-f', '--force', action='store_true',
+ help='overwrite files without asking')
+parser.add_argument(
+ '-r', '--redirects', metavar='N', type=int, default=config.REDIRECT_DEPTH,
+ help='the number of redirects webcheck should follow, 0 implies to follow
all redirects')
+parser.add_argument(
+ '-l', '--max-depth', '--levels', metavar='N', type=int,
default=config.MAX_DEPTH,
+ help='maximum depth of links to follow from base urls')
+parser.add_argument(
+ '-w', '--wait', metavar='SECONDS', type=float,
default=config.WAIT_BETWEEN_REQUESTS,
+ help='wait SECONDS between retrievals')
+parser.add_argument(
+ '--profile', action='store_true', help=argparse.SUPPRESS)
+parser.add_argument(
+ 'base_urls', metavar='URL', nargs='+')
+
def parse_args(crawler):
"""Parse command-line arguments."""
# these global options are set here
global LOGLEVEL
- try:
- optlist, args = getopt.gnu_getopt(sys.argv[1:],
- 'i:x:y:l:baqdo:cfr:u:w:Vh',
- ('internal=', 'external=', 'yank=', 'base-only', 'avoid-external',
- 'ignore-robots',
- 'quiet', 'silent', 'debug', 'profile', 'output=', 'continue',
- 'force', 'redirects=', 'levels=', 'wait=', 'version', 'help'))
- internal_urls = []
- external_urls = []
- yank_urls = []
- for flag, arg in optlist:
- if flag in ('-i', '--internal'):
- internal_urls.append(arg)
- elif flag in ('-x', '--external'):
- external_urls.append(arg)
- elif flag in ('-y', '--yank'):
- yank_urls.append(arg)
- elif flag in ('-b', '--base-only'):
- config.BASE_URLS_ONLY = True
- elif flag in ('-a', '--avoid-external'):
- config.AVOID_EXTERNAL_LINKS = True
- elif flag in ('--ignore-robots',):
- config.USE_ROBOTS = False
- elif flag in ('-q', '--quiet', '--silent'):
- LOGLEVEL = logging.WARNING
- elif flag in ('-d', '--debug'):
- LOGLEVEL = logging.DEBUG
- elif flag in ('--profile',):
- # undocumented on purpose
- pass
- elif flag in ('-o', '--output'):
- config.OUTPUT_DIR = arg
- elif flag in ('-c', '--continue'):
- config.CONTINUE = True
- elif flag in ('-f', '--force'):
- config.OVERWRITE_FILES = True
- elif flag in ('-r', '--redirects'):
- config.REDIRECT_DEPTH = int(arg)
- elif flag in ('-l', '--levels'):
- config.MAX_DEPTH = int(arg)
- elif flag in ('-w', '--wait'):
- config.WAIT_BETWEEN_REQUESTS = float(arg)
- elif flag in ('-V', '--version'):
- print_version()
- sys.exit(0)
- elif flag in ('-h', '--help'):
- print_help()
- sys.exit(0)
- if len(args) == 0 and not config.CONTINUE:
- print_usage()
- print_tryhelp()
- sys.exit(1)
- # add configuration to site
- for pattern in internal_urls:
- crawler.add_internal_re(pattern)
- for pattern in external_urls:
- crawler.add_external_re(pattern)
- for pattern in yank_urls:
- crawler.add_yanked_re(pattern)
- for arg in args:
- # if it does not look like a url it is probably a local file
- if urlparse.urlsplit(arg)[0] == '':
- arg = 'file://' + urllib.pathname2url(os.path.abspath(arg))
- crawler.add_base(arg)
- except getopt.error, reason:
- sys.stderr.write('webcheck: %s\n' % reason)
- print_tryhelp()
- sys.exit(1)
- except re.error, e:
- sys.stderr.write('webcheck: %s\n' % str(e))
- sys.exit(1)
+ args = parser.parse_args()
+ for pattern in args.internal:
+ crawler.add_internal_re(pattern)
+ for pattern in args.external:
+ crawler.add_external_re(pattern)
+ for pattern in args.yank:
+ crawler.add_yanked_re(pattern)
+ config.BASE_URLS_ONLY = args.base_only
+ config.AVOID_EXTERNAL_LINKS = args.avoid_external
+ config.USE_ROBOTS = not(args.ignore_robots)
+ if args.quiet:
+ LOGLEVEL = logging.WARNING
+ elif args.debug:
+ LOGLEVEL = logging.DEBUG
+ config.OUTPUT_DIR = args.output
+ config.CONTINUE = getattr(args, 'continue')
+ config.OVERWRITE_FILES = args.force
+ config.REDIRECT_DEPTH = args.redirects
+ config.MAX_DEPTH = args.max_depth
+ config.WAIT_BETWEEN_REQUESTS = args.wait
+ for arg in args.urls:
+ # if it does not look like a url it is probably a local file
+ if urlparse.urlsplit(arg)[0] == '':
+ arg = 'file://' + urllib.pathname2url(os.path.abspath(arg))
+ crawler.add_base(arg)
def main(crawler):
"""Main program."""
- # configure logging
logging.basicConfig(format='webcheck: %(levelname)s: %(message)s',
level=LOGLEVEL)
- # crawl through the website
logging.info('checking site....')
- crawler.crawl() # this will take a while
+ crawler.crawl()
logging.info('done.')
- # do postprocessing (building site structure, etc)
logging.info('postprocessing....')
crawler.postprocess()
logging.info('done.')
- # now we can write out the files
- # start with the frame-description page
logging.info('generating reports...')
- # for every plugin, generate a page
crawler.generate()
logging.info('done.')
+
def entry_point():
"""setuptools entry point"""
- # initialize crawler object
crawler = Crawler()
- # parse command-line arguments
parse_args(crawler)
- # run the main program
main(crawler)
-----------------------------------------------------------------------
Summary of changes:
run.py | 28 ++---
webcheck/cmd.py | 258 +++++++++++++++++-------------------------
webcheck/crawler.py | 112 +++++++++++-------
webcheck/plugins/__init__.py | 22 ++--
webcheck/plugins/about.py | 14 +--
5 files changed, 194 insertions(+), 240 deletions(-)
hooks/post-receive
--
webcheck
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
- webcheck branch master updated. 1.10.4-64-g24e191f,
Commits of the webcheck project