webcheck commit: r427 - in webcheck: . parsers parsers/html plugins
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r427 - in webcheck: . parsers parsers/html plugins
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r427 - in webcheck: . parsers parsers/html plugins
- Date: Thu, 18 Aug 2011 23:22:29 +0200 (CEST)
Author: arthur
Date: Thu Aug 18 23:22:26 2011
New Revision: 427
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=427
Log:
make source code changes to follow PEP8 more
Modified:
webcheck/config.py
webcheck/crawler.py
webcheck/db.py
webcheck/debugio.py
webcheck/monkeypatch.py
webcheck/myurllib.py
webcheck/parsers/__init__.py
webcheck/parsers/css.py
webcheck/parsers/html/__init__.py
webcheck/parsers/html/beautifulsoup.py
webcheck/parsers/html/calltidy.py
webcheck/parsers/html/htmlparser.py
webcheck/plugins/__init__.py
webcheck/plugins/about.py
webcheck/plugins/anchors.py
webcheck/plugins/badlinks.py
webcheck/plugins/external.py
webcheck/plugins/images.py
webcheck/plugins/new.py
webcheck/plugins/notchkd.py
webcheck/plugins/notitles.py
webcheck/plugins/old.py
webcheck/plugins/problems.py
webcheck/plugins/sitemap.py
webcheck/plugins/size.py
webcheck/plugins/urllist.py
webcheck/webcheck.py
Modified: webcheck/config.py
==============================================================================
--- webcheck/config.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/config.py Thu Aug 18 23:22:26 2011 (r427)
@@ -64,19 +64,19 @@
REDIRECT_DEPTH = 5
# The list of plugins that will be used to generate the report.
-PLUGINS = [ 'anchors',
- 'sitemap',
- 'urllist',
- 'images',
- 'external',
- 'notchkd',
- 'badlinks',
- 'old',
- 'new',
- 'size',
- 'notitles',
- 'problems',
- 'about' ]
+PLUGINS = ['anchors',
+ 'sitemap',
+ 'urllist',
+ 'images',
+ 'external',
+ 'notchkd',
+ 'badlinks',
+ 'old',
+ 'new',
+ 'size',
+ 'notitles',
+ 'problems',
+ 'about']
# Whether to overwrite files without asking. This is the state of the -f
# command line option.
@@ -107,11 +107,11 @@
# A list of names that will be checked when encountering an file:///
# directory. This file will be picked up instead of the directory list.
-FILE_INDEXES = [ 'index.html', 'index.htm' ]
+FILE_INDEXES = ['index.html', 'index.htm']
# A list of names that will be checked when encountering an ftp://
# directory. This file will be picked up instead of the directory list.
-FTP_INDEXES = [ 'index.html', 'index.htm' ]
+FTP_INDEXES = ['index.html', 'index.htm']
# Whether to fetch robots.txt files and do checking based on the information
# present in those files (normally matching links are yanked).
Modified: webcheck/crawler.py
==============================================================================
--- webcheck/crawler.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/crawler.py Thu Aug 18 23:22:26 2011 (r427)
@@ -70,7 +70,8 @@
pass
atexit.register(cookiejar.save, ignore_discard=False, ignore_expires=False)
# set up our custom opener that sets a meaningful user agent
- opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar),
NoRedirectHandler())
+ opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar),
+ NoRedirectHandler())
opener.addheaders = [
('User-agent', 'webcheck %s' % config.VERSION),
]
@@ -147,7 +148,7 @@
if config.BASE_URLS_ONLY:
# the url must start with one of the _internal_urls
for i in self._internal_urls:
- res |= (i==url[:len(i)])
+ res |= (i == url[:len(i)])
else:
# the netloc must match a netloc of an _internal_url
netloc = urlparse.urlsplit(url)[1]
@@ -169,18 +170,19 @@
netloc."""
# only some schemes have a meaningful robots.txt file
if scheme != 'http' and scheme != 'https':
- debugio.debug('crawler._get_robotparser() called with unsupported
scheme (%s)' % scheme)
+ debugio.debug('crawler._get_robotparser() '
+ 'called with unsupported scheme (%s)' % scheme)
return None
# split out the key part of the url
location = urlparse.urlunsplit((scheme, netloc, '', '', ''))
# try to create a new robotparser if we don't already have one
- if not self._robotparsers.has_key(location):
+ if location not in self._robotparsers:
debugio.info(' getting robots.txt for %s' % location)
self._robotparsers[location] = None
try:
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urlunsplit(
- (scheme, netloc, '/robots.txt', '', '') ))
+ (scheme, netloc, '/robots.txt', '', '')))
rp.read()
self._robotparsers[location] = rp
except (TypeError, IOError, httplib.HTTPException):
@@ -277,7 +279,8 @@
session.commit()
# sleep between requests if configured
if config.WAIT_BETWEEN_REQUESTS > 0:
- debugio.debug('crawler.crawl(): sleeping %s seconds' %
config.WAIT_BETWEEN_REQUESTS)
+ debugio.debug('crawler.crawl(): sleeping %s seconds' %
+ config.WAIT_BETWEEN_REQUESTS)
time.sleep(config.WAIT_BETWEEN_REQUESTS)
debugio.debug('crawler.crawl(): items left to check: %d' %
len(tocheck))
session.commit()
Modified: webcheck/db.py
==============================================================================
--- webcheck/db.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/db.py Thu Aug 18 23:22:26 2011 (r427)
@@ -177,7 +177,7 @@
if self.anchors.filter(Anchor.anchor == anchor).first():
self.add_pageproblem(
'anchor/id "%(anchor)s" defined multiple times'
- % { 'anchor': anchor })
+ % {'anchor': anchor})
else:
self.anchors.append(Anchor(anchor=anchor))
Modified: webcheck/debugio.py
==============================================================================
--- webcheck/debugio.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/debugio.py Thu Aug 18 23:22:26 2011 (r427)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -32,30 +32,34 @@
# log levels that can be used
ERROR = 0
-WARN = 1
-INFO = 2
+WARN = 1
+INFO = 2
DEBUG = 3
# initialize logging at default level
loglevel = INFO
+
def debug(msg):
"""Log the message to stderr if loglevel will allow it."""
if loglevel >= DEBUG:
- sys.stderr.write('webcheck: DEBUG: '+str(msg)+'\n')
+ sys.stderr.write('webcheck: DEBUG: ' + str(msg) + '\n')
+
def info(msg):
"""Log the message to stdout if loglevel will allow it."""
if loglevel >= INFO:
- sys.stdout.write('webcheck: '+str(msg)+'\n')
+ sys.stdout.write('webcheck: ' + str(msg) + '\n')
sys.stdout.flush()
+
def warn(msg):
"""Log a warning to stderr if loglevel will allow it."""
if loglevel >= WARN:
- sys.stderr.write('webcheck: Warning: '+str(msg)+'\n')
+ sys.stderr.write('webcheck: Warning: ' + str(msg) + '\n')
+
def error(msg):
"""Log an error to stderr if loglevel will allow it."""
if loglevel >= ERROR:
- sys.stderr.write('webcheck: Error: '+str(msg)+'\n')
+ sys.stderr.write('webcheck: Error: ' + str(msg) + '\n')
Modified: webcheck/monkeypatch.py
==============================================================================
--- webcheck/monkeypatch.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/monkeypatch.py Thu Aug 18 23:22:26 2011 (r427)
@@ -28,6 +28,7 @@
__all__ = []
+
# This monkeypatches RuleLine.applies_to to support * and $ characters in
# robots.txt path names.
def my_applies_to(ruleline, filename):
Modified: webcheck/myurllib.py
==============================================================================
--- webcheck/myurllib.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/myurllib.py Thu Aug 18 23:22:26 2011 (r427)
@@ -54,6 +54,7 @@
# pattern for leading dots
_leadingdotpattern = re.compile('^(/\.\.)*')
+
def _unescape_printable(match):
"""Helper function for _normalize_escapes() to perform the expansion of
html entity refs that are normal printable (but not reserver)
@@ -65,6 +66,7 @@
# transform remaining escapes to uppercase
return match.group(1).upper()
+
def _normalize_escapes(url):
"""Ensure that escaping in the url is consistent. Any reserved characters
are left alone. Any characters that are printable but are escaped are
@@ -75,9 +77,10 @@
# url encode any nonprintable or problematic characters (but not reserved
# characters) so we're left with a string with everything that needs to be
# quoted as such
- url = _urlprobpattern.sub(lambda x:'%%%02X' % ord(x.group(1)), url)
+ url = _urlprobpattern.sub(lambda x: '%%%02X' % ord(x.group(1)), url)
return url
+
def _urlclean(url):
"""Clean the url of uneccesary parts."""
# make escaping consistent
@@ -85,9 +88,9 @@
# split the url in useful parts
(scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
# remove any leading /../ parts
- if scheme in ( 'http', 'https' ):
+ if scheme in ('http', 'https'):
path = _leadingdotpattern.sub('', path)
- if scheme in ( 'http', 'https', 'ftp' ):
+ if scheme in ('http', 'https', 'ftp'):
# http(s) urls should have a non-empty path
if path == '':
path = '/'
@@ -104,13 +107,14 @@
if netloc[-1:] == ':':
netloc = netloc[:-1]
if userpass is not None:
- netloc = userpass+'@'+netloc
+ netloc = userpass + '@' + netloc
# get rid of double slashes in some paths
- if ( scheme == 'file' ):
+ if scheme == 'file':
path = _doubleslashpattern.sub('/', path)
# put the url back together again
return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
+
def normalizeurl(url):
"""Return a normalized URL."""
return _urlclean(url)
Modified: webcheck/parsers/__init__.py
==============================================================================
--- webcheck/parsers/__init__.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/parsers/__init__.py Thu Aug 18 23:22:26 2011 (r427)
@@ -1,7 +1,7 @@
# __init__.py - general content-type parser interface
#
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -34,24 +34,27 @@
# a map of mimetypes to modules
_parsermodules = {}
+
def _init_modules():
"""Initialize the modules."""
# go throught all known modules to probe the content-types
# (do this only once)
for mod in _modules:
- parser = __import__('parsers.'+mod, globals(), locals(), [mod])
+ parser = __import__('parsers.' + mod, globals(), locals(), [mod])
for mimetype in parser.mimetypes:
_parsermodules[mimetype] = parser
+
def get_parsermodule(mimetype):
"""Look up the correct module for the specified mimetype."""
if _parsermodules == {}:
_init_modules()
# check if we have a supported content-type
- if _parsermodules.has_key(mimetype):
+ if mimetype in _parsermodules:
return _parsermodules[mimetype]
return None
+
def get_mimetypes():
"""Return a list of supported mime types that can be parsed
by the installed parsers."""
Modified: webcheck/parsers/css.py
==============================================================================
--- webcheck/parsers/css.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/parsers/css.py Thu Aug 18 23:22:26 2011 (r427)
@@ -30,15 +30,16 @@
import re
# pattern for matching /* ... */ comments in css
-_commentpattern = re.compile('/\*.*?\*/', re.IGNORECASE|re.DOTALL)
+_commentpattern = re.compile('/\*.*?\*/', re.IGNORECASE | re.DOTALL)
# pattern for matching @import "url" statments in css
_importpattern = re.compile('@import\s+["\']([^"\']*)["\']',
- re.IGNORECASE|re.DOTALL)
+ re.IGNORECASE | re.DOTALL)
# pattern for matching url(...) in css
_urlpattern = re.compile('url\(["\']?(.*?)["\']?\)')
+
def parse(content, link, base=None):
"""Parse the specified content and extract information for crawling the
site further."""
Modified: webcheck/parsers/html/__init__.py
==============================================================================
--- webcheck/parsers/html/__init__.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/parsers/html/__init__.py Thu Aug 18 23:22:26 2011 (r427)
@@ -35,6 +35,7 @@
# pattern for matching all html entities
_entitypattern = re.compile('&(#[0-9]{1,6}|[a-zA-Z]{2,10});')
+
def htmlescape(txt, inattr=False):
"""HTML escape the given string and return an ASCII clean string with
known entities and character entities for the other values.
@@ -54,21 +55,22 @@
out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
else:
out += '"'
- elif htmlentitydefs.codepoint2name.has_key(ord(c)):
+ elif ord(c) in htmlentitydefs.codepoint2name:
out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
elif ord(c) > 126:
- out += '&#%d;'% ord(c)
+ out += '&#%d;' % ord(c)
elif inattr and c == u'\n':
out += ' '
else:
out += c.encode('utf-8')
return out
+
def _unescape_entity(match):
"""Helper function for htmlunescape().
This funcion unescapes a html entity, it is passed to the sub()
function."""
- if htmlentitydefs.name2codepoint.has_key(match.group(1)):
+ if match.group(1) in htmlentitydefs.name2codepoint:
# we have a named entity, return proper character
return unichr(htmlentitydefs.name2codepoint[match.group(1)])
elif match.group(1)[0] == '#':
@@ -78,6 +80,7 @@
# we have something else, just keep the original
return match.group(0)
+
def htmlunescape(txt):
"""This function unescapes a html encoded string.
This function returns a unicode string."""
@@ -92,6 +95,7 @@
# we're done
return txt
+
def _parsefunction(content, link):
# we find a suitable parse function
global _parsefunction
@@ -102,12 +106,14 @@
_parsefunction = parsers.html.beautifulsoup.parse
except ImportError:
# fall back to legacy HTMLParser parser
- debugio.warn('falling back to the legacy HTML parser, consider
installing BeautifulSoup')
+ debugio.warn('falling back to the legacy HTML parser, '
+ 'consider installing BeautifulSoup')
import parsers.html.htmlparser
_parsefunction = parsers.html.htmlparser.parse
# call the actual parse function
_parsefunction(content, link)
+
def parse(content, link):
"""Parse the specified content and extract an url list, a list of images a
title and an author. The content is assumed to contain HMTL."""
Modified: webcheck/parsers/html/beautifulsoup.py
==============================================================================
--- webcheck/parsers/html/beautifulsoup.py Wed Aug 10 22:42:57 2011
(r426)
+++ webcheck/parsers/html/beautifulsoup.py Thu Aug 18 23:22:26 2011
(r427)
@@ -40,7 +40,9 @@
# check BeautifulSoup find() function for bugs
if BeautifulSoup.BeautifulSoup('<foo>').find('foo', bar=True):
import debugio
- debugio.warn('using buggy version of BeautifulSoup (%s)' %
BeautifulSoup.__version__)
+ debugio.warn('using buggy version of BeautifulSoup (%s)' %
+ BeautifulSoup.__version__)
+
def parse(content, link):
"""Parse the specified content and extract an url list, a list of images a
@@ -67,21 +69,24 @@
base = link.url
# <link rel="TYPE" href="URL">
for l in soup.findAll('link', rel=True, href=True):
- if l['rel'].lower() in ('stylesheet', 'alternate stylesheet', 'icon',
'shortcut icon'):
+ if l['rel'].lower() in ('stylesheet', 'alternate stylesheet', 'icon',
+ 'shortcut icon'):
embed = myurllib.normalizeurl(htmlunescape(l['href']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <meta name="author" content="AUTHOR">
- author = soup.find('meta', attrs={'name': re.compile("^author$", re.I),
'content': True})
+ author = soup.find('meta', attrs={'name': re.compile("^author$", re.I),
+ 'content': True})
if author and author['content']:
link.author = htmlunescape(author['content']).strip()
# <meta http-equiv="refresh" content="0;url=URL">
- refresh = soup.find('meta', attrs={'http-equiv': _refreshhttpequivpattern,
'content': True})
+ refresh = soup.find('meta', attrs={'http-equiv': _refreshhttpequivpattern,
+ 'content': True})
if refresh and refresh['content']:
try:
child = _refershcontentpattern.search(refresh['content']).group(1)
except AttributeError:
- pass # ignore cases where refresh header parsing causes problems
+ pass # ignore cases where refresh header parsing causes problems
else:
link.add_child(urlparse.urljoin(base, child))
# <img src="URL">
@@ -100,7 +105,8 @@
# get anchor name
a_name = myurllib.normalizeurl(htmlunescape(a['name']).strip())
# if both id and name are used they should be the same
- if a.has_key('id') and a_name !=
myurllib.normalizeurl(htmlunescape(a['id']).strip()):
+ if 'id' in a and \
+ a_name != myurllib.normalizeurl(htmlunescape(a['id']).strip()):
link.add_pageproblem(
'anchors defined in name and id attributes do not match')
# add the id anchor anyway
@@ -110,7 +116,7 @@
# <ANY id="ID">
for elem in soup.findAll(id=True):
# skip anchor that have a name
- if elem.name == 'a' and elem.has_key('name'):
+ if elem.name == 'a' and 'name' in elem:
continue
# add the anchor
link.add_anchor(myurllib.normalizeurl(htmlunescape(elem['id']).strip()))
@@ -142,7 +148,7 @@
# <applet code="URL" [archive="URL"]...>
for applet in soup.findAll('applet', code=True):
# if applet has archive tag check that
- if applet.has_key('archive'):
+ if 'archive' in applet:
embed =
myurllib.normalizeurl(htmlunescape(applet['archive']).strip())
else:
embed = myurllib.normalizeurl(htmlunescape(applet['code']).strip())
@@ -154,7 +160,9 @@
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <embed><param name="movie" value="url"></embed>
- for param in soup.findAll('param', attrs={'name': re.compile("^movie$",
re.I), 'value': True}):
+ for param in soup.findAll('param', attrs={
+ 'name': re.compile("^movie$", re.I),
+ 'value': True}):
embed = myurllib.normalizeurl(htmlunescape(param['value']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
@@ -175,7 +183,7 @@
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <body|table|td background="url">
- for t in soup.findAll( ('body', 'table', 'td'), background=True):
+ for t in soup.findAll(('body', 'table', 'td'), background=True):
embed = myurllib.normalizeurl(htmlunescape(t['background']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
Modified: webcheck/parsers/html/calltidy.py
==============================================================================
--- webcheck/parsers/html/calltidy.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/parsers/html/calltidy.py Thu Aug 18 23:22:26 2011 (r427)
@@ -24,6 +24,7 @@
import config
import parsers.html
+
def parse(content, link):
"""Parse the specified content with tidy and add any errors to the
link."""
Modified: webcheck/parsers/html/htmlparser.py
==============================================================================
--- webcheck/parsers/html/htmlparser.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/parsers/html/htmlparser.py Thu Aug 18 23:22:26 2011 (r427)
@@ -45,6 +45,7 @@
# pattern for matching the encoding part of an xml declaration
_encodingpattern = re.compile('^xml .*encoding="([^"]*)"', re.I)
+
class _MyHTMLParser(HTMLParser.HTMLParser):
"""A simple subclass of HTMLParser.HTMLParser continuing after errors
and gathering some information from the parsed content."""
@@ -81,9 +82,9 @@
# (characters are escaped in myurllib.normalizeurl())
if _spacepattern.search(url):
self.link.add_pageproblem(
- what+' contains unescaped spaces: '+url+', '+self._location() )
+ what + ' contains unescaped spaces: ' + url + ', ' +
self._location())
# replace &#nnn; entity refs with proper characters
- url = _charentitypattern.sub(lambda x:chr(int(x.group(1))), url)
+ url = _charentitypattern.sub(lambda x: chr(int(x.group(1))), url)
return myurllib.normalizeurl(url)
def error(self, message):
@@ -91,7 +92,7 @@
# construct error message
message += ', ' + self._location()
# store error message
- debugio.debug('parsers.html.htmlparser._MyHTMLParser.error(): problem
parsing html: '+message)
+ debugio.debug('parsers.html.htmlparser._MyHTMLParser.error(): problem
parsing html: ' + message)
if self.errmsg is None:
self.errmsg = message
# increment error count
@@ -115,48 +116,48 @@
if tag == 'title':
self.collect = ''
# <base href="URL">
- elif tag == 'base' and attrs.has_key('href'):
+ elif tag == 'base' and 'href' in attrs:
self.base = self._cleanurl(attrs['href'])
# <link rel="type" href="URL">
- elif tag == 'link' and attrs.has_key('rel') and attrs.has_key('href'):
+ elif tag == 'link' and 'rel' in attrs and 'href' in attrs:
if attrs['rel'].lower() in ('stylesheet', 'alternate stylesheet',
'icon', 'shortcut icon'):
self.embedded.append(self._cleanurl(attrs['href']))
# <meta name="author" content="AUTHOR">
- elif tag == 'meta' and attrs.has_key('name') and
attrs.has_key('content') and attrs['name'].lower() == 'author':
+ elif tag == 'meta' and 'name' in attrs and 'content' in attrs and
attrs['name'].lower() == 'author':
if self.author is None:
self.author = attrs['content']
# <meta http-equiv="refresh" content="0;url=URL">
- elif tag == 'meta' and attrs.has_key('http-equiv') and
attrs.has_key('content') and attrs['http-equiv'].lower() == 'refresh':
- pass # TODO: implement
+ elif tag == 'meta' and 'http-equiv' in attrs and 'content' in attrs
and attrs['http-equiv'].lower() == 'refresh':
+ pass # TODO: implement
# <meta http-equiv="content-type" content="text/html; charset=utf-8" />
- elif tag == 'meta' and attrs.has_key('http-equiv') and
attrs.has_key('content') and attrs['http-equiv'].lower() == 'content-type':
+ elif tag == 'meta' and 'http-equiv' in attrs and 'content' in attrs
and attrs['http-equiv'].lower() == 'content-type':
try:
self.link.set_encoding(_charsetpattern.search(attrs['content']).group(1))
except AttributeError:
# ignore cases where encoding is not set in header
pass
# <img src="url">
- elif tag == 'img' and attrs.has_key('src'):
+ elif tag == 'img' and 'src' in attrs:
self.embedded.append(self._cleanurl(attrs['src']))
# <a href="url" name="anchor" id="anchor">
elif tag == 'a':
# <a href="url">
- if attrs.has_key('href'):
+ if 'href' in attrs:
self.children.append(self._cleanurl(attrs['href']))
# <a name="anchor">
a_name = None
- if attrs.has_key('name'):
+ if 'name' in attrs:
a_name = self._cleanurl(attrs['name'], 'anchor')
# <a id="anchor">
a_id = None
- if attrs.has_key('id'):
+ if 'id' in attrs:
a_id = self._cleanurl(attrs['id'], 'anchor')
# if both id and name are used they should be the same
if a_id and a_name and a_id != a_name:
# add problem
self.link.add_pageproblem(
'anchors defined in name and id attributes do not match
%(location)s'
- % { 'location': self._location() })
+ % {'location': self._location()})
elif a_id == a_name:
# ignore id if it's the same as name
a_id = None
@@ -165,8 +166,8 @@
if a_name in self.anchors:
self.link.add_pageproblem(
'anchor "%(anchor)s" defined again %(location)s'
- % { 'anchor': a_name,
- 'location': self._location() })
+ % {'anchor': a_name,
+ 'location': self._location()})
else:
self.anchors.append(a_name)
# <a id="anchor">
@@ -174,40 +175,40 @@
if a_id in self.anchors:
self.link.add_pageproblem(
'anchor "%(anchor)s" defined again %(location)s'
- % { 'anchor': a_id,
- 'location': self._location() })
+ % {'anchor': a_id,
+ 'location': self._location()})
else:
self.anchors.append(a_id)
# <frameset><frame src="url"...>...</frameset>
- elif tag == 'frame' and attrs.has_key('src'):
+ elif tag == 'frame' and 'src' in attrs:
self.embedded.append(self._cleanurl(attrs['src']))
# <map><area href="url"...>...</map>
- elif tag == 'area' and attrs.has_key('href'):
+ elif tag == 'area' and 'href' in attrs:
self.children.append(self._cleanurl(attrs['href']))
# <applet archive="URL"...>
- elif tag == 'applet' and attrs.has_key('archive'):
+ elif tag == 'applet' and 'archive' in attrs:
self.embedded.append(self._cleanurl(attrs['archive']))
# <applet code="URL"...>
- elif tag == 'applet' and attrs.has_key('code'):
+ elif tag == 'applet' and 'code' in attrs:
self.embedded.append(self._cleanurl(attrs['code']))
# <embed src="url"...>
- elif tag == 'embed' and attrs.has_key('src'):
+ elif tag == 'embed' and 'src' in attrs:
self.embedded.append(self._cleanurl(attrs['src']))
# <embed><param name="movie" value="url"></embed>
- elif tag == 'param' and attrs.has_key('name') and
attrs.has_key('value'):
+ elif tag == 'param' and 'name' in attrs and 'value' in attrs:
if attrs['name'].lower() == 'movie':
self.embedded.append(self._cleanurl(attrs['value']))
# <style>content</style>
elif tag == 'style':
self.collect = ''
# <script src="url">
- elif tag == 'script' and attrs.has_key('src'):
+ elif tag == 'script' and 'src' in attrs:
self.embedded.append(self._cleanurl(attrs['src']))
# <body|table|td background="url">
- elif tag in ('body', 'table', 'td') and attrs.has_key('background'):
+ elif tag in ('body', 'table', 'td') and 'background' in attrs:
self.embedded.append(self._cleanurl(attrs['background']))
# pick up any tags with a style attribute
- if attrs.has_key('style'):
+ if 'style' in attrs:
# delegate handling of inline css to css module
import parsers.css
parsers.css.parse(attrs['style'], self.link, self.base)
@@ -230,13 +231,13 @@
def handle_charref(self, name):
"""Handle character references (e.g. A) by passing the data to
handle_data()."""
- self.handle_data('&#'+name+';')
+ self.handle_data('&#' + name + ';')
# TODO: do not pass ; if plain text does not contain it?
def handle_entityref(self, name):
"""Handle entity references (e.g. é) by passing the data to
handle_data()."""
- self.handle_data('&'+name+';')
+ self.handle_data('&' + name + ';')
# TODO: do not pass ; if plain text does not contain it?
def handle_pi(self, data):
@@ -247,6 +248,7 @@
except AttributeError:
pass
+
def _maketxt(txt, encoding):
"""Return an unicode text of the specified string do correct character
conversions and replacing html entities with normal characters."""
@@ -259,6 +261,7 @@
# fall back to locale's encoding
return htmlunescape(unicode(txt, errors='replace'))
+
def parse(content, link):
"""Parse the specified content and extract an url list, a list of images a
title and an author. The content is assumed to contain HMTL."""
Modified: webcheck/plugins/__init__.py
==============================================================================
--- webcheck/plugins/__init__.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/__init__.py Thu Aug 18 23:22:26 2011 (r427)
@@ -54,12 +54,14 @@
# reference function from html module
htmlescape = parsers.html.htmlescape
+
def get_title(link):
"""Returns the title of a link if it is set otherwise returns url."""
if link.title is None or link.title == '':
return link.url
return link.title
+
def _floatformat(f):
"""Return a float as a string while trying to keep it within three
characters."""
@@ -69,26 +71,29 @@
txt = txt[:txt.find('.')]
return txt
+
def get_size(i):
"""Return the size in bytes as a readble string."""
K = 1024
- M = K*1024
- G = M*1024
- if i > 1024*1024*999:
- return _floatformat(float(i)/float(G))+'G'
- elif i > 1024*999:
- return _floatformat(float(i)/float(M))+'M'
+ M = K * 1024
+ G = M * 1024
+ if i > 1024 * 1024 * 999:
+ return _floatformat(float(i) / float(G)) + 'G'
+ elif i > 1024 * 999:
+ return _floatformat(float(i) / float(M)) + 'M'
elif i >= 1024:
- return _floatformat(float(i)/float(K))+'K'
+ return _floatformat(float(i) / float(K)) + 'K'
else:
return '%d' % i
+
def _mk_unicode(txt):
"""Returns a unicode instance of the string."""
if not isinstance(txt, unicode):
txt = unicode(txt)
return txt
+
def get_info(link):
"""Return a string with a summary of the information in the link."""
info = u'url: %s\n' % _mk_unicode(link.url)
@@ -133,6 +138,7 @@
# trim trailing newline
return info.strip()
+
def make_link(link, title=None):
"""Return an <a>nchor to a url with title. If url is in the Linklist and
is external, insert "class=external" in the <a> tag."""
@@ -147,7 +153,13 @@
if config.REPORT_LINKS_IN_NEW_WINDOW:
target = 'target="_blank" '
# gather some information about the link to report
- return '<a href="'+htmlescape(link.url, True)+'"
'+target+'class="'+cssclass+'" title="'+htmlescape(get_info(link),
True)+'">'+htmlescape(title)+'</a>'
+ return '<a href="%(url)s" %(target)sclass="%(cssclass)s"
title="%(info)s">%(title)s</a>' % \
+ dict(url=htmlescape(link.url, True),
+ target=target,
+ cssclass=cssclass,
+ info=htmlescape(get_info(link), True),
+ title=htmlescape(title))
+
def print_parents(fp, link, indent=' '):
"""Write a list of parents to the output file descriptor.
@@ -158,24 +170,25 @@
return
parents.sort(lambda a, b: cmp(a.title, b.title) or cmp(a.url, b.url))
fp.write(
- indent+'<div class="parents">\n'+
- indent+' referenced from:\n'+
- indent+' <ul>\n' )
+ indent + '<div class="parents">\n' +
+ indent + ' referenced from:\n' +
+ indent + ' <ul>\n')
more = 0
if len(parents) > config.PARENT_LISTLEN + 1:
more = len(parents) - config.PARENT_LISTLEN
parents = parents[:config.PARENT_LISTLEN]
for parent in parents:
fp.write(
- indent+' <li>%(parent)s</li>\n'
- % { 'parent': make_link(parent) })
+ indent + ' <li>%(parent)s</li>\n'
+ % {'parent': make_link(parent)})
if more:
fp.write(
- indent+' <li>%(more)d more...</li>\n'
- % { 'more': more })
+ indent + ' <li>%(more)d more...</li>\n'
+ % {'more': more})
fp.write(
- indent+' </ul>\n'+
- indent+'</div>\n' )
+ indent + ' </ul>\n' +
+ indent + '</div>\n')
+
def open_file(filename, istext=True, makebackup=False):
"""This returns an open file object which can be used for writing. This
@@ -189,8 +202,8 @@
os.mkdir(config.OUTPUT_DIR)
except OSError, (errno, strerror):
debugio.error('error creating directory %(dir)s: %(strerror)s' %
- { 'dir': config.OUTPUT_DIR,
- 'strerror': strerror })
+ {'dir': config.OUTPUT_DIR,
+ 'strerror': strerror})
sys.exit(1)
# build the output file name
fname = os.path.join(config.OUTPUT_DIR, filename)
@@ -198,7 +211,7 @@
if os.path.exists(fname):
if makebackup:
# create backup of original (overwriting previous backup)
- os.rename(fname, fname+'~')
+ os.rename(fname, fname + '~')
elif not config.OVERWRITE_FILES:
# ask to overwrite
try:
@@ -221,10 +234,11 @@
return open(fname, 'wb')
except IOError, (errno, strerror):
debugio.error('error creating output file %(fname)s: %(strerror)s' %
- { 'fname': fname,
- 'strerror': strerror })
+ {'fname': fname,
+ 'strerror': strerror})
sys.exit(1)
+
def _print_navbar(fp, plugin):
"""Return an html fragement representing the navigation bar for a page."""
fp.write(' <ul class="navbar">\n')
@@ -240,12 +254,13 @@
selected = ' class="selected"'
fp.write(
' <li><a href="%(pluginfile)s"%(selected)s
title="%(description)s">%(title)s</a></li>\n'
- % { 'pluginfile' : report.__outputfile__,
- 'selected' : selected,
- 'title' : htmlescape(report.__title__),
- 'description': htmlescape(report.__doc__) })
+ % {'pluginfile': report.__outputfile__,
+ 'selected': selected,
+ 'title': htmlescape(report.__title__),
+ 'description': htmlescape(report.__doc__)})
fp.write(' </ul>\n')
+
def open_html(plugin, site):
"""Print an html fragment for the start of an html page."""
# open the file
@@ -268,10 +283,10 @@
' </head>\n'
' <body>\n'
' <h1 class="basename">Webcheck report for <a
href="%(siteurl)s">%(sitetitle)s</a></h1>\n'
- % { 'sitetitle': htmlescape(get_title(base)),
- 'plugintitle': htmlescape(plugin.__title__),
- 'siteurl': base.url,
- 'version': config.VERSION })
+ % {'sitetitle': htmlescape(get_title(base)),
+ 'plugintitle': htmlescape(plugin.__title__),
+ 'siteurl': base.url,
+ 'version': config.VERSION})
# write navigation bar
_print_navbar(fp, plugin)
# write plugin heading
@@ -280,6 +295,7 @@
fp.write(' <div class="content">\n')
return fp
+
def close_html(fp):
"""Print an html fragment as footer of an html page."""
fp.write(' </div>\n')
@@ -290,11 +306,12 @@
' </p>\n'
' </body>\n'
'</html>\n'
- % { 'time': htmlescape(time.ctime(time.time())),
- 'homepage': config.HOMEPAGE,
- 'version': htmlescape(config.VERSION) })
+ % {'time': htmlescape(time.ctime(time.time())),
+ 'homepage': config.HOMEPAGE,
+ 'version': htmlescape(config.VERSION)})
fp.close()
+
def generate(site):
"""Generate pages for plugins."""
for p in config.PLUGINS:
Modified: webcheck/plugins/about.py
==============================================================================
--- webcheck/plugins/about.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/about.py Thu Aug 18 23:22:26 2011 (r427)
@@ -55,10 +55,10 @@
' This report was generated on %(time)s, a total of %(numurls)d\n'
' links were found.\n'
' </p>\n\n'
- % { 'version': plugins.htmlescape(config.VERSION),
- 'time': plugins.htmlescape(time.ctime(time.time())),
- 'numurls': site.links.count(),
- 'homepage': config.HOMEPAGE } )
+ % {'version': plugins.htmlescape(config.VERSION),
+ 'time': plugins.htmlescape(time.ctime(time.time())),
+ 'numurls': site.links.count(),
+ 'homepage': config.HOMEPAGE})
# output copyright information
fp.write(
' <h3>Copyright</h3>\n'
@@ -94,7 +94,7 @@
' Copyright © 2003-2005 Stuart Langridge, Paul McLanahan,\n'
' Peter Janes, Brad Choate, Dunstan Orchard, Ethan Marcotte,\n'
' Mark Wubben and Victor Kulinski\n'
- ' </p>\n\n' )
+ ' </p>\n\n')
# output plugin information
fp.write(
' <h3>Plugins</h3>\n'
@@ -104,10 +104,10 @@
fp.write(
' <li>\n'
' <strong>%s</strong><br />\n'
- % plugins.htmlescape(report.__title__) )
+ % plugins.htmlescape(report.__title__))
if hasattr(report, '__doc__'):
fp.write(' %s<br />\n' % plugins.htmlescape(report.__doc__))
fp.write(' </li>\n')
fp.write(
- ' </ul>\n' )
+ ' </ul>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/anchors.py
==============================================================================
--- webcheck/plugins/anchors.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/anchors.py Thu Aug 18 23:22:26 2011 (r427)
@@ -45,5 +45,5 @@
anchor.parent.add_pageproblem(
u'bad link: %(url)s#%(anchor)s: unknown anchor'
% {'url': link.url,
- 'anchor': anchor })
+ 'anchor': anchor})
# FIXME: commit changes in session
Modified: webcheck/plugins/badlinks.py
==============================================================================
--- webcheck/plugins/badlinks.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/badlinks.py Thu Aug 18 23:22:26 2011 (r427)
@@ -42,26 +42,26 @@
fp.write(
' <p class="description">\n'
' There were no problems retrieving links from the website.\n'
- ' </p>\n' )
+ ' </p>\n')
plugins.close_html(fp)
return
fp.write(
' <p class="description">\n'
' These links could not be retrieved during the crawling of the
website.\n'
' </p>\n'
- ' <ol>\n' )
+ ' <ol>\n')
for link in links:
# list the link
fp.write(
' <li>\n'
' %(badurl)s\n'
' <ul class="problems">\n'
- % { 'badurl': plugins.make_link(link,link.url) })
+ % {'badurl': plugins.make_link(link, link.url)})
# list the problems
for problem in link.linkproblems:
fp.write(
' <li>%(problem)s</li>\n'
- % { 'problem': plugins.htmlescape(problem) })
+ % {'problem': plugins.htmlescape(problem)})
fp.write(
' </ul>\n')
# present a list of parents
@@ -73,5 +73,5 @@
fp.write(
' </li>\n')
fp.write(
- ' </ol>\n' )
+ ' </ol>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/external.py
==============================================================================
--- webcheck/plugins/external.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/external.py Thu Aug 18 23:22:26 2011 (r427)
@@ -42,7 +42,7 @@
fp.write(
' <p class="description">'
' No external links were found on the website.'
- ' </p>\n' )
+ ' </p>\n')
plugins.close_html(fp)
return
fp.write(
@@ -50,16 +50,16 @@
' This is the list of all external urls encountered during the'
' examination of the website.'
' </p>\n'
- ' <ol>\n' )
+ ' <ol>\n')
for link in links:
fp.write(
' <li>\n'
' %(link)s\n'
- % { 'link': plugins.make_link(link) })
+ % {'link': plugins.make_link(link)})
# present a list of parents
plugins.print_parents(fp, link, ' ')
fp.write(
' </li>\n')
fp.write(
- ' </ol>\n' )
+ ' </ol>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/images.py
==============================================================================
--- webcheck/plugins/images.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/images.py Thu Aug 18 23:22:26 2011 (r427)
@@ -47,16 +47,16 @@
' <p class="description">\n'
' No images were linked on the website.\n'
' </p>\n'
- ' <ol>\n' )
+ ' <ol>\n')
plugins.close_html(fp)
return
fp.write(
' <p class="description">\n'
' This is the list of all images found linked on the website.\n'
' </p>\n'
- ' <ol>\n' )
+ ' <ol>\n')
for link in links:
fp.write(' <li>%s</li>\n' % plugins.make_link(link, link.url))
fp.write(
- ' </ol>\n' )
+ ' </ol>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/new.py
==============================================================================
--- webcheck/plugins/new.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/new.py Thu Aug 18 23:22:26 2011 (r427)
@@ -37,6 +37,7 @@
SECS_PER_DAY = 60 * 60 * 24
+
def generate(site):
"""Output the list of recently modified pages to the specified file
descriptor."""
# the time for which links are considered new
@@ -51,7 +52,7 @@
' <p class="description">\n'
' No pages were found that were modified within the last %(new)d
days.\n'
' </p>\n'
- % { 'new': config.REPORT_WHATSNEW_URL_AGE })
+ % {'new': config.REPORT_WHATSNEW_URL_AGE})
plugins.close_html(fp)
return
fp.write(
@@ -59,9 +60,9 @@
' These pages have been recently modified (within %(new)d days).\n'
' </p>\n'
' <ul>\n'
- % { 'new': config.REPORT_WHATSNEW_URL_AGE })
+ % {'new': config.REPORT_WHATSNEW_URL_AGE})
for link in links:
- age = (time.time()-link.mtime)/SECS_PER_DAY
+ age = (time.time() - link.mtime) / SECS_PER_DAY
fp.write(
' <li>\n'
' %(link)s\n'
@@ -69,7 +70,7 @@
' <li>age: %(age)d days</li>\n'
' </ul>\n'
' </li>\n'
- % { 'link': plugins.make_link(link),
- 'age': age })
+ % {'link': plugins.make_link(link),
+ 'age': age})
fp.write(' </ul>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/notchkd.py
==============================================================================
--- webcheck/plugins/notchkd.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/notchkd.py Thu Aug 18 23:22:26 2011 (r427)
@@ -42,7 +42,7 @@
fp.write(
' <p class="description">\n'
' All links have been checked.\n'
- ' </p>\n' )
+ ' </p>\n')
plugins.close_html(fp)
return
fp.write(
@@ -55,11 +55,11 @@
fp.write(
' <li>\n'
' %(link)s\n'
- % { 'link': plugins.make_link(link, link.url) })
+ % {'link': plugins.make_link(link, link.url)})
# present a list of parents
plugins.print_parents(fp, link, ' ')
fp.write(
' </li>\n')
fp.write(
- ' </ol>\n' )
+ ' </ol>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/notitles.py
==============================================================================
--- webcheck/plugins/notitles.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/notitles.py Thu Aug 18 23:22:26 2011 (r427)
@@ -40,14 +40,14 @@
# get all internal pages without a title
links = site.links.filter_by(is_page=True, is_internal=True)
links = links.filter(or_(char_length(db.Link.title) == 0,
- db.Link.title ==None)).order_by(db.Link.url)
+ db.Link.title == None)).order_by(db.Link.url)
# present results
fp = plugins.open_html(plugins.notitles, site)
if not links:
fp.write(
' <p class="description">\n'
' All pages had a title specified.\n'
- ' </p>\n' )
+ ' </p>\n')
plugins.close_html(fp)
return
fp.write(
@@ -59,8 +59,8 @@
for link in links:
fp.write(
' <li>%(link)s</li>\n'
- % { 'link': plugins.make_link(link,link.url) })
+ % {'link': plugins.make_link(link, link.url)})
link.add_pageproblem('missing title')
fp.write(
- ' </ol>\n' )
+ ' </ol>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/old.py
==============================================================================
--- webcheck/plugins/old.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/old.py Thu Aug 18 23:22:26 2011 (r427)
@@ -37,6 +37,7 @@
SECS_PER_DAY = 60 * 60 * 24
+
def generate(site):
"""Output the list of outdated pages to the specified file descriptor."""
# the time for which links are considered old
@@ -51,7 +52,7 @@
' <p class="description">\n'
' No pages were found that were older than %(old)d days old.\n'
' </p>\n'
- % { 'old': config.REPORT_WHATSOLD_URL_AGE })
+ % {'old': config.REPORT_WHATSOLD_URL_AGE})
plugins.close_html(fp)
return
fp.write(
@@ -60,7 +61,7 @@
' days) and may be outdated.\n'
' </p>\n'
' <ul>\n'
- % {'old': config.REPORT_WHATSOLD_URL_AGE })
+ % {'old': config.REPORT_WHATSOLD_URL_AGE})
for link in links:
age = (time.time() - link.mtime) / SECS_PER_DAY
fp.write(
@@ -70,10 +71,10 @@
' <li>age: %(age)d days</li>\n'
' </ul>\n'
' </li>\n'
- % { 'link': plugins.make_link(link),
- 'age': age })
+ % {'link': plugins.make_link(link),
+ 'age': age})
# add link to problem database
link.add_pageproblem('this page is %d days old' % age)
fp.write(
- ' </ul>\n' )
+ ' </ul>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/problems.py
==============================================================================
--- webcheck/plugins/problems.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/problems.py Thu Aug 18 23:22:26 2011 (r427)
@@ -41,12 +41,13 @@
name = name.lower()
import re
# strip any leading non alpha characters
- name = re.sub('^[^a-z]*','',name)
+ name = re.sub('^[^a-z]*', '', name)
# remove any non-allowed characters
- name = re.sub('[^a-z0-9_:.]+','-',name)
+ name = re.sub('[^a-z0-9_:.]+', '-', name)
# we're done
return name
+
def generate(site):
"""Output the overview of problems to the given file descriptor."""
# make a list of problems per author
@@ -61,7 +62,7 @@
else:
author = unicode('Unknown')
# store the problem
- if problem_db.has_key(author):
+ if author in problem_db:
problem_db[author].append(link)
else:
problem_db[author] = [link]
@@ -70,7 +71,7 @@
fp.write(
' <p class="description">\n'
' No problems were found on this site, hurray.\n'
- ' </p>\n' )
+ ' </p>\n')
plugins.close_html(fp)
return
# print description
@@ -78,7 +79,7 @@
' <p class="description">\n'
' This is an overview of all the problems on the site, grouped by\n'
' author.\n'
- ' </p>\n' )
+ ' </p>\n')
# get a list of authors
authors = problem_db.keys()
authors.sort()
@@ -88,8 +89,8 @@
for author in authors:
fp.write(
' <li><a href="#author_%(authorref)s">Author:
%(author)s</a></li>\n'
- % { 'authorref': plugins.htmlescape(_mk_id(author)),
- 'author': plugins.htmlescape(author) })
+ % {'authorref': plugins.htmlescape(_mk_id(author)),
+ 'author': plugins.htmlescape(author)})
fp.write(' </ul>\n')
# generate problem report
fp.write(' <ul>\n')
@@ -98,8 +99,8 @@
' <li id="author_%(authorref)s">\n'
' Author: %(author)s\n'
' <ul>\n'
- % { 'authorref': plugins.htmlescape(_mk_id(author)),
- 'author': plugins.htmlescape(author) })
+ % {'authorref': plugins.htmlescape(_mk_id(author)),
+ 'author': plugins.htmlescape(author)})
# sort pages by url
problem_db[author].sort(lambda a, b: cmp(a.url, b.url))
# list problems for this author
@@ -109,19 +110,19 @@
' <li>\n'
' %(link)s\n'
' <ul class="problems">\n'
- % { 'link': plugins.make_link(link) })
+ % {'link': plugins.make_link(link)})
# list the problems
for problem in link.pageproblems.order_by(db.PageProblem.message):
fp.write(
' <li>%(problem)s</li>\n'
- % { 'problem': plugins.htmlescape(problem) })
+ % {'problem': plugins.htmlescape(problem)})
# end the list item
fp.write(
' </ul>\n'
- ' </li>\n' )
+ ' </li>\n')
fp.write(
' </ul>\n'
- ' </li>\n' )
+ ' </li>\n')
fp.write(
- ' </ul>\n' )
+ ' </ul>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/plugins/sitemap.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/sitemap.py Thu Aug 18 23:22:26 2011 (r427)
@@ -52,6 +52,7 @@
(embed.depth == None or embed.depth > link.depth):
add_pagechildren(embed, children, explored)
+
def _explore(fp, link, explored, depth=0, indent=' '):
"""Recursively do a breadth first traversal of the graph of links on the
site. Prints the html results to the file descriptor."""
@@ -76,6 +77,7 @@
fp.write(indent + ' </ul>\n')
fp.write(indent + '</li>\n')
+
def generate(site):
"""Output the sitemap to the specified file descriptor."""
fp = plugins.open_html(plugins.sitemap, site)
@@ -84,10 +86,10 @@
' <p class="description">\n'
' This an overview of the crawled site.\n'
' </p>\n'
- ' <ul>\n' )
+ ' <ul>\n')
explored = set(x.id for x in site.bases)
for l in site.bases:
_explore(fp, l, explored)
fp.write(
- ' </ul>\n' )
+ ' </ul>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/size.py
==============================================================================
--- webcheck/plugins/size.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/size.py Thu Aug 18 23:22:26 2011 (r427)
@@ -53,12 +53,13 @@
link.total_size = size
return link.total_size
+
def generate(site):
"""Output the list of large pages to the given file descriptor."""
# get all internal pages and get big links
links = site.links.filter_by(is_page=True, is_internal=True)
- links = [ x for x in links
- if _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024 ]
+ links = [x for x in links
+ if _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024]
# sort links by size (biggest first)
links.sort(lambda a, b: cmp(b.total_size, a.total_size))
# present results
@@ -68,7 +69,7 @@
' <p class="description">\n'
' No pages over %(size)dK were found.\n'
' </p>\n'
- % { 'size': config.REPORT_SLOW_URL_SIZE })
+ % {'size': config.REPORT_SLOW_URL_SIZE})
plugins.close_html(fp)
return
fp.write(
@@ -77,7 +78,7 @@
' slow to download.\n'
' </p>\n'
' <ul>\n'
- % { 'size': config.REPORT_SLOW_URL_SIZE })
+ % {'size': config.REPORT_SLOW_URL_SIZE})
for link in links:
size = plugins.get_size(link.total_size)
fp.write(
@@ -87,11 +88,11 @@
' <li>size: %(size)s</li>\n'
' </ul>\n'
' </li>\n'
- % { 'link': plugins.make_link(link),
- 'size': size })
+ % {'link': plugins.make_link(link),
+ 'size': size})
link.add_pageproblem(
'this page and its components is %(size)s'
- % { 'size': size })
+ % {'size': size})
fp.write(
- ' </ul>\n' )
+ ' </ul>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/urllist.py
==============================================================================
--- webcheck/plugins/urllist.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/plugins/urllist.py Thu Aug 18 23:22:26 2011 (r427)
@@ -39,10 +39,10 @@
' the website. It lists internal as well as external and\n'
' non-examined urls.\n'
' </p>\n'
- ' <ol>\n' )
+ ' <ol>\n')
links = site.links.order_by(db.Link.url)
for link in links:
fp.write(' <li>' + plugins.make_link(link, link.url) + '</li>\n')
fp.write(
- ' </ol>\n' )
+ ' </ol>\n')
plugins.close_html(fp)
Modified: webcheck/webcheck.py
==============================================================================
--- webcheck/webcheck.py Wed Aug 10 22:42:57 2011 (r426)
+++ webcheck/webcheck.py Thu Aug 18 23:22:26 2011 (r427)
@@ -58,18 +58,21 @@
'Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.\n'
'This is free software; see the source for copying conditions. There is
NO\n'
'warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE.\n'
- % { 'version': __version__ })
+ % {'version': __version__})
+
def print_usage():
"""Print short usage information."""
sys.stderr.write(
'Usage: webcheck [OPTION]... URL...\n')
+
def print_tryhelp():
"""Print friendly pointer to more information."""
sys.stderr.write(
'Try \'webcheck --help\' for more information.\n')
+
def print_help():
"""Print the option list."""
sys.stdout.write(
@@ -96,7 +99,8 @@
' -w, --wait=SECONDS wait SECONDS between retrievals\n'
' -V, --version output version information and exit\n'
' -h, --help display this help and exit\n'
- % { 'redirects': config.REDIRECT_DEPTH } )
+ % {'redirects': config.REDIRECT_DEPTH})
+
def parse_args(site):
"""Parse command-line arguments."""
@@ -151,7 +155,7 @@
elif flag in ('-h', '--help'):
print_help()
sys.exit(0)
- if len(args)==0 and not config.CONTINUE:
+ if len(args) == 0 and not config.CONTINUE:
print_usage()
print_tryhelp()
sys.exit(1)
@@ -186,6 +190,7 @@
sys.stderr.write('webcheck: %s\n' % str(e))
sys.exit(1)
+
def install_file(source, text=False):
"""Install the given file in the output directory.
If the text flag is set to true it is assumed the file is text,
@@ -226,23 +231,24 @@
sfp = open(source, mode)
except IOError, (errno, strerror):
debugio.error('%(fname)s: %(strerror)s' %
- { 'fname': source,
- 'strerror': strerror })
+ {'fname': source,
+ 'strerror': strerror})
sys.exit(1)
# create file in output directory (with overwrite question)
- tfp = plugins.open_file(os.path.basename(source));
+ tfp = plugins.open_file(os.path.basename(source))
# copy contents
shutil.copyfileobj(sfp, tfp)
# close files
tfp.close()
sfp.close()
+
def main(site):
"""Main program."""
# crawl through the website
debugio.info('checking site....')
crawler.setup_urllib2()
- site.crawl() # this will take a while
+ site.crawl() # this will take a while
debugio.info('done.')
# do postprocessing (building site structure, etc)
debugio.info('postprocessing....')
@@ -259,6 +265,7 @@
install_file('favicon.ico', False)
debugio.info('done.')
+
if __name__ == '__main__':
try:
# initialize site object
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits
- webcheck commit: r427 - in webcheck: . parsers parsers/html plugins,
Commits of the webcheck project