webcheck commit: r457 - in webcheck: . webcheck webcheck/parsers/html
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r457 - in webcheck: . webcheck webcheck/parsers/html
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r457 - in webcheck: . webcheck webcheck/parsers/html
- Date: Fri, 14 Oct 2011 14:57:09 +0200 (CEST)
Author: arthur
Date: Fri Oct 14 14:57:08 2011
New Revision: 457
URL: http://arthurdejong.org/viewvc/webcheck?revision=457&view=revision
Log:
switch to using the logging framework
Deleted:
webcheck/webcheck/debugio.py
Modified:
webcheck/cmd.py
webcheck/webcheck/crawler.py
webcheck/webcheck/db.py
webcheck/webcheck/parsers/html/__init__.py
webcheck/webcheck/parsers/html/beautifulsoup.py
webcheck/webcheck/parsers/html/htmlparser.py
webcheck/webcheck/util.py
Modified: webcheck/cmd.py
==============================================================================
--- webcheck/cmd.py Fri Oct 14 10:14:28 2011 (r456)
+++ webcheck/cmd.py Fri Oct 14 14:57:08 2011 (r457)
@@ -25,6 +25,8 @@
"""This is the main webcheck module."""
+import getopt
+import logging
import os
import re
import sys
@@ -33,10 +35,12 @@
import webcheck
import webcheck.monkeypatch
-from webcheck import config, debugio, Crawler
+from webcheck import config, Crawler
-debugio.loglevel = debugio.INFO
+# The loglevel to use for the logger that is configured.
+LOGLEVEL = logging.INFO
+
# Whether to produce profiling information. This is for development
# purposes and as such undocumented.
@@ -98,7 +102,9 @@
def parse_args(crawler):
"""Parse command-line arguments."""
- import getopt
+ # these global options are set here
+ global PROFILE
+ global LOGLEVEL
try:
optlist, args = getopt.gnu_getopt(sys.argv[1:],
'i:x:y:l:baqdo:cfr:u:w:Vh',
@@ -123,12 +129,11 @@
elif flag in ('--ignore-robots',):
config.USE_ROBOTS = False
elif flag in ('-q', '--quiet', '--silent'):
- debugio.loglevel = debugio.ERROR
+ LOGLEVEL = logging.WARNING
elif flag in ('-d', '--debug'):
- debugio.loglevel = debugio.DEBUG
+ LOGLEVEL = logging.DEBUG
elif flag in ('--profile',):
# undocumented on purpose
- global PROFILE
PROFILE = True
elif flag in ('-o', '--output'):
config.OUTPUT_DIR = arg
@@ -174,19 +179,19 @@
def main(crawler):
"""Main program."""
# crawl through the website
- debugio.info('checking site....')
+ logging.info('checking site....')
crawler.crawl() # this will take a while
- debugio.info('done.')
+ logging.info('done.')
# do postprocessing (building site structure, etc)
- debugio.info('postprocessing....')
+ logging.info('postprocessing....')
crawler.postprocess()
- debugio.info('done.')
+ logging.info('done.')
# now we can write out the files
# start with the frame-description page
- debugio.info('generating reports...')
+ logging.info('generating reports...')
# for every plugin, generate a page
crawler.generate()
- debugio.info('done.')
+ logging.info('done.')
if __name__ == '__main__':
@@ -195,6 +200,8 @@
crawler = Crawler()
# parse command-line arguments
parse_args(crawler)
+ # configure logging
+ logging.basicConfig(format='webcheck: %(levelname)s: %(message)s',
level=LOGLEVEL)
# run the main program
if PROFILE:
fname = os.path.join(config.OUTPUT_DIR, 'webcheck.prof')
Modified: webcheck/webcheck/crawler.py
==============================================================================
--- webcheck/webcheck/crawler.py Fri Oct 14 10:14:28 2011 (r456)
+++ webcheck/webcheck/crawler.py Fri Oct 14 14:57:08 2011 (r457)
@@ -31,6 +31,7 @@
import cookielib
import datetime
import httplib
+import logging
import os
import re
import robotparser
@@ -38,7 +39,7 @@
import urllib2
import urlparse
-from webcheck import config, debugio
+from webcheck import config
from webcheck.db import Session, Base, Link, truncate_db
from webcheck.util import install_file
import webcheck.parsers
@@ -46,6 +47,9 @@
from sqlalchemy import create_engine
+logger = logging.getLogger(__name__)
+
+
class RedirectError(urllib2.HTTPError):
def __init__(self, url, code, msg, hdrs, fp, newurl):
@@ -189,14 +193,13 @@
netloc."""
# only some schemes have a meaningful robots.txt file
if scheme != 'http' and scheme != 'https':
- debugio.debug('crawler._get_robotparser() '
- 'called with unsupported scheme (%s)' % scheme)
+ logger.debug('called with unsupported scheme (%s)', scheme)
return None
# split out the key part of the url
location = urlparse.urlunsplit((scheme, netloc, '', '', ''))
# try to create a new robotparser if we don't already have one
if location not in self._robotparsers:
- debugio.info(' getting robots.txt for %s' % location)
+ logger.info('getting robots.txt for %s', location)
self._robotparsers[location] = None
try:
rp = robotparser.RobotFileParser()
@@ -298,17 +301,17 @@
session.commit()
# sleep between requests if configured
if config.WAIT_BETWEEN_REQUESTS > 0:
- debugio.debug('crawler.crawl(): sleeping %s seconds' %
- config.WAIT_BETWEEN_REQUESTS)
+ logger.debug('sleeping %s seconds',
+ config.WAIT_BETWEEN_REQUESTS)
time.sleep(config.WAIT_BETWEEN_REQUESTS)
- debugio.debug('crawler.crawl(): items left to check: %d' %
+ logger.debug('items left to check: %d' %
(remaining + len(tocheck)))
session.commit()
def fetch(self, link):
"""Attempt to fetch the url (if not yanked) and fill in link
attributes (based on is_internal)."""
- debugio.info(' %s' % link.url)
+ logger.info(link.url)
# mark the link as fetched to avoid loops
link.fetched = datetime.datetime.now()
# see if we can import the proper module for this scheme
@@ -331,37 +334,31 @@
return response
except RedirectError, e:
link.status = str(e.code)
- debugio.info(' ' + str(e))
+ logger.info(str(e))
if e.code == 301:
link.add_linkproblem(str(e))
link.add_redirect(e.newurl)
- return
except urllib2.HTTPError, e:
link.status = str(e.code)
- debugio.info(' ' + str(e))
+ logger.info(str(e))
link.add_linkproblem(str(e))
- return
except urllib2.URLError, e:
- debugio.info(' ' + str(e))
+ logger.info(str(e))
link.add_linkproblem(str(e))
- return
except KeyboardInterrupt:
# handle this in a higher-level exception handler
raise
except Exception, e:
# handle all other exceptions
- debugio.warn('unknown exception caught: ' + str(e))
+ logger.exception('unknown exception caught: ' + str(e))
link.add_linkproblem('error reading HTTP response: %s' % str(e))
- import traceback
- traceback.print_exc()
- return
def parse(self, link, response):
"""Parse the fetched response."""
# find a parser for the content-type
parsermodule = webcheck.parsers.get_parsermodule(link.mimetype)
if parsermodule is None:
- debugio.debug('crawler.Link.fetch(): unsupported content-type: %s'
% link.mimetype)
+ logger.debug('unsupported content-type: %s', link.mimetype)
return
try:
# skip parsing of content if we were returned nothing
@@ -369,16 +366,14 @@
if content is None:
return
# parse the content
- debugio.debug('crawler.Link.fetch(): parsing using %s' %
parsermodule.__name__)
+ logger.debug('parsing using %s', parsermodule.__name__)
parsermodule.parse(content, link)
except KeyboardInterrupt:
# handle this in a higher-level exception handler
raise
except Exception, e:
- import traceback
- traceback.print_exc()
- debugio.warn('problem parsing page: ' + str(e))
- link.add_pageproblem('problem parsing page: ' + str(e))
+ logger.exception('problem parsing page: %s', str(e))
+ link.add_pageproblem('problem parsing page: %s' % str(e))
def postprocess(self):
"""Do some basic post processing of the collected data, including
@@ -392,15 +387,15 @@
for url in self._internal_urls:
link = self.get_link(session, url).follow_link()
if not link:
- debugio.warn('base link %s redirects to nowhere' % url)
+ logger.warn('base link %s redirects to nowhere', url)
continue
# add the link to bases
- debugio.debug('crawler.postprocess(): adding %s to bases' %
link.url)
+ logger.debug('adding %s to bases', link.url)
self.bases.append(link)
# if we got no bases, just use the first internal one
if not self.bases:
link = session.query(Link).filter(Link.is_internal == True).first()
- debugio.debug('crawler.postprocess(): fallback to adding %s to
bases' % link.url)
+ logger.debug('fallback to adding %s to bases', link.url)
self.bases.append(link)
# do a breadth first traversal of the website to determine depth
session.query(Link).update(dict(depth=None), synchronize_session=False)
@@ -411,7 +406,7 @@
link.depth = 0
session.commit()
while count > 0:
- debugio.debug('crawler.postprocess(): %d links at depth %d' %
(count, depth))
+ logger.debug('%d links at depth %d', count, depth)
# update the depth of all links without a depth that have a
# parent with the previous depth
qry = session.query(Link).filter(Link.depth == None)
@@ -425,7 +420,7 @@
# import the plugin
pluginmod = __import__(plugin, globals(), locals(), [plugin])
if hasattr(pluginmod, 'postprocess'):
- debugio.info(' ' + plugin)
+ logger.info(plugin)
pluginmod.postprocess(self)
def generate(self):
@@ -437,7 +432,7 @@
# import the plugin
pluginmod = __import__(plugin, globals(), locals(), [plugin])
if hasattr(pluginmod, 'generate'):
- debugio.info(' ' + plugin)
+ logger.info(plugin)
pluginmod.generate(self)
# install theme files
install_file('webcheck.css', True)
Modified: webcheck/webcheck/db.py
==============================================================================
--- webcheck/webcheck/db.py Fri Oct 14 10:14:28 2011 (r456)
+++ webcheck/webcheck/db.py Fri Oct 14 14:57:08 2011 (r457)
@@ -20,6 +20,7 @@
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.
+import logging
import urlparse
from sqlalchemy.ext.declarative import declarative_base
@@ -29,10 +30,14 @@
from sqlalchemy.orm.session import object_session
from sqlalchemy.sql.expression import union
-from webcheck import config, debugio
+from webcheck import config
from webcheck.myurllib import normalizeurl
+logger = logging.getLogger(__name__)
+
+
+
# provide session and schema classes
Session = sessionmaker()
Base = declarative_base()
@@ -116,12 +121,11 @@
the encoding is supported."""
if not self.encoding and encoding:
try:
- debugio.debug('crawler.Link.set_encoding(%r)' % encoding)
+ logger.debug('crawler.Link.set_encoding(%r)', encoding)
unicode('just some random text', encoding, 'replace')
self.encoding = encoding
except Exception, e:
- import traceback
- traceback.print_exc()
+ logger.exception('unknown encoding: %s', encoding)
self.add_pageproblem('unknown encoding: %s' % encoding)
def add_redirect(self, url):
Modified: webcheck/webcheck/parsers/html/__init__.py
==============================================================================
--- webcheck/webcheck/parsers/html/__init__.py Fri Oct 14 10:14:28 2011
(r456)
+++ webcheck/webcheck/parsers/html/__init__.py Fri Oct 14 14:57:08 2011
(r457)
@@ -25,9 +25,13 @@
back to loading the legacy HTMLParser parser."""
import htmlentitydefs
+import logging
import re
-from webcheck import debugio, config
+from webcheck import config
+
+
+logger = logging.getLogger(__name__)
# the list of mimetypes this module should be able to handle
@@ -95,12 +99,12 @@
try:
# try BeautifulSoup parser first
import webcheck.parsers.html.beautifulsoup
- debugio.debug('webcheck.parsers.html.parse(): the BeautifulSoup parser
is ok')
+ logger.debug('the BeautifulSoup parser is ok')
_parsefunction = webcheck.parsers.html.beautifulsoup.parse
except ImportError:
# fall back to legacy HTMLParser parser
- debugio.warn('falling back to the legacy HTML parser, '
- 'consider installing BeautifulSoup')
+ logger.warn('falling back to the legacy HTML parser, '
+ 'consider installing BeautifulSoup')
import webcheck.parsers.html.htmlparser
_parsefunction = webcheck.parsers.html.htmlparser.parse
# call the actual parse function
@@ -116,9 +120,9 @@
if config.TIDY_OPTIONS:
try:
import webcheck.parsers.html.calltidy
- debugio.debug('webcheck.parsers.html.parse(): the Tidy parser is
ok')
+ logger.debug('the Tidy parser is ok')
webcheck.parsers.html.calltidy.parse(content, link)
except ImportError:
- debugio.warn('tidy library (python-utidylib) is unavailable')
+ logger.warn('tidy library (python-utidylib) is unavailable')
# remove config to only try once
config.TIDY_OPTIONS = None
Modified: webcheck/webcheck/parsers/html/beautifulsoup.py
==============================================================================
--- webcheck/webcheck/parsers/html/beautifulsoup.py Fri Oct 14 10:14:28
2011 (r456)
+++ webcheck/webcheck/parsers/html/beautifulsoup.py Fri Oct 14 14:57:08
2011 (r457)
@@ -24,6 +24,7 @@
BeautifulSoup HTML parser and is more flexible than the legacy HTMLParser
module."""
+import logging
import re
import urlparse
@@ -34,6 +35,9 @@
import webcheck.parsers.css
+logger = logging.getLogger(__name__)
+
+
# pattern for matching http-equiv and content part of
# <meta http-equiv="refresh" content="0;url=URL">
_refreshhttpequivpattern = re.compile('^refresh$', re.I)
@@ -41,9 +45,8 @@
# check BeautifulSoup find() function for bugs
if BeautifulSoup.BeautifulSoup('<foo>').find('foo', bar=True):
- from webcheck import debugio
- debugio.warn('using buggy version of BeautifulSoup (%s)' %
- BeautifulSoup.__version__)
+ logger.warn('using buggy version of BeautifulSoup (%s)',
+ BeautifulSoup.__version__)
def parse(content, link):
Modified: webcheck/webcheck/parsers/html/htmlparser.py
==============================================================================
--- webcheck/webcheck/parsers/html/htmlparser.py Fri Oct 14 10:14:28
2011 (r456)
+++ webcheck/webcheck/parsers/html/htmlparser.py Fri Oct 14 14:57:08
2011 (r457)
@@ -26,15 +26,18 @@
will only handle properly formatted HTML."""
import HTMLParser
+import logging
import re
import urlparse
-from webcheck import debugio
from webcheck.myurllib import normalizeurl
from webcheck.parsers.html import htmlunescape
import webcheck.parsers.css
+logger = logging.getLogger(__name__)
+
+
# pattern for matching numeric html entities
_charentitypattern = re.compile('&#([0-9]{1,3});')
@@ -94,7 +97,7 @@
# construct error message
message += ', ' + self._location()
# store error message
- debugio.debug('webcheck.parsers.html.htmlparser._MyHTMLParser.error():
problem parsing html: ' + message)
+ logger.debug('problem parsing html: %s', message)
if self.errmsg is None:
self.errmsg = message
# increment error count
@@ -107,7 +110,7 @@
try:
return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i)
except AssertionError:
-
debugio.debug('webcheck.parsers.html.htmlparser._MyHTMLParser.check_for_whole_start_tag():
caught assertion error')
+ logger.exception('caught assertion error')
return None
def handle_starttag(self, tag, attrs):
@@ -257,7 +260,7 @@
try:
return htmlunescape(unicode(txt, encoding, 'replace'))
except (LookupError, TypeError, ValueError), e:
- debugio.warn('page has unknown encoding: %s' % str(encoding))
+ logger.warn('page has unknown encoding: %s', str(encoding))
# fall back to locale's encoding
return htmlunescape(unicode(txt, errors='replace'))
@@ -272,13 +275,13 @@
parser.close()
except Exception, e:
# ignore (but log) all errors
- debugio.debug('webcheck.parsers.html.htmlparser.parse(): caught
exception: ' + str(e))
+ logger.exception('caught exception: %s', str(e))
# check for parser errors
if parser.errmsg is not None:
- debugio.debug('webcheck.parsers.html.htmlparser.parse(): problem
parsing html: ' + parser.errmsg)
+ logger.debug('problem parsing html: %s', parser.errmsg)
link.add_pageproblem('problem parsing html: %s' % parser.errmsg)
# dump encoding
- debugio.debug('webcheck.parsers.html.htmlparser.parse(): html encoding:
%s' % str(link.encoding))
+ logger.debug('html encoding: %s', str(link.encoding))
# flag that the link contains a valid page
link.is_page = True
# save the title
Modified: webcheck/webcheck/util.py
==============================================================================
--- webcheck/webcheck/util.py Fri Oct 14 10:14:28 2011 (r456)
+++ webcheck/webcheck/util.py Fri Oct 14 14:57:08 2011 (r457)
@@ -22,13 +22,17 @@
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.
+import logging
import os
import shutil
import sys
import urllib
import urlparse
-from webcheck import config, debugio
+from webcheck import config
+
+
+logger = logging.getLogger(__name__)
def open_file(filename, istext=True, makebackup=False):
@@ -52,7 +56,7 @@
res = raw_input('webcheck: overwrite %s? [y]es, [a]ll, [q]uit:
' % fname)
except EOFError:
# bail out in case raw_input() failed
- debugio.error('error reading response')
+ logger.exception('error reading response')
res = 'q'
res = res.lower() + ' '
if res[0] == 'a':
@@ -95,7 +99,7 @@
# test if source and target are the same
source = os.path.realpath(source)
if source == os.path.realpath(target):
- debugio.warn('attempt to overwrite %(fname)s with itself' % {'fname':
source})
+ logger.warn('attempt to overwrite %s with itself', source)
return
# open the input file
sfp = open(source, mode)
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
- webcheck commit: r457 - in webcheck: . webcheck webcheck/parsers/html,
Commits of the webcheck project