webcheck commit: r457 - in webcheck: . webcheck webcheck/parsers/html

[Date Prev][Date Next] [Thread Prev][Thread Next]
From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
To: webcheck-commits [at] lists.arthurdejong.org
Reply-to: webcheck-users [at] lists.arthurdejong.org
Subject: webcheck commit: r457 - in webcheck: . webcheck webcheck/parsers/html
Date: Fri, 14 Oct 2011 14:57:09 +0200 (CEST)
Author: arthur
Date: Fri Oct 14 14:57:08 2011
New Revision: 457
URL: http://arthurdejong.org/viewvc/webcheck?revision=457&view=revision

Log:
switch to using the logging framework

Deleted:
   webcheck/webcheck/debugio.py
Modified:
   webcheck/cmd.py
   webcheck/webcheck/crawler.py
   webcheck/webcheck/db.py
   webcheck/webcheck/parsers/html/__init__.py
   webcheck/webcheck/parsers/html/beautifulsoup.py
   webcheck/webcheck/parsers/html/htmlparser.py
   webcheck/webcheck/util.py

Modified: webcheck/cmd.py
==============================================================================
--- webcheck/cmd.py     Fri Oct 14 10:14:28 2011        (r456)
+++ webcheck/cmd.py     Fri Oct 14 14:57:08 2011        (r457)
@@ -25,6 +25,8 @@
 
 """This is the main webcheck module."""
 
+import getopt
+import logging
 import os
 import re
 import sys
@@ -33,10 +35,12 @@
 
 import webcheck
 import webcheck.monkeypatch
-from webcheck import config, debugio, Crawler
+from webcheck import config, Crawler
 
 
-debugio.loglevel = debugio.INFO
+# The loglevel to use for the logger that is configured.
+LOGLEVEL = logging.INFO
+
 
 # Whether to produce profiling information. This is for development
 # purposes and as such undocumented.
@@ -98,7 +102,9 @@
 
 def parse_args(crawler):
     """Parse command-line arguments."""
-    import getopt
+    # these global options are set here
+    global PROFILE
+    global LOGLEVEL
     try:
         optlist, args = getopt.gnu_getopt(sys.argv[1:],
           'i:x:y:l:baqdo:cfr:u:w:Vh',
@@ -123,12 +129,11 @@
             elif flag in ('--ignore-robots',):
                 config.USE_ROBOTS = False
             elif flag in ('-q', '--quiet', '--silent'):
-                debugio.loglevel = debugio.ERROR
+                LOGLEVEL = logging.WARNING
             elif flag in ('-d', '--debug'):
-                debugio.loglevel = debugio.DEBUG
+                LOGLEVEL = logging.DEBUG
             elif flag in ('--profile',):
                 # undocumented on purpose
-                global PROFILE
                 PROFILE = True
             elif flag in ('-o', '--output'):
                 config.OUTPUT_DIR = arg
@@ -174,19 +179,19 @@
 def main(crawler):
     """Main program."""
     # crawl through the website
-    debugio.info('checking site....')
+    logging.info('checking site....')
     crawler.crawl()  # this will take a while
-    debugio.info('done.')
+    logging.info('done.')
     # do postprocessing (building site structure, etc)
-    debugio.info('postprocessing....')
+    logging.info('postprocessing....')
     crawler.postprocess()
-    debugio.info('done.')
+    logging.info('done.')
     # now we can write out the files
     # start with the frame-description page
-    debugio.info('generating reports...')
+    logging.info('generating reports...')
     # for every plugin, generate a page
     crawler.generate()
-    debugio.info('done.')
+    logging.info('done.')
 
 
 if __name__ == '__main__':
@@ -195,6 +200,8 @@
         crawler = Crawler()
         # parse command-line arguments
         parse_args(crawler)
+        # configure logging
+        logging.basicConfig(format='webcheck: %(levelname)s: %(message)s', 
level=LOGLEVEL)
         # run the main program
         if PROFILE:
             fname = os.path.join(config.OUTPUT_DIR, 'webcheck.prof')

Modified: webcheck/webcheck/crawler.py
==============================================================================
--- webcheck/webcheck/crawler.py        Fri Oct 14 10:14:28 2011        (r456)
+++ webcheck/webcheck/crawler.py        Fri Oct 14 14:57:08 2011        (r457)
@@ -31,6 +31,7 @@
 import cookielib
 import datetime
 import httplib
+import logging
 import os
 import re
 import robotparser
@@ -38,7 +39,7 @@
 import urllib2
 import urlparse
 
-from webcheck import config, debugio
+from webcheck import config
 from webcheck.db import Session, Base, Link, truncate_db
 from webcheck.util import install_file
 import webcheck.parsers
@@ -46,6 +47,9 @@
 from sqlalchemy import create_engine
 
 
+logger = logging.getLogger(__name__)
+
+
 class RedirectError(urllib2.HTTPError):
 
     def __init__(self, url, code, msg, hdrs, fp, newurl):
@@ -189,14 +193,13 @@
         netloc."""
         # only some schemes have a meaningful robots.txt file
         if scheme != 'http' and scheme != 'https':
-            debugio.debug('crawler._get_robotparser() '
-                          'called with unsupported scheme (%s)' % scheme)
+            logger.debug('called with unsupported scheme (%s)', scheme)
             return None
         # split out the key part of the url
         location = urlparse.urlunsplit((scheme, netloc, '', '', ''))
         # try to create a new robotparser if we don't already have one
         if location not in self._robotparsers:
-            debugio.info('  getting robots.txt for %s' % location)
+            logger.info('getting robots.txt for %s', location)
             self._robotparsers[location] = None
             try:
                 rp = robotparser.RobotFileParser()
@@ -298,17 +301,17 @@
             session.commit()
             # sleep between requests if configured
             if config.WAIT_BETWEEN_REQUESTS > 0:
-                debugio.debug('crawler.crawl(): sleeping %s seconds' %
-                              config.WAIT_BETWEEN_REQUESTS)
+                logger.debug('sleeping %s seconds',
+                             config.WAIT_BETWEEN_REQUESTS)
                 time.sleep(config.WAIT_BETWEEN_REQUESTS)
-            debugio.debug('crawler.crawl(): items left to check: %d' %
+            logger.debug('items left to check: %d' %
                           (remaining + len(tocheck)))
         session.commit()
 
     def fetch(self, link):
         """Attempt to fetch the url (if not yanked) and fill in link
         attributes (based on is_internal)."""
-        debugio.info('  %s' % link.url)
+        logger.info(link.url)
         # mark the link as fetched to avoid loops
         link.fetched = datetime.datetime.now()
         # see if we can import the proper module for this scheme
@@ -331,37 +334,31 @@
             return response
         except RedirectError, e:
             link.status = str(e.code)
-            debugio.info('    ' + str(e))
+            logger.info(str(e))
             if e.code == 301:
                 link.add_linkproblem(str(e))
             link.add_redirect(e.newurl)
-            return
         except urllib2.HTTPError, e:
             link.status = str(e.code)
-            debugio.info('    ' + str(e))
+            logger.info(str(e))
             link.add_linkproblem(str(e))
-            return
         except urllib2.URLError, e:
-            debugio.info('    ' + str(e))
+            logger.info(str(e))
             link.add_linkproblem(str(e))
-            return
         except KeyboardInterrupt:
             # handle this in a higher-level exception handler
             raise
         except Exception, e:
             # handle all other exceptions
-            debugio.warn('unknown exception caught: ' + str(e))
+            logger.exception('unknown exception caught: ' + str(e))
             link.add_linkproblem('error reading HTTP response: %s' % str(e))
-            import traceback
-            traceback.print_exc()
-            return
 
     def parse(self, link, response):
         """Parse the fetched response."""
         # find a parser for the content-type
         parsermodule = webcheck.parsers.get_parsermodule(link.mimetype)
         if parsermodule is None:
-            debugio.debug('crawler.Link.fetch(): unsupported content-type: %s' 
% link.mimetype)
+            logger.debug('unsupported content-type: %s', link.mimetype)
             return
         try:
             # skip parsing of content if we were returned nothing
@@ -369,16 +366,14 @@
             if content is None:
                 return
             # parse the content
-            debugio.debug('crawler.Link.fetch(): parsing using %s' % 
parsermodule.__name__)
+            logger.debug('parsing using %s', parsermodule.__name__)
             parsermodule.parse(content, link)
         except KeyboardInterrupt:
             # handle this in a higher-level exception handler
             raise
         except Exception, e:
-            import traceback
-            traceback.print_exc()
-            debugio.warn('problem parsing page: ' + str(e))
-            link.add_pageproblem('problem parsing page: ' + str(e))
+            logger.exception('problem parsing page: %s', str(e))
+            link.add_pageproblem('problem parsing page: %s' % str(e))
 
     def postprocess(self):
         """Do some basic post processing of the collected data, including
@@ -392,15 +387,15 @@
         for url in self._internal_urls:
             link = self.get_link(session, url).follow_link()
             if not link:
-                debugio.warn('base link %s redirects to nowhere' % url)
+                logger.warn('base link %s redirects to nowhere', url)
                 continue
             # add the link to bases
-            debugio.debug('crawler.postprocess(): adding %s to bases' % 
link.url)
+            logger.debug('adding %s to bases', link.url)
             self.bases.append(link)
         # if we got no bases, just use the first internal one
         if not self.bases:
             link = session.query(Link).filter(Link.is_internal == True).first()
-            debugio.debug('crawler.postprocess(): fallback to adding %s to 
bases' % link.url)
+            logger.debug('fallback to adding %s to bases', link.url)
             self.bases.append(link)
         # do a breadth first traversal of the website to determine depth
         session.query(Link).update(dict(depth=None), synchronize_session=False)
@@ -411,7 +406,7 @@
             link.depth = 0
         session.commit()
         while count > 0:
-            debugio.debug('crawler.postprocess(): %d links at depth %d' % 
(count, depth))
+            logger.debug('%d links at depth %d', count, depth)
             # update the depth of all links without a depth that have a
             # parent with the previous depth
             qry = session.query(Link).filter(Link.depth == None)
@@ -425,7 +420,7 @@
             # import the plugin
             pluginmod = __import__(plugin, globals(), locals(), [plugin])
             if hasattr(pluginmod, 'postprocess'):
-                debugio.info('  ' + plugin)
+                logger.info(plugin)
                 pluginmod.postprocess(self)
 
     def generate(self):
@@ -437,7 +432,7 @@
             # import the plugin
             pluginmod = __import__(plugin, globals(), locals(), [plugin])
             if hasattr(pluginmod, 'generate'):
-                debugio.info('  ' + plugin)
+                logger.info(plugin)
                 pluginmod.generate(self)
         # install theme files
         install_file('webcheck.css', True)

Modified: webcheck/webcheck/db.py
==============================================================================
--- webcheck/webcheck/db.py     Fri Oct 14 10:14:28 2011        (r456)
+++ webcheck/webcheck/db.py     Fri Oct 14 14:57:08 2011        (r457)
@@ -20,6 +20,7 @@
 # The files produced as output from the software do not automatically fall
 # under the copyright of the software, unless explicitly stated otherwise.
 
+import logging
 import urlparse
 
 from sqlalchemy.ext.declarative import declarative_base
@@ -29,10 +30,14 @@
 from sqlalchemy.orm.session import object_session
 from sqlalchemy.sql.expression import union
 
-from webcheck import config, debugio
+from webcheck import config
 from webcheck.myurllib import normalizeurl
 
 
+logger = logging.getLogger(__name__)
+
+
+
 # provide session and schema classes
 Session = sessionmaker()
 Base = declarative_base()
@@ -116,12 +121,11 @@
         the encoding is supported."""
         if not self.encoding and encoding:
             try:
-                debugio.debug('crawler.Link.set_encoding(%r)' % encoding)
+                logger.debug('crawler.Link.set_encoding(%r)', encoding)
                 unicode('just some random text', encoding, 'replace')
                 self.encoding = encoding
             except Exception, e:
-                import traceback
-                traceback.print_exc()
+                logger.exception('unknown encoding: %s', encoding)
                 self.add_pageproblem('unknown encoding: %s' % encoding)
 
     def add_redirect(self, url):

Modified: webcheck/webcheck/parsers/html/__init__.py
==============================================================================
--- webcheck/webcheck/parsers/html/__init__.py  Fri Oct 14 10:14:28 2011        
(r456)
+++ webcheck/webcheck/parsers/html/__init__.py  Fri Oct 14 14:57:08 2011        
(r457)
@@ -25,9 +25,13 @@
 back to loading the legacy HTMLParser parser."""
 
 import htmlentitydefs
+import logging
 import re
 
-from webcheck import debugio, config
+from webcheck import config
+
+
+logger = logging.getLogger(__name__)
 
 
 # the list of mimetypes this module should be able to handle
@@ -95,12 +99,12 @@
     try:
         # try BeautifulSoup parser first
         import webcheck.parsers.html.beautifulsoup
-        debugio.debug('webcheck.parsers.html.parse(): the BeautifulSoup parser 
is ok')
+        logger.debug('the BeautifulSoup parser is ok')
         _parsefunction = webcheck.parsers.html.beautifulsoup.parse
     except ImportError:
         # fall back to legacy HTMLParser parser
-        debugio.warn('falling back to the legacy HTML parser, '
-                     'consider installing BeautifulSoup')
+        logger.warn('falling back to the legacy HTML parser, '
+                    'consider installing BeautifulSoup')
         import webcheck.parsers.html.htmlparser
         _parsefunction = webcheck.parsers.html.htmlparser.parse
     # call the actual parse function
@@ -116,9 +120,9 @@
     if config.TIDY_OPTIONS:
         try:
             import webcheck.parsers.html.calltidy
-            debugio.debug('webcheck.parsers.html.parse(): the Tidy parser is 
ok')
+            logger.debug('the Tidy parser is ok')
             webcheck.parsers.html.calltidy.parse(content, link)
         except ImportError:
-            debugio.warn('tidy library (python-utidylib) is unavailable')
+            logger.warn('tidy library (python-utidylib) is unavailable')
             # remove config to only try once
             config.TIDY_OPTIONS = None

Modified: webcheck/webcheck/parsers/html/beautifulsoup.py
==============================================================================
--- webcheck/webcheck/parsers/html/beautifulsoup.py     Fri Oct 14 10:14:28 
2011        (r456)
+++ webcheck/webcheck/parsers/html/beautifulsoup.py     Fri Oct 14 14:57:08 
2011        (r457)
@@ -24,6 +24,7 @@
 BeautifulSoup HTML parser and is more flexible than the legacy HTMLParser
 module."""
 
+import logging
 import re
 import urlparse
 
@@ -34,6 +35,9 @@
 import webcheck.parsers.css
 
 
+logger = logging.getLogger(__name__)
+
+
 # pattern for matching http-equiv and content part of
 # <meta http-equiv="refresh" content="0;url=URL">
 _refreshhttpequivpattern = re.compile('^refresh$', re.I)
@@ -41,9 +45,8 @@
 
 # check BeautifulSoup find() function for bugs
 if BeautifulSoup.BeautifulSoup('<foo>').find('foo', bar=True):
-    from webcheck import debugio
-    debugio.warn('using buggy version of BeautifulSoup (%s)' %
-                 BeautifulSoup.__version__)
+    logger.warn('using buggy version of BeautifulSoup (%s)',
+                BeautifulSoup.__version__)
 
 
 def parse(content, link):

Modified: webcheck/webcheck/parsers/html/htmlparser.py
==============================================================================
--- webcheck/webcheck/parsers/html/htmlparser.py        Fri Oct 14 10:14:28 
2011        (r456)
+++ webcheck/webcheck/parsers/html/htmlparser.py        Fri Oct 14 14:57:08 
2011        (r457)
@@ -26,15 +26,18 @@
 will only handle properly formatted HTML."""
 
 import HTMLParser
+import logging
 import re
 import urlparse
 
-from webcheck import debugio
 from webcheck.myurllib import normalizeurl
 from webcheck.parsers.html import htmlunescape
 import webcheck.parsers.css
 
 
+logger = logging.getLogger(__name__)
+
+
 # pattern for matching numeric html entities
 _charentitypattern = re.compile('&#([0-9]{1,3});')
 
@@ -94,7 +97,7 @@
         # construct error message
         message += ', ' + self._location()
         # store error message
-        debugio.debug('webcheck.parsers.html.htmlparser._MyHTMLParser.error(): 
problem parsing html: ' + message)
+        logger.debug('problem parsing html: %s', message)
         if self.errmsg is None:
             self.errmsg = message
         # increment error count
@@ -107,7 +110,7 @@
         try:
             return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i)
         except AssertionError:
-            
debugio.debug('webcheck.parsers.html.htmlparser._MyHTMLParser.check_for_whole_start_tag():
 caught assertion error')
+            logger.exception('caught assertion error')
             return None
 
     def handle_starttag(self, tag, attrs):
@@ -257,7 +260,7 @@
         try:
             return htmlunescape(unicode(txt, encoding, 'replace'))
         except (LookupError, TypeError, ValueError), e:
-            debugio.warn('page has unknown encoding: %s' % str(encoding))
+            logger.warn('page has unknown encoding: %s', str(encoding))
     # fall back to locale's encoding
     return htmlunescape(unicode(txt, errors='replace'))
 
@@ -272,13 +275,13 @@
         parser.close()
     except Exception, e:
         # ignore (but log) all errors
-        debugio.debug('webcheck.parsers.html.htmlparser.parse(): caught 
exception: ' + str(e))
+        logger.exception('caught exception: %s', str(e))
     # check for parser errors
     if parser.errmsg is not None:
-        debugio.debug('webcheck.parsers.html.htmlparser.parse(): problem 
parsing html: ' + parser.errmsg)
+        logger.debug('problem parsing html: %s', parser.errmsg)
         link.add_pageproblem('problem parsing html: %s' % parser.errmsg)
     # dump encoding
-    debugio.debug('webcheck.parsers.html.htmlparser.parse(): html encoding: 
%s' % str(link.encoding))
+    logger.debug('html encoding: %s', str(link.encoding))
     # flag that the link contains a valid page
     link.is_page = True
     # save the title

Modified: webcheck/webcheck/util.py
==============================================================================
--- webcheck/webcheck/util.py   Fri Oct 14 10:14:28 2011        (r456)
+++ webcheck/webcheck/util.py   Fri Oct 14 14:57:08 2011        (r457)
@@ -22,13 +22,17 @@
 # The files produced as output from the software do not automatically fall
 # under the copyright of the software, unless explicitly stated otherwise.
 
+import logging
 import os
 import shutil
 import sys
 import urllib
 import urlparse
 
-from webcheck import config, debugio
+from webcheck import config
+
+
+logger = logging.getLogger(__name__)
 
 
 def open_file(filename, istext=True, makebackup=False):
@@ -52,7 +56,7 @@
                 res = raw_input('webcheck: overwrite %s? [y]es, [a]ll, [q]uit: 
' % fname)
             except EOFError:
                 # bail out in case raw_input() failed
-                debugio.error('error reading response')
+                logger.exception('error reading response')
                 res = 'q'
             res = res.lower() + ' '
             if res[0] == 'a':
@@ -95,7 +99,7 @@
     # test if source and target are the same
     source = os.path.realpath(source)
     if source == os.path.realpath(target):
-        debugio.warn('attempt to overwrite %(fname)s with itself' % {'fname': 
source})
+        logger.warn('attempt to overwrite %s with itself', source)
         return
     # open the input file
     sfp = open(source, mode)
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
webcheck commit: r457 - in webcheck: . webcheck webcheck/parsers/html, Commits of the webcheck project
Prev by Date: webcheck commit: r456 - webcheck/webcheck
Next by Date: webcheck commit: r458 - webcheck/webcheck/plugins
Previous by thread: webcheck commit: r456 - webcheck/webcheck
Next by thread: webcheck commit: r458 - webcheck/webcheck/plugins