lists.arthurdejong.org
RSS feed

webcheck commit: r447 - in webcheck: . webcheck webcheck/parsers/html webcheck/plugins

[Date Prev][Date Next] [Thread Prev][Thread Next]

webcheck commit: r447 - in webcheck: . webcheck webcheck/parsers/html webcheck/plugins



Author: arthur
Date: Fri Oct  7 12:52:35 2011
New Revision: 447
URL: http://arthurdejong.org/viewvc/webcheck?revision=447&view=revision

Log:
move some more initialisation from cmd to crawler and make imports of config 
and debugio consistent

Modified:
   webcheck/cmd.py
   webcheck/webcheck/crawler.py
   webcheck/webcheck/db.py
   webcheck/webcheck/parsers/html/__init__.py
   webcheck/webcheck/parsers/html/beautifulsoup.py
   webcheck/webcheck/parsers/html/calltidy.py
   webcheck/webcheck/parsers/html/htmlparser.py
   webcheck/webcheck/plugins/__init__.py
   webcheck/webcheck/plugins/about.py
   webcheck/webcheck/plugins/new.py
   webcheck/webcheck/plugins/old.py
   webcheck/webcheck/plugins/sitemap.py
   webcheck/webcheck/plugins/size.py
   webcheck/webcheck/util.py

Modified: webcheck/cmd.py
==============================================================================
--- webcheck/cmd.py     Fri Oct  7 10:37:26 2011        (r446)
+++ webcheck/cmd.py     Fri Oct  7 12:52:35 2011        (r447)
@@ -32,12 +32,10 @@
 import urlparse
 
 import webcheck
-from webcheck import config
-from webcheck import debugio
-import webcheck.crawler
-import webcheck.db
 import webcheck.monkeypatch
-import webcheck.plugins
+from webcheck.crawler import Site
+from webcheck import config, debugio
+
 
 debugio.loglevel = debugio.INFO
 
@@ -157,13 +155,7 @@
         if not os.path.isdir(config.OUTPUT_DIR):
             os.mkdir(config.OUTPUT_DIR)
         # set up database connection
-        filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
-        from sqlalchemy import create_engine
-        engine = create_engine('sqlite:///' + filename)
-        webcheck.db.Session.configure(bind=engine)
-        # ensure that all tables are created
-        webcheck.db.Base.metadata.create_all(engine)
-        # TODO: schema migraton goes here
+        site.setup_database()
         # add configuration to site
         for pattern in internal_urls:
             site.add_internal_re(pattern)
@@ -189,7 +181,6 @@
     """Main program."""
     # crawl through the website
     debugio.info('checking site....')
-    webcheck.crawler.setup_urllib2()
     site.crawl()  # this will take a while
     debugio.info('done.')
     # do postprocessing (building site structure, etc)
@@ -201,14 +192,13 @@
     debugio.info('generating reports...')
     # for every plugin, generate a page
     site.generate()
-    # put extra files in the output directory
     debugio.info('done.')
 
 
 if __name__ == '__main__':
     try:
         # initialize site object
-        site = webcheck.crawler.Site()
+        site = Site()
         # parse command-line arguments
         parse_args(site)
         # run the main program

Modified: webcheck/webcheck/crawler.py
==============================================================================
--- webcheck/webcheck/crawler.py        Fri Oct  7 10:37:26 2011        (r446)
+++ webcheck/webcheck/crawler.py        Fri Oct  7 12:52:35 2011        (r447)
@@ -38,13 +38,14 @@
 import urllib2
 import urlparse
 
-from webcheck.db import Session, Link, LinkProblem, PageProblem, children, \
-                        embedded
-from webcheck import debugio
+from webcheck import config, debugio
+from webcheck.db import Session, Base, Link, LinkProblem, PageProblem, \
+        children, embedded
 from webcheck.util import install_file
-import webcheck.config
 import webcheck.parsers
 
+from sqlalchemy import create_engine
+
 
 class RedirectError(urllib2.HTTPError):
 
@@ -59,11 +60,11 @@
         raise RedirectError(req.get_full_url(), code, msg, headers, fp, newurl)
 
 
-def setup_urllib2():
+def _setup_urllib2():
     """Configure the urllib2 module to store cookies in the output
     directory."""
     import webcheck  # local import to avoid import loop
-    filename = os.path.join(webcheck.config.OUTPUT_DIR, 'cookies.lwp')
+    filename = os.path.join(config.OUTPUT_DIR, 'cookies.lwp')
     # set up our cookie jar
     cookiejar = cookielib.LWPCookieJar(filename)
     try:
@@ -77,7 +78,7 @@
     opener.addheaders = [
       ('User-agent', 'webcheck %s' % webcheck.__version__),
       ]
-    if webcheck.config.BYPASSHTTPCACHE:
+    if config.BYPASSHTTPCACHE:
         opener.addheaders.append(('Cache-control', 'no-cache'))
         opener.addheaders.append(('Pragma', 'no-cache'))
     urllib2.install_opener(opener)
@@ -115,6 +116,14 @@
         # list of base urls (these are the internal urls to start from)
         self.bases = []
 
+    def setup_database(self):
+        filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
+        engine = create_engine('sqlite:///' + filename)
+        Session.configure(bind=engine)
+        # ensure that all tables are created
+        Base.metadata.create_all(engine)
+        # TODO: schema migraton goes here
+
     def add_internal(self, url):
         """Add the given url and consider all urls below it to be internal.
         These links are all marked for checking with the crawl() function."""
@@ -147,7 +156,7 @@
                 return True
         res = False
         # check that the url starts with an internal url
-        if webcheck.config.BASE_URLS_ONLY:
+        if config.BASE_URLS_ONLY:
             # the url must start with one of the _internal_urls
             for i in self._internal_urls:
                 res |= (i == url[:len(i)])
@@ -203,10 +212,10 @@
                 return 'yanked'
         # check if we should avoid external links
         is_internal = self._is_internal(url)
-        if not is_internal and webcheck.config.AVOID_EXTERNAL_LINKS:
+        if not is_internal and config.AVOID_EXTERNAL_LINKS:
             return 'external avoided'
         # check if we should use robot parsers
-        if not webcheck.config.USE_ROBOTS:
+        if not config.USE_ROBOTS:
             return None
         (scheme, netloc) = urlparse.urlsplit(url)[0:2]
         # skip schemes not having robot.txt files
@@ -241,10 +250,12 @@
         add_internal(). If the serialization file pointer
         is specified the crawler writes out updated links to
         the file while crawling the site."""
+        # configure urllib2 to store cookies in the output directory
+        _setup_urllib2()
         # get a database session
         session = Session()
         # remove all links
-        if not webcheck.config.CONTINUE:
+        if not config.CONTINUE:
             session.query(LinkProblem).delete()
             session.commit()
             session.query(PageProblem).delete()
@@ -286,10 +297,10 @@
             # flush database changes
             session.commit()
             # sleep between requests if configured
-            if webcheck.config.WAIT_BETWEEN_REQUESTS > 0:
+            if config.WAIT_BETWEEN_REQUESTS > 0:
                 debugio.debug('crawler.crawl(): sleeping %s seconds' %
-                              webcheck.config.WAIT_BETWEEN_REQUESTS)
-                time.sleep(webcheck.config.WAIT_BETWEEN_REQUESTS)
+                              config.WAIT_BETWEEN_REQUESTS)
+                time.sleep(config.WAIT_BETWEEN_REQUESTS)
             debugio.debug('crawler.crawl(): items left to check: %d' %
                           (remaining + len(tocheck)))
         session.commit()
@@ -307,7 +318,7 @@
             parent = link.parents.first()
             if parent:
                 request.add_header('Referer', parent.url)
-            response = urllib2.urlopen(request, 
timeout=webcheck.config.IOTIMEOUT)
+            response = urllib2.urlopen(request, timeout=config.IOTIMEOUT)
             link.mimetype = response.info().gettype()
             link.set_encoding(response.headers.getparam('charset'))
             # FIXME: get result code and other stuff
@@ -406,7 +417,7 @@
             debugio.debug('crawler.postprocess(): %d links at depth %d' % 
(count, depth))
             # TODO: also handle embeds
         # see if any of the plugins want to do postprocessing
-        for plugin in webcheck.config.PLUGINS:
+        for plugin in config.PLUGINS:
             # import the plugin
             pluginmod = __import__(plugin, globals(), locals(), [plugin])
             if hasattr(pluginmod, 'postprocess'):
@@ -415,7 +426,7 @@
 
     def generate(self):
         """Generate pages for plugins."""
-        for plugin in webcheck.config.PLUGINS:
+        for plugin in config.PLUGINS:
             # import the plugin
             pluginmod = __import__(plugin, globals(), locals(), [plugin])
             if hasattr(pluginmod, 'generate'):

Modified: webcheck/webcheck/db.py
==============================================================================
--- webcheck/webcheck/db.py     Fri Oct  7 10:37:26 2011        (r446)
+++ webcheck/webcheck/db.py     Fri Oct  7 12:52:35 2011        (r447)
@@ -29,9 +29,8 @@
 from sqlalchemy.orm.session import object_session
 from sqlalchemy.sql.expression import union
 
+from webcheck import config, debugio
 from webcheck.myurllib import normalizeurl
-import webcheck.config
-import webcheck.debugio
 
 
 # provide session and schema classes
@@ -117,7 +116,7 @@
         the encoding is supported."""
         if not self.encoding and encoding:
             try:
-                webcheck.debugio.debug('crawler.Link.set_encoding(%r)' % 
encoding)
+                debugio.debug('crawler.Link.set_encoding(%r)' % encoding)
                 unicode('just some random text', encoding, 'replace')
                 self.encoding = encoding
             except Exception, e:
@@ -132,7 +131,7 @@
         self.redirectdepth = max([self.redirectdepth] +
                                  [x.redirectdepth for x in self.parents]) + 1
         # check depth
-        if self.redirectdepth >= webcheck.config.REDIRECT_DEPTH:
+        if self.redirectdepth >= config.REDIRECT_DEPTH:
             self.add_linkproblem('too many redirects (%d)' % 
self.redirectdepth)
             return
         # check for redirect to self

Modified: webcheck/webcheck/parsers/html/__init__.py
==============================================================================
--- webcheck/webcheck/parsers/html/__init__.py  Fri Oct  7 10:37:26 2011        
(r446)
+++ webcheck/webcheck/parsers/html/__init__.py  Fri Oct  7 12:52:35 2011        
(r447)
@@ -117,7 +117,7 @@
         try:
             import webcheck.parsers.html.calltidy
             debugio.debug('webcheck.parsers.html.parse(): the Tidy parser is 
ok')
-            calltidy.parse(content, link)
+            webcheck.parsers.html.calltidy.parse(content, link)
         except ImportError:
             debugio.warn('tidy library (python-utidylib) is unavailable')
             # remove config to only try once

Modified: webcheck/webcheck/parsers/html/beautifulsoup.py
==============================================================================
--- webcheck/webcheck/parsers/html/beautifulsoup.py     Fri Oct  7 10:37:26 
2011        (r446)
+++ webcheck/webcheck/parsers/html/beautifulsoup.py     Fri Oct  7 12:52:35 
2011        (r447)
@@ -31,6 +31,7 @@
 
 from webcheck.myurllib import normalizeurl
 from webcheck.parsers.html import htmlunescape
+import webcheck.parsers.css
 
 
 # pattern for matching http-equiv and content part of
@@ -171,13 +172,11 @@
     for style in soup.findAll('style'):
         if style.string:
             # delegate handling of inline css to css module
-            import webcheck.parsers.css
-            parsers.css.parse(htmlunescape(style.string), link, base)
+            webcheck.parsers.css.parse(htmlunescape(style.string), link, base)
     # <ANY style="CSS">
     for elem in soup.findAll(style=True):
         # delegate handling of inline css to css module
-        import webcheck.parsers.css
-        parsers.css.parse(elem['style'], link, base)
+        webcheck.parsers.css.parse(elem['style'], link, base)
     # <script src="url">
     for script in soup.findAll('script', src=True):
         embed = normalizeurl(htmlunescape(script['src']).strip())

Modified: webcheck/webcheck/parsers/html/calltidy.py
==============================================================================
--- webcheck/webcheck/parsers/html/calltidy.py  Fri Oct  7 10:37:26 2011        
(r446)
+++ webcheck/webcheck/parsers/html/calltidy.py  Fri Oct  7 12:52:35 2011        
(r447)
@@ -22,8 +22,8 @@
 
 import tidy
 
-import webcheck.config
-import webcheck.parsers.html
+from webcheck import config
+from webcheck.parsers.html import htmlunescape
 
 
 def parse(content, link):
@@ -31,7 +31,7 @@
     link."""
     # only call tidy on internal pages
     if link.is_internal:
-        t = tidy.parseString(content, **webcheck.config.TIDY_OPTIONS)
+        t = tidy.parseString(content, **config.TIDY_OPTIONS)
         for err in t.errors:
             # error messages are escaped so we unescape them
-            
link.add_pageproblem(webcheck.parsers.html.htmlunescape(unicode(err)))
+            link.add_pageproblem(htmlunescape(unicode(err)))

Modified: webcheck/webcheck/parsers/html/htmlparser.py
==============================================================================
--- webcheck/webcheck/parsers/html/htmlparser.py        Fri Oct  7 10:37:26 
2011        (r446)
+++ webcheck/webcheck/parsers/html/htmlparser.py        Fri Oct  7 12:52:35 
2011        (r447)
@@ -32,7 +32,6 @@
 from webcheck import debugio
 from webcheck.myurllib import normalizeurl
 from webcheck.parsers.html import htmlunescape
-import webcheck.crawler
 
 
 # pattern for matching numeric html entities

Modified: webcheck/webcheck/plugins/__init__.py
==============================================================================
--- webcheck/webcheck/plugins/__init__.py       Fri Oct  7 10:37:26 2011        
(r446)
+++ webcheck/webcheck/plugins/__init__.py       Fri Oct  7 12:52:35 2011        
(r447)
@@ -50,11 +50,10 @@
 from sqlalchemy.orm.session import object_session
 
 import webcheck
+from webcheck import config
 from webcheck.db import Link
 from webcheck.parsers.html import htmlescape
 from webcheck.util import open_file
-import webcheck.config
-import webcheck.debugio
 
 
 def _floatformat(f):
@@ -128,7 +127,7 @@
     is external, insert "class=external" in the <a> tag."""
     return '<a href="%(url)s" %(target)sclass="%(cssclass)s" 
title="%(info)s">%(title)s</a>' % \
             dict(url=htmlescape(link.url),
-                 target='target="_blank" ' if 
webcheck.config.REPORT_LINKS_IN_NEW_WINDOW else '',
+                 target='target="_blank" ' if 
config.REPORT_LINKS_IN_NEW_WINDOW else '',
                  cssclass='internal' if link.is_internal else 'external',
                  info=htmlescape(_get_info(link)).replace('\n', '&#10;'),
                  title=htmlescape(title or link.title or link.url))
@@ -141,7 +140,7 @@
     count = link.count_parents
     if not count:
         return
-    parents = link.parents.order_by(Link.title, 
Link.url).options(joinedload(Link.linkproblems))[:webcheck.config.PARENT_LISTLEN]
+    parents = link.parents.order_by(Link.title, 
Link.url).options(joinedload(Link.linkproblems))[:config.PARENT_LISTLEN]
     fp.write(
       indent + '<div class="parents">\n' +
       indent + ' referenced from:\n' +
@@ -164,7 +163,7 @@
 def _print_navbar(fp, selected):
     """Return an html fragement representing the navigation bar for a page."""
     fp.write('  <ul class="navbar">\n')
-    for plugin in webcheck.config.PLUGINS:
+    for plugin in config.PLUGINS:
         # import the plugin
         pluginmod = __import__(plugin, globals(), locals(), [plugin])
         # skip if no outputfile

Modified: webcheck/webcheck/plugins/about.py
==============================================================================
--- webcheck/webcheck/plugins/about.py  Fri Oct  7 10:37:26 2011        (r446)
+++ webcheck/webcheck/plugins/about.py  Fri Oct  7 12:52:35 2011        (r447)
@@ -31,8 +31,8 @@
 import time
 
 import webcheck
+from webcheck import config
 from webcheck.db import Session, Link
-import webcheck.config
 import webcheck.plugins
 
 
@@ -101,7 +101,7 @@
     fp.write(
       '   <h3>Plugins</h3>\n'
       '   <ul>\n')
-    for plugin in webcheck.config.PLUGINS:
+    for plugin in config.PLUGINS:
         pluginmod = __import__(plugin, globals(), locals(), [plugin])
         fp.write(
           '    <li>\n'

Modified: webcheck/webcheck/plugins/new.py
==============================================================================
--- webcheck/webcheck/plugins/new.py    Fri Oct  7 10:37:26 2011        (r446)
+++ webcheck/webcheck/plugins/new.py    Fri Oct  7 12:52:35 2011        (r447)
@@ -30,8 +30,8 @@
 
 import time
 
+from webcheck import config
 from webcheck.db import Session, Link
-import webcheck.config
 import webcheck.plugins
 
 
@@ -42,7 +42,7 @@
     """Output the list of recently modified pages."""
     session = Session()
     # the time for which links are considered new
-    newtime = time.time() - SECS_PER_DAY * 
webcheck.config.REPORT_WHATSNEW_URL_AGE
+    newtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSNEW_URL_AGE
     # get all internal pages that are new
     links = session.query(Link).filter_by(is_page=True, is_internal=True)
     links = links.filter(Link.mtime > newtime).order_by(Link.mtime.desc())
@@ -53,7 +53,7 @@
           '   <p class="description">\n'
           '    No pages were found that were modified within the last %(new)d 
days.\n'
           '   </p>\n'
-          % {'new': webcheck.config.REPORT_WHATSNEW_URL_AGE})
+          % {'new': config.REPORT_WHATSNEW_URL_AGE})
         webcheck.plugins.close_html(fp)
         return
     fp.write(
@@ -61,7 +61,7 @@
       '    These pages have been recently modified (within %(new)d days).\n'
       '   </p>\n'
       '   <ul>\n'
-      % {'new': webcheck.config.REPORT_WHATSNEW_URL_AGE})
+      % {'new': config.REPORT_WHATSNEW_URL_AGE})
     for link in links:
         age = (time.time() - link.mtime) / SECS_PER_DAY
         fp.write(

Modified: webcheck/webcheck/plugins/old.py
==============================================================================
--- webcheck/webcheck/plugins/old.py    Fri Oct  7 10:37:26 2011        (r446)
+++ webcheck/webcheck/plugins/old.py    Fri Oct  7 12:52:35 2011        (r447)
@@ -31,7 +31,7 @@
 import time
 
 from webcheck.db import Session, Link
-import webcheck.config
+from webcheck import config
 import webcheck.plugins
 
 
@@ -42,7 +42,7 @@
     """Output the list of outdated pages to the specified file descriptor."""
     session = Session()
     # the time for which links are considered old
-    oldtime = time.time() - SECS_PER_DAY * 
webcheck.config.REPORT_WHATSOLD_URL_AGE
+    oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE
     # get all internal pages that are old
     links = session.query(Link).filter_by(is_page=True, is_internal=True)
     links = links.filter(Link.mtime < oldtime).order_by(Link.mtime)
@@ -53,7 +53,7 @@
           '   <p class="description">\n'
           '    No pages were found that were older than %(old)d days old.\n'
           '   </p>\n'
-          % {'old': webcheck.config.REPORT_WHATSOLD_URL_AGE})
+          % {'old': config.REPORT_WHATSOLD_URL_AGE})
         webcheck.plugins.close_html(fp)
         return
     fp.write(
@@ -62,7 +62,7 @@
       '    days) and may be outdated.\n'
       '   </p>\n'
       '   <ul>\n'
-      % {'old': webcheck.config.REPORT_WHATSOLD_URL_AGE})
+      % {'old': config.REPORT_WHATSOLD_URL_AGE})
     for link in links:
         age = (time.time() - link.mtime) / SECS_PER_DAY
         fp.write(

Modified: webcheck/webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/webcheck/plugins/sitemap.py        Fri Oct  7 10:37:26 2011        
(r446)
+++ webcheck/webcheck/plugins/sitemap.py        Fri Oct  7 12:52:35 2011        
(r447)
@@ -28,8 +28,8 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'index.html'
 
-from webcheck.db import Session, Link
-import webcheck.config
+from webcheck import config
+from webcheck.db import Link
 import webcheck.plugins
 
 
@@ -60,7 +60,7 @@
     fp.write(indent + '<li>\n')
     fp.write(indent + ' ' + webcheck.plugins.make_link(link) + '\n')
     # only check children if we are not too deep yet
-    if depth <= webcheck.config.REPORT_SITEMAP_LEVEL:
+    if depth <= config.REPORT_SITEMAP_LEVEL:
         # figure out the links to follow and ensure that they are only
         # explored from here
         children = set()
@@ -80,7 +80,6 @@
 
 def generate(site):
     """Output the sitemap."""
-    session = Session()
     fp = webcheck.plugins.open_html(webcheck.plugins.sitemap, site)
     # output the site structure using breadth first traversal
     fp.write(

Modified: webcheck/webcheck/plugins/size.py
==============================================================================
--- webcheck/webcheck/plugins/size.py   Fri Oct  7 10:37:26 2011        (r446)
+++ webcheck/webcheck/plugins/size.py   Fri Oct  7 12:52:35 2011        (r447)
@@ -29,7 +29,7 @@
 __outputfile__ = 'size.html'
 
 from webcheck.db import Session, Link
-import webcheck.config
+from webcheck import config
 import webcheck.plugins
 
 
@@ -61,7 +61,7 @@
     # get all internal pages and get big links
     links = session.query(Link).filter_by(is_page=True, is_internal=True)
     links = [x for x in links
-             if _getsize(x) >= webcheck.config.REPORT_SLOW_URL_SIZE * 1024]
+             if _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024]
     # sort links by size (biggest first)
     links.sort(lambda a, b: cmp(b.total_size, a.total_size))
     # present results
@@ -71,7 +71,7 @@
           '   <p class="description">\n'
           '    No pages over %(size)dK were found.\n'
           '   </p>\n'
-          % {'size': webcheck.config.REPORT_SLOW_URL_SIZE})
+          % {'size': config.REPORT_SLOW_URL_SIZE})
         webcheck.plugins.close_html(fp)
         return
     fp.write(
@@ -80,7 +80,7 @@
       '    slow to download.\n'
       '   </p>\n'
       '   <ul>\n'
-      % {'size': webcheck.config.REPORT_SLOW_URL_SIZE})
+      % {'size': config.REPORT_SLOW_URL_SIZE})
     for link in links:
         size = webcheck.plugins.get_size(link.total_size)
         fp.write(

Modified: webcheck/webcheck/util.py
==============================================================================
--- webcheck/webcheck/util.py   Fri Oct  7 10:37:26 2011        (r446)
+++ webcheck/webcheck/util.py   Fri Oct  7 12:52:35 2011        (r447)
@@ -34,7 +34,7 @@
 def open_file(filename, istext=True, makebackup=False):
     """This returns an open file object which can be used for writing. This
     file is created in the output directory. The output directory (stored in
-    webcheck.config.OUTPUT_DIR is created if it does not yet exist. If the 
second
+    config.OUTPUT_DIR is created if it does not yet exist. If the second
     parameter is True (default) the file is opened as an UTF-8 text file."""
     # check if output directory exists and create it if needed
     if not os.path.isdir(config.OUTPUT_DIR):
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/