webcheck commit: r435 - in webcheck: . parsers plugins webcheck webcheck/parsers webcheck/parsers/html webcheck/plugins

[Date Prev][Date Next] [Thread Prev][Thread Next]
From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
To: webcheck-commits [at] lists.arthurdejong.org
Reply-to: webcheck-users [at] lists.arthurdejong.org
Subject: webcheck commit: r435 - in webcheck: . parsers plugins webcheck webcheck/parsers webcheck/parsers/html webcheck/plugins
Date: Fri, 16 Sep 2011 15:36:40 +0200 (CEST)
Author: arthur
Date: Fri Sep 16 15:36:38 2011
New Revision: 435
URL: http://arthurdejong.org/viewvc/webcheck?revision=435&view=revision

Log:
move all the code except the command-line handling to the webcheck package and 
reorganise imports accordingly

Added:
   webcheck/cmd.py
      - copied, changed from r434, webcheck/webcheck.py
   webcheck/webcheck/   (props changed)
   webcheck/webcheck/__init__.py
   webcheck/webcheck/config.py
      - copied, changed from r434, webcheck/config.py
   webcheck/webcheck/crawler.py
      - copied, changed from r434, webcheck/crawler.py
   webcheck/webcheck/db.py
      - copied, changed from r434, webcheck/db.py
   webcheck/webcheck/debugio.py
      - copied, changed from r434, webcheck/debugio.py
   webcheck/webcheck/monkeypatch.py
      - copied, changed from r434, webcheck/monkeypatch.py
   webcheck/webcheck/myurllib.py
      - copied, changed from r434, webcheck/myurllib.py
   webcheck/webcheck/parsers/
      - copied from r434, webcheck/parsers/
   webcheck/webcheck/plugins/
      - copied from r434, webcheck/plugins/
Deleted:
   webcheck/config.py
   webcheck/crawler.py
   webcheck/db.py
   webcheck/debugio.py
   webcheck/monkeypatch.py
   webcheck/myurllib.py
   webcheck/parsers/
   webcheck/plugins/
   webcheck/webcheck.py
Modified:
   webcheck/webcheck/parsers/__init__.py
   webcheck/webcheck/parsers/css.py
   webcheck/webcheck/parsers/html/__init__.py
   webcheck/webcheck/parsers/html/beautifulsoup.py
   webcheck/webcheck/parsers/html/calltidy.py
   webcheck/webcheck/parsers/html/htmlparser.py
   webcheck/webcheck/plugins/__init__.py
   webcheck/webcheck/plugins/about.py
   webcheck/webcheck/plugins/anchors.py
   webcheck/webcheck/plugins/badlinks.py
   webcheck/webcheck/plugins/external.py
   webcheck/webcheck/plugins/images.py
   webcheck/webcheck/plugins/new.py
   webcheck/webcheck/plugins/notchkd.py
   webcheck/webcheck/plugins/notitles.py
   webcheck/webcheck/plugins/old.py
   webcheck/webcheck/plugins/problems.py
   webcheck/webcheck/plugins/sitemap.py
   webcheck/webcheck/plugins/size.py
   webcheck/webcheck/plugins/urllist.py

Copied and modified: webcheck/cmd.py (from r434, webcheck/webcheck.py)
==============================================================================
--- webcheck/webcheck.py        Sun Sep 11 17:33:55 2011        (r434, copy 
source)
+++ webcheck/cmd.py     Fri Sep 16 15:36:38 2011        (r435)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# webcheck.py - main module of webcheck doing command-line checking
+# cmd.py - command-line front-end for webcheck
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
@@ -28,22 +28,22 @@
 __version__ = '1.10.4'
 __homepage__ = 'http://arthurdejong.org/webcheck/'
 
-import sys
 import os
 import re
-import urlparse
+import sys
 import urllib
+import urlparse
 
-import config
+from webcheck import config
 # update some fields that currently are stored in config
 config.VERSION = __version__
 config.HOMEPAGE = __homepage__
 
-import crawler
-import plugins
-import debugio
-import monkeypatch
-import db
+from webcheck import debugio
+import webcheck.crawler
+import webcheck.db
+import webcheck.monkeypatch
+import webcheck.plugins
 
 debugio.loglevel = debugio.INFO
 
@@ -166,9 +166,9 @@
         filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
         from sqlalchemy import create_engine
         engine = create_engine('sqlite:///' + filename)
-        db.Session.configure(bind=engine)
+        webcheck.db.Session.configure(bind=engine)
         # ensure that all tables are created
-        db.Base.metadata.create_all(engine)
+        webcheck.db.Base.metadata.create_all(engine)
         # TODO: schema migraton goes here
         # add configuration to site
         for pattern in internal_urls:
@@ -235,7 +235,7 @@
                        'strerror': strerror})
         sys.exit(1)
     # create file in output directory (with overwrite question)
-    tfp = plugins.open_file(os.path.basename(source))
+    tfp = webcheck.plugins.open_file(os.path.basename(source))
     # copy contents
     shutil.copyfileobj(sfp, tfp)
     # close files
@@ -247,7 +247,7 @@
     """Main program."""
     # crawl through the website
     debugio.info('checking site....')
-    crawler.setup_urllib2()
+    webcheck.crawler.setup_urllib2()
     site.crawl()  # this will take a while
     debugio.info('done.')
     # do postprocessing (building site structure, etc)
@@ -269,7 +269,7 @@
 if __name__ == '__main__':
     try:
         # initialize site object
-        site = crawler.Site()
+        site = webcheck.crawler.Site()
         # parse command-line arguments
         parse_args(site)
         # run the main program

Added: webcheck/webcheck/__init__.py
==============================================================================

Copied and modified: webcheck/webcheck/config.py (from r434, webcheck/config.py)
==============================================================================
--- webcheck/config.py  Sun Sep 11 17:33:55 2011        (r434, copy source)
+++ webcheck/webcheck/config.py Fri Sep 16 15:36:38 2011        (r435)
@@ -29,6 +29,7 @@
 
 import urllib
 
+
 # Whether to consider any URL not starting with the base URL to be external.
 # This is the state of the -b command line option.
 BASE_URLS_ONLY = False

Copied and modified: webcheck/webcheck/crawler.py (from r434, 
webcheck/crawler.py)
==============================================================================
--- webcheck/crawler.py Sun Sep 11 17:33:55 2011        (r434, copy source)
+++ webcheck/webcheck/crawler.py        Fri Sep 16 15:36:38 2011        (r435)
@@ -40,10 +40,11 @@
 import urllib2
 import urlparse
 
-import config
-import db
-import debugio
-import parsers
+from webcheck.db import Session, Link, LinkProblem, PageProblem, children, \
+                        embedded
+from webcheck import debugio
+import webcheck.config
+import webcheck.parsers
 
 
 class RedirectError(urllib2.HTTPError):
@@ -61,7 +62,7 @@
 def setup_urllib2():
     """Configure the urllib2 module to store cookies in the output
     directory."""
-    filename = os.path.join(config.OUTPUT_DIR, 'cookies.lwp')
+    filename = os.path.join(webcheck.config.OUTPUT_DIR, 'cookies.lwp')
     # set up our cookie jar
     cookiejar = cookielib.LWPCookieJar(filename)
     try:
@@ -73,9 +74,9 @@
     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar),
                                   NoRedirectHandler())
     opener.addheaders = [
-      ('User-agent', 'webcheck %s' % config.VERSION),
+      ('User-agent', 'webcheck %s' % webcheck.config.VERSION),
       ]
-    if config.BYPASSHTTPCACHE:
+    if webcheck.config.BYPASSHTTPCACHE:
         opener.addheaders.append(('Cache-control', 'no-cache'))
         opener.addheaders.append(('Pragma', 'no-cache'))
     urllib2.install_opener(opener)
@@ -116,7 +117,7 @@
     def add_internal(self, url):
         """Add the given url and consider all urls below it to be internal.
         These links are all marked for checking with the crawl() function."""
-        url = db.Link.clean_url(url)
+        url = Link.clean_url(url)
         if url not in self._internal_urls:
             self._internal_urls.add(url)
 
@@ -145,7 +146,7 @@
                 return True
         res = False
         # check that the url starts with an internal url
-        if config.BASE_URLS_ONLY:
+        if webcheck.config.BASE_URLS_ONLY:
             # the url must start with one of the _internal_urls
             for i in self._internal_urls:
                 res |= (i == url[:len(i)])
@@ -201,10 +202,10 @@
                 return 'yanked'
         # check if we should avoid external links
         is_internal = self._is_internal(url)
-        if not is_internal and config.AVOID_EXTERNAL_LINKS:
+        if not is_internal and webcheck.config.AVOID_EXTERNAL_LINKS:
             return 'external avoided'
         # check if we should use robot parsers
-        if not config.USE_ROBOTS:
+        if not webcheck.config.USE_ROBOTS:
             return None
         (scheme, netloc) = urlparse.urlsplit(url)[0:2]
         # skip schemes not having robot.txt files
@@ -223,16 +224,16 @@
 
     def get_link(self, session, url):
         # try to find the URL
-        url = db.Link.clean_url(url)
-        link = session.query(db.Link).filter_by(url=url).first()
+        url = Link.clean_url(url)
+        link = session.query(Link).filter_by(url=url).first()
         if not link:
-            link = db.Link(url=url)
+            link = Link(url=url)
             session.add(link)
         return link
 
     def get_links_to_crawl(self, session):
-        links = session.query(db.Link).filter(db.Link.fetched == None)
-        return links.filter(db.Link.yanked == None)
+        links = session.query(Link).filter(Link.fetched == None)
+        return links.filter(Link.yanked == None)
 
     def crawl(self):
         """Crawl the website based on the urls specified with
@@ -240,22 +241,22 @@
         is specified the crawler writes out updated links to
         the file while crawling the site."""
         # get a database session
-        session = db.Session()
+        session = Session()
         # remove all links
-        if not config.CONTINUE:
-            session.query(db.LinkProblem).delete()
+        if not webcheck.config.CONTINUE:
+            session.query(LinkProblem).delete()
             session.commit()
-            session.query(db.PageProblem).delete()
+            session.query(PageProblem).delete()
             session.commit()
-            session.execute(db.children.delete())
+            session.execute(children.delete())
             session.commit()
-            session.execute(db.embedded.delete())
+            session.execute(embedded.delete())
             session.commit()
-            session.query(db.Link).delete()
+            session.query(Link).delete()
             session.commit()
         # add all internal urls to the database
         for url in self._internal_urls:
-            url = db.Link.clean_url(url)
+            url = Link.clean_url(url)
             self.get_link(session, url)
         # add some URLs from the database that haven't been fetched
         tocheck = self.get_links_to_crawl(session)
@@ -284,10 +285,10 @@
             # flush database changes
             session.commit()
             # sleep between requests if configured
-            if config.WAIT_BETWEEN_REQUESTS > 0:
+            if webcheck.config.WAIT_BETWEEN_REQUESTS > 0:
                 debugio.debug('crawler.crawl(): sleeping %s seconds' %
-                              config.WAIT_BETWEEN_REQUESTS)
-                time.sleep(config.WAIT_BETWEEN_REQUESTS)
+                              webcheck.config.WAIT_BETWEEN_REQUESTS)
+                time.sleep(webcheck.config.WAIT_BETWEEN_REQUESTS)
             debugio.debug('crawler.crawl(): items left to check: %d' %
                           (remaining + len(tocheck)))
         session.commit()
@@ -346,7 +347,7 @@
     def parse(self, link, response):
         """Parse the fetched response."""
         # find a parser for the content-type
-        parsermodule = parsers.get_parsermodule(link.mimetype)
+        parsermodule = webcheck.parsers.get_parsermodule(link.mimetype)
         if parsermodule is None:
             debugio.debug('crawler.Link.fetch(): unsupported content-type: %s' 
% link.mimetype)
             return
@@ -368,7 +369,7 @@
         """Do some basic post processing of the collected data, including
         depth calculation of every link."""
         # get a database session
-        session = db.Session()
+        session = Session()
         # build the list of urls that were set up with add_internal() that
         # do not have a parent (they form the base for the site)
         for url in self._internal_urls:
@@ -381,11 +382,11 @@
             self.bases.append(link)
         # if we got no bases, just use the first internal one
         if not self.bases:
-            link = session.query(db.Link).filter(db.Link.is_internal == 
True).first()
+            link = session.query(Link).filter(Link.is_internal == True).first()
             debugio.debug('crawler.postprocess(): fallback to adding %s to 
bases' % link.url)
             self.bases.append(link)
         # do a breadth first traversal of the website to determine depth
-        session.query(db.Link).update(dict(depth=None), 
synchronize_session=False)
+        session.query(Link).update(dict(depth=None), synchronize_session=False)
         session.commit()
         depth = 0
         count = len(self.bases)
@@ -396,15 +397,15 @@
         while count > 0:
             # update the depth of all links without a depth that have a
             # parent with the previous depth
-            qry = session.query(db.Link).filter(db.Link.depth == None)
-            qry = qry.filter(db.Link.linked_from.any(db.Link.depth == depth))
+            qry = session.query(Link).filter(Link.depth == None)
+            qry = qry.filter(Link.linked_from.any(Link.depth == depth))
             count = qry.update(dict(depth=depth + 1), 
synchronize_session=False)
             session.commit()
             depth += 1
             debugio.debug('crawler.postprocess(): %d links at depth %d' % 
(count, depth))
             # TODO: also handle embeds
         # see if any of the plugins want to do postprocessing
-        for p in config.PLUGINS:
+        for p in webcheck.config.PLUGINS:
             # import the plugin
             plugin = __import__('plugins.' + p, globals(), locals(), [p])
             if hasattr(plugin, 'postprocess'):
@@ -413,7 +414,7 @@
 
     def generate(self):
         """Generate pages for plugins."""
-        for p in config.PLUGINS:
+        for p in webcheck.config.PLUGINS:
             # import the plugin
             plugin = __import__('plugins.' + p, globals(), locals(), [p])
             if hasattr(plugin, 'generate'):

Copied and modified: webcheck/webcheck/db.py (from r434, webcheck/db.py)
==============================================================================
--- webcheck/db.py      Sun Sep 11 17:33:55 2011        (r434, copy source)
+++ webcheck/webcheck/db.py     Fri Sep 16 15:36:38 2011        (r435)
@@ -29,9 +29,9 @@
 from sqlalchemy.orm.session import object_session
 from sqlalchemy.sql.expression import ClauseElement, union
 
-import config
-import debugio
-import myurllib
+from webcheck.myurllib import normalizeurl
+import webcheck.config
+import webcheck.debugio
 
 
 # provide session and schema classes
@@ -93,15 +93,14 @@
     @staticmethod
     def clean_url(url):
         # normalise the URL, removing the fragment from the URL
-        url = myurllib.normalizeurl(url)
-        return urlparse.urldefrag(myurllib.normalizeurl(url))[0]
+        return urlparse.urldefrag(normalizeurl(url))[0]
 
     def _get_link(self, url):
         """Get a link object for the specified URL."""
         # get the session
         session = object_session(self)
         # normalise the URL, removing the fragment from the URL
-        url, fragment = urlparse.urldefrag(myurllib.normalizeurl(url))
+        url, fragment = urlparse.urldefrag(normalizeurl(url))
         # try to find the link
         instance = session.query(Link).filter_by(url=url).first()
         if not instance:
@@ -118,7 +117,7 @@
         the encoding is supported."""
         if not self.encoding and encoding:
             try:
-                debugio.debug('crawler.Link.set_encoding(%r)' % encoding)
+                webcheck.debugio.debug('crawler.Link.set_encoding(%r)' % 
encoding)
                 unicode('just some random text', encoding, 'replace')
                 self.encoding = encoding
             except Exception, e:
@@ -133,7 +132,7 @@
         self.redirectdepth = max([self.redirectdepth] +
                                  [x.redirectdepth for x in self.parents]) + 1
         # check depth
-        if self.redirectdepth >= config.REDIRECT_DEPTH:
+        if self.redirectdepth >= webcheck.config.REDIRECT_DEPTH:
             self.add_linkproblem('too many redirects (%d)' % 
self.redirectdepth)
             return
         # check for redirect to self

Copied and modified: webcheck/webcheck/debugio.py (from r434, 
webcheck/debugio.py)
==============================================================================
--- webcheck/debugio.py Sun Sep 11 17:33:55 2011        (r434, copy source)
+++ webcheck/webcheck/debugio.py        Fri Sep 16 15:36:38 2011        (r435)
@@ -30,6 +30,7 @@
 
 import sys
 
+
 # log levels that can be used
 ERROR = 0
 WARN = 1

Copied and modified: webcheck/webcheck/monkeypatch.py (from r434, 
webcheck/monkeypatch.py)
==============================================================================
--- webcheck/monkeypatch.py     Sun Sep 11 17:33:55 2011        (r434, copy 
source)
+++ webcheck/webcheck/monkeypatch.py    Fri Sep 16 15:36:38 2011        (r435)
@@ -21,9 +21,9 @@
 # under the copyright of the software, unless explicitly stated otherwise.
 
 import re
-import urlparse
-import urllib
 import sys
+import urllib
+import urlparse
 
 
 __all__ = []

Copied and modified: webcheck/webcheck/myurllib.py (from r434, 
webcheck/myurllib.py)
==============================================================================
--- webcheck/myurllib.py        Sun Sep 11 17:33:55 2011        (r434, copy 
source)
+++ webcheck/webcheck/myurllib.py       Fri Sep 16 15:36:38 2011        (r435)
@@ -20,9 +20,9 @@
 # The files produced as output from the software do not automatically fall
 # under the copyright of the software, unless explicitly stated otherwise.
 
-import urlparse
 import re
 import urllib
+import urlparse
 
 # this is a workaround for Python 2.3
 try:

Modified: webcheck/webcheck/parsers/__init__.py
==============================================================================
--- webcheck/parsers/__init__.py        Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/parsers/__init__.py       Fri Sep 16 15:36:38 2011        
(r435)
@@ -40,7 +40,7 @@
     # go throught all known modules to probe the content-types
     # (do this only once)
     for mod in _modules:
-        parser = __import__('parsers.' + mod, globals(), locals(), [mod])
+        parser = __import__('webcheck.parsers.' + mod, globals(), locals(), 
[mod])
         for mimetype in parser.mimetypes:
             _parsermodules[mimetype] = parser
 

Modified: webcheck/webcheck/parsers/css.py
==============================================================================
--- webcheck/parsers/css.py     Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/parsers/css.py    Fri Sep 16 15:36:38 2011        (r435)
@@ -26,8 +26,9 @@
 
 mimetypes = ('text/css',)
 
-import urlparse
 import re
+import urlparse
+
 
 # pattern for matching /* ... */ comments in css
 _commentpattern = re.compile('/\*.*?\*/', re.IGNORECASE | re.DOTALL)

Modified: webcheck/webcheck/parsers/html/__init__.py
==============================================================================
--- webcheck/parsers/html/__init__.py   Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/parsers/html/__init__.py  Fri Sep 16 15:36:38 2011        
(r435)
@@ -24,10 +24,12 @@
 module that tries to load the BeatifulSoup parser first and falls
 back to loading the legacy HTMLParser parser."""
 
-import debugio
-import re
 import htmlentitydefs
-import config
+import re
+
+from webcheck import debugio
+import webcheck.config
+
 
 # the list of mimetypes this module should be able to handle
 mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
@@ -93,15 +95,15 @@
     global _parsefunction
     try:
         # try BeautifulSoup parser first
-        import parsers.html.beautifulsoup
-        debugio.debug('parsers.html.parse(): the BeautifulSoup parser is ok')
-        _parsefunction = parsers.html.beautifulsoup.parse
+        import webcheck.parsers.html.beautifulsoup
+        debugio.debug('webcheck.parsers.html.parse(): the BeautifulSoup parser 
is ok')
+        _parsefunction = webcheck.parsers.html.beautifulsoup.parse
     except ImportError:
         # fall back to legacy HTMLParser parser
         debugio.warn('falling back to the legacy HTML parser, '
                      'consider installing BeautifulSoup')
-        import parsers.html.htmlparser
-        _parsefunction = parsers.html.htmlparser.parse
+        import webcheck.parsers.html.htmlparser
+        _parsefunction = webcheck.parsers.html.htmlparser.parse
     # call the actual parse function
     _parsefunction(content, link)
 
@@ -112,12 +114,12 @@
     # call the normal parse function
     _parsefunction(content, link)
     # call the tidy parse function
-    if config.TIDY_OPTIONS:
+    if webcheck.config.TIDY_OPTIONS:
         try:
             import calltidy
-            debugio.debug('parsers.html.parse(): the Tidy parser is ok')
+            debugio.debug('webcheck.parsers.html.parse(): the Tidy parser is 
ok')
             calltidy.parse(content, link)
         except ImportError:
             debugio.warn('tidy library (python-utidylib) is unavailable')
             # remove config to only try once
-            config.TIDY_OPTIONS = None
+            webcheck.config.TIDY_OPTIONS = None

Modified: webcheck/webcheck/parsers/html/beautifulsoup.py
==============================================================================
--- webcheck/parsers/html/beautifulsoup.py      Sun Sep 11 17:33:55 2011        
(r434)
+++ webcheck/webcheck/parsers/html/beautifulsoup.py     Fri Sep 16 15:36:38 
2011        (r435)
@@ -24,13 +24,16 @@
 BeautifulSoup HTML parser and is more flexible than the legacy HTMLParser
 module."""
 
-import urlparse
-import crawler
-import re
 import htmlentitydefs
+import re
+import urlparse
+
 import BeautifulSoup
-import myurllib
-from parsers.html import htmlunescape
+
+from webcheck.myurllib import normalizeurl
+from webcheck.parsers.html import htmlunescape
+import crawler
+
 
 # pattern for matching http-equiv and content part of
 # <meta http-equiv="refresh" content="0;url=URL">
@@ -57,21 +60,21 @@
     if title and title.string:
         link.title = htmlunescape(title.string).strip()
 
-    # FIXME: using myurllib.normalizeurl is wrong below, we should probably use
+    # FIXME: using normalizeurl is wrong below, we should probably use
     #        something like link.urlunescape() to do the escaping and check
     #        and log at the same time
 
     # <base href="URL">
     base = soup.find('base', href=True)
     if base:
-        base = myurllib.normalizeurl(htmlunescape(base['href']).strip())
+        base = normalizeurl(htmlunescape(base['href']).strip())
     else:
         base = link.url
     # <link rel="TYPE" href="URL">
     for l in soup.findAll('link', rel=True, href=True):
         if l['rel'].lower() in ('stylesheet', 'alternate stylesheet', 'icon',
                                 'shortcut icon'):
-            embed = myurllib.normalizeurl(htmlunescape(l['href']).strip())
+            embed = normalizeurl(htmlunescape(l['href']).strip())
             if embed:
                 link.add_embed(urlparse.urljoin(base, embed))
     # <meta name="author" content="AUTHOR">
@@ -91,26 +94,26 @@
             link.add_child(urlparse.urljoin(base, child))
     # <img src="URL">
     for img in soup.findAll('img', src=True):
-        embed = myurllib.normalizeurl(htmlunescape(img['src']).strip())
+        embed = normalizeurl(htmlunescape(img['src']).strip())
         if embed:
             link.add_embed(urlparse.urljoin(base, embed))
     # <a href="URL">
     for a in soup.findAll('a', href=True):
-        child = myurllib.normalizeurl(htmlunescape(a['href']).strip())
+        child = normalizeurl(htmlunescape(a['href']).strip())
         if child:
             link.add_child(urlparse.urljoin(base, child))
     # <a name="NAME">
     # TODO: consistent url escaping?
     for a in soup.findAll('a', attrs={'name': True}):
         # get anchor name
-        a_name = myurllib.normalizeurl(htmlunescape(a['name']).strip())
+        a_name = normalizeurl(htmlunescape(a['name']).strip())
         # if both id and name are used they should be the same
         if 'id' in a and \
-           a_name != myurllib.normalizeurl(htmlunescape(a['id']).strip()):
+           a_name != normalizeurl(htmlunescape(a['id']).strip()):
             link.add_pageproblem(
               'anchors defined in name and id attributes do not match')
             # add the id anchor anyway
-            
link.add_anchor(myurllib.normalizeurl(htmlunescape(a['id']).strip()))
+            link.add_anchor(normalizeurl(htmlunescape(a['id']).strip()))
         # add the anchor
         link.add_anchor(a_name)
     # <ANY id="ID">
@@ -119,51 +122,51 @@
         if elem.name == 'a' and 'name' in elem:
             continue
         # add the anchor
-        
link.add_anchor(myurllib.normalizeurl(htmlunescape(elem['id']).strip()))
+        link.add_anchor(normalizeurl(htmlunescape(elem['id']).strip()))
     # <frameset><frame src="URL"...>...</frameset>
     for frame in soup.findAll('frame', src=True):
-        embed = myurllib.normalizeurl(htmlunescape(frame['src']).strip())
+        embed = normalizeurl(htmlunescape(frame['src']).strip())
         if embed:
             link.add_embed(urlparse.urljoin(base, embed))
     # <iframe src="URL"...>
     for frame in soup.findAll('iframe', src=True):
-        embed = myurllib.normalizeurl(htmlunescape(frame['src']).strip())
+        embed = normalizeurl(htmlunescape(frame['src']).strip())
         if embed:
             link.add_embed(urlparse.urljoin(base, embed))
     # <object data="URL"...>
     for obj in soup.findAll('object', data=True):
-        embed = myurllib.normalizeurl(htmlunescape(obj['data']).strip())
+        embed = normalizeurl(htmlunescape(obj['data']).strip())
         if embed:
             link.add_embed(urlparse.urljoin(base, embed))
     # <object><param name="movie" value="URL"...></object>
     for para in soup.findAll('param', attrs={'name': 'movie', 'value': True}):
-        embed = myurllib.normalizeurl(htmlunescape(para['value']).strip())
+        embed = normalizeurl(htmlunescape(para['value']).strip())
         if embed:
             link.add_embed(urlparse.urljoin(base, embed))
     # <map><area href="URL"...>...</map>
     for area in soup.findAll('area', href=True):
-        child = myurllib.normalizeurl(htmlunescape(area['href']).strip())
+        child = normalizeurl(htmlunescape(area['href']).strip())
         if child:
             link.add_child(urlparse.urljoin(base, child))
     # <applet code="URL" [archive="URL"]...>
     for applet in soup.findAll('applet', code=True):
         # if applet has archive tag check that
         if 'archive' in applet:
-            embed = 
myurllib.normalizeurl(htmlunescape(applet['archive']).strip())
+            embed = normalizeurl(htmlunescape(applet['archive']).strip())
         else:
-            embed = myurllib.normalizeurl(htmlunescape(applet['code']).strip())
+            embed = normalizeurl(htmlunescape(applet['code']).strip())
         if embed:
             link.add_embed(urlparse.urljoin(base, embed))
     # <embed src="URL"...>
     for embedd in soup.findAll('frame', src=True):
-        embed = myurllib.normalizeurl(htmlunescape(embedd['src']).strip())
+        embed = normalizeurl(htmlunescape(embedd['src']).strip())
         if embed:
             link.add_embed(urlparse.urljoin(base, embed))
     # <embed><param name="movie" value="url"></embed>
     for param in soup.findAll('param', attrs={
                   'name': re.compile("^movie$", re.I),
                   'value': True}):
-        embed = myurllib.normalizeurl(htmlunescape(param['value']).strip())
+        embed = normalizeurl(htmlunescape(param['value']).strip())
         if embed:
             link.add_embed(urlparse.urljoin(base, embed))
     # <style>content</style>
@@ -179,12 +182,12 @@
         parsers.css.parse(elem['style'], link, base)
     # <script src="url">
     for script in soup.findAll('script', src=True):
-        embed = myurllib.normalizeurl(htmlunescape(script['src']).strip())
+        embed = normalizeurl(htmlunescape(script['src']).strip())
         if embed:
             link.add_embed(urlparse.urljoin(base, embed))
     # <body|table|td background="url">
     for t in soup.findAll(('body', 'table', 'td'), background=True):
-        embed = myurllib.normalizeurl(htmlunescape(t['background']).strip())
+        embed = normalizeurl(htmlunescape(t['background']).strip())
         if embed:
             link.add_embed(urlparse.urljoin(base, embed))
     # flag that the link contains a valid page

Modified: webcheck/webcheck/parsers/html/calltidy.py
==============================================================================
--- webcheck/parsers/html/calltidy.py   Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/parsers/html/calltidy.py  Fri Sep 16 15:36:38 2011        
(r435)
@@ -21,8 +21,9 @@
 # under the copyright of the software, unless explicitly stated otherwise.
 
 import tidy
-import config
-import parsers.html
+
+import webcheck.config
+import webcheck.parsers.html
 
 
 def parse(content, link):
@@ -30,7 +31,7 @@
     link."""
     # only call tidy on internal pages
     if link.is_internal:
-        t = tidy.parseString(content, **config.TIDY_OPTIONS)
+        t = tidy.parseString(content, **webcheck.config.TIDY_OPTIONS)
         for err in t.errors:
             # error messages are escaped so we unescape them
-            link.add_pageproblem(parsers.html.htmlunescape(unicode(err)))
+            
link.add_pageproblem(webcheck.parsers.html.htmlunescape(unicode(err)))

Modified: webcheck/webcheck/parsers/html/htmlparser.py
==============================================================================
--- webcheck/parsers/html/htmlparser.py Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/parsers/html/htmlparser.py        Fri Sep 16 15:36:38 
2011        (r435)
@@ -25,13 +25,15 @@
 is not available and can be considered depricated. This parser
 will only handle properly formatted HTML."""
 
-import debugio
 import HTMLParser
-import urlparse
 import re
-import crawler
-import myurllib
-from parsers.html import htmlunescape
+import urlparse
+
+from webcheck import debugio
+from webcheck.myurllib import normalizeurl
+from webcheck.parsers.html import htmlunescape
+import webcheck.crawler
+
 
 # pattern for matching numeric html entities
 _charentitypattern = re.compile('&#([0-9]{1,3});')
@@ -79,20 +81,20 @@
     def _cleanurl(self, url, what='link'):
         """Do some translations of url."""
         # check for spaces in urls
-        # (characters are escaped in myurllib.normalizeurl())
+        # (characters are escaped in normalizeurl())
         if _spacepattern.search(url):
             self.link.add_pageproblem(
               what + ' contains unescaped spaces: ' + url + ', ' + 
self._location())
         # replace &#nnn; entity refs with proper characters
         url = _charentitypattern.sub(lambda x: chr(int(x.group(1))), url)
-        return myurllib.normalizeurl(url)
+        return normalizeurl(url)
 
     def error(self, message):
         """Override superclass' error() method to ignore errors."""
         # construct error message
         message += ', ' + self._location()
         # store error message
-        debugio.debug('parsers.html.htmlparser._MyHTMLParser.error(): problem 
parsing html: ' + message)
+        debugio.debug('webcheck.parsers.html.htmlparser._MyHTMLParser.error(): 
problem parsing html: ' + message)
         if self.errmsg is None:
             self.errmsg = message
         # increment error count
@@ -105,7 +107,7 @@
         try:
             return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i)
         except AssertionError:
-            
debugio.debug('parsers.html.htmlparser._MyHTMLParser.check_for_whole_start_tag():
 caught assertion error')
+            
debugio.debug('webcheck.parsers.html.htmlparser._MyHTMLParser.check_for_whole_start_tag():
 caught assertion error')
             return None
 
     def handle_starttag(self, tag, attrs):
@@ -210,8 +212,8 @@
         # pick up any tags with a style attribute
         if 'style' in attrs:
             # delegate handling of inline css to css module
-            import parsers.css
-            parsers.css.parse(attrs['style'], self.link, self.base)
+            import webcheck.parsers.css
+            webcheck.parsers.css.parse(attrs['style'], self.link, self.base)
 
     def handle_endtag(self, tag):
         """Handle end tags in html."""
@@ -220,8 +222,8 @@
             self.collect = None
         elif tag == 'style' and self.collect is not None:
             # delegate handling of inline css to css module
-            import parsers.css
-            parsers.css.parse(self.collect, self.link, self.base)
+            import webcheck.parsers.css
+            webcheck.parsers.css.parse(self.collect, self.link, self.base)
 
     def handle_data(self, data):
         """Collect data if we were collecting data."""
@@ -272,13 +274,13 @@
         parser.close()
     except Exception, e:
         # ignore (but log) all errors
-        debugio.debug('parsers.html.htmlparser.parse(): caught exception: ' + 
str(e))
+        debugio.debug('webcheck.parsers.html.htmlparser.parse(): caught 
exception: ' + str(e))
     # check for parser errors
     if parser.errmsg is not None:
-        debugio.debug('parsers.html.htmlparser.parse(): problem parsing html: 
' + parser.errmsg)
+        debugio.debug('webcheck.parsers.html.htmlparser.parse(): problem 
parsing html: ' + parser.errmsg)
         link.add_pageproblem('problem parsing html: %s' % parser.errmsg)
     # dump encoding
-    debugio.debug('parsers.html.htmlparser.parse(): html encoding: %s' % 
str(link.encoding))
+    debugio.debug('webcheck.parsers.html.htmlparser.parse(): html encoding: 
%s' % str(link.encoding))
     # flag that the link contains a valid page
     link.is_page = True
     # save the title

Modified: webcheck/webcheck/plugins/__init__.py
==============================================================================
--- webcheck/plugins/__init__.py        Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/__init__.py       Fri Sep 16 15:36:38 2011        
(r435)
@@ -49,13 +49,10 @@
 from sqlalchemy.orm import joinedload
 from sqlalchemy.orm.session import object_session
 
-import config
-import db
-import debugio
-import parsers.html
-
-# reference function from html module
-htmlescape = parsers.html.htmlescape
+from webcheck.db import Link
+from webcheck.parsers.html import htmlescape
+import webcheck.config
+import webcheck.debugio
 
 
 def _floatformat(f):
@@ -129,7 +126,7 @@
     is external, insert "class=external" in the <a> tag."""
     return '<a href="%(url)s" %(target)sclass="%(cssclass)s" 
title="%(info)s">%(title)s</a>' % \
             dict(url=htmlescape(link.url),
-                 target='target="_blank" ' if 
config.REPORT_LINKS_IN_NEW_WINDOW else '',
+                 target='target="_blank" ' if 
webcheck.config.REPORT_LINKS_IN_NEW_WINDOW else '',
                  cssclass='internal' if link.is_internal else 'external',
                  info=htmlescape(_get_info(link)).replace('\n', '&#10;'),
                  title=htmlescape(title or link.title or link.url))
@@ -142,7 +139,7 @@
     count = link.count_parents
     if not count:
         return
-    parents = link.parents.order_by(db.Link.title, 
db.Link.url).options(joinedload(db.Link.linkproblems))[:config.PARENT_LISTLEN]
+    parents = link.parents.order_by(Link.title, 
Link.url).options(joinedload(Link.linkproblems))[:webcheck.config.PARENT_LISTLEN]
     fp.write(
       indent + '<div class="parents">\n' +
       indent + ' referenced from:\n' +
@@ -165,26 +162,26 @@
 def open_file(filename, istext=True, makebackup=False):
     """This returns an open file object which can be used for writing. This
     file is created in the output directory. The output directory (stored in
-    config.OUTPUT_DIR is created if it does not yet exist. If the second
+    webcheck.config.OUTPUT_DIR is created if it does not yet exist. If the 
second
     parameter is True (default) the file is opened as an UTF-8 text file."""
     import os
     # check if output directory exists and create it if needed
-    if not os.path.isdir(config.OUTPUT_DIR):
+    if not os.path.isdir(webcheck.config.OUTPUT_DIR):
         try:
-            os.mkdir(config.OUTPUT_DIR)
+            os.mkdir(webcheck.config.OUTPUT_DIR)
         except OSError, (errno, strerror):
             debugio.error('error creating directory %(dir)s: %(strerror)s' %
-                          {'dir': config.OUTPUT_DIR,
+                          {'dir': webcheck.config.OUTPUT_DIR,
                            'strerror': strerror})
             sys.exit(1)
     # build the output file name
-    fname = os.path.join(config.OUTPUT_DIR, filename)
+    fname = os.path.join(webcheck.config.OUTPUT_DIR, filename)
     # check if file exists
     if os.path.exists(fname):
         if makebackup:
             # create backup of original (overwriting previous backup)
             os.rename(fname, fname + '~')
-        elif not config.OVERWRITE_FILES:
+        elif not webcheck.config.OVERWRITE_FILES:
             # ask to overwrite
             try:
                 res = raw_input('webcheck: overwrite %s? [y]es, [a]ll, [q]uit: 
' % fname)
@@ -194,7 +191,7 @@
                 res = 'q'
             res = res.lower() + ' '
             if res[0] == 'a':
-                config.OVERWRITE_FILES = True
+                webcheck.config.OVERWRITE_FILES = True
             elif res[0] != 'y':
                 print 'Aborted.'
                 sys.exit(1)
@@ -214,9 +211,9 @@
 def _print_navbar(fp, plugin):
     """Return an html fragement representing the navigation bar for a page."""
     fp.write('  <ul class="navbar">\n')
-    for p in config.PLUGINS:
+    for p in webcheck.config.PLUGINS:
         # import the plugin
-        report = __import__('plugins.' + p, globals(), locals(), [p])
+        report = __import__('webcheck.plugins.' + p, globals(), locals(), [p])
         # skip if no outputfile
         if not hasattr(report, '__outputfile__'):
             continue
@@ -258,7 +255,7 @@
       % {'sitetitle':   htmlescape(base.title or base.url),
          'plugintitle': htmlescape(plugin.__title__),
          'siteurl':     base.url,
-         'version':     config.VERSION})
+         'version':     webcheck.config.VERSION})
     # write navigation bar
     _print_navbar(fp, plugin)
     # write plugin heading
@@ -279,6 +276,6 @@
       ' </body>\n'
       '</html>\n'
       % {'time':     htmlescape(time.ctime(time.time())),
-         'homepage': config.HOMEPAGE,
-         'version':  htmlescape(config.VERSION)})
+         'homepage': webcheck.config.HOMEPAGE,
+         'version':  htmlescape(webcheck.config.VERSION)})
     fp.close()

Modified: webcheck/webcheck/plugins/about.py
==============================================================================
--- webcheck/plugins/about.py   Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/about.py  Fri Sep 16 15:36:38 2011        (r435)
@@ -30,15 +30,15 @@
 
 import time
 
-import config
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
 
 
 def generate(site):
     """Output a list of modules, it's authors and the webcheck version."""
-    fp = plugins.open_html(plugins.about, site)
-    session = db.Session()
+    fp = webcheck.plugins.open_html(webcheck.plugins.about, site)
+    session = Session()
     # TODO: xxx links were fetched, xxx pages were examined and a total of xxx 
notes and problems were found
     # TODO: include some runtime information (e.g. supported schemes, user 
configuration, etc)
     # output some general information about the report
@@ -56,10 +56,10 @@
       '    This report was generated on %(time)s, a total of %(numurls)d\n'
       '    links were found.\n'
       '   </p>\n\n'
-      % {'version':  plugins.htmlescape(config.VERSION),
-         'time':     plugins.htmlescape(time.ctime(time.time())),
-         'numurls':  session.query(db.Link).count(),
-         'homepage': config.HOMEPAGE})
+      % {'version':  webcheck.plugins.htmlescape(webcheck.config.VERSION),
+         'time':     webcheck.plugins.htmlescape(time.ctime(time.time())),
+         'numurls':  session.query(Link).count(),
+         'homepage': webcheck.config.HOMEPAGE})
     # output copyright information
     fp.write(
       '   <h3>Copyright</h3>\n'
@@ -100,15 +100,15 @@
     fp.write(
       '   <h3>Plugins</h3>\n'
       '   <ul>\n')
-    for plugin in config.PLUGINS:
-        report = __import__('plugins.' + plugin, globals(), locals(), [plugin])
+    for plugin in webcheck.config.PLUGINS:
+        report = __import__('webcheck.plugins.' + plugin, globals(), locals(), 
[plugin])
         fp.write(
           '    <li>\n'
           '     <strong>%s</strong><br />\n'
-          % plugins.htmlescape(report.__title__))
+          % webcheck.plugins.htmlescape(report.__title__))
         if hasattr(report, '__doc__'):
-            fp.write('     %s<br />\n' % plugins.htmlescape(report.__doc__))
+            fp.write('     %s<br />\n' % 
webcheck.plugins.htmlescape(report.__doc__))
         fp.write('    </li>\n')
     fp.write(
       '   </ul>\n')
-    plugins.close_html(fp)
+    webcheck.plugins.close_html(fp)

Modified: webcheck/webcheck/plugins/anchors.py
==============================================================================
--- webcheck/plugins/anchors.py Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/anchors.py        Fri Sep 16 15:36:38 2011        
(r435)
@@ -27,22 +27,22 @@
 __title__ = 'missing anchors'
 __author__ = 'Arthur de Jong'
 
-import db
+from webcheck.db import Session, Link, Anchor
 
 
 def postprocess(site):
     """Add all missing anchors as page problems to the referring page."""
-    session = db.Session()
+    session = Session()
     # find all fetched links with requested anchors
-    links = session.query(db.Link).filter(db.Link.reqanchors.any())
-    links = links.filter(db.Link.fetched != None)
+    links = session.query(Link).filter(Link.reqanchors.any())
+    links = links.filter(Link.fetched != None)
     # go over list and find missing anchors
     # TODO: we can probably make a nicer query for this
     for link in links:
         # check that all requested anchors exist
         for anchor in link.reqanchors:
             # if the anchor is not there there, report problem
-            if not link.anchors.filter(db.Anchor.anchor == 
anchor.anchor).first():
+            if not link.anchors.filter(Anchor.anchor == anchor.anchor).first():
                 anchor.parent.add_pageproblem(
                   u'bad link: %(url)s#%(anchor)s: unknown anchor'
                   % {'url': link.url,

Modified: webcheck/webcheck/plugins/badlinks.py
==============================================================================
--- webcheck/plugins/badlinks.py        Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/badlinks.py       Fri Sep 16 15:36:38 2011        
(r435)
@@ -30,15 +30,15 @@
 
 from sqlalchemy.orm import joinedload
 
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
 
 
 def postporcess(site):
     """Add all bad links as pageproblems on pages where they are linked."""
-    session = db.Session()
+    session = Session()
     # find all links with link problems
-    links = 
session.query(db.Link).filter(db.Link.linkproblems.any()).options(joinedload(db.Link.linkproblems))
+    links = 
session.query(Link).filter(Link.linkproblems.any()).options(joinedload(Link.linkproblems))
     # TODO: probably make it a nicer query over all linkproblems
     for link in links:
         # add a reference to the problem map
@@ -50,17 +50,17 @@
 
 def generate(site):
     """Present the list of bad links."""
-    session = db.Session()
+    session = Session()
     # find all links with link problems
-    links = 
session.query(db.Link).filter(db.Link.linkproblems.any()).order_by(db.Link.url).options(joinedload(db.Link.linkproblems))
+    links = 
session.query(Link).filter(Link.linkproblems.any()).order_by(Link.url).options(joinedload(Link.linkproblems))
     # present results
-    fp = plugins.open_html(plugins.badlinks, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.badlinks, site)
     if not links:
         fp.write(
           '   <p class="description">\n'
           '    There were no problems retrieving links from the website.\n'
           '   </p>\n')
-        plugins.close_html(fp)
+        webcheck.plugins.close_html(fp)
         return
     fp.write(
       '   <p class="description">\n'
@@ -73,18 +73,18 @@
           '    <li>\n'
           '     %(badurl)s\n'
           '     <ul class="problems">\n'
-          % {'badurl':  plugins.make_link(link, link.url)})
+          % {'badurl':  webcheck.plugins.make_link(link, link.url)})
         # list the problems
         for problem in link.linkproblems:
             fp.write(
               '      <li>%(problem)s</li>\n'
-              % {'problem':  plugins.htmlescape(problem)})
+              % {'problem':  webcheck.plugins.htmlescape(problem)})
         fp.write(
           '     </ul>\n')
         # present a list of parents
-        plugins.print_parents(fp, link, '     ')
+        webcheck.plugins.print_parents(fp, link, '     ')
         fp.write(
           '    </li>\n')
     fp.write(
       '   </ol>\n')
-    plugins.close_html(fp)
+    webcheck.plugins.close_html(fp)

Modified: webcheck/webcheck/plugins/external.py
==============================================================================
--- webcheck/plugins/external.py        Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/external.py       Fri Sep 16 15:36:38 2011        
(r435)
@@ -30,23 +30,23 @@
 
 from sqlalchemy.orm import joinedload
 
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
 
 
 def generate(site):
     """Generate the list of external links."""
-    session = db.Session()
+    session = Session()
     # get all external links
-    links = session.query(db.Link).filter(db.Link.is_internal != 
True).order_by(db.Link.url)
+    links = session.query(Link).filter(Link.is_internal != 
True).order_by(Link.url)
     # present results
-    fp = plugins.open_html(plugins.external, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.external, site)
     if not links:
         fp.write(
           '   <p class="description">'
           '    No external links were found on the website.'
           '   </p>\n')
-        plugins.close_html(fp)
+        webcheck.plugins.close_html(fp)
         return
     fp.write(
       '   <p class="description">'
@@ -54,15 +54,15 @@
       '    examination of the website.'
       '   </p>\n'
       '   <ol>\n')
-    for link in links.options(joinedload(db.Link.linkproblems)):
+    for link in links.options(joinedload(Link.linkproblems)):
         fp.write(
           '    <li>\n'
           '     %(link)s\n'
-          % {'link': plugins.make_link(link)})
+          % {'link': webcheck.plugins.make_link(link)})
         # present a list of parents
-        plugins.print_parents(fp, link, '     ')
+        webcheck.plugins.print_parents(fp, link, '     ')
         fp.write(
           '    </li>\n')
     fp.write(
       '   </ol>\n')
-    plugins.close_html(fp)
+    webcheck.plugins.close_html(fp)

Modified: webcheck/webcheck/plugins/images.py
==============================================================================
--- webcheck/plugins/images.py  Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/images.py Fri Sep 16 15:36:38 2011        (r435)
@@ -30,27 +30,27 @@
 
 import re
 
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
 
 
 def generate(site):
     """Generate a list of image URLs that were found."""
-    session = db.Session()
+    session = Session()
     # get non-page links that have an image/* mimetype
-    links = session.query(db.Link)
-    links = links.filter((db.Link.is_page != True) | (db.Link.is_page == None))
-    links = links.filter(db.Link.mimetype.startswith('image/'))
-    links = links.order_by(db.Link.url)
+    links = session.query(Link)
+    links = links.filter((Link.is_page != True) | (Link.is_page == None))
+    links = links.filter(Link.mimetype.startswith('image/'))
+    links = links.order_by(Link.url)
     # present results
-    fp = plugins.open_html(plugins.images, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.images, site)
     if not links:
         fp.write(
           '   <p class="description">\n'
           '    No images were linked on the website.\n'
           '   </p>\n'
           '   <ol>\n')
-        plugins.close_html(fp)
+        webcheck.plugins.close_html(fp)
         return
     fp.write(
       '   <p class="description">\n'
@@ -58,7 +58,7 @@
       '   </p>\n'
       '   <ol>\n')
     for link in links:
-        fp.write('    <li>%s</li>\n' % plugins.make_link(link, link.url))
+        fp.write('    <li>%s</li>\n' % webcheck.plugins.make_link(link, 
link.url))
     fp.write(
       '   </ol>\n')
-    plugins.close_html(fp)
+    webcheck.plugins.close_html(fp)

Modified: webcheck/webcheck/plugins/new.py
==============================================================================
--- webcheck/plugins/new.py     Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/new.py    Fri Sep 16 15:36:38 2011        (r435)
@@ -30,9 +30,9 @@
 
 import time
 
-import config
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
 
 
 SECS_PER_DAY = 60 * 60 * 24
@@ -40,28 +40,28 @@
 
 def generate(site):
     """Output the list of recently modified pages."""
-    session = db.Session()
+    session = Session()
     # the time for which links are considered new
-    newtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSNEW_URL_AGE
+    newtime = time.time() - SECS_PER_DAY * 
webcheck.config.REPORT_WHATSNEW_URL_AGE
     # get all internal pages that are new
-    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
-    links = links.filter(db.Link.mtime > 
newtime).order_by(db.Link.mtime.desc())
+    links = session.query(Link).filter_by(is_page=True, is_internal=True)
+    links = links.filter(Link.mtime > newtime).order_by(Link.mtime.desc())
     # present results
-    fp = plugins.open_html(plugins.new, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.new, site)
     if not links.count():
         fp.write(
           '   <p class="description">\n'
           '    No pages were found that were modified within the last %(new)d 
days.\n'
           '   </p>\n'
-          % {'new': config.REPORT_WHATSNEW_URL_AGE})
-        plugins.close_html(fp)
+          % {'new': webcheck.config.REPORT_WHATSNEW_URL_AGE})
+        webcheck.plugins.close_html(fp)
         return
     fp.write(
       '   <p class="description">\n'
       '    These pages have been recently modified (within %(new)d days).\n'
       '   </p>\n'
       '   <ul>\n'
-      % {'new': config.REPORT_WHATSNEW_URL_AGE})
+      % {'new': webcheck.config.REPORT_WHATSNEW_URL_AGE})
     for link in links:
         age = (time.time() - link.mtime) / SECS_PER_DAY
         fp.write(
@@ -71,7 +71,7 @@
           '      <li>age: %(age)d days</li>\n'
           '     </ul>\n'
           '    </li>\n'
-          % {'link': plugins.make_link(link),
+          % {'link': webcheck.plugins.make_link(link),
              'age':  age})
     fp.write('   </ul>\n')
-    plugins.close_html(fp)
+    webcheck.plugins.close_html(fp)

Modified: webcheck/webcheck/plugins/notchkd.py
==============================================================================
--- webcheck/plugins/notchkd.py Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/notchkd.py        Fri Sep 16 15:36:38 2011        
(r435)
@@ -30,23 +30,23 @@
 
 from sqlalchemy.orm import joinedload
 
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
 
 
 def generate(site):
     """Output the list of not checked pages."""
-    session = db.Session()
+    session = Session()
     # get all yanked urls
-    links = session.query(db.Link).filter(db.Link.yanked != 
None).order_by(db.Link.url)
+    links = session.query(Link).filter(Link.yanked != None).order_by(Link.url)
     # present results
-    fp = plugins.open_html(plugins.notchkd, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.notchkd, site)
     if not links.count():
         fp.write(
           '   <p class="description">\n'
           '    All links have been checked.\n'
           '   </p>\n')
-        plugins.close_html(fp)
+        webcheck.plugins.close_html(fp)
         return
     fp.write(
       '   <p class="description">\n'
@@ -54,15 +54,15 @@
       '    at all during the examination of the website.\n'
       '   </p>\n'
       '   <ol>\n')
-    for link in links.options(joinedload(db.Link.linkproblems)):
+    for link in links.options(joinedload(Link.linkproblems)):
         fp.write(
           '    <li>\n'
           '     %(link)s\n'
-          % {'link': plugins.make_link(link, link.url)})
+          % {'link': webcheck.plugins.make_link(link, link.url)})
         # present a list of parents
-        plugins.print_parents(fp, link, '     ')
+        webcheck.plugins.print_parents(fp, link, '     ')
         fp.write(
           '    </li>\n')
     fp.write(
       '   </ol>\n')
-    plugins.close_html(fp)
+    webcheck.plugins.close_html(fp)

Modified: webcheck/webcheck/plugins/notitles.py
==============================================================================
--- webcheck/plugins/notitles.py        Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/notitles.py       Fri Sep 16 15:36:38 2011        
(r435)
@@ -30,17 +30,17 @@
 
 from sqlalchemy.sql.functions import char_length
 
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
 
 
 def postprocess(site):
     """Add page problems for all pages without a title."""
-    session = db.Session()
+    session = Session()
     # get all internal pages without a title
-    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
-    links = links.filter((char_length(db.Link.title) == 0) |
-                         (db.Link.title == None))
+    links = session.query(Link).filter_by(is_page=True, is_internal=True)
+    links = links.filter((char_length(Link.title) == 0) |
+                         (Link.title == None))
     for link in links:
         link.add_pageproblem('missing title')
     session.commit()
@@ -48,19 +48,19 @@
 
 def generate(site):
     """Output the list of pages without a title."""
-    session = db.Session()
+    session = Session()
     # get all internal pages without a title
-    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
-    links = links.filter((char_length(db.Link.title) == 0) |
-                         (db.Link.title == None)).order_by(db.Link.url)
+    links = session.query(Link).filter_by(is_page=True, is_internal=True)
+    links = links.filter((char_length(Link.title) == 0) |
+                         (Link.title == None)).order_by(Link.url)
     # present results
-    fp = plugins.open_html(plugins.notitles, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.notitles, site)
     if not links.count():
         fp.write(
           '   <p class="description">\n'
           '    All pages had a title specified.\n'
           '   </p>\n')
-        plugins.close_html(fp)
+        webcheck.plugins.close_html(fp)
         return
     fp.write(
       '   <p class="description">\n'
@@ -71,7 +71,7 @@
     for link in links:
         fp.write(
           '    <li>%(link)s</li>\n'
-          % {'link': plugins.make_link(link, link.url)})
+          % {'link': webcheck.plugins.make_link(link, link.url)})
     fp.write(
       '   </ol>\n')
-    plugins.close_html(fp)
+    webcheck.plugins.close_html(fp)

Modified: webcheck/webcheck/plugins/old.py
==============================================================================
--- webcheck/plugins/old.py     Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/old.py    Fri Sep 16 15:36:38 2011        (r435)
@@ -30,9 +30,9 @@
 
 import time
 
-import config
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
 
 
 SECS_PER_DAY = 60 * 60 * 24
@@ -40,21 +40,21 @@
 
 def generate(site):
     """Output the list of outdated pages to the specified file descriptor."""
-    session = db.Session()
+    session = Session()
     # the time for which links are considered old
-    oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE
+    oldtime = time.time() - SECS_PER_DAY * 
webcheck.config.REPORT_WHATSOLD_URL_AGE
     # get all internal pages that are old
-    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
-    links = links.filter(db.Link.mtime < oldtime).order_by(db.Link.mtime)
+    links = session.query(Link).filter_by(is_page=True, is_internal=True)
+    links = links.filter(Link.mtime < oldtime).order_by(Link.mtime)
     # present results
-    fp = plugins.open_html(plugins.old, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.old, site)
     if not links.count():
         fp.write(
           '   <p class="description">\n'
           '    No pages were found that were older than %(old)d days old.\n'
           '   </p>\n'
-          % {'old': config.REPORT_WHATSOLD_URL_AGE})
-        plugins.close_html(fp)
+          % {'old': webcheck.config.REPORT_WHATSOLD_URL_AGE})
+        webcheck.plugins.close_html(fp)
         return
     fp.write(
       '   <p class="description">\n'
@@ -62,7 +62,7 @@
       '    days) and may be outdated.\n'
       '   </p>\n'
       '   <ul>\n'
-      % {'old': config.REPORT_WHATSOLD_URL_AGE})
+      % {'old': webcheck.config.REPORT_WHATSOLD_URL_AGE})
     for link in links:
         age = (time.time() - link.mtime) / SECS_PER_DAY
         fp.write(
@@ -72,8 +72,8 @@
           '      <li>age: %(age)d days</li>\n'
           '     </ul>\n'
           '    </li>\n'
-          % {'link': plugins.make_link(link),
+          % {'link': webcheck.plugins.make_link(link),
              'age':  age})
     fp.write(
       '   </ul>\n')
-    plugins.close_html(fp)
+    webcheck.plugins.close_html(fp)

Modified: webcheck/webcheck/plugins/problems.py
==============================================================================
--- webcheck/plugins/problems.py        Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/problems.py       Fri Sep 16 15:36:38 2011        
(r435)
@@ -30,8 +30,8 @@
 
 import urllib
 
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
 
 
 def _mk_id(name):
@@ -50,12 +50,12 @@
 
 def generate(site):
     """Output the overview of problems per author."""
-    session = db.Session()
+    session = Session()
     # make a list of problems per author
     problem_db = {}
     # get internal links with page problems
-    links = session.query(db.Link).filter_by(is_internal=True)
-    links = links.filter(db.Link.pageproblems.any()).order_by(db.Link.url)
+    links = session.query(Link).filter_by(is_internal=True)
+    links = links.filter(Link.pageproblems.any()).order_by(Link.url)
     for link in links:
         # make a normal name for the author
         if link.author:
@@ -67,13 +67,13 @@
             problem_db[author].append(link)
         else:
             problem_db[author] = [link]
-    fp = plugins.open_html(plugins.problems, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.problems, site)
     if not problem_db:
         fp.write(
           '   <p class="description">\n'
           '    No problems were found on this site, hurray.\n'
           '   </p>\n')
-        plugins.close_html(fp)
+        webcheck.plugins.close_html(fp)
         return
     # print description
     fp.write(
@@ -90,8 +90,8 @@
         for author in authors:
             fp.write(
               '    <li><a href="#author_%(authorref)s">Author: 
%(author)s</a></li>\n'
-              % {'authorref': plugins.htmlescape(_mk_id(author)),
-                 'author':    plugins.htmlescape(author)})
+              % {'authorref': webcheck.plugins.htmlescape(_mk_id(author)),
+                 'author':    webcheck.plugins.htmlescape(author)})
         fp.write('   </ul>\n')
     # generate problem report
     fp.write('   <ul>\n')
@@ -100,8 +100,8 @@
           '     <li id="author_%(authorref)s">\n'
           '      Author: %(author)s\n'
           '      <ul>\n'
-          % {'authorref': plugins.htmlescape(_mk_id(author)),
-             'author':    plugins.htmlescape(author)})
+          % {'authorref': webcheck.plugins.htmlescape(_mk_id(author)),
+             'author':    webcheck.plugins.htmlescape(author)})
         # sort pages by url
         problem_db[author].sort(lambda a, b: cmp(a.url, b.url))
         # list problems for this author
@@ -111,12 +111,12 @@
               '    <li>\n'
               '     %(link)s\n'
               '     <ul class="problems">\n'
-              % {'link': plugins.make_link(link)})
+              % {'link': webcheck.plugins.make_link(link)})
             # list the problems
             for problem in link.pageproblems:
                 fp.write(
                   '      <li>%(problem)s</li>\n'
-                  % {'problem':  plugins.htmlescape(problem)})
+                  % {'problem':  webcheck.plugins.htmlescape(problem)})
             # end the list item
             fp.write(
               '     </ul>\n'
@@ -126,4 +126,4 @@
           '     </li>\n')
     fp.write(
       '   </ul>\n')
-    plugins.close_html(fp)
+    webcheck.plugins.close_html(fp)

Modified: webcheck/webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/plugins/sitemap.py Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/sitemap.py        Fri Sep 16 15:36:38 2011        
(r435)
@@ -28,25 +28,25 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'index.html'
 
-import config
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
 
 
 def add_pagechildren(link, children, explored):
     """Determine the page children of this link, combining the children of
     embedded items and following redirects."""
     # get all internal children
-    qry = link.children.filter(db.Link.is_internal == True)
+    qry = link.children.filter(Link.is_internal == True)
     if link.depth:
-        qry = qry.filter((db.Link.depth > link.depth) | (db.Link.depth == 
None))
+        qry = qry.filter((Link.depth > link.depth) | (Link.depth == None))
     # follow redirects
     children.update(y
                     for y in (x.follow_link() for x in qry)
                     if y and y.is_page and y.is_internal and y.id not in 
explored)
     explored.update(x.id for x in children)
     # add embedded element's pagechildren (think frames)
-    for embed in link.embedded.filter(db.Link.is_internal == 
True).filter(db.Link.is_page == True):
+    for embed in link.embedded.filter(Link.is_internal == 
True).filter(Link.is_page == True):
         # TODO: put this in a query
         if embed.id not in explored and \
            (embed.depth == None or embed.depth > link.depth):
@@ -58,9 +58,9 @@
     site. Prints the html results to the file descriptor."""
     # output this link
     fp.write(indent + '<li>\n')
-    fp.write(indent + ' ' + plugins.make_link(link) + '\n')
+    fp.write(indent + ' ' + webcheck.plugins.make_link(link) + '\n')
     # only check children if we are not too deep yet
-    if depth <= config.REPORT_SITEMAP_LEVEL:
+    if depth <= webcheck.config.REPORT_SITEMAP_LEVEL:
         # figure out the links to follow and ensure that they are only
         # explored from here
         children = set()
@@ -80,8 +80,8 @@
 
 def generate(site):
     """Output the sitemap."""
-    session = db.Session()
-    fp = plugins.open_html(plugins.sitemap, site)
+    session = Session()
+    fp = webcheck.plugins.open_html(webcheck.plugins.sitemap, site)
     # output the site structure using breadth first traversal
     fp.write(
       '   <p class="description">\n'
@@ -93,4 +93,4 @@
         _explore(fp, l, explored)
     fp.write(
       '   </ul>\n')
-    plugins.close_html(fp)
+    webcheck.plugins.close_html(fp)

Modified: webcheck/webcheck/plugins/size.py
==============================================================================
--- webcheck/plugins/size.py    Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/size.py   Fri Sep 16 15:36:38 2011        (r435)
@@ -28,9 +28,9 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'size.html'
 
-import config
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
 
 
 def _getsize(link, done=None):
@@ -57,22 +57,22 @@
 
 def generate(site):
     """Output the list of large pages."""
-    session = db.Session()
+    session = Session()
     # get all internal pages and get big links
-    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
+    links = session.query(Link).filter_by(is_page=True, is_internal=True)
     links = [x for x in links
-             if _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024]
+             if _getsize(x) >= webcheck.config.REPORT_SLOW_URL_SIZE * 1024]
     # sort links by size (biggest first)
     links.sort(lambda a, b: cmp(b.total_size, a.total_size))
     # present results
-    fp = plugins.open_html(plugins.size, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.size, site)
     if not links:
         fp.write(
           '   <p class="description">\n'
           '    No pages over %(size)dK were found.\n'
           '   </p>\n'
-          % {'size': config.REPORT_SLOW_URL_SIZE})
-        plugins.close_html(fp)
+          % {'size': webcheck.config.REPORT_SLOW_URL_SIZE})
+        webcheck.plugins.close_html(fp)
         return
     fp.write(
       '   <p class="description">\n'
@@ -80,9 +80,9 @@
       '    slow to download.\n'
       '   </p>\n'
       '   <ul>\n'
-      % {'size': config.REPORT_SLOW_URL_SIZE})
+      % {'size': webcheck.config.REPORT_SLOW_URL_SIZE})
     for link in links:
-        size = plugins.get_size(link.total_size)
+        size = webcheck.plugins.get_size(link.total_size)
         fp.write(
           '    <li>\n'
           '     %(link)s\n'
@@ -90,8 +90,8 @@
           '      <li>size: %(size)s</li>\n'
           '     </ul>\n'
           '    </li>\n'
-          % {'link': plugins.make_link(link),
+          % {'link': webcheck.plugins.make_link(link),
              'size': size})
     fp.write(
       '   </ul>\n')
-    plugins.close_html(fp)
+    webcheck.plugins.close_html(fp)

Modified: webcheck/webcheck/plugins/urllist.py
==============================================================================
--- webcheck/plugins/urllist.py Sun Sep 11 17:33:55 2011        (r434)
+++ webcheck/webcheck/plugins/urllist.py        Fri Sep 16 15:36:38 2011        
(r435)
@@ -26,14 +26,14 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'urllist.html'
 
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
 
 
 def generate(site):
     """Output a sorted list of URLs."""
-    session = db.Session()
-    fp = plugins.open_html(plugins.urllist, site)
+    session = Session()
+    fp = webcheck.plugins.open_html(webcheck.plugins.urllist, site)
     fp.write(
       '   <p class="description">\n'
       '    This is the list of all urls encountered during the examination 
of\n'
@@ -41,9 +41,9 @@
       '    non-examined urls.\n'
       '   </p>\n'
       '   <ol>\n')
-    links = session.query(db.Link).order_by(db.Link.url)
+    links = session.query(Link).order_by(Link.url)
     for link in links:
-        fp.write('    <li>' + plugins.make_link(link, link.url) + '</li>\n')
+        fp.write('    <li>' + webcheck.plugins.make_link(link, link.url) + 
'</li>\n')
     fp.write(
       '   </ol>\n')
-    plugins.close_html(fp)
+    webcheck.plugins.close_html(fp)
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
webcheck commit: r435 - in webcheck: . parsers plugins webcheck webcheck/parsers webcheck/parsers/html webcheck/plugins, Commits of the webcheck project
Prev by Date: webcheck commit: r434 - webcheck
Next by Date: webcheck commit: r436 - in webcheck/webcheck: . plugins
Previous by thread: webcheck commit: r434 - webcheck
Next by thread: webcheck commit: r436 - in webcheck/webcheck: . plugins