webcheck commit: r421 - in webcheck: . parsers parsers/html plugins

[Date Prev][Date Next] [Thread Prev][Thread Next]
From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
To: webcheck-commits [at] lists.arthurdejong.org
Reply-to: webcheck-users [at] lists.arthurdejong.org
Subject: webcheck commit: r421 - in webcheck: . parsers parsers/html plugins
Date: Thu, 4 Aug 2011 21:46:27 +0200 (CEST)
Author: arthur
Date: Thu Aug  4 21:46:26 2011
New Revision: 421
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=421

Log:
use SQLAlchemy to store crawled website data to improve scalability

Added:
   webcheck/db.py
Deleted:
   webcheck/serialize.py
Modified:
   webcheck/config.py
   webcheck/crawler.py
   webcheck/parsers/css.py
   webcheck/parsers/html/__init__.py
   webcheck/parsers/html/beautifulsoup.py
   webcheck/parsers/html/calltidy.py
   webcheck/parsers/html/htmlparser.py
   webcheck/plugins/__init__.py
   webcheck/plugins/about.py
   webcheck/plugins/badlinks.py
   webcheck/plugins/external.py
   webcheck/plugins/images.py
   webcheck/plugins/new.py
   webcheck/plugins/notchkd.py
   webcheck/plugins/notitles.py
   webcheck/plugins/old.py
   webcheck/plugins/problems.py
   webcheck/plugins/sitemap.py
   webcheck/plugins/size.py
   webcheck/plugins/urllist.py
   webcheck/webcheck.py

Modified: webcheck/config.py
==============================================================================
--- webcheck/config.py  Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/config.py  Thu Aug  4 21:46:26 2011        (r421)
@@ -64,7 +64,7 @@
 REDIRECT_DEPTH = 5
 
 # The list of plugins that will be used to generate the report.
-PLUGINS = [ 'anchors',
+PLUGINS = [ #'anchors',
             'sitemap',
             'urllist',
             'images',

Modified: webcheck/crawler.py
==============================================================================
--- webcheck/crawler.py Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/crawler.py Thu Aug  4 21:46:26 2011        (r421)
@@ -27,21 +27,24 @@
 manipulate the crawling of the website. This module also contains the Link
 class that holds all the link related properties."""
 
-import config
-import debugio
-import urlparse
-import urllib
-import robotparser
-import parsers
-import re
-import time
-import myurllib
-import urllib2
-import httplib
-import socket
 import atexit
 import cookielib
+import datetime
+import httplib
 import os
+import re
+import robotparser
+import socket
+import time
+import urllib
+import urllib2
+import urlparse
+
+import config
+import db
+import debugio
+import parsers
+
 
 # set up our cookie jar
 cookiejar = cookielib.LWPCookieJar('cookies.lwp')
@@ -80,12 +83,13 @@
 # pattern to match anchor part of a url
 _anchorpattern = re.compile('#([^#]+)$')
 
-class Site:
+
+# TODO: rename Site to Crawler
+class Site(object):
     """Class to represent gathered data of a site.
 
     The available properties of this class are:
 
-      linkMap    - a map of urls to link objects
       bases      - a list of base link object
    """
 
@@ -102,15 +106,13 @@
         self._yanked_res = {}
         # map of scheme+netloc to robot handleds
         self._robotparsers = {}
-        # a map of urls to Link objects
-        self.linkMap = {}
         # list of base urls (these are the internal urls to start from)
         self.bases = []
 
     def add_internal(self, url):
         """Add the given url and consider all urls below it to be internal.
         These links are all marked for checking with the crawl() function."""
-        url = myurllib.normalizeurl(url)
+        url = db.Link.clean_url(url)
         if url not in self._internal_urls:
             self._internal_urls.add(url)
 
@@ -129,53 +131,53 @@
         will not be checked at all."""
         self._yanked_res[exp] = re.compile(exp, re.IGNORECASE)
 
-    def _is_internal(self, link):
+    def _is_internal(self, url):
         """Check whether the specified url is external or internal.
         This uses the urls marked with add_internal() and the regular
         expressions passed with add_external_re()."""
         # check if it is internal through the regexps
         for regexp in self._internal_res.values():
-            if regexp.search(link.url) is not None:
+            if regexp.search(url) is not None:
                 return True
         res = False
         # check that the url starts with an internal url
         if config.BASE_URLS_ONLY:
             # the url must start with one of the _internal_urls
             for i in self._internal_urls:
-                res |= (i==link.url[:len(i)])
+                res |= (i==url[:len(i)])
         else:
             # the netloc must match a netloc of an _internal_url
+            netloc = urlparse.urlsplit(url)[1]
             for i in self._internal_urls:
-                res |= (urlparse.urlsplit(i)[1]==link.netloc)
+                res |= (urlparse.urlsplit(i)[1] == netloc)
         # if it is not internal now, it never will be
         if not res:
             return False
         # check if it is external through the regexps
         for x in self._external_res.values():
             # if the url matches it is external and we can stop
-            if x.search(link.url) is not None:
+            if x.search(url):
                 return False
         return True
 
-    def _get_robotparser(self, link):
+    def _get_robotparser(self, scheme, netloc):
         """Return the proper robots parser for the given url or None if one
         cannot be constructed. Robot parsers are cached per scheme and
         netloc."""
         # only some schemes have a meaningful robots.txt file
-        if link.scheme != 'http' and link.scheme != 'https':
-            debugio.debug('crawler._get_robotparser() called with unsupported 
scheme (%s)' % link.scheme)
+        if scheme != 'http' and scheme != 'https':
+            debugio.debug('crawler._get_robotparser() called with unsupported 
scheme (%s)' % scheme)
             return None
         # split out the key part of the url
-        location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', ''))
+        location = urlparse.urlunsplit((scheme, netloc, '', '', ''))
         # try to create a new robotparser if we don't already have one
         if not self._robotparsers.has_key(location):
-            import httplib
             debugio.info('  getting robots.txt for %s' % location)
             self._robotparsers[location] = None
             try:
                 rp = robotparser.RobotFileParser()
                 rp.set_url(urlparse.urlunsplit(
-                  (link.scheme, link.netloc, '/robots.txt', '', '') ))
+                  (scheme, netloc, '/robots.txt', '', '') ))
                 rp.read()
                 self._robotparsers[location] = rp
             except (TypeError, IOError, httplib.HTTPException):
@@ -183,425 +185,155 @@
                 pass
         return self._robotparsers[location]
 
-    def _is_yanked(self, link):
+    def _is_yanked(self, url):
         """Check whether the specified url should not be checked at all.
         This uses the regualr expressions passed with add_yanked_re() and the
         robots information present."""
         # check if it is yanked through the regexps
         for regexp in self._yanked_res.values():
             # if the url matches it is yanked and we can stop
-            if regexp.search(link.url) is not None:
+            if regexp.search(url):
                 return 'yanked'
         # check if we should avoid external links
-        if not link.isinternal and config.AVOID_EXTERNAL_LINKS:
+        is_internal = self._is_internal(url)
+        if not is_internal and config.AVOID_EXTERNAL_LINKS:
             return 'external avoided'
         # check if we should use robot parsers
         if not config.USE_ROBOTS:
-            return False
-        # skip schemes not haveing robot.txt files
-        if link.scheme != 'http' and link.scheme != 'https':
-            return False
+            return None
+        (scheme, netloc) = urlparse.urlsplit(url)[0:2]
+        # skip schemes not having robot.txt files
+        if scheme not in ('http', 'https'):
+            return None
         # skip robot checks for external urls
         # TODO: make this configurable
-        if not link.isinternal:
-            return False
+        if not is_internal:
+            return None
         # check robots for remaining links
-        rp = self._get_robotparser(link)
-        if rp is not None and not rp.can_fetch('webcheck', link.url):
+        rp = self._get_robotparser(scheme, netloc)
+        if rp and not rp.can_fetch('webcheck', url):
             return 'robot restriced'
         # fall back to allowing the url
-        return False
+        return None
+
+    def get_link(self, session, url):
+        # try to find the URL
+        url = db.Link.clean_url(url)
+        link = session.query(db.Link).filter_by(url=url).first()
+        if not link:
+            link = db.Link(url=url)
+            session.add(link)
+        return link
 
-    def get_link(self, url):
-        """Return a link object for the given url.
-        This function checks the map of cached link objects for an
-        instance."""
-        # clean the url
-        url = myurllib.normalizeurl(url)
-        # check if we have an object ready
-        if self.linkMap.has_key(url):
-            return self.linkMap[url]
-        # create a new instance
-        return Link(self, url)
+    def get_links_to_crawl(self, session):
+        links = session.query(db.Link).filter(db.Link.fetched == None)
+        return links.filter(db.Link.yanked == None)[:100]
 
-    def crawl(self, serfp=None):
+    def crawl(self):
         """Crawl the website based on the urls specified with
         add_internal(). If the serialization file pointer
         is specified the crawler writes out updated links to
         the file while crawling the site."""
-        # TODO: have some different scheme to crawl a site (e.g. separate
-        #       internal and external queues, threading, etc)
-        tocheck = set()
-        # add all unfetched site urls
-        for link in self.linkMap.values():
-            if not link.isyanked and not link.isfetched:
-                tocheck.add(link)
-        # add all internal urls
+        # get a database session
+        session = db.Session()
+        # remove all links
+        if not config.CONTINUE:
+            session.query(db.LinkProblem).delete()
+            session.commit()
+            session.query(db.PageProblem).delete()
+            session.commit()
+            session.execute(db.children.delete())
+            session.commit()
+            session.execute(db.embedded.delete())
+            session.commit()
+            session.query(db.Link).delete()
+            session.commit()
+        # add all internal urls to the database
         for url in self._internal_urls:
-            tocheck.add(self.get_link(url))
+            url = db.Link.clean_url(url)
+            self.get_link(session, url)
+        # add some URLs from the database that haven't been fetched
+        tocheck = self.get_links_to_crawl(session)
         # repeat until we have nothing more to check
-        fetchedlinks = 0
-        while len(tocheck) > 0:
+        while tocheck:
             debugio.debug('crawler.crawl(): items left to check: %d' % 
len(tocheck))
             # choose a link from the tocheck list
             link = tocheck.pop()
+            link.is_internal = self._is_internal(link.url)
+            link.yanked = self._is_yanked(link.url)
+            # see if there are any more links to check
+            if not tocheck:
+                tocheck = self.get_links_to_crawl(session)
             # skip link it there is nothing to check
-            if link.isyanked or link.isfetched:
+            if link.yanked or link.fetched:
                 continue
             # fetch the link's contents
-            link.fetch()
-            # add children to tocheck
-            for child in link.children:
-                if not child.isyanked and not child.isfetched:
-                    tocheck.add(child)
-            # add embedded content
-            for embed in link.embedded:
-                if not embed.isyanked and not embed.isfetched:
-                    tocheck.add(embed)
-            # serialize all as of yet unserialized links
-            fetchedlinks += 1
-            # TODO: make this configurable
-            if serfp and fetchedlinks >= 5:
-                fetchedlinks = 0
-                import serialize
-                for link in self.linkMap.values():
-                    if link._ischanged:
-                        serialize.serialize_link(serfp, link)
-                        link._ischanged = False
-                serfp.flush()
+            response = self.fetch(link)
+            if response:
+                self.parse(link, response)
+            # flush database changes
+            session.commit()
             # sleep between requests if configured
             if config.WAIT_BETWEEN_REQUESTS > 0:
                 debugio.debug('crawler.crawl(): sleeping %s seconds' % 
config.WAIT_BETWEEN_REQUESTS)
                 time.sleep(config.WAIT_BETWEEN_REQUESTS)
-        # serialize remaining changed links
-        if serfp:
-            import serialize
-            for link in self.linkMap.values():
-                if link._ischanged:
-                    serialize.serialize_link(serfp, link)
-                    link._ischanged = False
-            serfp.flush()
-
-    def postprocess(self):
-        """Do some basic post processing of the collected data, including
-        depth calculation of every link."""
-        # build the list of urls that were set up with add_internal() that
-        # do not have a parent (they form the base for the site)
-        for url in self._internal_urls:
-            link = self.linkMap[url].follow_link()
-            if link == None:
-                debugio.warn('base link %s redirects to nowhere' % url)
-                continue
-            # add the link to bases
-            debugio.debug('crawler.postprocess(): adding %s to bases' % 
link.url)
-            self.bases.append(link)
-        # if we got no bases, just use the first internal one
-        if len(self.bases) == 0:
-            debugio.debug('crawler.postprocess(): fallback to adding %s to 
bases' % self._internal_urls[0])
-            self.bases.append(self.linkMap[self._internal_urls[0]])
-        # do a breadth first traversal of the website to determin depth and
-        # figure out page children
-        tocheck = set()
-        for link in self.bases:
-            link.depth = 0
-            tocheck.add(link)
-        # repeat until we have nothing more to check
-        while len(tocheck) > 0:
-            debugio.debug('crawler.postprocess(): items left to examine: %d' % 
len(tocheck))
-            # choose a link from the tocheck list
-            link = tocheck.pop()
-            # figure out page children
-            for child in link._pagechildren():
-                # skip children with the wrong depth
-                if child.depth != link.depth+1:
-                    continue
-                tocheck.add(child)
-
-class Link:
-    """This is a basic class representing a url.
-
-    Some basic information about a url is stored in instances of this
-    class:
-
-      url        - the url this link represents
-      scheme     - the scheme part of the url
-      netloc     - the netloc part of the url
-      path       - the path part of the url
-      query      - the query part of the url
-      parents    - list of parent links (all the Links that link to this
-                   page)
-      children   - list of child links (the Links that this page links to)
-      pagechildren - list of child pages, including children of embedded
-                     elements
-      embedded   - list of links to embeded content
-      anchors    - list of anchors defined on the page
-      reqanchors - list of anchors requesten for this page anchor->link*
-      depth      - the number of clicks from the base urls this page to
-                   find
-      isinternal - whether the link is considered to be internal
-      isyanked   - whether the link should be checked at all
-      isfetched  - whether the lis is fetched already
-      ispage     - whether the link represents a page
-      mtime      - modification time (in seconds since the Epoch)
-      size       - the size of this document
-      mimetype   - the content-type of the document
-      encoding   - the character set used in the document
-      title      - the title of this document (unicode)
-      author     - the author of this document (unicode)
-      status     - the result of retreiving the document
-      linkproblems - list of problems with retrieving the link
-      pageproblems - list of problems in the parsed page
-      redirectdepth - the number of this redirect (=0 not a redirect)
-
-   Instances of this class should be made through a site instance
-   by adding internal urls and calling crawl().
-   """
-
-    def __init__(self, site, url):
-        """Creates an instance of the Link class and initializes the
-        documented properties to some sensible value."""
-        # store a reference to the site
-        self.site = site
-        # split the url in useful parts and store the parts
-        (self.scheme, self.netloc, self.path, self.query) = \
-          urlparse.urlsplit(url)[0:4]
-        # store the url (without the fragment)
-        url = urlparse.urlunsplit(
-          (self.scheme, self.netloc, self.path, self.query, '') )
-        self.url = url
-        # ensure that we are not creating something that already exists
-        assert not self.site.linkMap.has_key(url)
-        # store the Link object in the linkMap
-        self.site.linkMap[url] = self
-        # deternmin the kind of url (internal or external)
-        self.isinternal = self.site._is_internal(self)
-        # check if the url is yanked
-        self.isyanked = self.site._is_yanked(self)
-        # initialize some properties
-        self.parents = set()
-        self.children = set()
-        self.pagechildren = None
-        self.embedded = set()
-        self.anchors = set()
-        self.reqanchors = {}
-        self.depth = None
-        self.isfetched = False
-        self.ispage = False
-        self.mtime = None
-        self.size = None
-        self.mimetype = None
-        self.encoding = None
-        self.title = None
-        self.author = None
-        self.status = None
-        self.linkproblems = []
-        self.pageproblems = []
-        self.redirectdepth = 0
-        self.redirectlist = None
-        self._ischanged = False
-
-    def __checkurl(self, url):
-        """Check to see if the url is formatted properly, correct formatting
-        if possible and log an error in the formatting to the current page."""
-        # search for spaces in the url
-        if _spacepattern.search(url):
-            self.add_pageproblem('link contains unescaped spaces: %s' % url)
-            # replace spaces by %20
-            url = _spacepattern.sub('%20', url)
-        # find anchor part
-        try:
-            # get the anchor
-            anchor = _anchorpattern.search(url).group(1)
-            # get link for url we link to
-            child = self.site.get_link(url)
-            # store anchor
-            child.add_reqanchor(self, anchor)
-        except AttributeError:
-            # ignore problems lookup up anchor
-            pass
-        return url
-
-    def __tolink(self, link):
-        """Convert the link to a link object, either it is already a link,
-        a link object is returned from the database or a new link is
-        created. This returns None for empty strings."""
-        # ignore if child is empty string
-        if link == '' or link == u'':
-            return None
-        if type(link) is unicode and self.encoding:
-            # convert url to binary if passed as unicode
-            link = link.encode(self.encoding)
-        # convert the url to a link object if we were called with a url
-        if type(link) is unicode or type(link) is str:
-            link = self.site.get_link(self.__checkurl(link))
-        # re're done
-        return link
 
-    def add_child(self, child):
-        """Add a link object to the child relation of this link.
-        The reverse relation is also made."""
-        # ignore children for external links
-        if not self.isinternal:
-            return
-        # convert to link object
-        child = self.__tolink(child)
-        if child is None:
-            return
-        # add to children
-        if child not in self.children:
-            self.children.add(child)
-            self._ischanged = True
-        # add self to parents of child
-        if self not in child.parents:
-            child.parents.add(self)
-
-    def add_embed(self, link):
-        """Mark the given link object as used as an image on this page."""
-        # ignore embeds for external links
-        if not self.isinternal:
-            return
-        # convert to link object
-        link = self.__tolink(link)
-        if link is None:
-            return
-        # add to embedded
-        if link not in self.embedded:
-            self.embedded.add(link)
-            self._ischanged = True
-        # add self to parents of embed
-        if self not in link.parents:
-            link.parents.add(self)
-
-    def add_anchor(self, anchor):
-        """Indicate that this page contains the specified anchor."""
-        # lowercase anchor
-        anchor = anchor.lower()
-        # add anchor
-        if anchor in self.anchors:
-            self.add_pageproblem(
-              'anchor/id "%(anchor)s" defined multiple times'
-              % { 'anchor':   anchor })
-        else:
-            self.anchors.add(anchor)
-            self._ischanged = True
-
-    def add_reqanchor(self, parent, anchor):
-        """Indicate that the specified link contains a reference to the
-        specified anchor. This can be checked later."""
-        # lowercase anchor
-        anchor = anchor.lower()
-        # convert the url to a link object if we were called with a url
-        parent = self.__tolink(parent)
-        # add anchor
-        if anchor in self.reqanchors:
-            if parent not in self.reqanchors[anchor]:
-                self.reqanchors[anchor].add(parent)
-                self._ischanged = True
-        else:
-            self.reqanchors[anchor] = set([parent])
-            self._ischanged = True
-
-    def redirect(self, url):
-        """Indicate that this link redirects to the specified url. Maximum
-        redirect counting is done as well as loop detection."""
-        # figure out depth and urls that have been visited in this
-        # redirect list
-        redirectdepth = 0
-        redirectlist = set()
-        for parent in self.parents:
-            if parent.redirectdepth > redirectdepth:
-                redirectdepth = parent.redirectdepth
-                redirectlist = parent.redirectlist
-        self.redirectdepth = redirectdepth + 1
-        self.redirectlist = redirectlist
-        self.redirectlist.add(self.url)
-        # check depth
-        if self.redirectdepth >= config.REDIRECT_DEPTH:
-            self.add_linkproblem('too many redirects (%d)' % 
self.redirectdepth)
-            return None
-        # check for redirect to self
-        url = self.__checkurl(url)
-        if url == self.url:
-            self.add_linkproblem('redirect same as source: %s' % url)
-            return None
-        # check for redirect loop
-        if url in self.redirectlist:
-            self.add_linkproblem('redirect loop %s' % url)
-        # add child
-        self.add_child(url)
-
-    def add_linkproblem(self, problem):
-        """Indicate that something went wrong while retreiving this link."""
-        self.linkproblems.append(problem)
-        self._ischanged = True
-
-    def add_pageproblem(self, problem):
-        """Indicate that something went wrong with parsing the document."""
-        # only think about problems on internal pages
-        if not self.isinternal:
-            return
-        # only include a single problem once (e.g. multiple anchors)
-        if problem not in self.pageproblems:
-            self.pageproblems.append(problem)
-            self._ischanged = True
-
-    def fetch(self):
-        """Attempt to fetch the url (if isyanked is not True) and fill in link
-        attributes (based on isinternal)."""
-        debugio.info('  %s' % self.url)
-        # fully ignore links that should not be feteched
-        if self.isyanked:
-            debugio.info('    ' + self.isyanked)
-            return
+    def fetch(self, link):
+        """Attempt to fetch the url (if not yanked) and fill in link
+        attributes (based on is_internal)."""
+        debugio.info('  %s' % link.url)
+        # mark the link as fetched to avoid loops
+        link.fetched = datetime.datetime.now()
         # see if we can import the proper module for this scheme
         try:
             # FIXME: if an URI has a username:passwd add the uri, username and 
password to the HTTPPasswordMgr
-            request = urllib2.Request(self.url)
-            if self.parents:
-                request.add_header('Referer', iter(self.parents).next().url)
+            request = urllib2.Request(link.url)
+            if link.parents:
+                request.add_header('Referer', iter(link.parents).next().url)
             response = urllib2.urlopen(request)
-            self.mimetype = response.info().gettype()
-            self.set_encoding(response.info().getencoding())
+            link.mimetype = response.info().gettype()
+            link.set_encoding(response.headers.getparam('charset'))
             # FIXME: get result code and other stuff
-            self.status = str(response.code)
+            link.status = str(response.code)
             # link.size = int(response.getheader('Content-length'))
             # link.mtime = time.mktime(response.msg.getdate('Last-Modified'))
             # if response.status == 301: 
link.add_linkproblem(str(response.status)+': '+response.reason)
             # elif response.status != 200: 
link.add_linkproblem(str(response.status)+': '+response.reason)
             # TODO: add checking for size
+            return response
         except RedirectError, e:
-            self.status = str(e.code)
+            link.status = str(e.code)
             debugio.info('    ' + str(e))
             if e.code == 301:
-                self.add_linkproblem(str(e))
-            self.redirect(e.newurl)
+                link.add_linkproblem(str(e))
+            link.add_redirect(e.newurl)
             return
         except urllib2.HTTPError, e:
-            self.status = str(e.code)
+            link.status = str(e.code)
             debugio.info('    ' + str(e))
-            self.add_linkproblem(str(e))
+            link.add_linkproblem(str(e))
             return
         except urllib2.URLError, e:
             debugio.info('    ' + str(e))
-            self.add_linkproblem(str(e))
+            link.add_linkproblem(str(e))
             return
         except KeyboardInterrupt:
             # handle this in a higher-level exception handler
             raise
         except Exception, e:
             # handle all other exceptions
-            debugio.warn('unknown exception caught: '+str(e))
-            self.add_linkproblem('error reading HTTP response: '+str(e))
+            debugio.warn('unknown exception caught: ' + str(e))
+            link.add_linkproblem('error reading HTTP response: %s' % str(e))
             import traceback
             traceback.print_exc()
             return
-        finally:
-            self.isfetched = True
-            self._ischanged = True
+
+    def parse(self, link, response):
+        """Parse the fetched response."""
         # find a parser for the content-type
-        parsermodule = parsers.get_parsermodule(self.mimetype)
+        parsermodule = parsers.get_parsermodule(link.mimetype)
         if parsermodule is None:
-            debugio.debug('crawler.Link.fetch(): unsupported content-type: %s' 
% self.mimetype)
+            debugio.debug('crawler.Link.fetch(): unsupported content-type: %s' 
% link.mimetype)
             return
         # skip parsing of content if we were returned nothing
         content = response.read()
@@ -610,69 +342,52 @@
         # parse the content
         debugio.debug('crawler.Link.fetch(): parsing using %s' % 
parsermodule.__name__)
         try:
-            parsermodule.parse(content, self)
+            parsermodule.parse(content, link)
         except Exception, e:
-            self.add_pageproblem('problem parsing page: ' + str(e))
-            debugio.warn('problem parsing page: ' + str(e))
             import traceback
             traceback.print_exc()
+            debugio.warn('problem parsing page: ' + str(e))
+            link.add_pageproblem('problem parsing page: ' + str(e))
 
-    def follow_link(self, visited=set()):
-        """If this link represents a redirect return the redirect target,
-        otherwise return self. If this redirect does not find a referenced
-        link None is returned."""
-        # if this is not a redirect just return
-        if self.redirectdepth == 0:
-            return self
-        # if we don't know where this redirects, return None
-        if len(self.children) == 0:
-            return None
-        # the first (and only) child is the redirect target
-        visited.add(self)
-        # check for loops
-        child = self.children.copy().pop()
-        if child in visited:
-            return None
-        # check where we redirect to
-        return child.follow_link(visited)
-
-    def _pagechildren(self):
-        """Determin the page children of this link, combining the children of
-        embedded items and following redirects."""
-        # if we already have pagechildren defined we're done
-        if self.pagechildren is not None:
-            return self.pagechildren
-        self.pagechildren = set()
-        # add my own children, following redirects
-        for child in self.children:
-            # follow redirects
-            child = child.follow_link()
-            # skip children we already have
-            if child is None:
+    def postprocess(self):
+        """Do some basic post processing of the collected data, including
+        depth calculation of every link."""
+        # get a database session
+        session = db.Session()
+        # build the list of urls that were set up with add_internal() that
+        # do not have a parent (they form the base for the site)
+        for url in self._internal_urls:
+            link = self.get_link(session, url).follow_link()
+            if not link:
+                debugio.warn('base link %s redirects to nowhere' % url)
                 continue
-            # set depth of child if it is not already set
-            if child.depth is None:
-                child.depth = self.depth+1
-            # add child pages to out pagechildren
-            if child.ispage:
-                self.pagechildren.add(child)
-        # add my embedded element's children
-        for embed in self.embedded:
-            # set depth of embed if it is not already set
-            if embed.depth is None:
-                embed.depth = self.depth
-            # merge in children of embeds
-            self.pagechildren.update(embed._pagechildren())
-        # return the results
-        return self.pagechildren
-
-    def set_encoding(self, encoding):
-        """Set the encoding of the link doing some basic checks
-        to see if the encoding is supported."""
-        if self.encoding is None and encoding is not None:
-            try:
-                debugio.debug('crawler.Link.set_encoding("'+str(encoding)+'")')
-                unicode('just some random text', encoding, 'replace')
-                self.encoding = encoding
-            except Exception:
-                self.add_pageproblem('unknown encoding: ' + str(encoding))
+            # add the link to bases
+            debugio.debug('crawler.postprocess(): adding %s to bases' % 
link.url)
+            self.bases.append(link)
+        # if we got no bases, just use the first internal one
+        if not self.bases:
+            link = session.query(db.Link).filter(db.Link.is_internal == 
True).first()
+            debugio.debug('crawler.postprocess(): fallback to adding %s to 
bases' % link.url)
+            self.bases.append(link)
+        # do a breadth first traversal of the website to determine depth and
+        # figure out page children
+        session.query(db.Link).update(dict(depth=None), 
synchronize_session=False)
+        session.commit()
+        depth = 0
+        count = len(self.bases)
+        for link in self.bases:
+            link.depth = 0
+        session.commit()
+        debugio.debug('crawler.postprocess(): %d links at depth 0' % count)
+        while count > 0:
+            # update the depth of all links without a depth that have a
+            # parent with the previous depth
+            qry = session.query(db.Link).filter(db.Link.depth == None)
+            qry = qry.filter(db.Link.linked_from.any(db.Link.depth == depth))
+            count = qry.update(dict(depth=depth + 1), 
synchronize_session=False)
+            session.commit()
+            depth += 1
+            debugio.debug('crawler.postprocess(): %d links at depth %d' % 
(count, depth))
+            # TODO: also handle embeds
+        # make the list of links (and session) available to the plugins
+        self.links = session.query(db.Link)

Added: webcheck/db.py
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ webcheck/db.py      Thu Aug  4 21:46:26 2011        (r421)
@@ -0,0 +1,252 @@
+
+# db.py - database access layer for webcheck
+#
+# Copyright (C) 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+import urlparse
+
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import Table, Column, Integer, Boolean, String, DateTime, 
ForeignKey
+from sqlalchemy.orm import relationship, backref, sessionmaker
+from sqlalchemy.orm.session import object_session
+from sqlalchemy.sql.expression import ClauseElement
+
+import config
+import debugio
+import myurllib
+
+
+# provide session and schema classes
+Session = sessionmaker()
+Base = declarative_base()
+
+
+children = Table(
+    'children', Base.metadata,
+    Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE')),
+    Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'))
+    )
+
+
+embedded = Table(
+    'embedded', Base.metadata,
+    Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE')),
+    Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'))
+    )
+
+
+class Link(Base):
+
+    __tablename__ = 'links'
+
+    id = Column(Integer, primary_key=True)
+    url = Column(String, index=True, nullable=False, unique=True)
+    fetched = Column(DateTime, index=True)
+    is_internal = Column(Boolean, index=True)
+    yanked = Column(String, index=True)
+    depth = Column(Integer)
+
+    # information about the retrieved link
+    status = Column(String)
+    mimetype = Column(String)
+    mimetype = Column(String)
+    encoding = Column(String)
+    size = Column(Integer)
+    mtime = Column(DateTime)
+    is_page = Column(Boolean, index=True)
+    title = Column(String)
+    author = Column(String)
+
+    # relationships between links
+    children = relationship('Link', secondary=children,
+        backref=backref('linked_from', collection_class=set),
+        primaryjoin=(id == children.c.parent_id),
+        secondaryjoin=(id == children.c.child_id),
+        collection_class=set)
+    embedded = relationship('Link', secondary=embedded,
+        backref=backref('embedded_in', collection_class=set),
+        primaryjoin=(id == embedded.c.parent_id),
+        secondaryjoin=(id == embedded.c.child_id),
+        collection_class=set)
+
+    # crawling information
+    redirectdepth = Column(Integer, default=0)
+
+    @staticmethod
+    def clean_url(url):
+        # normalise the URL, removing the fragment from the URL
+        url = myurllib.normalizeurl(url)
+        (scheme, netloc, path, query) = urlparse.urlsplit(url)[0:4]
+        return urlparse.urlunsplit((scheme, netloc, path, query, ''))
+
+    def _get_link(self, url):
+        """Get a link object for the specified URL."""
+        # get the session
+        session = object_session(self)
+        # try to find the URL
+        url = self.clean_url(url)
+        instance = session.query(Link).filter_by(url=url).first()
+        if not instance:
+            instance = Link(url=url)
+            session.add(instance)
+        return instance
+
+    def set_encoding(self, encoding):
+        """Set the encoding of the link doing some basic checks to see if
+        the encoding is supported."""
+        if not self.encoding and encoding:
+            try:
+                debugio.debug('crawler.Link.set_encoding(%r)' % encoding)
+                unicode('just some random text', encoding, 'replace')
+                self.encoding = encoding
+            except Exception, e:
+                import traceback
+                traceback.print_exc()
+                self.add_pageproblem('unknown encoding: %s' % encoding)
+
+    def add_redirect(self, url):
+        """Indicate that this link redirects to the specified url."""
+        url = self.clean_url(url)
+        # figure out depth
+        self.redirectdepth = max([self.redirectdepth] +
+                                 [x.redirectdepth for x in self.parents]) + 1
+        # check depth
+        if self.redirectdepth >= config.REDIRECT_DEPTH:
+            self.add_linkproblem('too many redirects (%d)' % 
self.redirectdepth)
+            return
+        # check for redirect to self
+        if url == self.url:
+            self.add_linkproblem('redirect same as source: %s' % url)
+            return
+        # add child
+        self.add_child(url)
+
+    def add_linkproblem(self, message):
+        """Indicate that something went wrong while retrieving this link."""
+        self.linkproblems.append(LinkProblem(message=message))
+
+    def add_pageproblem(self, message):
+        """Indicate that something went wrong with parsing the document."""
+        # only think about problems on internal pages
+        if not self.is_internal:
+            return
+        # TODO: only include a single problem once (e.g. multiple anchors)
+        self.pageproblems.append(PageProblem(message=message))
+
+    def add_child(self, url):
+        """Add the specified URL as a child of this link."""
+        # ignore children for external links
+        if not self.is_internal:
+            return
+        # add to children
+        self.children.add(self._get_link(url))
+
+    def add_embed(self, url):
+        """Mark the given URL as used as an image on this page."""
+        # ignore embeds for external links
+        if not self.is_internal:
+            return
+        # add to embedded
+        self.embedded.add(self._get_link(url))
+
+    def add_anchor(self, anchor):
+        """Indicate that this page contains the specified anchor."""
+        return # FIXME: implement/update
+        # lowercase anchor
+        anchor = anchor.lower()
+        # add anchor
+        if anchor in self.anchors:
+            self.add_pageproblem(
+              'anchor/id "%(anchor)s" defined multiple times'
+              % { 'anchor':   anchor })
+        else:
+            self.anchors.add(anchor)
+
+    def add_reqanchor(self, parent, anchor):
+        """Indicate that the specified link contains a reference to the
+        specified anchor. This can be checked later."""
+        return # FIXME: implement/update
+        # lowercase anchor
+        anchor = anchor.lower()
+        # convert the url to a link object if we were called with a url
+        parent = self.__tolink(parent)
+        # add anchor
+        if anchor in self.reqanchors:
+            if parent not in self.reqanchors[anchor]:
+                self.reqanchors[anchor].add(parent)
+        else:
+            self.reqanchors[anchor] = set([parent])
+
+    def follow_link(self, visited=None):
+        """If this link represents a redirect return the redirect target,
+        otherwise return self. If this redirect does not find a referenced
+        link None is returned."""
+        # if this is not a redirect just return
+        if not self.redirectdepth:
+            return self
+        # if we don't know where this redirects, return None
+        if not self.children:
+            return None
+        # avoid loops
+        if not visited:
+            visited = set()
+        visited.add(self.url)
+        # the first (and only) child is the redirect target
+        child = list(self.children)[0]
+        if child.url in visited:
+            return None
+        # check where we redirect to
+        return child.follow_link(visited)
+
+    @property
+    def parents(self):
+        return set(self.linked_from).union(self.embedded_in)
+
+
+class LinkProblem(Base):
+    """Storage of problems in the URL itself (e.g. problem downloading the
+    associated resource)."""
+
+    __tablename__ = 'linkproblems'
+
+    id = Column(Integer, primary_key=True)
+    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+    link = relationship(Link, backref=backref('linkproblems', order_by=id,
+                        cascade='all,delete,delete-orphan'))
+    message = Column(String)
+
+    def __unicode__(self):
+        return self.message
+
+
+class PageProblem(Base):
+    """Storage of problems in the information from the retrieved URL (e.g.
+    invalid HTML)."""
+
+    __tablename__ = 'pageproblems'
+
+    id = Column(Integer, primary_key=True)
+    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+    link = relationship(Link, backref=backref('pageproblems', order_by=id,
+                        cascade='all,delete,delete-orphan'))
+    message = Column(String)
+
+    def __unicode__(self):
+        return self.message

Modified: webcheck/parsers/css.py
==============================================================================
--- webcheck/parsers/css.py     Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/parsers/css.py     Thu Aug  4 21:46:26 2011        (r421)
@@ -1,7 +1,7 @@
 
 # css.py - parser functions for css content
 #
-# Copyright (C) 2005, 2006, 2009 Arthur de Jong
+# Copyright (C) 2005, 2006, 2009, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -39,16 +39,16 @@
 # pattern for matching url(...) in css
 _urlpattern = re.compile('url\(["\']?(.*?)["\']?\)')
 
-def parse(content, link, baseurl=None):
+def parse(content, link, base=None):
     """Parse the specified content and extract information for crawling the
     site further."""
-    # if no baseurl is specified, get it from the link
-    baseurl = link.url
+    # if no base is specified, get it from the link
+    base = base or link.url
     # strip out comments from the content
     content = _commentpattern.sub('', content)
-    # handler @imports
-    for i in _importpattern.findall(content):
-        link.add_embed(urlparse.urljoin(baseurl, i))
+    # handle @imports
+    for embed in _importpattern.findall(content):
+        link.add_embed(urlparse.urljoin(base, embed))
     # handle url()s
-    for i in _urlpattern.findall(content):
-        link.add_embed(urlparse.urljoin(baseurl, i))
+    for embed in _urlpattern.findall(content):
+        link.add_embed(urlparse.urljoin(base, embed))

Modified: webcheck/parsers/html/__init__.py
==============================================================================
--- webcheck/parsers/html/__init__.py   Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/parsers/html/__init__.py   Thu Aug  4 21:46:26 2011        (r421)
@@ -1,7 +1,7 @@
 
 # html.py - parser functions for html content
 #
-# Copyright (C) 2005, 2006, 2007, 2008 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -44,7 +44,7 @@
         return u''
     # convert to unicode object
     if not isinstance(txt, unicode):
-        txt = unicode(txt, errors='replace')
+        txt = unicode(txt)
     # the output string
     out = ''
     # loop over the characters of the string

Modified: webcheck/parsers/html/beautifulsoup.py
==============================================================================
--- webcheck/parsers/html/beautifulsoup.py      Sat Jun 18 23:26:22 2011        
(r420)
+++ webcheck/parsers/html/beautifulsoup.py      Thu Aug  4 21:46:26 2011        
(r421)
@@ -1,7 +1,7 @@
 
 # beautifulsoup.py - parser functions for html content
 #
-# Copyright (C) 2007, 2008, 2009 Arthur de Jong
+# Copyright (C) 2007, 2008, 2009, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -80,10 +80,10 @@
     if refresh and refresh['content']:
         try:
             child = _refershcontentpattern.search(refresh['content']).group(1)
-            link.add_child(urlparse.urljoin(base, child))
         except AttributeError:
-            # ignore cases where refresh header parsing causes problems
-            pass
+            pass # ignore cases where refresh header parsing causes problems
+        else:
+            link.add_child(urlparse.urljoin(base, child))
     # <img src="URL">
     for img in soup.findAll('img', src=True):
         embed = myurllib.normalizeurl(htmlunescape(img['src']).strip())
@@ -180,4 +180,4 @@
         if embed:
             link.add_embed(urlparse.urljoin(base, embed))
     # flag that the link contains a valid page
-    link.ispage = True
+    link.is_page = True

Modified: webcheck/parsers/html/calltidy.py
==============================================================================
--- webcheck/parsers/html/calltidy.py   Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/parsers/html/calltidy.py   Thu Aug  4 21:46:26 2011        (r421)
@@ -1,7 +1,7 @@
 
 # calltidy.py - parser functions for html content
 #
-# Copyright (C) 2008 Arthur de Jong
+# Copyright (C) 2008, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,7 +28,7 @@
     """Parse the specified content with tidy and add any errors to the
     link."""
     # only call tidy on internal pages
-    if link.isinternal:
+    if link.is_internal:
         t = tidy.parseString(content, **config.TIDY_OPTIONS)
         for err in t.errors:
             # error messages are escaped so we unescape them

Modified: webcheck/parsers/html/htmlparser.py
==============================================================================
--- webcheck/parsers/html/htmlparser.py Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/parsers/html/htmlparser.py Thu Aug  4 21:46:26 2011        (r421)
@@ -1,7 +1,7 @@
 
 # html.py - parser functions for html content
 #
-# Copyright (C) 2005, 2006, 2007, 2009 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2009, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -269,15 +269,15 @@
         parser.close()
     except Exception, e:
         # ignore (but log) all errors
-        debugio.debug('parsers.html.htmlparser.parse(): caught exception: 
'+str(e))
+        debugio.debug('parsers.html.htmlparser.parse(): caught exception: ' + 
str(e))
     # check for parser errors
     if parser.errmsg is not None:
-        debugio.debug('parsers.html.htmlparser.parse(): problem parsing html: 
'+parser.errmsg)
+        debugio.debug('parsers.html.htmlparser.parse(): problem parsing html: 
' + parser.errmsg)
         link.add_pageproblem('problem parsing html: %s' % parser.errmsg)
     # dump encoding
     debugio.debug('parsers.html.htmlparser.parse(): html encoding: %s' % 
str(link.encoding))
     # flag that the link contains a valid page
-    link.ispage = True
+    link.is_page = True
     # save the title
     if parser.title is not None:
         link.title = _maketxt(parser.title, link.encoding).strip()

Modified: webcheck/plugins/__init__.py
==============================================================================
--- webcheck/plugins/__init__.py        Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/__init__.py        Thu Aug  4 21:46:26 2011        (r421)
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2009 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2009, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -84,7 +84,7 @@
 def _mk_unicode(txt):
     """Returns a unicode instance of the string."""
     if not isinstance(txt, unicode):
-        txt = unicode(txt, errors='replace')
+        txt = unicode(txt)
     return txt
 
 def get_info(link):
@@ -96,15 +96,15 @@
         info += 'title: %s\n' % link.title.strip()
     if link.author:
         info += 'author: %s\n' % link.author.strip()
-    if link.isinternal:
+    if link.is_internal:
         info += 'internal link'
     else:
         info += 'external link'
-    if link.isyanked:
-        if isinstance(link.isyanked, unicode):
-            info += ', not checked (%s)\n' % link.isyanked
-        if isinstance(link.isyanked, str):
-            info += ', not checked (%s)\n' % _mk_unicode(link.isyanked)
+    if link.yanked:
+        if isinstance(link.yanked, unicode):
+            info += ', not checked (%s)\n' % link.yanked
+        if isinstance(link.yanked, str):
+            info += ', not checked (%s)\n' % _mk_unicode(link.yanked)
         else:
             info += ', not checked\n'
     else:
@@ -135,7 +135,7 @@
     """Return an <a>nchor to a url with title. If url is in the Linklist and
     is external, insert "class=external" in the <a> tag."""
     # try to fetch the link object for this url
-    if link.isinternal:
+    if link.is_internal:
         cssclass = 'internal'
     else:
         cssclass = 'external'
@@ -152,7 +152,7 @@
     The output is indeted with the specified indent."""
     parents = list(link.parents)
     # if there are no parents print nothing
-    if len(parents) == 0:
+    if not parents:
         return
     parents.sort(lambda a, b: cmp(a.title, b.title) or cmp(a.url, b.url))
     fp.write(
@@ -160,7 +160,7 @@
       indent+' referenced from:\n'+
       indent+' <ul>\n' )
     more = 0
-    if len(parents) > config.PARENT_LISTLEN+1:
+    if len(parents) > config.PARENT_LISTLEN + 1:
         more = len(parents) - config.PARENT_LISTLEN
         parents = parents[:config.PARENT_LISTLEN]
     for parent in parents:

Modified: webcheck/plugins/about.py
==============================================================================
--- webcheck/plugins/about.py   Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/about.py   Thu Aug  4 21:46:26 2011        (r421)
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,9 +28,11 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'about.html'
 
+import time
+
 import config
 import plugins
-import time
+
 
 def generate(site):
     """Output a list of modules, it's authors and it's version to the
@@ -55,7 +57,7 @@
       '   </p>\n\n'
       % { 'version':  plugins.htmlescape(config.VERSION),
           'time':     plugins.htmlescape(time.ctime(time.time())),
-          'numurls':  len(site.linkMap),
+          'numurls':  site.links.count(),
           'homepage': config.HOMEPAGE } )
     # output copyright information
     fp.write(
@@ -74,7 +76,7 @@
       '    particular purpose. See the source for further details.\n'
       '   </p>\n'
       '   <p>\n'
-      '    Copyright &copy; 1998, 1999, 2002, 2005, 2006, 2007 Albert Hopkins 
(marduk),\n'
+      '    Copyright &copy; 1998-2011 Albert Hopkins (marduk),\n'
       '    Mike W. Meyer and Arthur de Jong\n'
       '   </p>\n'
       '   <p>\n'
@@ -89,7 +91,7 @@
       '    notices (see <tt>fancytooltips.js</tt> for details):\n'
       '   </p>\n'
       '   <p>\n'
-      '    Copyright &copy; 2003, 2005 Stuart Langridge, Paul McLanahan,\n'
+      '    Copyright &copy; 2003-2005 Stuart Langridge, Paul McLanahan,\n'
       '    Peter Janes, Brad Choate, Dunstan Orchard, Ethan Marcotte,\n'
       '    Mark Wubben and Victor Kulinski\n'
       '   </p>\n\n' )
@@ -98,7 +100,7 @@
       '   <h3>Plugins</h3>\n'
       '   <ul>\n')
     for plugin in config.PLUGINS:
-        report = __import__('plugins.'+plugin, globals(), locals(), [plugin])
+        report = __import__('plugins.' + plugin, globals(), locals(), [plugin])
         fp.write(
           '    <li>\n'
           '     <strong>%s</strong><br />\n'

Modified: webcheck/plugins/badlinks.py
==============================================================================
--- webcheck/plugins/badlinks.py        Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/badlinks.py        Thu Aug  4 21:46:26 2011        (r421)
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,16 +28,14 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'badlinks.html'
 
+import db
 import plugins
 
+
 def generate(site):
     """Present the list of bad links to the given file descriptor."""
     # find all links with link problems
-    links = [ x
-              for x in site.linkMap.values()
-              if len(x.linkproblems)>0 ]
-    # sort list
-    links.sort(lambda a, b: cmp(a.url, b.url))
+    links = site.links.filter(db.Link.linkproblems.any()).order_by('url')
     # present results
     fp = plugins.open_html(plugins.badlinks, site)
     if not links:
@@ -71,7 +69,7 @@
         # add a reference to the problem map
         for problem in link.linkproblems:
             for parent in link.parents:
-                parent.add_pageproblem('bad link: ' + link.url + ': ' + 
problem)
+                parent.add_pageproblem('bad link: %s: %s' % (link.url, 
problem))
         fp.write(
           '    </li>\n')
     fp.write(

Modified: webcheck/plugins/external.py
==============================================================================
--- webcheck/plugins/external.py        Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/external.py        Thu Aug  4 21:46:26 2011        (r421)
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2009 Arthur de Jong
+# Copyright (C) 2005, 2006, 2009, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,17 +28,14 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'external.html'
 
+import db
 import plugins
 
+
 def generate(site):
     """Generate the list of external links to the given file descriptor."""
     # get all external links
-    links = [ x
-              for x in site.linkMap.values()
-              if not x.isinternal ]
-    # sort list
-    # FIXME: use sort(key=....) (adds dependency on python>=2.4)
-    links.sort(lambda a, b: cmp(a.url, b.url))
+    links = site.links.filter(db.Link.is_internal != True).order_by('url')
     # present results
     fp = plugins.open_html(plugins.external, site)
     if not links:

Modified: webcheck/plugins/images.py
==============================================================================
--- webcheck/plugins/images.py  Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/images.py  Thu Aug  4 21:46:26 2011        (r421)
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,21 +28,19 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'images.html'
 
-import plugins
 import re
+from sqlalchemy.sql.expression import or_
+
+import db
+import plugins
+
 
 def generate(site):
     """Output a list of images to the given file descriptor."""
-    # this finds all links with a reasonable image-like content-type
-    matcher = re.compile('^image/.*$')
     # get non-page images that have an image/* mimetype
-    links = [ x
-              for x in site.linkMap.values()
-              if not x.ispage and
-                     x.mimetype is not None and
-                     matcher.search(x.mimetype) ]
-    # sort list
-    links.sort(lambda a, b: cmp(a.url, b.url))
+    links = site.links.filter(or_(db.Link.is_page != True, db.Link.is_page == 
None))
+    links = links.filter(db.Link.mimetype.startswith('image/'))
+    links = links.order_by('url')
     # present results
     fp = plugins.open_html(plugins.images, site)
     if not links:

Modified: webcheck/plugins/new.py
==============================================================================
--- webcheck/plugins/new.py     Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/new.py     Thu Aug  4 21:46:26 2011        (r421)
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,25 +28,22 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'new.html'
 
+import time
+
 import config
+import db
 import plugins
-import time
 
-SECS_PER_DAY = 60*60*24
+
+SECS_PER_DAY = 60 * 60 * 24
 
 def generate(site):
     """Output the list of recently modified pages to the specified file 
descriptor."""
     # the time for which links are considered new
-    newtime = time.time()-SECS_PER_DAY*config.REPORT_WHATSNEW_URL_AGE
+    newtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSNEW_URL_AGE
     # get all internal pages that are new
-    links = [ x
-              for x in site.linkMap.values()
-              if x.ispage and
-                 x.isinternal and
-                 x.mtime is not None and
-                 x.mtime > newtime ]
-    # sort links
-    links.sort(lambda a, b: cmp(b.mtime, a.mtime))
+    links = site.links.filter_by(is_page=True, is_internal=True)
+    links = links.filter(db.Link.mtime > newtime).order_by('-mtime')
     # present results
     fp = plugins.open_html(plugins.new, site)
     if not links:

Modified: webcheck/plugins/notchkd.py
==============================================================================
--- webcheck/plugins/notchkd.py Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/notchkd.py Thu Aug  4 21:46:26 2011        (r421)
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,15 +28,14 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'notchkd.html'
 
+import db
 import plugins
 
+
 def generate(site):
     """Output the list of not checked pages to the given file descriptor."""
     # get all yanked urls
-    links = [ x
-              for x in site.linkMap.values()
-              if x.isyanked ]
-    links.sort(lambda a, b: cmp(a.url, b.url))
+    links = site.links.filter(db.Link.yanked != None).order_by('url')
     # present results
     fp = plugins.open_html(plugins.notchkd, site)
     if not links:

Modified: webcheck/plugins/notitles.py
==============================================================================
--- webcheck/plugins/notitles.py        Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/notitles.py        Thu Aug  4 21:46:26 2011        (r421)
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,17 +28,19 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'notitles.html'
 
+from sqlalchemy.sql.functions import char_length
+from sqlalchemy.sql.expression import or_
+
+import db
 import plugins
 
+
 def generate(site):
     """Output the list of pages without a title to the given file 
descriptor."""
     # get all internal pages without a title
-    links = [ x
-              for x in site.linkMap.values()
-              if x.ispage and
-                 x.isinternal and
-                 (x.title is None or x.title == '') ]
-    links.sort(lambda a, b: cmp(a.url, b.url))
+    links = site.links.filter_by(is_page=True, is_internal=True)
+    links = links.filter(or_(char_length(db.Link.title) == 0,
+                             db.Link.title ==None)).order_by('url')
     # present results
     fp = plugins.open_html(plugins.notitles, site)
     if not links:

Modified: webcheck/plugins/old.py
==============================================================================
--- webcheck/plugins/old.py     Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/old.py     Thu Aug  4 21:46:26 2011        (r421)
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,25 +28,22 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'old.html'
 
+import time
+
 import config
+import db
 import plugins
-import time
 
-SECS_PER_DAY = 60*60*24
+
+SECS_PER_DAY = 60 * 60 * 24
 
 def generate(site):
     """Output the list of outdated pages to the specified file descriptor."""
     # the time for which links are considered old
-    oldtime = time.time()-SECS_PER_DAY*config.REPORT_WHATSOLD_URL_AGE
+    oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE
     # get all internal pages that are old
-    links = [ x
-              for x in site.linkMap.values()
-              if x.ispage and
-                 x.isinternal and
-                 x.mtime is not None and
-                 x.mtime < oldtime ]
-    # sort links
-    links.sort(lambda a, b: cmp(a.mtime, b.mtime))
+    links = site.links.filter_by(is_page=True, is_internal=True)
+    links = links.filter(db.Link.mtime < oldtime).order_by('mtime').all()
     # present results
     fp = plugins.open_html(plugins.old, site)
     if not links:
@@ -65,7 +62,7 @@
       '   <ul>\n'
       % {'old': config.REPORT_WHATSOLD_URL_AGE })
     for link in links:
-        age = (time.time()-link.mtime)/SECS_PER_DAY
+        age = (time.time() - link.mtime) / SECS_PER_DAY
         fp.write(
           '    <li>\n'
           '     %(link)s\n'

Modified: webcheck/plugins/problems.py
==============================================================================
--- webcheck/plugins/problems.py        Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/problems.py        Thu Aug  4 21:46:26 2011        (r421)
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,9 +28,12 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'problems.html'
 
-import plugins
 import urllib
 
+import db
+import plugins
+
+
 def _mk_id(name):
     """Convert the name to a string that may be used inside an
     ID attribute."""
@@ -48,10 +51,10 @@
     """Output the overview of problems to the given file descriptor."""
     # make a list of problems per author
     problem_db = {}
-    for link in site.linkMap.values():
-        # skip external pages
-        if not link.isinternal or len(link.pageproblems) == 0:
-            continue
+    # get internal links with page problems
+    links = site.links.filter_by(is_internal=True)
+    links = links.filter(db.Link.pageproblems.any()).order_by('url')
+    for link in links:
         # make a normal name for the author
         if link.author:
             author = link.author.strip()

Modified: webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/plugins/sitemap.py Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/sitemap.py Thu Aug  4 21:46:26 2011        (r421)
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,42 +28,60 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'index.html'
 
+from sqlalchemy.sql.expression import or_
+from sqlalchemy.orm.session import object_session
+
 import config
+import db
 import plugins
 
-# this is a workaround for Python 2.3
-try:
-    set
-except NameError:
-    from sets import Set as set
+
+def add_pagechildren(link, children, explored):
+    """Determine the page children of this link, combining the children of
+    embedded items and following redirects."""
+    links = object_session(link).query(db.Link)
+    # get all internal children
+    qry = links.filter(db.Link.linked_from.contains(link))
+    qry = qry.filter(db.Link.is_internal == True)
+    if link.depth:
+        qry = qry.filter(or_(db.Link.depth > link.depth, db.Link.depth == 
None))
+    #qry = qry.filter(~db.Link.id.in_(explored))
+    # follow redirects
+    children.update(y
+                    for y in (x.follow_link() for x in qry)
+                    if y and y.is_page and y.is_internal and y.id not in 
explored)
+    explored.update(x.id for x in children)
+    # add embedded element's pagechildren (think frames)
+    for embed in link.embedded:
+        # TODO: put this in a query
+        if embed.is_internal and embed.is_page and \
+           embed.id not in explored and \
+           (embed.depth == None or embed.depth > link.depth):
+            add_pagechildren(embed, children, explored)
 
 def _explore(fp, link, explored, depth=0, indent='    '):
     """Recursively do a breadth first traversal of the graph of links on the
     site. Prints the html results to the file descriptor."""
     # output this link
-    fp.write(indent+'<li>\n')
-    fp.write(indent+' '+plugins.make_link(link)+'\n')
+    fp.write(indent + '<li>\n')
+    fp.write(indent + ' ' + plugins.make_link(link) + '\n')
     # only check children if we are not too deep yet
     if depth <= config.REPORT_SITEMAP_LEVEL:
         # figure out the links to follow and ensure that they are only
         # explored from here
-        children = []
-        for child in link.pagechildren:
-            # skip pages that have the wrong depth, are not internal or have
-            # already been visited
-            if child.depth != depth+1 or not child.isinternal or child in 
explored:
-                continue
-            # set child as explored and add to to explore list
-            explored.add(child)
-            children.append(child)
-        # go over the children and present them as a list
-        if len(children) > 0:
-            fp.write(indent+' <ul>\n')
+        children = set()
+        add_pagechildren(link, children, explored)
+        # remove None which could be there as a result of follow_link()
+        children.discard(None)
+        if children:
+            children = list(children)
+            # present children as a list
+            fp.write(indent + ' <ul>\n')
             children.sort(lambda a, b: cmp(a.url, b.url))
             for child in children:
-                _explore(fp, child, explored, depth+1, indent+'  ')
-            fp.write(indent+' </ul>\n')
-    fp.write(indent+'</li>\n')
+                _explore(fp, child, explored, depth + 1, indent + '  ')
+            fp.write(indent + ' </ul>\n')
+    fp.write(indent + '</li>\n')
 
 def generate(site):
     """Output the sitemap to the specified file descriptor."""
@@ -74,7 +92,7 @@
       '    This an overview of the crawled site.\n'
       '   </p>\n'
       '   <ul>\n' )
-    explored = set(site.bases)
+    explored = set(x.id for x in site.bases)
     for l in site.bases:
         _explore(fp, l, explored)
     fp.write(

Modified: webcheck/plugins/size.py
==============================================================================
--- webcheck/plugins/size.py    Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/size.py    Thu Aug  4 21:46:26 2011        (r421)
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -31,6 +31,7 @@
 import config
 import plugins
 
+
 def _getsize(link, done=None):
     """Return the size of the link and all its embedded links, counting each
     link only once."""
@@ -40,7 +41,7 @@
     # add this link to the list
     done.append(link)
     # if we don't known about our total size yet, calculate
-    if not hasattr(link, 'totalSize'):
+    if not hasattr(link, 'total_size'):
         size = 0
         # add our size
         if link.size is not None:
@@ -49,19 +50,17 @@
         for embed in link.embedded:
             if embed not in done:
                 size += _getsize(embed, done)
-        link.totalSize = size
-    return link.totalSize
+        link.total_size = size
+    return link.total_size
 
 def generate(site):
     """Output the list of large pages to the given file descriptor."""
     # get all internal pages and get big links
-    links = [ x
-              for x in site.linkMap.values()
-              if x.ispage and
-                 x.isinternal and
-                 _getsize(x) >= config.REPORT_SLOW_URL_SIZE*1024 ]
+    links = site.links.filter_by(is_page=True, is_internal=True)
+    links = [ x for x in links
+              if _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024 ]
     # sort links by size (biggest first)
-    links.sort(lambda a, b: cmp(b.totalSize, a.totalSize))
+    links.sort(lambda a, b: cmp(b.total_size, a.total_size))
     # present results
     fp = plugins.open_html(plugins.size, site)
     if not links:
@@ -80,7 +79,7 @@
       '   <ul>\n'
       % { 'size': config.REPORT_SLOW_URL_SIZE })
     for link in links:
-        size = plugins.get_size(link.totalSize)
+        size = plugins.get_size(link.total_size)
         fp.write(
           '    <li>\n'
           '     %(link)s\n'

Modified: webcheck/plugins/urllist.py
==============================================================================
--- webcheck/plugins/urllist.py Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/plugins/urllist.py Thu Aug  4 21:46:26 2011        (r421)
@@ -1,7 +1,7 @@
 
 # urllist.py - plugin to generate a list of visited urls
 #
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -28,6 +28,7 @@
 
 import plugins
 
+
 def generate(site):
     """Output a sorted list of urls to the specified file descriptor."""
     fp = plugins.open_html(plugins.urllist, site)
@@ -38,10 +39,9 @@
       '    non-examined urls.\n'
       '   </p>\n'
       '   <ol>\n' )
-    urls = site.linkMap.keys()
-    urls.sort()
-    for url in urls:
-        fp.write('    <li>'+plugins.make_link(site.linkMap[url], 
url)+'</li>\n')
+    links = site.links.order_by('url')
+    for link in links:
+        fp.write('    <li>' + plugins.make_link(link, link.url) + '</li>\n')
     fp.write(
       '   </ol>\n' )
     plugins.close_html(fp)

Modified: webcheck/webcheck.py
==============================================================================
--- webcheck/webcheck.py        Sat Jun 18 23:26:22 2011        (r420)
+++ webcheck/webcheck.py        Thu Aug  4 21:46:26 2011        (r421)
@@ -42,8 +42,8 @@
 import crawler
 import plugins
 import debugio
-import serialize
 import monkeypatch
+import db
 
 debugio.loglevel = debugio.INFO
 
@@ -108,13 +108,16 @@
            'ignore-robots',
            'quiet', 'silent', 'debug', 'profile', 'output=', 'continue',
            'force', 'redirects=', 'userpass=', 'wait=', 'version', 'help'))
+        internal_urls = []
+        external_urls = []
+        yank_urls = []
         for flag, arg in optlist:
             if flag in ('-i', '--internal'):
-                site.add_internal_re(arg)
+                internal_urls.append(arg)
             elif flag in ('-x', '--external'):
-                site.add_external_re(arg)
+                external_urls.append(arg)
             elif flag in ('-y', '--yank'):
-                site.add_yanked_re(arg)
+                yank_urls.append(arg)
             elif flag in ('-b', '--base-only'):
                 config.BASE_URLS_ONLY = True
             elif flag in ('-a', '--avoid-external'):
@@ -152,6 +155,24 @@
             print_usage()
             print_tryhelp()
             sys.exit(1)
+        # ensure output directory exists
+        if not os.path.isdir(config.OUTPUT_DIR):
+            os.mkdir(config.OUTPUT_DIR)
+        # set up database connection
+        filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
+        from sqlalchemy import create_engine
+        engine = create_engine('sqlite:///' + filename)
+        db.Session.configure(bind=engine)
+        # ensure that all tables are created
+        db.Base.metadata.create_all(engine)
+        # TODO: schema migraton goes here
+        # add configuration to site
+        for pattern in internal_urls:
+            site.add_internal_re(pattern)
+        for pattern in external_urls:
+            site.add_external_re(pattern)
+        for pattern in yank_urls:
+            site.add_yanked_re(pattern)
         for arg in args:
             # if it does not look like a url it is probably a local file
             if urlparse.urlsplit(arg)[0] == '':
@@ -218,33 +239,10 @@
 
 def main(site):
     """Main program."""
-    # read serialized file
-    if config.CONTINUE:
-        fname = os.path.join(config.OUTPUT_DIR, 'webcheck.dat')
-        debugio.info('reading stored crawler data....')
-        try:
-            fp = open(fname, 'r')
-            site = serialize.deserialize(fp)
-            fp.close()
-        except IOError, (errno, strerror):
-            debugio.error('%(fname)s: %(strerror)s' %
-                          { 'fname': fname,
-                            'strerror': strerror })
-            sys.exit(1)
-        debugio.info('done.')
-    # create seriazlized file
-    fp = plugins.open_file('webcheck.dat', makebackup=True)
-    serialize.serialize_site(fp, site)
     # crawl through the website
     debugio.info('checking site....')
-    site.crawl(fp) # this will take a while
+    site.crawl() # this will take a while
     debugio.info('done.')
-    fp.close()
-    # serialize the final state again
-    fp = plugins.open_file('webcheck.dat', makebackup=True)
-    serialize.serialize_site(fp, site)
-    serialize.serialize_links(fp, site)
-    fp.close()
     # do postprocessing (building site structure, etc)
     debugio.info('postprocessing....')
     site.postprocess()
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits
webcheck commit: r421 - in webcheck: . parsers parsers/html plugins, Commits of the webcheck project
Prev by Date: webcheck commit: r420 - webcheck
Next by Date: webcheck commit: r422 - webcheck/plugins
Previous by thread: webcheck commit: r420 - webcheck
Next by thread: webcheck commit: r422 - webcheck/plugins