lists.arthurdejong.org
RSS feed

webcheck commit: r431 - in webcheck: . plugins

[Date Prev][Date Next] [Thread Prev][Thread Next]

webcheck commit: r431 - in webcheck: . plugins



Author: arthur
Date: Sat Aug 20 15:06:00 2011
New Revision: 431
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=431

Log:
make plugins get their own session and split postprocessinf and report 
generation

Modified:
   webcheck/crawler.py
   webcheck/plugins/__init__.py
   webcheck/plugins/about.py
   webcheck/plugins/anchors.py
   webcheck/plugins/badlinks.py
   webcheck/plugins/external.py
   webcheck/plugins/images.py
   webcheck/plugins/new.py
   webcheck/plugins/notchkd.py
   webcheck/plugins/notitles.py
   webcheck/plugins/old.py
   webcheck/plugins/problems.py
   webcheck/plugins/sitemap.py
   webcheck/plugins/size.py
   webcheck/plugins/urllist.py
   webcheck/webcheck.py

Modified: webcheck/crawler.py
==============================================================================
--- webcheck/crawler.py Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/crawler.py Sat Aug 20 15:06:00 2011        (r431)
@@ -396,5 +396,19 @@
             depth += 1
             debugio.debug('crawler.postprocess(): %d links at depth %d' % 
(count, depth))
             # TODO: also handle embeds
-        # make the list of links (and session) available to the plugins
-        self.links = session.query(db.Link)
+        # see if any of the plugins want to do postprocessing
+        for p in config.PLUGINS:
+            # import the plugin
+            plugin = __import__('plugins.' + p, globals(), locals(), [p])
+            if hasattr(plugin, 'postprocess'):
+                debugio.info('  ' + p)
+                plugin.postprocess(self)
+
+    def generate(self):
+        """Generate pages for plugins."""
+        for p in config.PLUGINS:
+            # import the plugin
+            plugin = __import__('plugins.' + p, globals(), locals(), [p])
+            if hasattr(plugin, 'generate'):
+                debugio.info('  ' + p)
+                plugin.generate(self)

Modified: webcheck/plugins/__init__.py
==============================================================================
--- webcheck/plugins/__init__.py        Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/__init__.py        Sat Aug 20 15:06:00 2011        (r431)
@@ -281,14 +281,3 @@
          'homepage': config.HOMEPAGE,
          'version':  htmlescape(config.VERSION)})
     fp.close()
-
-
-def generate(site):
-    """Generate pages for plugins."""
-    for p in config.PLUGINS:
-        debugio.info('  ' + p)
-        # import the plugin
-        plugin = __import__('plugins.' + p, globals(), locals(), [p])
-        # run the plugin
-        plugin.generate(site)
-        object_session(site.links[0]).commit()

Modified: webcheck/plugins/about.py
==============================================================================
--- webcheck/plugins/about.py   Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/about.py   Sat Aug 20 15:06:00 2011        (r431)
@@ -31,13 +31,14 @@
 import time
 
 import config
+import db
 import plugins
 
 
 def generate(site):
-    """Output a list of modules, it's authors and it's version to the
-    file descriptor."""
+    """Output a list of modules, it's authors and the webcheck version."""
     fp = plugins.open_html(plugins.about, site)
+    session = db.Session()
     # TODO: xxx links were fetched, xxx pages were examined and a total of xxx 
notes and problems were found
     # TODO: include some runtime information (e.g. supported schemes, user 
configuration, etc)
     # output some general information about the report
@@ -57,7 +58,7 @@
       '   </p>\n\n'
       % {'version':  plugins.htmlescape(config.VERSION),
          'time':     plugins.htmlescape(time.ctime(time.time())),
-         'numurls':  site.links.count(),
+         'numurls':  session.query(db.Link).count(),
          'homepage': config.HOMEPAGE})
     # output copyright information
     fp.write(

Modified: webcheck/plugins/anchors.py
==============================================================================
--- webcheck/plugins/anchors.py Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/anchors.py Sat Aug 20 15:06:00 2011        (r431)
@@ -27,16 +27,17 @@
 __title__ = 'missing anchors'
 __author__ = 'Arthur de Jong'
 
-from sqlalchemy.orm.session import object_session
-
 import db
 
 
-def generate(site):
-    """Present the list of bad links to the given file descriptor."""
+def postprocess(site):
+    """Add all missing anchors as page problems to the referring page."""
+    session = db.Session()
     # find all fetched links with requested anchors
-    links = site.links.filter(db.Link.reqanchors.any()).filter(db.Link.fetched 
!= None)
+    links = session.query(db.Link).filter(db.Link.reqanchors.any())
+    links = links.filter(db.Link.fetched != None)
     # go over list and find missing anchors
+    # TODO: we can probably make a nicer query for this
     for link in links:
         # check that all requested anchors exist
         for anchor in link.reqanchors:
@@ -46,4 +47,5 @@
                   u'bad link: %(url)s#%(anchor)s: unknown anchor'
                   % {'url': link.url,
                      'anchor': anchor})
-    # FIXME: commit changes in session
+    # commit changes in session
+    session.commit()

Modified: webcheck/plugins/badlinks.py
==============================================================================
--- webcheck/plugins/badlinks.py        Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/badlinks.py        Sat Aug 20 15:06:00 2011        (r431)
@@ -34,10 +34,25 @@
 import plugins
 
 
+def postporcess(site):
+    """Add all bad links as pageproblems on pages where they are linked."""
+    session = db.Session()
+    # find all links with link problems
+    links = 
session.query(db.Link).filter(db.Link.linkproblems.any()).options(joinedload(db.Link.linkproblems))
+    # TODO: probably make it a nicer query over all linkproblems
+    for link in links:
+        # add a reference to the problem map
+        for problem in link.linkproblems:
+            for parent in link.parents:
+                parent.add_pageproblem('bad link: %s: %s' % (link.url, 
problem))
+    session.commit()
+
+
 def generate(site):
-    """Present the list of bad links to the given file descriptor."""
+    """Present the list of bad links."""
+    session = db.Session()
     # find all links with link problems
-    links = 
site.links.filter(db.Link.linkproblems.any()).order_by(db.Link.url).options(joinedload(db.Link.linkproblems))
+    links = 
session.query(db.Link).filter(db.Link.linkproblems.any()).order_by(db.Link.url).options(joinedload(db.Link.linkproblems))
     # present results
     fp = plugins.open_html(plugins.badlinks, site)
     if not links:
@@ -68,10 +83,6 @@
           '     </ul>\n')
         # present a list of parents
         plugins.print_parents(fp, link, '     ')
-        # add a reference to the problem map
-        for problem in link.linkproblems:
-            for parent in link.parents:
-                parent.add_pageproblem('bad link: %s: %s' % (link.url, 
problem))
         fp.write(
           '    </li>\n')
     fp.write(

Modified: webcheck/plugins/external.py
==============================================================================
--- webcheck/plugins/external.py        Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/external.py        Sat Aug 20 15:06:00 2011        (r431)
@@ -35,9 +35,10 @@
 
 
 def generate(site):
-    """Generate the list of external links to the given file descriptor."""
+    """Generate the list of external links."""
+    session = db.Session()
     # get all external links
-    links = site.links.filter(db.Link.is_internal != 
True).order_by(db.Link.url)
+    links = session.query(db.Link).filter(db.Link.is_internal != 
True).order_by(db.Link.url)
     # present results
     fp = plugins.open_html(plugins.external, site)
     if not links:

Modified: webcheck/plugins/images.py
==============================================================================
--- webcheck/plugins/images.py  Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/images.py  Sat Aug 20 15:06:00 2011        (r431)
@@ -35,9 +35,11 @@
 
 
 def generate(site):
-    """Output a list of images to the given file descriptor."""
-    # get non-page images that have an image/* mimetype
-    links = site.links.filter((db.Link.is_page != True) | (db.Link.is_page == 
None))
+    """Generate a list of image URLs that were found."""
+    session = db.Session()
+    # get non-page links that have an image/* mimetype
+    links = session.query(db.Link)
+    links = links.filter((db.Link.is_page != True) | (db.Link.is_page == None))
     links = links.filter(db.Link.mimetype.startswith('image/'))
     links = links.order_by(db.Link.url)
     # present results

Modified: webcheck/plugins/new.py
==============================================================================
--- webcheck/plugins/new.py     Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/new.py     Sat Aug 20 15:06:00 2011        (r431)
@@ -39,11 +39,12 @@
 
 
 def generate(site):
-    """Output the list of recently modified pages to the specified file 
descriptor."""
+    """Output the list of recently modified pages."""
+    session = db.Session()
     # the time for which links are considered new
     newtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSNEW_URL_AGE
     # get all internal pages that are new
-    links = site.links.filter_by(is_page=True, is_internal=True)
+    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
     links = links.filter(db.Link.mtime > 
newtime).order_by(db.Link.mtime.desc())
     # present results
     fp = plugins.open_html(plugins.new, site)

Modified: webcheck/plugins/notchkd.py
==============================================================================
--- webcheck/plugins/notchkd.py Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/notchkd.py Sat Aug 20 15:06:00 2011        (r431)
@@ -35,9 +35,10 @@
 
 
 def generate(site):
-    """Output the list of not checked pages to the given file descriptor."""
+    """Output the list of not checked pages."""
+    session = db.Session()
     # get all yanked urls
-    links = site.links.filter(db.Link.yanked != None).order_by(db.Link.url)
+    links = session.query(db.Link).filter(db.Link.yanked != 
None).order_by(db.Link.url)
     # present results
     fp = plugins.open_html(plugins.notchkd, site)
     if not links.count():

Modified: webcheck/plugins/notitles.py
==============================================================================
--- webcheck/plugins/notitles.py        Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/notitles.py        Sat Aug 20 15:06:00 2011        (r431)
@@ -29,21 +29,33 @@
 __outputfile__ = 'notitles.html'
 
 from sqlalchemy.sql.functions import char_length
-from sqlalchemy.sql.expression import or_
 
 import db
 import plugins
 
 
+def postprocess(site):
+    """Add page problems for all pages without a title."""
+    session = db.Session()
+    # get all internal pages without a title
+    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
+    links = links.filter((char_length(db.Link.title) == 0) |
+                         (db.Link.title == None))
+    for link in links:
+        link.add_pageproblem('missing title')
+    session.commit()
+
+
 def generate(site):
-    """Output the list of pages without a title to the given file 
descriptor."""
+    """Output the list of pages without a title."""
+    session = db.Session()
     # get all internal pages without a title
-    links = site.links.filter_by(is_page=True, is_internal=True)
-    links = links.filter(or_(char_length(db.Link.title) == 0,
-                             db.Link.title == None)).order_by(db.Link.url)
+    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
+    links = links.filter((char_length(db.Link.title) == 0) |
+                         (db.Link.title == None)).order_by(db.Link.url)
     # present results
     fp = plugins.open_html(plugins.notitles, site)
-    if not links:
+    if not links.count():
         fp.write(
           '   <p class="description">\n'
           '    All pages had a title specified.\n'
@@ -60,7 +72,6 @@
         fp.write(
           '    <li>%(link)s</li>\n'
           % {'link': plugins.make_link(link, link.url)})
-        link.add_pageproblem('missing title')
     fp.write(
       '   </ol>\n')
     plugins.close_html(fp)

Modified: webcheck/plugins/old.py
==============================================================================
--- webcheck/plugins/old.py     Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/old.py     Sat Aug 20 15:06:00 2011        (r431)
@@ -40,10 +40,11 @@
 
 def generate(site):
     """Output the list of outdated pages to the specified file descriptor."""
+    session = db.Session()
     # the time for which links are considered old
     oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE
     # get all internal pages that are old
-    links = site.links.filter_by(is_page=True, is_internal=True)
+    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
     links = links.filter(db.Link.mtime < oldtime).order_by(db.Link.mtime)
     # present results
     fp = plugins.open_html(plugins.old, site)
@@ -73,8 +74,6 @@
           '    </li>\n'
           % {'link': plugins.make_link(link),
              'age':  age})
-        # add link to problem database
-        link.add_pageproblem('this page is %d days old' % age)
     fp.write(
       '   </ul>\n')
     plugins.close_html(fp)

Modified: webcheck/plugins/problems.py
==============================================================================
--- webcheck/plugins/problems.py        Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/problems.py        Sat Aug 20 15:06:00 2011        (r431)
@@ -49,11 +49,12 @@
 
 
 def generate(site):
-    """Output the overview of problems to the given file descriptor."""
+    """Output the overview of problems per author."""
+    session = db.Session()
     # make a list of problems per author
     problem_db = {}
     # get internal links with page problems
-    links = site.links.filter_by(is_internal=True)
+    links = session.query(db.Link).filter_by(is_internal=True)
     links = links.filter(db.Link.pageproblems.any()).order_by(db.Link.url)
     for link in links:
         # make a normal name for the author

Modified: webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/plugins/sitemap.py Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/sitemap.py Sat Aug 20 15:06:00 2011        (r431)
@@ -79,7 +79,8 @@
 
 
 def generate(site):
-    """Output the sitemap to the specified file descriptor."""
+    """Output the sitemap."""
+    session = db.Session()
     fp = plugins.open_html(plugins.sitemap, site)
     # output the site structure using breadth first traversal
     fp.write(

Modified: webcheck/plugins/size.py
==============================================================================
--- webcheck/plugins/size.py    Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/size.py    Sat Aug 20 15:06:00 2011        (r431)
@@ -29,6 +29,7 @@
 __outputfile__ = 'size.html'
 
 import config
+import db
 import plugins
 
 
@@ -55,9 +56,10 @@
 
 
 def generate(site):
-    """Output the list of large pages to the given file descriptor."""
+    """Output the list of large pages."""
+    session = db.Session()
     # get all internal pages and get big links
-    links = site.links.filter_by(is_page=True, is_internal=True)
+    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
     links = [x for x in links
              if _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024]
     # sort links by size (biggest first)
@@ -90,9 +92,6 @@
           '    </li>\n'
           % {'link': plugins.make_link(link),
              'size': size})
-        link.add_pageproblem(
-          'this page and its components is %(size)s'
-          % {'size': size})
     fp.write(
       '   </ul>\n')
     plugins.close_html(fp)

Modified: webcheck/plugins/urllist.py
==============================================================================
--- webcheck/plugins/urllist.py Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/plugins/urllist.py Sat Aug 20 15:06:00 2011        (r431)
@@ -31,7 +31,8 @@
 
 
 def generate(site):
-    """Output a sorted list of urls to the specified file descriptor."""
+    """Output a sorted list of URLs."""
+    session = db.Session()
     fp = plugins.open_html(plugins.urllist, site)
     fp.write(
       '   <p class="description">\n'
@@ -40,7 +41,7 @@
       '    non-examined urls.\n'
       '   </p>\n'
       '   <ol>\n')
-    links = site.links.order_by(db.Link.url)
+    links = session.query(db.Link).order_by(db.Link.url)
     for link in links:
         fp.write('    <li>' + plugins.make_link(link, link.url) + '</li>\n')
     fp.write(

Modified: webcheck/webcheck.py
==============================================================================
--- webcheck/webcheck.py        Fri Aug 19 21:44:51 2011        (r430)
+++ webcheck/webcheck.py        Sat Aug 20 15:06:00 2011        (r431)
@@ -258,7 +258,7 @@
     # start with the frame-description page
     debugio.info('generating reports...')
     # for every plugin, generate a page
-    plugins.generate(site)
+    site.generate()
     # put extra files in the output directory
     install_file('webcheck.css', True)
     install_file('fancytooltips/fancytooltips.js', True)
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits