webcheck commit: r431 - in webcheck: . plugins
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r431 - in webcheck: . plugins
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r431 - in webcheck: . plugins
- Date: Sat, 20 Aug 2011 15:06:02 +0200 (CEST)
Author: arthur
Date: Sat Aug 20 15:06:00 2011
New Revision: 431
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=431
Log:
make plugins get their own session and split postprocessinf and report
generation
Modified:
webcheck/crawler.py
webcheck/plugins/__init__.py
webcheck/plugins/about.py
webcheck/plugins/anchors.py
webcheck/plugins/badlinks.py
webcheck/plugins/external.py
webcheck/plugins/images.py
webcheck/plugins/new.py
webcheck/plugins/notchkd.py
webcheck/plugins/notitles.py
webcheck/plugins/old.py
webcheck/plugins/problems.py
webcheck/plugins/sitemap.py
webcheck/plugins/size.py
webcheck/plugins/urllist.py
webcheck/webcheck.py
Modified: webcheck/crawler.py
==============================================================================
--- webcheck/crawler.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/crawler.py Sat Aug 20 15:06:00 2011 (r431)
@@ -396,5 +396,19 @@
depth += 1
debugio.debug('crawler.postprocess(): %d links at depth %d' %
(count, depth))
# TODO: also handle embeds
- # make the list of links (and session) available to the plugins
- self.links = session.query(db.Link)
+ # see if any of the plugins want to do postprocessing
+ for p in config.PLUGINS:
+ # import the plugin
+ plugin = __import__('plugins.' + p, globals(), locals(), [p])
+ if hasattr(plugin, 'postprocess'):
+ debugio.info(' ' + p)
+ plugin.postprocess(self)
+
+ def generate(self):
+ """Generate pages for plugins."""
+ for p in config.PLUGINS:
+ # import the plugin
+ plugin = __import__('plugins.' + p, globals(), locals(), [p])
+ if hasattr(plugin, 'generate'):
+ debugio.info(' ' + p)
+ plugin.generate(self)
Modified: webcheck/plugins/__init__.py
==============================================================================
--- webcheck/plugins/__init__.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/__init__.py Sat Aug 20 15:06:00 2011 (r431)
@@ -281,14 +281,3 @@
'homepage': config.HOMEPAGE,
'version': htmlescape(config.VERSION)})
fp.close()
-
-
-def generate(site):
- """Generate pages for plugins."""
- for p in config.PLUGINS:
- debugio.info(' ' + p)
- # import the plugin
- plugin = __import__('plugins.' + p, globals(), locals(), [p])
- # run the plugin
- plugin.generate(site)
- object_session(site.links[0]).commit()
Modified: webcheck/plugins/about.py
==============================================================================
--- webcheck/plugins/about.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/about.py Sat Aug 20 15:06:00 2011 (r431)
@@ -31,13 +31,14 @@
import time
import config
+import db
import plugins
def generate(site):
- """Output a list of modules, it's authors and it's version to the
- file descriptor."""
+ """Output a list of modules, it's authors and the webcheck version."""
fp = plugins.open_html(plugins.about, site)
+ session = db.Session()
# TODO: xxx links were fetched, xxx pages were examined and a total of xxx
notes and problems were found
# TODO: include some runtime information (e.g. supported schemes, user
configuration, etc)
# output some general information about the report
@@ -57,7 +58,7 @@
' </p>\n\n'
% {'version': plugins.htmlescape(config.VERSION),
'time': plugins.htmlescape(time.ctime(time.time())),
- 'numurls': site.links.count(),
+ 'numurls': session.query(db.Link).count(),
'homepage': config.HOMEPAGE})
# output copyright information
fp.write(
Modified: webcheck/plugins/anchors.py
==============================================================================
--- webcheck/plugins/anchors.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/anchors.py Sat Aug 20 15:06:00 2011 (r431)
@@ -27,16 +27,17 @@
__title__ = 'missing anchors'
__author__ = 'Arthur de Jong'
-from sqlalchemy.orm.session import object_session
-
import db
-def generate(site):
- """Present the list of bad links to the given file descriptor."""
+def postprocess(site):
+ """Add all missing anchors as page problems to the referring page."""
+ session = db.Session()
# find all fetched links with requested anchors
- links = site.links.filter(db.Link.reqanchors.any()).filter(db.Link.fetched
!= None)
+ links = session.query(db.Link).filter(db.Link.reqanchors.any())
+ links = links.filter(db.Link.fetched != None)
# go over list and find missing anchors
+ # TODO: we can probably make a nicer query for this
for link in links:
# check that all requested anchors exist
for anchor in link.reqanchors:
@@ -46,4 +47,5 @@
u'bad link: %(url)s#%(anchor)s: unknown anchor'
% {'url': link.url,
'anchor': anchor})
- # FIXME: commit changes in session
+ # commit changes in session
+ session.commit()
Modified: webcheck/plugins/badlinks.py
==============================================================================
--- webcheck/plugins/badlinks.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/badlinks.py Sat Aug 20 15:06:00 2011 (r431)
@@ -34,10 +34,25 @@
import plugins
+def postporcess(site):
+ """Add all bad links as pageproblems on pages where they are linked."""
+ session = db.Session()
+ # find all links with link problems
+ links =
session.query(db.Link).filter(db.Link.linkproblems.any()).options(joinedload(db.Link.linkproblems))
+ # TODO: probably make it a nicer query over all linkproblems
+ for link in links:
+ # add a reference to the problem map
+ for problem in link.linkproblems:
+ for parent in link.parents:
+ parent.add_pageproblem('bad link: %s: %s' % (link.url,
problem))
+ session.commit()
+
+
def generate(site):
- """Present the list of bad links to the given file descriptor."""
+ """Present the list of bad links."""
+ session = db.Session()
# find all links with link problems
- links =
site.links.filter(db.Link.linkproblems.any()).order_by(db.Link.url).options(joinedload(db.Link.linkproblems))
+ links =
session.query(db.Link).filter(db.Link.linkproblems.any()).order_by(db.Link.url).options(joinedload(db.Link.linkproblems))
# present results
fp = plugins.open_html(plugins.badlinks, site)
if not links:
@@ -68,10 +83,6 @@
' </ul>\n')
# present a list of parents
plugins.print_parents(fp, link, ' ')
- # add a reference to the problem map
- for problem in link.linkproblems:
- for parent in link.parents:
- parent.add_pageproblem('bad link: %s: %s' % (link.url,
problem))
fp.write(
' </li>\n')
fp.write(
Modified: webcheck/plugins/external.py
==============================================================================
--- webcheck/plugins/external.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/external.py Sat Aug 20 15:06:00 2011 (r431)
@@ -35,9 +35,10 @@
def generate(site):
- """Generate the list of external links to the given file descriptor."""
+ """Generate the list of external links."""
+ session = db.Session()
# get all external links
- links = site.links.filter(db.Link.is_internal !=
True).order_by(db.Link.url)
+ links = session.query(db.Link).filter(db.Link.is_internal !=
True).order_by(db.Link.url)
# present results
fp = plugins.open_html(plugins.external, site)
if not links:
Modified: webcheck/plugins/images.py
==============================================================================
--- webcheck/plugins/images.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/images.py Sat Aug 20 15:06:00 2011 (r431)
@@ -35,9 +35,11 @@
def generate(site):
- """Output a list of images to the given file descriptor."""
- # get non-page images that have an image/* mimetype
- links = site.links.filter((db.Link.is_page != True) | (db.Link.is_page ==
None))
+ """Generate a list of image URLs that were found."""
+ session = db.Session()
+ # get non-page links that have an image/* mimetype
+ links = session.query(db.Link)
+ links = links.filter((db.Link.is_page != True) | (db.Link.is_page == None))
links = links.filter(db.Link.mimetype.startswith('image/'))
links = links.order_by(db.Link.url)
# present results
Modified: webcheck/plugins/new.py
==============================================================================
--- webcheck/plugins/new.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/new.py Sat Aug 20 15:06:00 2011 (r431)
@@ -39,11 +39,12 @@
def generate(site):
- """Output the list of recently modified pages to the specified file
descriptor."""
+ """Output the list of recently modified pages."""
+ session = db.Session()
# the time for which links are considered new
newtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSNEW_URL_AGE
# get all internal pages that are new
- links = site.links.filter_by(is_page=True, is_internal=True)
+ links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
links = links.filter(db.Link.mtime >
newtime).order_by(db.Link.mtime.desc())
# present results
fp = plugins.open_html(plugins.new, site)
Modified: webcheck/plugins/notchkd.py
==============================================================================
--- webcheck/plugins/notchkd.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/notchkd.py Sat Aug 20 15:06:00 2011 (r431)
@@ -35,9 +35,10 @@
def generate(site):
- """Output the list of not checked pages to the given file descriptor."""
+ """Output the list of not checked pages."""
+ session = db.Session()
# get all yanked urls
- links = site.links.filter(db.Link.yanked != None).order_by(db.Link.url)
+ links = session.query(db.Link).filter(db.Link.yanked !=
None).order_by(db.Link.url)
# present results
fp = plugins.open_html(plugins.notchkd, site)
if not links.count():
Modified: webcheck/plugins/notitles.py
==============================================================================
--- webcheck/plugins/notitles.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/notitles.py Sat Aug 20 15:06:00 2011 (r431)
@@ -29,21 +29,33 @@
__outputfile__ = 'notitles.html'
from sqlalchemy.sql.functions import char_length
-from sqlalchemy.sql.expression import or_
import db
import plugins
+def postprocess(site):
+ """Add page problems for all pages without a title."""
+ session = db.Session()
+ # get all internal pages without a title
+ links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
+ links = links.filter((char_length(db.Link.title) == 0) |
+ (db.Link.title == None))
+ for link in links:
+ link.add_pageproblem('missing title')
+ session.commit()
+
+
def generate(site):
- """Output the list of pages without a title to the given file
descriptor."""
+ """Output the list of pages without a title."""
+ session = db.Session()
# get all internal pages without a title
- links = site.links.filter_by(is_page=True, is_internal=True)
- links = links.filter(or_(char_length(db.Link.title) == 0,
- db.Link.title == None)).order_by(db.Link.url)
+ links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
+ links = links.filter((char_length(db.Link.title) == 0) |
+ (db.Link.title == None)).order_by(db.Link.url)
# present results
fp = plugins.open_html(plugins.notitles, site)
- if not links:
+ if not links.count():
fp.write(
' <p class="description">\n'
' All pages had a title specified.\n'
@@ -60,7 +72,6 @@
fp.write(
' <li>%(link)s</li>\n'
% {'link': plugins.make_link(link, link.url)})
- link.add_pageproblem('missing title')
fp.write(
' </ol>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/old.py
==============================================================================
--- webcheck/plugins/old.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/old.py Sat Aug 20 15:06:00 2011 (r431)
@@ -40,10 +40,11 @@
def generate(site):
"""Output the list of outdated pages to the specified file descriptor."""
+ session = db.Session()
# the time for which links are considered old
oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE
# get all internal pages that are old
- links = site.links.filter_by(is_page=True, is_internal=True)
+ links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
links = links.filter(db.Link.mtime < oldtime).order_by(db.Link.mtime)
# present results
fp = plugins.open_html(plugins.old, site)
@@ -73,8 +74,6 @@
' </li>\n'
% {'link': plugins.make_link(link),
'age': age})
- # add link to problem database
- link.add_pageproblem('this page is %d days old' % age)
fp.write(
' </ul>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/problems.py
==============================================================================
--- webcheck/plugins/problems.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/problems.py Sat Aug 20 15:06:00 2011 (r431)
@@ -49,11 +49,12 @@
def generate(site):
- """Output the overview of problems to the given file descriptor."""
+ """Output the overview of problems per author."""
+ session = db.Session()
# make a list of problems per author
problem_db = {}
# get internal links with page problems
- links = site.links.filter_by(is_internal=True)
+ links = session.query(db.Link).filter_by(is_internal=True)
links = links.filter(db.Link.pageproblems.any()).order_by(db.Link.url)
for link in links:
# make a normal name for the author
Modified: webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/plugins/sitemap.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/sitemap.py Sat Aug 20 15:06:00 2011 (r431)
@@ -79,7 +79,8 @@
def generate(site):
- """Output the sitemap to the specified file descriptor."""
+ """Output the sitemap."""
+ session = db.Session()
fp = plugins.open_html(plugins.sitemap, site)
# output the site structure using breadth first traversal
fp.write(
Modified: webcheck/plugins/size.py
==============================================================================
--- webcheck/plugins/size.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/size.py Sat Aug 20 15:06:00 2011 (r431)
@@ -29,6 +29,7 @@
__outputfile__ = 'size.html'
import config
+import db
import plugins
@@ -55,9 +56,10 @@
def generate(site):
- """Output the list of large pages to the given file descriptor."""
+ """Output the list of large pages."""
+ session = db.Session()
# get all internal pages and get big links
- links = site.links.filter_by(is_page=True, is_internal=True)
+ links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
links = [x for x in links
if _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024]
# sort links by size (biggest first)
@@ -90,9 +92,6 @@
' </li>\n'
% {'link': plugins.make_link(link),
'size': size})
- link.add_pageproblem(
- 'this page and its components is %(size)s'
- % {'size': size})
fp.write(
' </ul>\n')
plugins.close_html(fp)
Modified: webcheck/plugins/urllist.py
==============================================================================
--- webcheck/plugins/urllist.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/plugins/urllist.py Sat Aug 20 15:06:00 2011 (r431)
@@ -31,7 +31,8 @@
def generate(site):
- """Output a sorted list of urls to the specified file descriptor."""
+ """Output a sorted list of URLs."""
+ session = db.Session()
fp = plugins.open_html(plugins.urllist, site)
fp.write(
' <p class="description">\n'
@@ -40,7 +41,7 @@
' non-examined urls.\n'
' </p>\n'
' <ol>\n')
- links = site.links.order_by(db.Link.url)
+ links = session.query(db.Link).order_by(db.Link.url)
for link in links:
fp.write(' <li>' + plugins.make_link(link, link.url) + '</li>\n')
fp.write(
Modified: webcheck/webcheck.py
==============================================================================
--- webcheck/webcheck.py Fri Aug 19 21:44:51 2011 (r430)
+++ webcheck/webcheck.py Sat Aug 20 15:06:00 2011 (r431)
@@ -258,7 +258,7 @@
# start with the frame-description page
debugio.info('generating reports...')
# for every plugin, generate a page
- plugins.generate(site)
+ site.generate()
# put extra files in the output directory
install_file('webcheck.css', True)
install_file('fancytooltips/fancytooltips.js', True)
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits
- webcheck commit: r431 - in webcheck: . plugins,
Commits of the webcheck project