webcheck commit: r435 - in webcheck: . parsers plugins webcheck webcheck/parsers webcheck/parsers/html webcheck/plugins
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r435 - in webcheck: . parsers plugins webcheck webcheck/parsers webcheck/parsers/html webcheck/plugins
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r435 - in webcheck: . parsers plugins webcheck webcheck/parsers webcheck/parsers/html webcheck/plugins
- Date: Fri, 16 Sep 2011 15:36:40 +0200 (CEST)
Author: arthur
Date: Fri Sep 16 15:36:38 2011
New Revision: 435
URL: http://arthurdejong.org/viewvc/webcheck?revision=435&view=revision
Log:
move all the code except the command-line handling to the webcheck package and
reorganise imports accordingly
Added:
webcheck/cmd.py
- copied, changed from r434, webcheck/webcheck.py
webcheck/webcheck/ (props changed)
webcheck/webcheck/__init__.py
webcheck/webcheck/config.py
- copied, changed from r434, webcheck/config.py
webcheck/webcheck/crawler.py
- copied, changed from r434, webcheck/crawler.py
webcheck/webcheck/db.py
- copied, changed from r434, webcheck/db.py
webcheck/webcheck/debugio.py
- copied, changed from r434, webcheck/debugio.py
webcheck/webcheck/monkeypatch.py
- copied, changed from r434, webcheck/monkeypatch.py
webcheck/webcheck/myurllib.py
- copied, changed from r434, webcheck/myurllib.py
webcheck/webcheck/parsers/
- copied from r434, webcheck/parsers/
webcheck/webcheck/plugins/
- copied from r434, webcheck/plugins/
Deleted:
webcheck/config.py
webcheck/crawler.py
webcheck/db.py
webcheck/debugio.py
webcheck/monkeypatch.py
webcheck/myurllib.py
webcheck/parsers/
webcheck/plugins/
webcheck/webcheck.py
Modified:
webcheck/webcheck/parsers/__init__.py
webcheck/webcheck/parsers/css.py
webcheck/webcheck/parsers/html/__init__.py
webcheck/webcheck/parsers/html/beautifulsoup.py
webcheck/webcheck/parsers/html/calltidy.py
webcheck/webcheck/parsers/html/htmlparser.py
webcheck/webcheck/plugins/__init__.py
webcheck/webcheck/plugins/about.py
webcheck/webcheck/plugins/anchors.py
webcheck/webcheck/plugins/badlinks.py
webcheck/webcheck/plugins/external.py
webcheck/webcheck/plugins/images.py
webcheck/webcheck/plugins/new.py
webcheck/webcheck/plugins/notchkd.py
webcheck/webcheck/plugins/notitles.py
webcheck/webcheck/plugins/old.py
webcheck/webcheck/plugins/problems.py
webcheck/webcheck/plugins/sitemap.py
webcheck/webcheck/plugins/size.py
webcheck/webcheck/plugins/urllist.py
Copied and modified: webcheck/cmd.py (from r434, webcheck/webcheck.py)
==============================================================================
--- webcheck/webcheck.py Sun Sep 11 17:33:55 2011 (r434, copy
source)
+++ webcheck/cmd.py Fri Sep 16 15:36:38 2011 (r435)
@@ -1,6 +1,6 @@
#!/usr/bin/env python
-# webcheck.py - main module of webcheck doing command-line checking
+# cmd.py - command-line front-end for webcheck
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
@@ -28,22 +28,22 @@
__version__ = '1.10.4'
__homepage__ = 'http://arthurdejong.org/webcheck/'
-import sys
import os
import re
-import urlparse
+import sys
import urllib
+import urlparse
-import config
+from webcheck import config
# update some fields that currently are stored in config
config.VERSION = __version__
config.HOMEPAGE = __homepage__
-import crawler
-import plugins
-import debugio
-import monkeypatch
-import db
+from webcheck import debugio
+import webcheck.crawler
+import webcheck.db
+import webcheck.monkeypatch
+import webcheck.plugins
debugio.loglevel = debugio.INFO
@@ -166,9 +166,9 @@
filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
from sqlalchemy import create_engine
engine = create_engine('sqlite:///' + filename)
- db.Session.configure(bind=engine)
+ webcheck.db.Session.configure(bind=engine)
# ensure that all tables are created
- db.Base.metadata.create_all(engine)
+ webcheck.db.Base.metadata.create_all(engine)
# TODO: schema migraton goes here
# add configuration to site
for pattern in internal_urls:
@@ -235,7 +235,7 @@
'strerror': strerror})
sys.exit(1)
# create file in output directory (with overwrite question)
- tfp = plugins.open_file(os.path.basename(source))
+ tfp = webcheck.plugins.open_file(os.path.basename(source))
# copy contents
shutil.copyfileobj(sfp, tfp)
# close files
@@ -247,7 +247,7 @@
"""Main program."""
# crawl through the website
debugio.info('checking site....')
- crawler.setup_urllib2()
+ webcheck.crawler.setup_urllib2()
site.crawl() # this will take a while
debugio.info('done.')
# do postprocessing (building site structure, etc)
@@ -269,7 +269,7 @@
if __name__ == '__main__':
try:
# initialize site object
- site = crawler.Site()
+ site = webcheck.crawler.Site()
# parse command-line arguments
parse_args(site)
# run the main program
Added: webcheck/webcheck/__init__.py
==============================================================================
Copied and modified: webcheck/webcheck/config.py (from r434, webcheck/config.py)
==============================================================================
--- webcheck/config.py Sun Sep 11 17:33:55 2011 (r434, copy source)
+++ webcheck/webcheck/config.py Fri Sep 16 15:36:38 2011 (r435)
@@ -29,6 +29,7 @@
import urllib
+
# Whether to consider any URL not starting with the base URL to be external.
# This is the state of the -b command line option.
BASE_URLS_ONLY = False
Copied and modified: webcheck/webcheck/crawler.py (from r434,
webcheck/crawler.py)
==============================================================================
--- webcheck/crawler.py Sun Sep 11 17:33:55 2011 (r434, copy source)
+++ webcheck/webcheck/crawler.py Fri Sep 16 15:36:38 2011 (r435)
@@ -40,10 +40,11 @@
import urllib2
import urlparse
-import config
-import db
-import debugio
-import parsers
+from webcheck.db import Session, Link, LinkProblem, PageProblem, children, \
+ embedded
+from webcheck import debugio
+import webcheck.config
+import webcheck.parsers
class RedirectError(urllib2.HTTPError):
@@ -61,7 +62,7 @@
def setup_urllib2():
"""Configure the urllib2 module to store cookies in the output
directory."""
- filename = os.path.join(config.OUTPUT_DIR, 'cookies.lwp')
+ filename = os.path.join(webcheck.config.OUTPUT_DIR, 'cookies.lwp')
# set up our cookie jar
cookiejar = cookielib.LWPCookieJar(filename)
try:
@@ -73,9 +74,9 @@
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar),
NoRedirectHandler())
opener.addheaders = [
- ('User-agent', 'webcheck %s' % config.VERSION),
+ ('User-agent', 'webcheck %s' % webcheck.config.VERSION),
]
- if config.BYPASSHTTPCACHE:
+ if webcheck.config.BYPASSHTTPCACHE:
opener.addheaders.append(('Cache-control', 'no-cache'))
opener.addheaders.append(('Pragma', 'no-cache'))
urllib2.install_opener(opener)
@@ -116,7 +117,7 @@
def add_internal(self, url):
"""Add the given url and consider all urls below it to be internal.
These links are all marked for checking with the crawl() function."""
- url = db.Link.clean_url(url)
+ url = Link.clean_url(url)
if url not in self._internal_urls:
self._internal_urls.add(url)
@@ -145,7 +146,7 @@
return True
res = False
# check that the url starts with an internal url
- if config.BASE_URLS_ONLY:
+ if webcheck.config.BASE_URLS_ONLY:
# the url must start with one of the _internal_urls
for i in self._internal_urls:
res |= (i == url[:len(i)])
@@ -201,10 +202,10 @@
return 'yanked'
# check if we should avoid external links
is_internal = self._is_internal(url)
- if not is_internal and config.AVOID_EXTERNAL_LINKS:
+ if not is_internal and webcheck.config.AVOID_EXTERNAL_LINKS:
return 'external avoided'
# check if we should use robot parsers
- if not config.USE_ROBOTS:
+ if not webcheck.config.USE_ROBOTS:
return None
(scheme, netloc) = urlparse.urlsplit(url)[0:2]
# skip schemes not having robot.txt files
@@ -223,16 +224,16 @@
def get_link(self, session, url):
# try to find the URL
- url = db.Link.clean_url(url)
- link = session.query(db.Link).filter_by(url=url).first()
+ url = Link.clean_url(url)
+ link = session.query(Link).filter_by(url=url).first()
if not link:
- link = db.Link(url=url)
+ link = Link(url=url)
session.add(link)
return link
def get_links_to_crawl(self, session):
- links = session.query(db.Link).filter(db.Link.fetched == None)
- return links.filter(db.Link.yanked == None)
+ links = session.query(Link).filter(Link.fetched == None)
+ return links.filter(Link.yanked == None)
def crawl(self):
"""Crawl the website based on the urls specified with
@@ -240,22 +241,22 @@
is specified the crawler writes out updated links to
the file while crawling the site."""
# get a database session
- session = db.Session()
+ session = Session()
# remove all links
- if not config.CONTINUE:
- session.query(db.LinkProblem).delete()
+ if not webcheck.config.CONTINUE:
+ session.query(LinkProblem).delete()
session.commit()
- session.query(db.PageProblem).delete()
+ session.query(PageProblem).delete()
session.commit()
- session.execute(db.children.delete())
+ session.execute(children.delete())
session.commit()
- session.execute(db.embedded.delete())
+ session.execute(embedded.delete())
session.commit()
- session.query(db.Link).delete()
+ session.query(Link).delete()
session.commit()
# add all internal urls to the database
for url in self._internal_urls:
- url = db.Link.clean_url(url)
+ url = Link.clean_url(url)
self.get_link(session, url)
# add some URLs from the database that haven't been fetched
tocheck = self.get_links_to_crawl(session)
@@ -284,10 +285,10 @@
# flush database changes
session.commit()
# sleep between requests if configured
- if config.WAIT_BETWEEN_REQUESTS > 0:
+ if webcheck.config.WAIT_BETWEEN_REQUESTS > 0:
debugio.debug('crawler.crawl(): sleeping %s seconds' %
- config.WAIT_BETWEEN_REQUESTS)
- time.sleep(config.WAIT_BETWEEN_REQUESTS)
+ webcheck.config.WAIT_BETWEEN_REQUESTS)
+ time.sleep(webcheck.config.WAIT_BETWEEN_REQUESTS)
debugio.debug('crawler.crawl(): items left to check: %d' %
(remaining + len(tocheck)))
session.commit()
@@ -346,7 +347,7 @@
def parse(self, link, response):
"""Parse the fetched response."""
# find a parser for the content-type
- parsermodule = parsers.get_parsermodule(link.mimetype)
+ parsermodule = webcheck.parsers.get_parsermodule(link.mimetype)
if parsermodule is None:
debugio.debug('crawler.Link.fetch(): unsupported content-type: %s'
% link.mimetype)
return
@@ -368,7 +369,7 @@
"""Do some basic post processing of the collected data, including
depth calculation of every link."""
# get a database session
- session = db.Session()
+ session = Session()
# build the list of urls that were set up with add_internal() that
# do not have a parent (they form the base for the site)
for url in self._internal_urls:
@@ -381,11 +382,11 @@
self.bases.append(link)
# if we got no bases, just use the first internal one
if not self.bases:
- link = session.query(db.Link).filter(db.Link.is_internal ==
True).first()
+ link = session.query(Link).filter(Link.is_internal == True).first()
debugio.debug('crawler.postprocess(): fallback to adding %s to
bases' % link.url)
self.bases.append(link)
# do a breadth first traversal of the website to determine depth
- session.query(db.Link).update(dict(depth=None),
synchronize_session=False)
+ session.query(Link).update(dict(depth=None), synchronize_session=False)
session.commit()
depth = 0
count = len(self.bases)
@@ -396,15 +397,15 @@
while count > 0:
# update the depth of all links without a depth that have a
# parent with the previous depth
- qry = session.query(db.Link).filter(db.Link.depth == None)
- qry = qry.filter(db.Link.linked_from.any(db.Link.depth == depth))
+ qry = session.query(Link).filter(Link.depth == None)
+ qry = qry.filter(Link.linked_from.any(Link.depth == depth))
count = qry.update(dict(depth=depth + 1),
synchronize_session=False)
session.commit()
depth += 1
debugio.debug('crawler.postprocess(): %d links at depth %d' %
(count, depth))
# TODO: also handle embeds
# see if any of the plugins want to do postprocessing
- for p in config.PLUGINS:
+ for p in webcheck.config.PLUGINS:
# import the plugin
plugin = __import__('plugins.' + p, globals(), locals(), [p])
if hasattr(plugin, 'postprocess'):
@@ -413,7 +414,7 @@
def generate(self):
"""Generate pages for plugins."""
- for p in config.PLUGINS:
+ for p in webcheck.config.PLUGINS:
# import the plugin
plugin = __import__('plugins.' + p, globals(), locals(), [p])
if hasattr(plugin, 'generate'):
Copied and modified: webcheck/webcheck/db.py (from r434, webcheck/db.py)
==============================================================================
--- webcheck/db.py Sun Sep 11 17:33:55 2011 (r434, copy source)
+++ webcheck/webcheck/db.py Fri Sep 16 15:36:38 2011 (r435)
@@ -29,9 +29,9 @@
from sqlalchemy.orm.session import object_session
from sqlalchemy.sql.expression import ClauseElement, union
-import config
-import debugio
-import myurllib
+from webcheck.myurllib import normalizeurl
+import webcheck.config
+import webcheck.debugio
# provide session and schema classes
@@ -93,15 +93,14 @@
@staticmethod
def clean_url(url):
# normalise the URL, removing the fragment from the URL
- url = myurllib.normalizeurl(url)
- return urlparse.urldefrag(myurllib.normalizeurl(url))[0]
+ return urlparse.urldefrag(normalizeurl(url))[0]
def _get_link(self, url):
"""Get a link object for the specified URL."""
# get the session
session = object_session(self)
# normalise the URL, removing the fragment from the URL
- url, fragment = urlparse.urldefrag(myurllib.normalizeurl(url))
+ url, fragment = urlparse.urldefrag(normalizeurl(url))
# try to find the link
instance = session.query(Link).filter_by(url=url).first()
if not instance:
@@ -118,7 +117,7 @@
the encoding is supported."""
if not self.encoding and encoding:
try:
- debugio.debug('crawler.Link.set_encoding(%r)' % encoding)
+ webcheck.debugio.debug('crawler.Link.set_encoding(%r)' %
encoding)
unicode('just some random text', encoding, 'replace')
self.encoding = encoding
except Exception, e:
@@ -133,7 +132,7 @@
self.redirectdepth = max([self.redirectdepth] +
[x.redirectdepth for x in self.parents]) + 1
# check depth
- if self.redirectdepth >= config.REDIRECT_DEPTH:
+ if self.redirectdepth >= webcheck.config.REDIRECT_DEPTH:
self.add_linkproblem('too many redirects (%d)' %
self.redirectdepth)
return
# check for redirect to self
Copied and modified: webcheck/webcheck/debugio.py (from r434,
webcheck/debugio.py)
==============================================================================
--- webcheck/debugio.py Sun Sep 11 17:33:55 2011 (r434, copy source)
+++ webcheck/webcheck/debugio.py Fri Sep 16 15:36:38 2011 (r435)
@@ -30,6 +30,7 @@
import sys
+
# log levels that can be used
ERROR = 0
WARN = 1
Copied and modified: webcheck/webcheck/monkeypatch.py (from r434,
webcheck/monkeypatch.py)
==============================================================================
--- webcheck/monkeypatch.py Sun Sep 11 17:33:55 2011 (r434, copy
source)
+++ webcheck/webcheck/monkeypatch.py Fri Sep 16 15:36:38 2011 (r435)
@@ -21,9 +21,9 @@
# under the copyright of the software, unless explicitly stated otherwise.
import re
-import urlparse
-import urllib
import sys
+import urllib
+import urlparse
__all__ = []
Copied and modified: webcheck/webcheck/myurllib.py (from r434,
webcheck/myurllib.py)
==============================================================================
--- webcheck/myurllib.py Sun Sep 11 17:33:55 2011 (r434, copy
source)
+++ webcheck/webcheck/myurllib.py Fri Sep 16 15:36:38 2011 (r435)
@@ -20,9 +20,9 @@
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.
-import urlparse
import re
import urllib
+import urlparse
# this is a workaround for Python 2.3
try:
Modified: webcheck/webcheck/parsers/__init__.py
==============================================================================
--- webcheck/parsers/__init__.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/parsers/__init__.py Fri Sep 16 15:36:38 2011
(r435)
@@ -40,7 +40,7 @@
# go throught all known modules to probe the content-types
# (do this only once)
for mod in _modules:
- parser = __import__('parsers.' + mod, globals(), locals(), [mod])
+ parser = __import__('webcheck.parsers.' + mod, globals(), locals(),
[mod])
for mimetype in parser.mimetypes:
_parsermodules[mimetype] = parser
Modified: webcheck/webcheck/parsers/css.py
==============================================================================
--- webcheck/parsers/css.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/parsers/css.py Fri Sep 16 15:36:38 2011 (r435)
@@ -26,8 +26,9 @@
mimetypes = ('text/css',)
-import urlparse
import re
+import urlparse
+
# pattern for matching /* ... */ comments in css
_commentpattern = re.compile('/\*.*?\*/', re.IGNORECASE | re.DOTALL)
Modified: webcheck/webcheck/parsers/html/__init__.py
==============================================================================
--- webcheck/parsers/html/__init__.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/parsers/html/__init__.py Fri Sep 16 15:36:38 2011
(r435)
@@ -24,10 +24,12 @@
module that tries to load the BeatifulSoup parser first and falls
back to loading the legacy HTMLParser parser."""
-import debugio
-import re
import htmlentitydefs
-import config
+import re
+
+from webcheck import debugio
+import webcheck.config
+
# the list of mimetypes this module should be able to handle
mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
@@ -93,15 +95,15 @@
global _parsefunction
try:
# try BeautifulSoup parser first
- import parsers.html.beautifulsoup
- debugio.debug('parsers.html.parse(): the BeautifulSoup parser is ok')
- _parsefunction = parsers.html.beautifulsoup.parse
+ import webcheck.parsers.html.beautifulsoup
+ debugio.debug('webcheck.parsers.html.parse(): the BeautifulSoup parser
is ok')
+ _parsefunction = webcheck.parsers.html.beautifulsoup.parse
except ImportError:
# fall back to legacy HTMLParser parser
debugio.warn('falling back to the legacy HTML parser, '
'consider installing BeautifulSoup')
- import parsers.html.htmlparser
- _parsefunction = parsers.html.htmlparser.parse
+ import webcheck.parsers.html.htmlparser
+ _parsefunction = webcheck.parsers.html.htmlparser.parse
# call the actual parse function
_parsefunction(content, link)
@@ -112,12 +114,12 @@
# call the normal parse function
_parsefunction(content, link)
# call the tidy parse function
- if config.TIDY_OPTIONS:
+ if webcheck.config.TIDY_OPTIONS:
try:
import calltidy
- debugio.debug('parsers.html.parse(): the Tidy parser is ok')
+ debugio.debug('webcheck.parsers.html.parse(): the Tidy parser is
ok')
calltidy.parse(content, link)
except ImportError:
debugio.warn('tidy library (python-utidylib) is unavailable')
# remove config to only try once
- config.TIDY_OPTIONS = None
+ webcheck.config.TIDY_OPTIONS = None
Modified: webcheck/webcheck/parsers/html/beautifulsoup.py
==============================================================================
--- webcheck/parsers/html/beautifulsoup.py Sun Sep 11 17:33:55 2011
(r434)
+++ webcheck/webcheck/parsers/html/beautifulsoup.py Fri Sep 16 15:36:38
2011 (r435)
@@ -24,13 +24,16 @@
BeautifulSoup HTML parser and is more flexible than the legacy HTMLParser
module."""
-import urlparse
-import crawler
-import re
import htmlentitydefs
+import re
+import urlparse
+
import BeautifulSoup
-import myurllib
-from parsers.html import htmlunescape
+
+from webcheck.myurllib import normalizeurl
+from webcheck.parsers.html import htmlunescape
+import crawler
+
# pattern for matching http-equiv and content part of
# <meta http-equiv="refresh" content="0;url=URL">
@@ -57,21 +60,21 @@
if title and title.string:
link.title = htmlunescape(title.string).strip()
- # FIXME: using myurllib.normalizeurl is wrong below, we should probably use
+ # FIXME: using normalizeurl is wrong below, we should probably use
# something like link.urlunescape() to do the escaping and check
# and log at the same time
# <base href="URL">
base = soup.find('base', href=True)
if base:
- base = myurllib.normalizeurl(htmlunescape(base['href']).strip())
+ base = normalizeurl(htmlunescape(base['href']).strip())
else:
base = link.url
# <link rel="TYPE" href="URL">
for l in soup.findAll('link', rel=True, href=True):
if l['rel'].lower() in ('stylesheet', 'alternate stylesheet', 'icon',
'shortcut icon'):
- embed = myurllib.normalizeurl(htmlunescape(l['href']).strip())
+ embed = normalizeurl(htmlunescape(l['href']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <meta name="author" content="AUTHOR">
@@ -91,26 +94,26 @@
link.add_child(urlparse.urljoin(base, child))
# <img src="URL">
for img in soup.findAll('img', src=True):
- embed = myurllib.normalizeurl(htmlunescape(img['src']).strip())
+ embed = normalizeurl(htmlunescape(img['src']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <a href="URL">
for a in soup.findAll('a', href=True):
- child = myurllib.normalizeurl(htmlunescape(a['href']).strip())
+ child = normalizeurl(htmlunescape(a['href']).strip())
if child:
link.add_child(urlparse.urljoin(base, child))
# <a name="NAME">
# TODO: consistent url escaping?
for a in soup.findAll('a', attrs={'name': True}):
# get anchor name
- a_name = myurllib.normalizeurl(htmlunescape(a['name']).strip())
+ a_name = normalizeurl(htmlunescape(a['name']).strip())
# if both id and name are used they should be the same
if 'id' in a and \
- a_name != myurllib.normalizeurl(htmlunescape(a['id']).strip()):
+ a_name != normalizeurl(htmlunescape(a['id']).strip()):
link.add_pageproblem(
'anchors defined in name and id attributes do not match')
# add the id anchor anyway
-
link.add_anchor(myurllib.normalizeurl(htmlunescape(a['id']).strip()))
+ link.add_anchor(normalizeurl(htmlunescape(a['id']).strip()))
# add the anchor
link.add_anchor(a_name)
# <ANY id="ID">
@@ -119,51 +122,51 @@
if elem.name == 'a' and 'name' in elem:
continue
# add the anchor
-
link.add_anchor(myurllib.normalizeurl(htmlunescape(elem['id']).strip()))
+ link.add_anchor(normalizeurl(htmlunescape(elem['id']).strip()))
# <frameset><frame src="URL"...>...</frameset>
for frame in soup.findAll('frame', src=True):
- embed = myurllib.normalizeurl(htmlunescape(frame['src']).strip())
+ embed = normalizeurl(htmlunescape(frame['src']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <iframe src="URL"...>
for frame in soup.findAll('iframe', src=True):
- embed = myurllib.normalizeurl(htmlunescape(frame['src']).strip())
+ embed = normalizeurl(htmlunescape(frame['src']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <object data="URL"...>
for obj in soup.findAll('object', data=True):
- embed = myurllib.normalizeurl(htmlunescape(obj['data']).strip())
+ embed = normalizeurl(htmlunescape(obj['data']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <object><param name="movie" value="URL"...></object>
for para in soup.findAll('param', attrs={'name': 'movie', 'value': True}):
- embed = myurllib.normalizeurl(htmlunescape(para['value']).strip())
+ embed = normalizeurl(htmlunescape(para['value']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <map><area href="URL"...>...</map>
for area in soup.findAll('area', href=True):
- child = myurllib.normalizeurl(htmlunescape(area['href']).strip())
+ child = normalizeurl(htmlunescape(area['href']).strip())
if child:
link.add_child(urlparse.urljoin(base, child))
# <applet code="URL" [archive="URL"]...>
for applet in soup.findAll('applet', code=True):
# if applet has archive tag check that
if 'archive' in applet:
- embed =
myurllib.normalizeurl(htmlunescape(applet['archive']).strip())
+ embed = normalizeurl(htmlunescape(applet['archive']).strip())
else:
- embed = myurllib.normalizeurl(htmlunescape(applet['code']).strip())
+ embed = normalizeurl(htmlunescape(applet['code']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <embed src="URL"...>
for embedd in soup.findAll('frame', src=True):
- embed = myurllib.normalizeurl(htmlunescape(embedd['src']).strip())
+ embed = normalizeurl(htmlunescape(embedd['src']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <embed><param name="movie" value="url"></embed>
for param in soup.findAll('param', attrs={
'name': re.compile("^movie$", re.I),
'value': True}):
- embed = myurllib.normalizeurl(htmlunescape(param['value']).strip())
+ embed = normalizeurl(htmlunescape(param['value']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <style>content</style>
@@ -179,12 +182,12 @@
parsers.css.parse(elem['style'], link, base)
# <script src="url">
for script in soup.findAll('script', src=True):
- embed = myurllib.normalizeurl(htmlunescape(script['src']).strip())
+ embed = normalizeurl(htmlunescape(script['src']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# <body|table|td background="url">
for t in soup.findAll(('body', 'table', 'td'), background=True):
- embed = myurllib.normalizeurl(htmlunescape(t['background']).strip())
+ embed = normalizeurl(htmlunescape(t['background']).strip())
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# flag that the link contains a valid page
Modified: webcheck/webcheck/parsers/html/calltidy.py
==============================================================================
--- webcheck/parsers/html/calltidy.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/parsers/html/calltidy.py Fri Sep 16 15:36:38 2011
(r435)
@@ -21,8 +21,9 @@
# under the copyright of the software, unless explicitly stated otherwise.
import tidy
-import config
-import parsers.html
+
+import webcheck.config
+import webcheck.parsers.html
def parse(content, link):
@@ -30,7 +31,7 @@
link."""
# only call tidy on internal pages
if link.is_internal:
- t = tidy.parseString(content, **config.TIDY_OPTIONS)
+ t = tidy.parseString(content, **webcheck.config.TIDY_OPTIONS)
for err in t.errors:
# error messages are escaped so we unescape them
- link.add_pageproblem(parsers.html.htmlunescape(unicode(err)))
+
link.add_pageproblem(webcheck.parsers.html.htmlunescape(unicode(err)))
Modified: webcheck/webcheck/parsers/html/htmlparser.py
==============================================================================
--- webcheck/parsers/html/htmlparser.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/parsers/html/htmlparser.py Fri Sep 16 15:36:38
2011 (r435)
@@ -25,13 +25,15 @@
is not available and can be considered depricated. This parser
will only handle properly formatted HTML."""
-import debugio
import HTMLParser
-import urlparse
import re
-import crawler
-import myurllib
-from parsers.html import htmlunescape
+import urlparse
+
+from webcheck import debugio
+from webcheck.myurllib import normalizeurl
+from webcheck.parsers.html import htmlunescape
+import webcheck.crawler
+
# pattern for matching numeric html entities
_charentitypattern = re.compile('&#([0-9]{1,3});')
@@ -79,20 +81,20 @@
def _cleanurl(self, url, what='link'):
"""Do some translations of url."""
# check for spaces in urls
- # (characters are escaped in myurllib.normalizeurl())
+ # (characters are escaped in normalizeurl())
if _spacepattern.search(url):
self.link.add_pageproblem(
what + ' contains unescaped spaces: ' + url + ', ' +
self._location())
# replace &#nnn; entity refs with proper characters
url = _charentitypattern.sub(lambda x: chr(int(x.group(1))), url)
- return myurllib.normalizeurl(url)
+ return normalizeurl(url)
def error(self, message):
"""Override superclass' error() method to ignore errors."""
# construct error message
message += ', ' + self._location()
# store error message
- debugio.debug('parsers.html.htmlparser._MyHTMLParser.error(): problem
parsing html: ' + message)
+ debugio.debug('webcheck.parsers.html.htmlparser._MyHTMLParser.error():
problem parsing html: ' + message)
if self.errmsg is None:
self.errmsg = message
# increment error count
@@ -105,7 +107,7 @@
try:
return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i)
except AssertionError:
-
debugio.debug('parsers.html.htmlparser._MyHTMLParser.check_for_whole_start_tag():
caught assertion error')
+
debugio.debug('webcheck.parsers.html.htmlparser._MyHTMLParser.check_for_whole_start_tag():
caught assertion error')
return None
def handle_starttag(self, tag, attrs):
@@ -210,8 +212,8 @@
# pick up any tags with a style attribute
if 'style' in attrs:
# delegate handling of inline css to css module
- import parsers.css
- parsers.css.parse(attrs['style'], self.link, self.base)
+ import webcheck.parsers.css
+ webcheck.parsers.css.parse(attrs['style'], self.link, self.base)
def handle_endtag(self, tag):
"""Handle end tags in html."""
@@ -220,8 +222,8 @@
self.collect = None
elif tag == 'style' and self.collect is not None:
# delegate handling of inline css to css module
- import parsers.css
- parsers.css.parse(self.collect, self.link, self.base)
+ import webcheck.parsers.css
+ webcheck.parsers.css.parse(self.collect, self.link, self.base)
def handle_data(self, data):
"""Collect data if we were collecting data."""
@@ -272,13 +274,13 @@
parser.close()
except Exception, e:
# ignore (but log) all errors
- debugio.debug('parsers.html.htmlparser.parse(): caught exception: ' +
str(e))
+ debugio.debug('webcheck.parsers.html.htmlparser.parse(): caught
exception: ' + str(e))
# check for parser errors
if parser.errmsg is not None:
- debugio.debug('parsers.html.htmlparser.parse(): problem parsing html:
' + parser.errmsg)
+ debugio.debug('webcheck.parsers.html.htmlparser.parse(): problem
parsing html: ' + parser.errmsg)
link.add_pageproblem('problem parsing html: %s' % parser.errmsg)
# dump encoding
- debugio.debug('parsers.html.htmlparser.parse(): html encoding: %s' %
str(link.encoding))
+ debugio.debug('webcheck.parsers.html.htmlparser.parse(): html encoding:
%s' % str(link.encoding))
# flag that the link contains a valid page
link.is_page = True
# save the title
Modified: webcheck/webcheck/plugins/__init__.py
==============================================================================
--- webcheck/plugins/__init__.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/__init__.py Fri Sep 16 15:36:38 2011
(r435)
@@ -49,13 +49,10 @@
from sqlalchemy.orm import joinedload
from sqlalchemy.orm.session import object_session
-import config
-import db
-import debugio
-import parsers.html
-
-# reference function from html module
-htmlescape = parsers.html.htmlescape
+from webcheck.db import Link
+from webcheck.parsers.html import htmlescape
+import webcheck.config
+import webcheck.debugio
def _floatformat(f):
@@ -129,7 +126,7 @@
is external, insert "class=external" in the <a> tag."""
return '<a href="%(url)s" %(target)sclass="%(cssclass)s"
title="%(info)s">%(title)s</a>' % \
dict(url=htmlescape(link.url),
- target='target="_blank" ' if
config.REPORT_LINKS_IN_NEW_WINDOW else '',
+ target='target="_blank" ' if
webcheck.config.REPORT_LINKS_IN_NEW_WINDOW else '',
cssclass='internal' if link.is_internal else 'external',
info=htmlescape(_get_info(link)).replace('\n', ' '),
title=htmlescape(title or link.title or link.url))
@@ -142,7 +139,7 @@
count = link.count_parents
if not count:
return
- parents = link.parents.order_by(db.Link.title,
db.Link.url).options(joinedload(db.Link.linkproblems))[:config.PARENT_LISTLEN]
+ parents = link.parents.order_by(Link.title,
Link.url).options(joinedload(Link.linkproblems))[:webcheck.config.PARENT_LISTLEN]
fp.write(
indent + '<div class="parents">\n' +
indent + ' referenced from:\n' +
@@ -165,26 +162,26 @@
def open_file(filename, istext=True, makebackup=False):
"""This returns an open file object which can be used for writing. This
file is created in the output directory. The output directory (stored in
- config.OUTPUT_DIR is created if it does not yet exist. If the second
+ webcheck.config.OUTPUT_DIR is created if it does not yet exist. If the
second
parameter is True (default) the file is opened as an UTF-8 text file."""
import os
# check if output directory exists and create it if needed
- if not os.path.isdir(config.OUTPUT_DIR):
+ if not os.path.isdir(webcheck.config.OUTPUT_DIR):
try:
- os.mkdir(config.OUTPUT_DIR)
+ os.mkdir(webcheck.config.OUTPUT_DIR)
except OSError, (errno, strerror):
debugio.error('error creating directory %(dir)s: %(strerror)s' %
- {'dir': config.OUTPUT_DIR,
+ {'dir': webcheck.config.OUTPUT_DIR,
'strerror': strerror})
sys.exit(1)
# build the output file name
- fname = os.path.join(config.OUTPUT_DIR, filename)
+ fname = os.path.join(webcheck.config.OUTPUT_DIR, filename)
# check if file exists
if os.path.exists(fname):
if makebackup:
# create backup of original (overwriting previous backup)
os.rename(fname, fname + '~')
- elif not config.OVERWRITE_FILES:
+ elif not webcheck.config.OVERWRITE_FILES:
# ask to overwrite
try:
res = raw_input('webcheck: overwrite %s? [y]es, [a]ll, [q]uit:
' % fname)
@@ -194,7 +191,7 @@
res = 'q'
res = res.lower() + ' '
if res[0] == 'a':
- config.OVERWRITE_FILES = True
+ webcheck.config.OVERWRITE_FILES = True
elif res[0] != 'y':
print 'Aborted.'
sys.exit(1)
@@ -214,9 +211,9 @@
def _print_navbar(fp, plugin):
"""Return an html fragement representing the navigation bar for a page."""
fp.write(' <ul class="navbar">\n')
- for p in config.PLUGINS:
+ for p in webcheck.config.PLUGINS:
# import the plugin
- report = __import__('plugins.' + p, globals(), locals(), [p])
+ report = __import__('webcheck.plugins.' + p, globals(), locals(), [p])
# skip if no outputfile
if not hasattr(report, '__outputfile__'):
continue
@@ -258,7 +255,7 @@
% {'sitetitle': htmlescape(base.title or base.url),
'plugintitle': htmlescape(plugin.__title__),
'siteurl': base.url,
- 'version': config.VERSION})
+ 'version': webcheck.config.VERSION})
# write navigation bar
_print_navbar(fp, plugin)
# write plugin heading
@@ -279,6 +276,6 @@
' </body>\n'
'</html>\n'
% {'time': htmlescape(time.ctime(time.time())),
- 'homepage': config.HOMEPAGE,
- 'version': htmlescape(config.VERSION)})
+ 'homepage': webcheck.config.HOMEPAGE,
+ 'version': htmlescape(webcheck.config.VERSION)})
fp.close()
Modified: webcheck/webcheck/plugins/about.py
==============================================================================
--- webcheck/plugins/about.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/about.py Fri Sep 16 15:36:38 2011 (r435)
@@ -30,15 +30,15 @@
import time
-import config
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
def generate(site):
"""Output a list of modules, it's authors and the webcheck version."""
- fp = plugins.open_html(plugins.about, site)
- session = db.Session()
+ fp = webcheck.plugins.open_html(webcheck.plugins.about, site)
+ session = Session()
# TODO: xxx links were fetched, xxx pages were examined and a total of xxx
notes and problems were found
# TODO: include some runtime information (e.g. supported schemes, user
configuration, etc)
# output some general information about the report
@@ -56,10 +56,10 @@
' This report was generated on %(time)s, a total of %(numurls)d\n'
' links were found.\n'
' </p>\n\n'
- % {'version': plugins.htmlescape(config.VERSION),
- 'time': plugins.htmlescape(time.ctime(time.time())),
- 'numurls': session.query(db.Link).count(),
- 'homepage': config.HOMEPAGE})
+ % {'version': webcheck.plugins.htmlescape(webcheck.config.VERSION),
+ 'time': webcheck.plugins.htmlescape(time.ctime(time.time())),
+ 'numurls': session.query(Link).count(),
+ 'homepage': webcheck.config.HOMEPAGE})
# output copyright information
fp.write(
' <h3>Copyright</h3>\n'
@@ -100,15 +100,15 @@
fp.write(
' <h3>Plugins</h3>\n'
' <ul>\n')
- for plugin in config.PLUGINS:
- report = __import__('plugins.' + plugin, globals(), locals(), [plugin])
+ for plugin in webcheck.config.PLUGINS:
+ report = __import__('webcheck.plugins.' + plugin, globals(), locals(),
[plugin])
fp.write(
' <li>\n'
' <strong>%s</strong><br />\n'
- % plugins.htmlescape(report.__title__))
+ % webcheck.plugins.htmlescape(report.__title__))
if hasattr(report, '__doc__'):
- fp.write(' %s<br />\n' % plugins.htmlescape(report.__doc__))
+ fp.write(' %s<br />\n' %
webcheck.plugins.htmlescape(report.__doc__))
fp.write(' </li>\n')
fp.write(
' </ul>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
Modified: webcheck/webcheck/plugins/anchors.py
==============================================================================
--- webcheck/plugins/anchors.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/anchors.py Fri Sep 16 15:36:38 2011
(r435)
@@ -27,22 +27,22 @@
__title__ = 'missing anchors'
__author__ = 'Arthur de Jong'
-import db
+from webcheck.db import Session, Link, Anchor
def postprocess(site):
"""Add all missing anchors as page problems to the referring page."""
- session = db.Session()
+ session = Session()
# find all fetched links with requested anchors
- links = session.query(db.Link).filter(db.Link.reqanchors.any())
- links = links.filter(db.Link.fetched != None)
+ links = session.query(Link).filter(Link.reqanchors.any())
+ links = links.filter(Link.fetched != None)
# go over list and find missing anchors
# TODO: we can probably make a nicer query for this
for link in links:
# check that all requested anchors exist
for anchor in link.reqanchors:
# if the anchor is not there there, report problem
- if not link.anchors.filter(db.Anchor.anchor ==
anchor.anchor).first():
+ if not link.anchors.filter(Anchor.anchor == anchor.anchor).first():
anchor.parent.add_pageproblem(
u'bad link: %(url)s#%(anchor)s: unknown anchor'
% {'url': link.url,
Modified: webcheck/webcheck/plugins/badlinks.py
==============================================================================
--- webcheck/plugins/badlinks.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/badlinks.py Fri Sep 16 15:36:38 2011
(r435)
@@ -30,15 +30,15 @@
from sqlalchemy.orm import joinedload
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
def postporcess(site):
"""Add all bad links as pageproblems on pages where they are linked."""
- session = db.Session()
+ session = Session()
# find all links with link problems
- links =
session.query(db.Link).filter(db.Link.linkproblems.any()).options(joinedload(db.Link.linkproblems))
+ links =
session.query(Link).filter(Link.linkproblems.any()).options(joinedload(Link.linkproblems))
# TODO: probably make it a nicer query over all linkproblems
for link in links:
# add a reference to the problem map
@@ -50,17 +50,17 @@
def generate(site):
"""Present the list of bad links."""
- session = db.Session()
+ session = Session()
# find all links with link problems
- links =
session.query(db.Link).filter(db.Link.linkproblems.any()).order_by(db.Link.url).options(joinedload(db.Link.linkproblems))
+ links =
session.query(Link).filter(Link.linkproblems.any()).order_by(Link.url).options(joinedload(Link.linkproblems))
# present results
- fp = plugins.open_html(plugins.badlinks, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.badlinks, site)
if not links:
fp.write(
' <p class="description">\n'
' There were no problems retrieving links from the website.\n'
' </p>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
return
fp.write(
' <p class="description">\n'
@@ -73,18 +73,18 @@
' <li>\n'
' %(badurl)s\n'
' <ul class="problems">\n'
- % {'badurl': plugins.make_link(link, link.url)})
+ % {'badurl': webcheck.plugins.make_link(link, link.url)})
# list the problems
for problem in link.linkproblems:
fp.write(
' <li>%(problem)s</li>\n'
- % {'problem': plugins.htmlescape(problem)})
+ % {'problem': webcheck.plugins.htmlescape(problem)})
fp.write(
' </ul>\n')
# present a list of parents
- plugins.print_parents(fp, link, ' ')
+ webcheck.plugins.print_parents(fp, link, ' ')
fp.write(
' </li>\n')
fp.write(
' </ol>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
Modified: webcheck/webcheck/plugins/external.py
==============================================================================
--- webcheck/plugins/external.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/external.py Fri Sep 16 15:36:38 2011
(r435)
@@ -30,23 +30,23 @@
from sqlalchemy.orm import joinedload
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
def generate(site):
"""Generate the list of external links."""
- session = db.Session()
+ session = Session()
# get all external links
- links = session.query(db.Link).filter(db.Link.is_internal !=
True).order_by(db.Link.url)
+ links = session.query(Link).filter(Link.is_internal !=
True).order_by(Link.url)
# present results
- fp = plugins.open_html(plugins.external, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.external, site)
if not links:
fp.write(
' <p class="description">'
' No external links were found on the website.'
' </p>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
return
fp.write(
' <p class="description">'
@@ -54,15 +54,15 @@
' examination of the website.'
' </p>\n'
' <ol>\n')
- for link in links.options(joinedload(db.Link.linkproblems)):
+ for link in links.options(joinedload(Link.linkproblems)):
fp.write(
' <li>\n'
' %(link)s\n'
- % {'link': plugins.make_link(link)})
+ % {'link': webcheck.plugins.make_link(link)})
# present a list of parents
- plugins.print_parents(fp, link, ' ')
+ webcheck.plugins.print_parents(fp, link, ' ')
fp.write(
' </li>\n')
fp.write(
' </ol>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
Modified: webcheck/webcheck/plugins/images.py
==============================================================================
--- webcheck/plugins/images.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/images.py Fri Sep 16 15:36:38 2011 (r435)
@@ -30,27 +30,27 @@
import re
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
def generate(site):
"""Generate a list of image URLs that were found."""
- session = db.Session()
+ session = Session()
# get non-page links that have an image/* mimetype
- links = session.query(db.Link)
- links = links.filter((db.Link.is_page != True) | (db.Link.is_page == None))
- links = links.filter(db.Link.mimetype.startswith('image/'))
- links = links.order_by(db.Link.url)
+ links = session.query(Link)
+ links = links.filter((Link.is_page != True) | (Link.is_page == None))
+ links = links.filter(Link.mimetype.startswith('image/'))
+ links = links.order_by(Link.url)
# present results
- fp = plugins.open_html(plugins.images, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.images, site)
if not links:
fp.write(
' <p class="description">\n'
' No images were linked on the website.\n'
' </p>\n'
' <ol>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
return
fp.write(
' <p class="description">\n'
@@ -58,7 +58,7 @@
' </p>\n'
' <ol>\n')
for link in links:
- fp.write(' <li>%s</li>\n' % plugins.make_link(link, link.url))
+ fp.write(' <li>%s</li>\n' % webcheck.plugins.make_link(link,
link.url))
fp.write(
' </ol>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
Modified: webcheck/webcheck/plugins/new.py
==============================================================================
--- webcheck/plugins/new.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/new.py Fri Sep 16 15:36:38 2011 (r435)
@@ -30,9 +30,9 @@
import time
-import config
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
SECS_PER_DAY = 60 * 60 * 24
@@ -40,28 +40,28 @@
def generate(site):
"""Output the list of recently modified pages."""
- session = db.Session()
+ session = Session()
# the time for which links are considered new
- newtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSNEW_URL_AGE
+ newtime = time.time() - SECS_PER_DAY *
webcheck.config.REPORT_WHATSNEW_URL_AGE
# get all internal pages that are new
- links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
- links = links.filter(db.Link.mtime >
newtime).order_by(db.Link.mtime.desc())
+ links = session.query(Link).filter_by(is_page=True, is_internal=True)
+ links = links.filter(Link.mtime > newtime).order_by(Link.mtime.desc())
# present results
- fp = plugins.open_html(plugins.new, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.new, site)
if not links.count():
fp.write(
' <p class="description">\n'
' No pages were found that were modified within the last %(new)d
days.\n'
' </p>\n'
- % {'new': config.REPORT_WHATSNEW_URL_AGE})
- plugins.close_html(fp)
+ % {'new': webcheck.config.REPORT_WHATSNEW_URL_AGE})
+ webcheck.plugins.close_html(fp)
return
fp.write(
' <p class="description">\n'
' These pages have been recently modified (within %(new)d days).\n'
' </p>\n'
' <ul>\n'
- % {'new': config.REPORT_WHATSNEW_URL_AGE})
+ % {'new': webcheck.config.REPORT_WHATSNEW_URL_AGE})
for link in links:
age = (time.time() - link.mtime) / SECS_PER_DAY
fp.write(
@@ -71,7 +71,7 @@
' <li>age: %(age)d days</li>\n'
' </ul>\n'
' </li>\n'
- % {'link': plugins.make_link(link),
+ % {'link': webcheck.plugins.make_link(link),
'age': age})
fp.write(' </ul>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
Modified: webcheck/webcheck/plugins/notchkd.py
==============================================================================
--- webcheck/plugins/notchkd.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/notchkd.py Fri Sep 16 15:36:38 2011
(r435)
@@ -30,23 +30,23 @@
from sqlalchemy.orm import joinedload
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
def generate(site):
"""Output the list of not checked pages."""
- session = db.Session()
+ session = Session()
# get all yanked urls
- links = session.query(db.Link).filter(db.Link.yanked !=
None).order_by(db.Link.url)
+ links = session.query(Link).filter(Link.yanked != None).order_by(Link.url)
# present results
- fp = plugins.open_html(plugins.notchkd, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.notchkd, site)
if not links.count():
fp.write(
' <p class="description">\n'
' All links have been checked.\n'
' </p>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
return
fp.write(
' <p class="description">\n'
@@ -54,15 +54,15 @@
' at all during the examination of the website.\n'
' </p>\n'
' <ol>\n')
- for link in links.options(joinedload(db.Link.linkproblems)):
+ for link in links.options(joinedload(Link.linkproblems)):
fp.write(
' <li>\n'
' %(link)s\n'
- % {'link': plugins.make_link(link, link.url)})
+ % {'link': webcheck.plugins.make_link(link, link.url)})
# present a list of parents
- plugins.print_parents(fp, link, ' ')
+ webcheck.plugins.print_parents(fp, link, ' ')
fp.write(
' </li>\n')
fp.write(
' </ol>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
Modified: webcheck/webcheck/plugins/notitles.py
==============================================================================
--- webcheck/plugins/notitles.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/notitles.py Fri Sep 16 15:36:38 2011
(r435)
@@ -30,17 +30,17 @@
from sqlalchemy.sql.functions import char_length
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
def postprocess(site):
"""Add page problems for all pages without a title."""
- session = db.Session()
+ session = Session()
# get all internal pages without a title
- links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
- links = links.filter((char_length(db.Link.title) == 0) |
- (db.Link.title == None))
+ links = session.query(Link).filter_by(is_page=True, is_internal=True)
+ links = links.filter((char_length(Link.title) == 0) |
+ (Link.title == None))
for link in links:
link.add_pageproblem('missing title')
session.commit()
@@ -48,19 +48,19 @@
def generate(site):
"""Output the list of pages without a title."""
- session = db.Session()
+ session = Session()
# get all internal pages without a title
- links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
- links = links.filter((char_length(db.Link.title) == 0) |
- (db.Link.title == None)).order_by(db.Link.url)
+ links = session.query(Link).filter_by(is_page=True, is_internal=True)
+ links = links.filter((char_length(Link.title) == 0) |
+ (Link.title == None)).order_by(Link.url)
# present results
- fp = plugins.open_html(plugins.notitles, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.notitles, site)
if not links.count():
fp.write(
' <p class="description">\n'
' All pages had a title specified.\n'
' </p>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
return
fp.write(
' <p class="description">\n'
@@ -71,7 +71,7 @@
for link in links:
fp.write(
' <li>%(link)s</li>\n'
- % {'link': plugins.make_link(link, link.url)})
+ % {'link': webcheck.plugins.make_link(link, link.url)})
fp.write(
' </ol>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
Modified: webcheck/webcheck/plugins/old.py
==============================================================================
--- webcheck/plugins/old.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/old.py Fri Sep 16 15:36:38 2011 (r435)
@@ -30,9 +30,9 @@
import time
-import config
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
SECS_PER_DAY = 60 * 60 * 24
@@ -40,21 +40,21 @@
def generate(site):
"""Output the list of outdated pages to the specified file descriptor."""
- session = db.Session()
+ session = Session()
# the time for which links are considered old
- oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE
+ oldtime = time.time() - SECS_PER_DAY *
webcheck.config.REPORT_WHATSOLD_URL_AGE
# get all internal pages that are old
- links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
- links = links.filter(db.Link.mtime < oldtime).order_by(db.Link.mtime)
+ links = session.query(Link).filter_by(is_page=True, is_internal=True)
+ links = links.filter(Link.mtime < oldtime).order_by(Link.mtime)
# present results
- fp = plugins.open_html(plugins.old, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.old, site)
if not links.count():
fp.write(
' <p class="description">\n'
' No pages were found that were older than %(old)d days old.\n'
' </p>\n'
- % {'old': config.REPORT_WHATSOLD_URL_AGE})
- plugins.close_html(fp)
+ % {'old': webcheck.config.REPORT_WHATSOLD_URL_AGE})
+ webcheck.plugins.close_html(fp)
return
fp.write(
' <p class="description">\n'
@@ -62,7 +62,7 @@
' days) and may be outdated.\n'
' </p>\n'
' <ul>\n'
- % {'old': config.REPORT_WHATSOLD_URL_AGE})
+ % {'old': webcheck.config.REPORT_WHATSOLD_URL_AGE})
for link in links:
age = (time.time() - link.mtime) / SECS_PER_DAY
fp.write(
@@ -72,8 +72,8 @@
' <li>age: %(age)d days</li>\n'
' </ul>\n'
' </li>\n'
- % {'link': plugins.make_link(link),
+ % {'link': webcheck.plugins.make_link(link),
'age': age})
fp.write(
' </ul>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
Modified: webcheck/webcheck/plugins/problems.py
==============================================================================
--- webcheck/plugins/problems.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/problems.py Fri Sep 16 15:36:38 2011
(r435)
@@ -30,8 +30,8 @@
import urllib
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
def _mk_id(name):
@@ -50,12 +50,12 @@
def generate(site):
"""Output the overview of problems per author."""
- session = db.Session()
+ session = Session()
# make a list of problems per author
problem_db = {}
# get internal links with page problems
- links = session.query(db.Link).filter_by(is_internal=True)
- links = links.filter(db.Link.pageproblems.any()).order_by(db.Link.url)
+ links = session.query(Link).filter_by(is_internal=True)
+ links = links.filter(Link.pageproblems.any()).order_by(Link.url)
for link in links:
# make a normal name for the author
if link.author:
@@ -67,13 +67,13 @@
problem_db[author].append(link)
else:
problem_db[author] = [link]
- fp = plugins.open_html(plugins.problems, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.problems, site)
if not problem_db:
fp.write(
' <p class="description">\n'
' No problems were found on this site, hurray.\n'
' </p>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
return
# print description
fp.write(
@@ -90,8 +90,8 @@
for author in authors:
fp.write(
' <li><a href="#author_%(authorref)s">Author:
%(author)s</a></li>\n'
- % {'authorref': plugins.htmlescape(_mk_id(author)),
- 'author': plugins.htmlescape(author)})
+ % {'authorref': webcheck.plugins.htmlescape(_mk_id(author)),
+ 'author': webcheck.plugins.htmlescape(author)})
fp.write(' </ul>\n')
# generate problem report
fp.write(' <ul>\n')
@@ -100,8 +100,8 @@
' <li id="author_%(authorref)s">\n'
' Author: %(author)s\n'
' <ul>\n'
- % {'authorref': plugins.htmlescape(_mk_id(author)),
- 'author': plugins.htmlescape(author)})
+ % {'authorref': webcheck.plugins.htmlescape(_mk_id(author)),
+ 'author': webcheck.plugins.htmlescape(author)})
# sort pages by url
problem_db[author].sort(lambda a, b: cmp(a.url, b.url))
# list problems for this author
@@ -111,12 +111,12 @@
' <li>\n'
' %(link)s\n'
' <ul class="problems">\n'
- % {'link': plugins.make_link(link)})
+ % {'link': webcheck.plugins.make_link(link)})
# list the problems
for problem in link.pageproblems:
fp.write(
' <li>%(problem)s</li>\n'
- % {'problem': plugins.htmlescape(problem)})
+ % {'problem': webcheck.plugins.htmlescape(problem)})
# end the list item
fp.write(
' </ul>\n'
@@ -126,4 +126,4 @@
' </li>\n')
fp.write(
' </ul>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
Modified: webcheck/webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/plugins/sitemap.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/sitemap.py Fri Sep 16 15:36:38 2011
(r435)
@@ -28,25 +28,25 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'index.html'
-import config
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
def add_pagechildren(link, children, explored):
"""Determine the page children of this link, combining the children of
embedded items and following redirects."""
# get all internal children
- qry = link.children.filter(db.Link.is_internal == True)
+ qry = link.children.filter(Link.is_internal == True)
if link.depth:
- qry = qry.filter((db.Link.depth > link.depth) | (db.Link.depth ==
None))
+ qry = qry.filter((Link.depth > link.depth) | (Link.depth == None))
# follow redirects
children.update(y
for y in (x.follow_link() for x in qry)
if y and y.is_page and y.is_internal and y.id not in
explored)
explored.update(x.id for x in children)
# add embedded element's pagechildren (think frames)
- for embed in link.embedded.filter(db.Link.is_internal ==
True).filter(db.Link.is_page == True):
+ for embed in link.embedded.filter(Link.is_internal ==
True).filter(Link.is_page == True):
# TODO: put this in a query
if embed.id not in explored and \
(embed.depth == None or embed.depth > link.depth):
@@ -58,9 +58,9 @@
site. Prints the html results to the file descriptor."""
# output this link
fp.write(indent + '<li>\n')
- fp.write(indent + ' ' + plugins.make_link(link) + '\n')
+ fp.write(indent + ' ' + webcheck.plugins.make_link(link) + '\n')
# only check children if we are not too deep yet
- if depth <= config.REPORT_SITEMAP_LEVEL:
+ if depth <= webcheck.config.REPORT_SITEMAP_LEVEL:
# figure out the links to follow and ensure that they are only
# explored from here
children = set()
@@ -80,8 +80,8 @@
def generate(site):
"""Output the sitemap."""
- session = db.Session()
- fp = plugins.open_html(plugins.sitemap, site)
+ session = Session()
+ fp = webcheck.plugins.open_html(webcheck.plugins.sitemap, site)
# output the site structure using breadth first traversal
fp.write(
' <p class="description">\n'
@@ -93,4 +93,4 @@
_explore(fp, l, explored)
fp.write(
' </ul>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
Modified: webcheck/webcheck/plugins/size.py
==============================================================================
--- webcheck/plugins/size.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/size.py Fri Sep 16 15:36:38 2011 (r435)
@@ -28,9 +28,9 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'size.html'
-import config
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
def _getsize(link, done=None):
@@ -57,22 +57,22 @@
def generate(site):
"""Output the list of large pages."""
- session = db.Session()
+ session = Session()
# get all internal pages and get big links
- links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
+ links = session.query(Link).filter_by(is_page=True, is_internal=True)
links = [x for x in links
- if _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024]
+ if _getsize(x) >= webcheck.config.REPORT_SLOW_URL_SIZE * 1024]
# sort links by size (biggest first)
links.sort(lambda a, b: cmp(b.total_size, a.total_size))
# present results
- fp = plugins.open_html(plugins.size, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.size, site)
if not links:
fp.write(
' <p class="description">\n'
' No pages over %(size)dK were found.\n'
' </p>\n'
- % {'size': config.REPORT_SLOW_URL_SIZE})
- plugins.close_html(fp)
+ % {'size': webcheck.config.REPORT_SLOW_URL_SIZE})
+ webcheck.plugins.close_html(fp)
return
fp.write(
' <p class="description">\n'
@@ -80,9 +80,9 @@
' slow to download.\n'
' </p>\n'
' <ul>\n'
- % {'size': config.REPORT_SLOW_URL_SIZE})
+ % {'size': webcheck.config.REPORT_SLOW_URL_SIZE})
for link in links:
- size = plugins.get_size(link.total_size)
+ size = webcheck.plugins.get_size(link.total_size)
fp.write(
' <li>\n'
' %(link)s\n'
@@ -90,8 +90,8 @@
' <li>size: %(size)s</li>\n'
' </ul>\n'
' </li>\n'
- % {'link': plugins.make_link(link),
+ % {'link': webcheck.plugins.make_link(link),
'size': size})
fp.write(
' </ul>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
Modified: webcheck/webcheck/plugins/urllist.py
==============================================================================
--- webcheck/plugins/urllist.py Sun Sep 11 17:33:55 2011 (r434)
+++ webcheck/webcheck/plugins/urllist.py Fri Sep 16 15:36:38 2011
(r435)
@@ -26,14 +26,14 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'urllist.html'
-import db
-import plugins
+from webcheck.db import Session, Link
+import webcheck.plugins
def generate(site):
"""Output a sorted list of URLs."""
- session = db.Session()
- fp = plugins.open_html(plugins.urllist, site)
+ session = Session()
+ fp = webcheck.plugins.open_html(webcheck.plugins.urllist, site)
fp.write(
' <p class="description">\n'
' This is the list of all urls encountered during the examination
of\n'
@@ -41,9 +41,9 @@
' non-examined urls.\n'
' </p>\n'
' <ol>\n')
- links = session.query(db.Link).order_by(db.Link.url)
+ links = session.query(Link).order_by(Link.url)
for link in links:
- fp.write(' <li>' + plugins.make_link(link, link.url) + '</li>\n')
+ fp.write(' <li>' + webcheck.plugins.make_link(link, link.url) +
'</li>\n')
fp.write(
' </ol>\n')
- plugins.close_html(fp)
+ webcheck.plugins.close_html(fp)
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
- webcheck commit: r435 - in webcheck: . parsers plugins webcheck webcheck/parsers webcheck/parsers/html webcheck/plugins,
Commits of the webcheck project