webcheck commit: r447 - in webcheck: . webcheck webcheck/parsers/html webcheck/plugins
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r447 - in webcheck: . webcheck webcheck/parsers/html webcheck/plugins
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r447 - in webcheck: . webcheck webcheck/parsers/html webcheck/plugins
- Date: Fri, 7 Oct 2011 12:52:36 +0200 (CEST)
Author: arthur
Date: Fri Oct 7 12:52:35 2011
New Revision: 447
URL: http://arthurdejong.org/viewvc/webcheck?revision=447&view=revision
Log:
move some more initialisation from cmd to crawler and make imports of config
and debugio consistent
Modified:
webcheck/cmd.py
webcheck/webcheck/crawler.py
webcheck/webcheck/db.py
webcheck/webcheck/parsers/html/__init__.py
webcheck/webcheck/parsers/html/beautifulsoup.py
webcheck/webcheck/parsers/html/calltidy.py
webcheck/webcheck/parsers/html/htmlparser.py
webcheck/webcheck/plugins/__init__.py
webcheck/webcheck/plugins/about.py
webcheck/webcheck/plugins/new.py
webcheck/webcheck/plugins/old.py
webcheck/webcheck/plugins/sitemap.py
webcheck/webcheck/plugins/size.py
webcheck/webcheck/util.py
Modified: webcheck/cmd.py
==============================================================================
--- webcheck/cmd.py Fri Oct 7 10:37:26 2011 (r446)
+++ webcheck/cmd.py Fri Oct 7 12:52:35 2011 (r447)
@@ -32,12 +32,10 @@
import urlparse
import webcheck
-from webcheck import config
-from webcheck import debugio
-import webcheck.crawler
-import webcheck.db
import webcheck.monkeypatch
-import webcheck.plugins
+from webcheck.crawler import Site
+from webcheck import config, debugio
+
debugio.loglevel = debugio.INFO
@@ -157,13 +155,7 @@
if not os.path.isdir(config.OUTPUT_DIR):
os.mkdir(config.OUTPUT_DIR)
# set up database connection
- filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
- from sqlalchemy import create_engine
- engine = create_engine('sqlite:///' + filename)
- webcheck.db.Session.configure(bind=engine)
- # ensure that all tables are created
- webcheck.db.Base.metadata.create_all(engine)
- # TODO: schema migraton goes here
+ site.setup_database()
# add configuration to site
for pattern in internal_urls:
site.add_internal_re(pattern)
@@ -189,7 +181,6 @@
"""Main program."""
# crawl through the website
debugio.info('checking site....')
- webcheck.crawler.setup_urllib2()
site.crawl() # this will take a while
debugio.info('done.')
# do postprocessing (building site structure, etc)
@@ -201,14 +192,13 @@
debugio.info('generating reports...')
# for every plugin, generate a page
site.generate()
- # put extra files in the output directory
debugio.info('done.')
if __name__ == '__main__':
try:
# initialize site object
- site = webcheck.crawler.Site()
+ site = Site()
# parse command-line arguments
parse_args(site)
# run the main program
Modified: webcheck/webcheck/crawler.py
==============================================================================
--- webcheck/webcheck/crawler.py Fri Oct 7 10:37:26 2011 (r446)
+++ webcheck/webcheck/crawler.py Fri Oct 7 12:52:35 2011 (r447)
@@ -38,13 +38,14 @@
import urllib2
import urlparse
-from webcheck.db import Session, Link, LinkProblem, PageProblem, children, \
- embedded
-from webcheck import debugio
+from webcheck import config, debugio
+from webcheck.db import Session, Base, Link, LinkProblem, PageProblem, \
+ children, embedded
from webcheck.util import install_file
-import webcheck.config
import webcheck.parsers
+from sqlalchemy import create_engine
+
class RedirectError(urllib2.HTTPError):
@@ -59,11 +60,11 @@
raise RedirectError(req.get_full_url(), code, msg, headers, fp, newurl)
-def setup_urllib2():
+def _setup_urllib2():
"""Configure the urllib2 module to store cookies in the output
directory."""
import webcheck # local import to avoid import loop
- filename = os.path.join(webcheck.config.OUTPUT_DIR, 'cookies.lwp')
+ filename = os.path.join(config.OUTPUT_DIR, 'cookies.lwp')
# set up our cookie jar
cookiejar = cookielib.LWPCookieJar(filename)
try:
@@ -77,7 +78,7 @@
opener.addheaders = [
('User-agent', 'webcheck %s' % webcheck.__version__),
]
- if webcheck.config.BYPASSHTTPCACHE:
+ if config.BYPASSHTTPCACHE:
opener.addheaders.append(('Cache-control', 'no-cache'))
opener.addheaders.append(('Pragma', 'no-cache'))
urllib2.install_opener(opener)
@@ -115,6 +116,14 @@
# list of base urls (these are the internal urls to start from)
self.bases = []
+ def setup_database(self):
+ filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
+ engine = create_engine('sqlite:///' + filename)
+ Session.configure(bind=engine)
+ # ensure that all tables are created
+ Base.metadata.create_all(engine)
+ # TODO: schema migraton goes here
+
def add_internal(self, url):
"""Add the given url and consider all urls below it to be internal.
These links are all marked for checking with the crawl() function."""
@@ -147,7 +156,7 @@
return True
res = False
# check that the url starts with an internal url
- if webcheck.config.BASE_URLS_ONLY:
+ if config.BASE_URLS_ONLY:
# the url must start with one of the _internal_urls
for i in self._internal_urls:
res |= (i == url[:len(i)])
@@ -203,10 +212,10 @@
return 'yanked'
# check if we should avoid external links
is_internal = self._is_internal(url)
- if not is_internal and webcheck.config.AVOID_EXTERNAL_LINKS:
+ if not is_internal and config.AVOID_EXTERNAL_LINKS:
return 'external avoided'
# check if we should use robot parsers
- if not webcheck.config.USE_ROBOTS:
+ if not config.USE_ROBOTS:
return None
(scheme, netloc) = urlparse.urlsplit(url)[0:2]
# skip schemes not having robot.txt files
@@ -241,10 +250,12 @@
add_internal(). If the serialization file pointer
is specified the crawler writes out updated links to
the file while crawling the site."""
+ # configure urllib2 to store cookies in the output directory
+ _setup_urllib2()
# get a database session
session = Session()
# remove all links
- if not webcheck.config.CONTINUE:
+ if not config.CONTINUE:
session.query(LinkProblem).delete()
session.commit()
session.query(PageProblem).delete()
@@ -286,10 +297,10 @@
# flush database changes
session.commit()
# sleep between requests if configured
- if webcheck.config.WAIT_BETWEEN_REQUESTS > 0:
+ if config.WAIT_BETWEEN_REQUESTS > 0:
debugio.debug('crawler.crawl(): sleeping %s seconds' %
- webcheck.config.WAIT_BETWEEN_REQUESTS)
- time.sleep(webcheck.config.WAIT_BETWEEN_REQUESTS)
+ config.WAIT_BETWEEN_REQUESTS)
+ time.sleep(config.WAIT_BETWEEN_REQUESTS)
debugio.debug('crawler.crawl(): items left to check: %d' %
(remaining + len(tocheck)))
session.commit()
@@ -307,7 +318,7 @@
parent = link.parents.first()
if parent:
request.add_header('Referer', parent.url)
- response = urllib2.urlopen(request,
timeout=webcheck.config.IOTIMEOUT)
+ response = urllib2.urlopen(request, timeout=config.IOTIMEOUT)
link.mimetype = response.info().gettype()
link.set_encoding(response.headers.getparam('charset'))
# FIXME: get result code and other stuff
@@ -406,7 +417,7 @@
debugio.debug('crawler.postprocess(): %d links at depth %d' %
(count, depth))
# TODO: also handle embeds
# see if any of the plugins want to do postprocessing
- for plugin in webcheck.config.PLUGINS:
+ for plugin in config.PLUGINS:
# import the plugin
pluginmod = __import__(plugin, globals(), locals(), [plugin])
if hasattr(pluginmod, 'postprocess'):
@@ -415,7 +426,7 @@
def generate(self):
"""Generate pages for plugins."""
- for plugin in webcheck.config.PLUGINS:
+ for plugin in config.PLUGINS:
# import the plugin
pluginmod = __import__(plugin, globals(), locals(), [plugin])
if hasattr(pluginmod, 'generate'):
Modified: webcheck/webcheck/db.py
==============================================================================
--- webcheck/webcheck/db.py Fri Oct 7 10:37:26 2011 (r446)
+++ webcheck/webcheck/db.py Fri Oct 7 12:52:35 2011 (r447)
@@ -29,9 +29,8 @@
from sqlalchemy.orm.session import object_session
from sqlalchemy.sql.expression import union
+from webcheck import config, debugio
from webcheck.myurllib import normalizeurl
-import webcheck.config
-import webcheck.debugio
# provide session and schema classes
@@ -117,7 +116,7 @@
the encoding is supported."""
if not self.encoding and encoding:
try:
- webcheck.debugio.debug('crawler.Link.set_encoding(%r)' %
encoding)
+ debugio.debug('crawler.Link.set_encoding(%r)' % encoding)
unicode('just some random text', encoding, 'replace')
self.encoding = encoding
except Exception, e:
@@ -132,7 +131,7 @@
self.redirectdepth = max([self.redirectdepth] +
[x.redirectdepth for x in self.parents]) + 1
# check depth
- if self.redirectdepth >= webcheck.config.REDIRECT_DEPTH:
+ if self.redirectdepth >= config.REDIRECT_DEPTH:
self.add_linkproblem('too many redirects (%d)' %
self.redirectdepth)
return
# check for redirect to self
Modified: webcheck/webcheck/parsers/html/__init__.py
==============================================================================
--- webcheck/webcheck/parsers/html/__init__.py Fri Oct 7 10:37:26 2011
(r446)
+++ webcheck/webcheck/parsers/html/__init__.py Fri Oct 7 12:52:35 2011
(r447)
@@ -117,7 +117,7 @@
try:
import webcheck.parsers.html.calltidy
debugio.debug('webcheck.parsers.html.parse(): the Tidy parser is
ok')
- calltidy.parse(content, link)
+ webcheck.parsers.html.calltidy.parse(content, link)
except ImportError:
debugio.warn('tidy library (python-utidylib) is unavailable')
# remove config to only try once
Modified: webcheck/webcheck/parsers/html/beautifulsoup.py
==============================================================================
--- webcheck/webcheck/parsers/html/beautifulsoup.py Fri Oct 7 10:37:26
2011 (r446)
+++ webcheck/webcheck/parsers/html/beautifulsoup.py Fri Oct 7 12:52:35
2011 (r447)
@@ -31,6 +31,7 @@
from webcheck.myurllib import normalizeurl
from webcheck.parsers.html import htmlunescape
+import webcheck.parsers.css
# pattern for matching http-equiv and content part of
@@ -171,13 +172,11 @@
for style in soup.findAll('style'):
if style.string:
# delegate handling of inline css to css module
- import webcheck.parsers.css
- parsers.css.parse(htmlunescape(style.string), link, base)
+ webcheck.parsers.css.parse(htmlunescape(style.string), link, base)
# <ANY style="CSS">
for elem in soup.findAll(style=True):
# delegate handling of inline css to css module
- import webcheck.parsers.css
- parsers.css.parse(elem['style'], link, base)
+ webcheck.parsers.css.parse(elem['style'], link, base)
# <script src="url">
for script in soup.findAll('script', src=True):
embed = normalizeurl(htmlunescape(script['src']).strip())
Modified: webcheck/webcheck/parsers/html/calltidy.py
==============================================================================
--- webcheck/webcheck/parsers/html/calltidy.py Fri Oct 7 10:37:26 2011
(r446)
+++ webcheck/webcheck/parsers/html/calltidy.py Fri Oct 7 12:52:35 2011
(r447)
@@ -22,8 +22,8 @@
import tidy
-import webcheck.config
-import webcheck.parsers.html
+from webcheck import config
+from webcheck.parsers.html import htmlunescape
def parse(content, link):
@@ -31,7 +31,7 @@
link."""
# only call tidy on internal pages
if link.is_internal:
- t = tidy.parseString(content, **webcheck.config.TIDY_OPTIONS)
+ t = tidy.parseString(content, **config.TIDY_OPTIONS)
for err in t.errors:
# error messages are escaped so we unescape them
-
link.add_pageproblem(webcheck.parsers.html.htmlunescape(unicode(err)))
+ link.add_pageproblem(htmlunescape(unicode(err)))
Modified: webcheck/webcheck/parsers/html/htmlparser.py
==============================================================================
--- webcheck/webcheck/parsers/html/htmlparser.py Fri Oct 7 10:37:26
2011 (r446)
+++ webcheck/webcheck/parsers/html/htmlparser.py Fri Oct 7 12:52:35
2011 (r447)
@@ -32,7 +32,6 @@
from webcheck import debugio
from webcheck.myurllib import normalizeurl
from webcheck.parsers.html import htmlunescape
-import webcheck.crawler
# pattern for matching numeric html entities
Modified: webcheck/webcheck/plugins/__init__.py
==============================================================================
--- webcheck/webcheck/plugins/__init__.py Fri Oct 7 10:37:26 2011
(r446)
+++ webcheck/webcheck/plugins/__init__.py Fri Oct 7 12:52:35 2011
(r447)
@@ -50,11 +50,10 @@
from sqlalchemy.orm.session import object_session
import webcheck
+from webcheck import config
from webcheck.db import Link
from webcheck.parsers.html import htmlescape
from webcheck.util import open_file
-import webcheck.config
-import webcheck.debugio
def _floatformat(f):
@@ -128,7 +127,7 @@
is external, insert "class=external" in the <a> tag."""
return '<a href="%(url)s" %(target)sclass="%(cssclass)s"
title="%(info)s">%(title)s</a>' % \
dict(url=htmlescape(link.url),
- target='target="_blank" ' if
webcheck.config.REPORT_LINKS_IN_NEW_WINDOW else '',
+ target='target="_blank" ' if
config.REPORT_LINKS_IN_NEW_WINDOW else '',
cssclass='internal' if link.is_internal else 'external',
info=htmlescape(_get_info(link)).replace('\n', ' '),
title=htmlescape(title or link.title or link.url))
@@ -141,7 +140,7 @@
count = link.count_parents
if not count:
return
- parents = link.parents.order_by(Link.title,
Link.url).options(joinedload(Link.linkproblems))[:webcheck.config.PARENT_LISTLEN]
+ parents = link.parents.order_by(Link.title,
Link.url).options(joinedload(Link.linkproblems))[:config.PARENT_LISTLEN]
fp.write(
indent + '<div class="parents">\n' +
indent + ' referenced from:\n' +
@@ -164,7 +163,7 @@
def _print_navbar(fp, selected):
"""Return an html fragement representing the navigation bar for a page."""
fp.write(' <ul class="navbar">\n')
- for plugin in webcheck.config.PLUGINS:
+ for plugin in config.PLUGINS:
# import the plugin
pluginmod = __import__(plugin, globals(), locals(), [plugin])
# skip if no outputfile
Modified: webcheck/webcheck/plugins/about.py
==============================================================================
--- webcheck/webcheck/plugins/about.py Fri Oct 7 10:37:26 2011 (r446)
+++ webcheck/webcheck/plugins/about.py Fri Oct 7 12:52:35 2011 (r447)
@@ -31,8 +31,8 @@
import time
import webcheck
+from webcheck import config
from webcheck.db import Session, Link
-import webcheck.config
import webcheck.plugins
@@ -101,7 +101,7 @@
fp.write(
' <h3>Plugins</h3>\n'
' <ul>\n')
- for plugin in webcheck.config.PLUGINS:
+ for plugin in config.PLUGINS:
pluginmod = __import__(plugin, globals(), locals(), [plugin])
fp.write(
' <li>\n'
Modified: webcheck/webcheck/plugins/new.py
==============================================================================
--- webcheck/webcheck/plugins/new.py Fri Oct 7 10:37:26 2011 (r446)
+++ webcheck/webcheck/plugins/new.py Fri Oct 7 12:52:35 2011 (r447)
@@ -30,8 +30,8 @@
import time
+from webcheck import config
from webcheck.db import Session, Link
-import webcheck.config
import webcheck.plugins
@@ -42,7 +42,7 @@
"""Output the list of recently modified pages."""
session = Session()
# the time for which links are considered new
- newtime = time.time() - SECS_PER_DAY *
webcheck.config.REPORT_WHATSNEW_URL_AGE
+ newtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSNEW_URL_AGE
# get all internal pages that are new
links = session.query(Link).filter_by(is_page=True, is_internal=True)
links = links.filter(Link.mtime > newtime).order_by(Link.mtime.desc())
@@ -53,7 +53,7 @@
' <p class="description">\n'
' No pages were found that were modified within the last %(new)d
days.\n'
' </p>\n'
- % {'new': webcheck.config.REPORT_WHATSNEW_URL_AGE})
+ % {'new': config.REPORT_WHATSNEW_URL_AGE})
webcheck.plugins.close_html(fp)
return
fp.write(
@@ -61,7 +61,7 @@
' These pages have been recently modified (within %(new)d days).\n'
' </p>\n'
' <ul>\n'
- % {'new': webcheck.config.REPORT_WHATSNEW_URL_AGE})
+ % {'new': config.REPORT_WHATSNEW_URL_AGE})
for link in links:
age = (time.time() - link.mtime) / SECS_PER_DAY
fp.write(
Modified: webcheck/webcheck/plugins/old.py
==============================================================================
--- webcheck/webcheck/plugins/old.py Fri Oct 7 10:37:26 2011 (r446)
+++ webcheck/webcheck/plugins/old.py Fri Oct 7 12:52:35 2011 (r447)
@@ -31,7 +31,7 @@
import time
from webcheck.db import Session, Link
-import webcheck.config
+from webcheck import config
import webcheck.plugins
@@ -42,7 +42,7 @@
"""Output the list of outdated pages to the specified file descriptor."""
session = Session()
# the time for which links are considered old
- oldtime = time.time() - SECS_PER_DAY *
webcheck.config.REPORT_WHATSOLD_URL_AGE
+ oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE
# get all internal pages that are old
links = session.query(Link).filter_by(is_page=True, is_internal=True)
links = links.filter(Link.mtime < oldtime).order_by(Link.mtime)
@@ -53,7 +53,7 @@
' <p class="description">\n'
' No pages were found that were older than %(old)d days old.\n'
' </p>\n'
- % {'old': webcheck.config.REPORT_WHATSOLD_URL_AGE})
+ % {'old': config.REPORT_WHATSOLD_URL_AGE})
webcheck.plugins.close_html(fp)
return
fp.write(
@@ -62,7 +62,7 @@
' days) and may be outdated.\n'
' </p>\n'
' <ul>\n'
- % {'old': webcheck.config.REPORT_WHATSOLD_URL_AGE})
+ % {'old': config.REPORT_WHATSOLD_URL_AGE})
for link in links:
age = (time.time() - link.mtime) / SECS_PER_DAY
fp.write(
Modified: webcheck/webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/webcheck/plugins/sitemap.py Fri Oct 7 10:37:26 2011
(r446)
+++ webcheck/webcheck/plugins/sitemap.py Fri Oct 7 12:52:35 2011
(r447)
@@ -28,8 +28,8 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'index.html'
-from webcheck.db import Session, Link
-import webcheck.config
+from webcheck import config
+from webcheck.db import Link
import webcheck.plugins
@@ -60,7 +60,7 @@
fp.write(indent + '<li>\n')
fp.write(indent + ' ' + webcheck.plugins.make_link(link) + '\n')
# only check children if we are not too deep yet
- if depth <= webcheck.config.REPORT_SITEMAP_LEVEL:
+ if depth <= config.REPORT_SITEMAP_LEVEL:
# figure out the links to follow and ensure that they are only
# explored from here
children = set()
@@ -80,7 +80,6 @@
def generate(site):
"""Output the sitemap."""
- session = Session()
fp = webcheck.plugins.open_html(webcheck.plugins.sitemap, site)
# output the site structure using breadth first traversal
fp.write(
Modified: webcheck/webcheck/plugins/size.py
==============================================================================
--- webcheck/webcheck/plugins/size.py Fri Oct 7 10:37:26 2011 (r446)
+++ webcheck/webcheck/plugins/size.py Fri Oct 7 12:52:35 2011 (r447)
@@ -29,7 +29,7 @@
__outputfile__ = 'size.html'
from webcheck.db import Session, Link
-import webcheck.config
+from webcheck import config
import webcheck.plugins
@@ -61,7 +61,7 @@
# get all internal pages and get big links
links = session.query(Link).filter_by(is_page=True, is_internal=True)
links = [x for x in links
- if _getsize(x) >= webcheck.config.REPORT_SLOW_URL_SIZE * 1024]
+ if _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024]
# sort links by size (biggest first)
links.sort(lambda a, b: cmp(b.total_size, a.total_size))
# present results
@@ -71,7 +71,7 @@
' <p class="description">\n'
' No pages over %(size)dK were found.\n'
' </p>\n'
- % {'size': webcheck.config.REPORT_SLOW_URL_SIZE})
+ % {'size': config.REPORT_SLOW_URL_SIZE})
webcheck.plugins.close_html(fp)
return
fp.write(
@@ -80,7 +80,7 @@
' slow to download.\n'
' </p>\n'
' <ul>\n'
- % {'size': webcheck.config.REPORT_SLOW_URL_SIZE})
+ % {'size': config.REPORT_SLOW_URL_SIZE})
for link in links:
size = webcheck.plugins.get_size(link.total_size)
fp.write(
Modified: webcheck/webcheck/util.py
==============================================================================
--- webcheck/webcheck/util.py Fri Oct 7 10:37:26 2011 (r446)
+++ webcheck/webcheck/util.py Fri Oct 7 12:52:35 2011 (r447)
@@ -34,7 +34,7 @@
def open_file(filename, istext=True, makebackup=False):
"""This returns an open file object which can be used for writing. This
file is created in the output directory. The output directory (stored in
- webcheck.config.OUTPUT_DIR is created if it does not yet exist. If the
second
+ config.OUTPUT_DIR is created if it does not yet exist. If the second
parameter is True (default) the file is opened as an UTF-8 text file."""
# check if output directory exists and create it if needed
if not os.path.isdir(config.OUTPUT_DIR):
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
- webcheck commit: r447 - in webcheck: . webcheck webcheck/parsers/html webcheck/plugins,
Commits of the webcheck project