webcheck commit: r448 - in webcheck: . webcheck webcheck/parsers/html webcheck/plugins
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r448 - in webcheck: . webcheck webcheck/parsers/html webcheck/plugins
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r448 - in webcheck: . webcheck webcheck/parsers/html webcheck/plugins
- Date: Fri, 7 Oct 2011 13:19:33 +0200 (CEST)
Author: arthur
Date: Fri Oct 7 13:19:31 2011
New Revision: 448
URL: http://arthurdejong.org/viewvc/webcheck?revision=448&view=revision
Log:
rename Site to Crawler
Modified:
webcheck/cmd.py
webcheck/webcheck/__init__.py
webcheck/webcheck/crawler.py
webcheck/webcheck/parsers/html/htmlparser.py
webcheck/webcheck/plugins/__init__.py
webcheck/webcheck/plugins/about.py
webcheck/webcheck/plugins/anchors.py
webcheck/webcheck/plugins/badlinks.py
webcheck/webcheck/plugins/external.py
webcheck/webcheck/plugins/images.py
webcheck/webcheck/plugins/new.py
webcheck/webcheck/plugins/notchkd.py
webcheck/webcheck/plugins/notitles.py
webcheck/webcheck/plugins/old.py
webcheck/webcheck/plugins/problems.py
webcheck/webcheck/plugins/sitemap.py
webcheck/webcheck/plugins/size.py
webcheck/webcheck/plugins/urllist.py
Modified: webcheck/cmd.py
==============================================================================
--- webcheck/cmd.py Fri Oct 7 12:52:35 2011 (r447)
+++ webcheck/cmd.py Fri Oct 7 13:19:31 2011 (r448)
@@ -33,8 +33,7 @@
import webcheck
import webcheck.monkeypatch
-from webcheck.crawler import Site
-from webcheck import config, debugio
+from webcheck import config, debugio, Crawler
debugio.loglevel = debugio.INFO
@@ -97,7 +96,7 @@
% {'redirects': config.REDIRECT_DEPTH})
-def parse_args(site):
+def parse_args(crawler):
"""Parse command-line arguments."""
import getopt
try:
@@ -155,19 +154,19 @@
if not os.path.isdir(config.OUTPUT_DIR):
os.mkdir(config.OUTPUT_DIR)
# set up database connection
- site.setup_database()
+ crawler.setup_database()
# add configuration to site
for pattern in internal_urls:
- site.add_internal_re(pattern)
+ crawler.add_internal_re(pattern)
for pattern in external_urls:
- site.add_external_re(pattern)
+ crawler.add_external_re(pattern)
for pattern in yank_urls:
- site.add_yanked_re(pattern)
+ crawler.add_yanked_re(pattern)
for arg in args:
# if it does not look like a url it is probably a local file
if urlparse.urlsplit(arg)[0] == '':
arg = 'file://' + urllib.pathname2url(os.path.abspath(arg))
- site.add_internal(arg)
+ crawler.add_internal(arg)
except getopt.error, reason:
sys.stderr.write('webcheck: %s\n' % reason)
print_tryhelp()
@@ -177,30 +176,30 @@
sys.exit(1)
-def main(site):
+def main(crawler):
"""Main program."""
# crawl through the website
debugio.info('checking site....')
- site.crawl() # this will take a while
+ crawler.crawl() # this will take a while
debugio.info('done.')
# do postprocessing (building site structure, etc)
debugio.info('postprocessing....')
- site.postprocess()
+ crawler.postprocess()
debugio.info('done.')
# now we can write out the files
# start with the frame-description page
debugio.info('generating reports...')
# for every plugin, generate a page
- site.generate()
+ crawler.generate()
debugio.info('done.')
if __name__ == '__main__':
try:
- # initialize site object
- site = Site()
+ # initialize crawler object
+ crawler = Crawler()
# parse command-line arguments
- parse_args(site)
+ parse_args(crawler)
# run the main program
if PROFILE:
fname = os.path.join(config.OUTPUT_DIR, 'webcheck.prof')
@@ -213,12 +212,12 @@
sqltap.start()
except ImportError:
pass
- cProfile.run('main(site)', fname)
+ cProfile.run('main(crawler)', fname)
if 'sqltap' in locals():
statistics = sqltap.collect()
sqltap.report(statistics, os.path.join(config.OUTPUT_DIR,
'sqltap.html'))
else:
- main(site)
+ main(crawler)
except KeyboardInterrupt:
sys.stderr.write('Interrupted\n')
sys.exit(1)
Modified: webcheck/webcheck/__init__.py
==============================================================================
--- webcheck/webcheck/__init__.py Fri Oct 7 12:52:35 2011 (r447)
+++ webcheck/webcheck/__init__.py Fri Oct 7 13:19:31 2011 (r448)
@@ -23,3 +23,5 @@
__version__ = '1.10.4'
__homepage__ = 'http://arthurdejong.org/webcheck/'
+
+from webcheck.crawler import Crawler
Modified: webcheck/webcheck/crawler.py
==============================================================================
--- webcheck/webcheck/crawler.py Fri Oct 7 12:52:35 2011 (r447)
+++ webcheck/webcheck/crawler.py Fri Oct 7 13:19:31 2011 (r448)
@@ -22,7 +22,7 @@
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.
-"""General module to do site-checking. This module contains the Site class
+"""General module to do site-checking. This module contains the Crawler class
containing the state for the crawled site and some functions to access and
manipulate the crawling of the website. This module also contains the Link
class that holds all the link related properties."""
@@ -91,8 +91,7 @@
_anchorpattern = re.compile('#([^#]+)$')
-# TODO: rename Site to Crawler
-class Site(object):
+class Crawler(object):
"""Class to represent gathered data of a site.
The available properties of this class are:
@@ -101,7 +100,7 @@
"""
def __init__(self):
- """Creates an instance of the Site class and initializes the
+ """Creates an instance of the Crawler class and initializes the
state of the site."""
# list of internal urls
self._internal_urls = set()
Modified: webcheck/webcheck/parsers/html/htmlparser.py
==============================================================================
--- webcheck/webcheck/parsers/html/htmlparser.py Fri Oct 7 12:52:35
2011 (r447)
+++ webcheck/webcheck/parsers/html/htmlparser.py Fri Oct 7 13:19:31
2011 (r448)
@@ -32,6 +32,7 @@
from webcheck import debugio
from webcheck.myurllib import normalizeurl
from webcheck.parsers.html import htmlunescape
+import webcheck.parsers.css
# pattern for matching numeric html entities
@@ -211,7 +212,6 @@
# pick up any tags with a style attribute
if 'style' in attrs:
# delegate handling of inline css to css module
- import webcheck.parsers.css
webcheck.parsers.css.parse(attrs['style'], self.link, self.base)
def handle_endtag(self, tag):
@@ -221,7 +221,6 @@
self.collect = None
elif tag == 'style' and self.collect is not None:
# delegate handling of inline css to css module
- import webcheck.parsers.css
webcheck.parsers.css.parse(self.collect, self.link, self.base)
def handle_data(self, data):
Modified: webcheck/webcheck/plugins/__init__.py
==============================================================================
--- webcheck/webcheck/plugins/__init__.py Fri Oct 7 12:52:35 2011
(r447)
+++ webcheck/webcheck/plugins/__init__.py Fri Oct 7 13:19:31 2011
(r448)
@@ -28,7 +28,7 @@
the generate() function. Each plugin should export the following
fields:
- generate(site)
+ generate(crawler)
Based on the site generate all the output files as needed.
__title__
A short description of the plugin that is used when linking
@@ -182,12 +182,12 @@
fp.write(' </ul>\n')
-def open_html(plugin, site):
+def open_html(plugin, crawler):
"""Print an html fragment for the start of an html page."""
# open the file
fp = open_file(plugin.__outputfile__)
# get the first base url
- base = site.bases[0]
+ base = crawler.bases[0]
# write basic html head
fp.write(
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
Modified: webcheck/webcheck/plugins/about.py
==============================================================================
--- webcheck/webcheck/plugins/about.py Fri Oct 7 12:52:35 2011 (r447)
+++ webcheck/webcheck/plugins/about.py Fri Oct 7 13:19:31 2011 (r448)
@@ -36,9 +36,9 @@
import webcheck.plugins
-def generate(site):
+def generate(crawler):
"""Output a list of modules, it's authors and the webcheck version."""
- fp = webcheck.plugins.open_html(webcheck.plugins.about, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.about, crawler)
session = Session()
# TODO: xxx links were fetched, xxx pages were examined and a total of xxx
notes and problems were found
# TODO: include some runtime information (e.g. supported schemes, user
configuration, etc)
Modified: webcheck/webcheck/plugins/anchors.py
==============================================================================
--- webcheck/webcheck/plugins/anchors.py Fri Oct 7 12:52:35 2011
(r447)
+++ webcheck/webcheck/plugins/anchors.py Fri Oct 7 13:19:31 2011
(r448)
@@ -30,7 +30,7 @@
from webcheck.db import Session, Link, Anchor
-def postprocess(site):
+def postprocess(crawler):
"""Add all missing anchors as page problems to the referring page."""
session = Session()
# find all fetched links with requested anchors
Modified: webcheck/webcheck/plugins/badlinks.py
==============================================================================
--- webcheck/webcheck/plugins/badlinks.py Fri Oct 7 12:52:35 2011
(r447)
+++ webcheck/webcheck/plugins/badlinks.py Fri Oct 7 13:19:31 2011
(r448)
@@ -34,7 +34,7 @@
import webcheck.plugins
-def postporcess(site):
+def postporcess(crawler):
"""Add all bad links as pageproblems on pages where they are linked."""
session = Session()
# find all links with link problems
@@ -48,13 +48,13 @@
session.commit()
-def generate(site):
+def generate(crawler):
"""Present the list of bad links."""
session = Session()
# find all links with link problems
links =
session.query(Link).filter(Link.linkproblems.any()).order_by(Link.url).options(joinedload(Link.linkproblems))
# present results
- fp = webcheck.plugins.open_html(webcheck.plugins.badlinks, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.badlinks, crawler)
if not links:
fp.write(
' <p class="description">\n'
Modified: webcheck/webcheck/plugins/external.py
==============================================================================
--- webcheck/webcheck/plugins/external.py Fri Oct 7 12:52:35 2011
(r447)
+++ webcheck/webcheck/plugins/external.py Fri Oct 7 13:19:31 2011
(r448)
@@ -34,13 +34,13 @@
import webcheck.plugins
-def generate(site):
+def generate(crawler):
"""Generate the list of external links."""
session = Session()
# get all external links
links = session.query(Link).filter(Link.is_internal !=
True).order_by(Link.url)
# present results
- fp = webcheck.plugins.open_html(webcheck.plugins.external, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.external, crawler)
if not links:
fp.write(
' <p class="description">'
Modified: webcheck/webcheck/plugins/images.py
==============================================================================
--- webcheck/webcheck/plugins/images.py Fri Oct 7 12:52:35 2011 (r447)
+++ webcheck/webcheck/plugins/images.py Fri Oct 7 13:19:31 2011 (r448)
@@ -32,7 +32,7 @@
import webcheck.plugins
-def generate(site):
+def generate(crawler):
"""Generate a list of image URLs that were found."""
session = Session()
# get non-page links that have an image/* mimetype
@@ -41,7 +41,7 @@
links = links.filter(Link.mimetype.startswith('image/'))
links = links.order_by(Link.url)
# present results
- fp = webcheck.plugins.open_html(webcheck.plugins.images, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.images, crawler)
if not links:
fp.write(
' <p class="description">\n'
Modified: webcheck/webcheck/plugins/new.py
==============================================================================
--- webcheck/webcheck/plugins/new.py Fri Oct 7 12:52:35 2011 (r447)
+++ webcheck/webcheck/plugins/new.py Fri Oct 7 13:19:31 2011 (r448)
@@ -38,7 +38,7 @@
SECS_PER_DAY = 60 * 60 * 24
-def generate(site):
+def generate(crawler):
"""Output the list of recently modified pages."""
session = Session()
# the time for which links are considered new
@@ -47,7 +47,7 @@
links = session.query(Link).filter_by(is_page=True, is_internal=True)
links = links.filter(Link.mtime > newtime).order_by(Link.mtime.desc())
# present results
- fp = webcheck.plugins.open_html(webcheck.plugins.new, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.new, crawler)
if not links.count():
fp.write(
' <p class="description">\n'
Modified: webcheck/webcheck/plugins/notchkd.py
==============================================================================
--- webcheck/webcheck/plugins/notchkd.py Fri Oct 7 12:52:35 2011
(r447)
+++ webcheck/webcheck/plugins/notchkd.py Fri Oct 7 13:19:31 2011
(r448)
@@ -34,13 +34,13 @@
import webcheck.plugins
-def generate(site):
+def generate(crawler):
"""Output the list of not checked pages."""
session = Session()
# get all yanked urls
links = session.query(Link).filter(Link.yanked != None).order_by(Link.url)
# present results
- fp = webcheck.plugins.open_html(webcheck.plugins.notchkd, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.notchkd, crawler)
if not links.count():
fp.write(
' <p class="description">\n'
Modified: webcheck/webcheck/plugins/notitles.py
==============================================================================
--- webcheck/webcheck/plugins/notitles.py Fri Oct 7 12:52:35 2011
(r447)
+++ webcheck/webcheck/plugins/notitles.py Fri Oct 7 13:19:31 2011
(r448)
@@ -34,7 +34,7 @@
import webcheck.plugins
-def postprocess(site):
+def postprocess(crawler):
"""Add page problems for all pages without a title."""
session = Session()
# get all internal pages without a title
@@ -46,7 +46,7 @@
session.commit()
-def generate(site):
+def generate(crawler):
"""Output the list of pages without a title."""
session = Session()
# get all internal pages without a title
@@ -54,7 +54,7 @@
links = links.filter((char_length(Link.title) == 0) |
(Link.title == None)).order_by(Link.url)
# present results
- fp = webcheck.plugins.open_html(webcheck.plugins.notitles, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.notitles, crawler)
if not links.count():
fp.write(
' <p class="description">\n'
Modified: webcheck/webcheck/plugins/old.py
==============================================================================
--- webcheck/webcheck/plugins/old.py Fri Oct 7 12:52:35 2011 (r447)
+++ webcheck/webcheck/plugins/old.py Fri Oct 7 13:19:31 2011 (r448)
@@ -38,7 +38,7 @@
SECS_PER_DAY = 60 * 60 * 24
-def generate(site):
+def generate(crawler):
"""Output the list of outdated pages to the specified file descriptor."""
session = Session()
# the time for which links are considered old
@@ -47,7 +47,7 @@
links = session.query(Link).filter_by(is_page=True, is_internal=True)
links = links.filter(Link.mtime < oldtime).order_by(Link.mtime)
# present results
- fp = webcheck.plugins.open_html(webcheck.plugins.old, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.old, crawler)
if not links.count():
fp.write(
' <p class="description">\n'
Modified: webcheck/webcheck/plugins/problems.py
==============================================================================
--- webcheck/webcheck/plugins/problems.py Fri Oct 7 12:52:35 2011
(r447)
+++ webcheck/webcheck/plugins/problems.py Fri Oct 7 13:19:31 2011
(r448)
@@ -46,7 +46,7 @@
return name
-def generate(site):
+def generate(crawler):
"""Output the overview of problems per author."""
session = Session()
# make a list of problems per author
@@ -65,7 +65,7 @@
problem_db[author].append(link)
else:
problem_db[author] = [link]
- fp = webcheck.plugins.open_html(webcheck.plugins.problems, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.problems, crawler)
if not problem_db:
fp.write(
' <p class="description">\n'
Modified: webcheck/webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/webcheck/plugins/sitemap.py Fri Oct 7 12:52:35 2011
(r447)
+++ webcheck/webcheck/plugins/sitemap.py Fri Oct 7 13:19:31 2011
(r448)
@@ -78,17 +78,17 @@
fp.write(indent + '</li>\n')
-def generate(site):
+def generate(crawler):
"""Output the sitemap."""
- fp = webcheck.plugins.open_html(webcheck.plugins.sitemap, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.sitemap, crawler)
# output the site structure using breadth first traversal
fp.write(
' <p class="description">\n'
' This an overview of the crawled site.\n'
' </p>\n'
' <ul>\n')
- explored = set(x.id for x in site.bases)
- for l in site.bases:
+ explored = set(x.id for x in crawler.bases)
+ for l in crawler.bases:
_explore(fp, l, explored)
fp.write(
' </ul>\n')
Modified: webcheck/webcheck/plugins/size.py
==============================================================================
--- webcheck/webcheck/plugins/size.py Fri Oct 7 12:52:35 2011 (r447)
+++ webcheck/webcheck/plugins/size.py Fri Oct 7 13:19:31 2011 (r448)
@@ -55,7 +55,7 @@
return link.total_size
-def generate(site):
+def generate(crawler):
"""Output the list of large pages."""
session = Session()
# get all internal pages and get big links
@@ -65,7 +65,7 @@
# sort links by size (biggest first)
links.sort(lambda a, b: cmp(b.total_size, a.total_size))
# present results
- fp = webcheck.plugins.open_html(webcheck.plugins.size, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.size, crawler)
if not links:
fp.write(
' <p class="description">\n'
Modified: webcheck/webcheck/plugins/urllist.py
==============================================================================
--- webcheck/webcheck/plugins/urllist.py Fri Oct 7 12:52:35 2011
(r447)
+++ webcheck/webcheck/plugins/urllist.py Fri Oct 7 13:19:31 2011
(r448)
@@ -30,10 +30,10 @@
import webcheck.plugins
-def generate(site):
+def generate(crawler):
"""Output a sorted list of URLs."""
session = Session()
- fp = webcheck.plugins.open_html(webcheck.plugins.urllist, site)
+ fp = webcheck.plugins.open_html(webcheck.plugins.urllist, crawler)
fp.write(
' <p class="description">\n'
' This is the list of all urls encountered during the examination
of\n'
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
- webcheck commit: r448 - in webcheck: . webcheck webcheck/parsers/html webcheck/plugins,
Commits of the webcheck project