webcheck commit: r421 - in webcheck: . parsers parsers/html plugins
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r421 - in webcheck: . parsers parsers/html plugins
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r421 - in webcheck: . parsers parsers/html plugins
- Date: Thu, 4 Aug 2011 21:46:27 +0200 (CEST)
Author: arthur
Date: Thu Aug 4 21:46:26 2011
New Revision: 421
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=421
Log:
use SQLAlchemy to store crawled website data to improve scalability
Added:
webcheck/db.py
Deleted:
webcheck/serialize.py
Modified:
webcheck/config.py
webcheck/crawler.py
webcheck/parsers/css.py
webcheck/parsers/html/__init__.py
webcheck/parsers/html/beautifulsoup.py
webcheck/parsers/html/calltidy.py
webcheck/parsers/html/htmlparser.py
webcheck/plugins/__init__.py
webcheck/plugins/about.py
webcheck/plugins/badlinks.py
webcheck/plugins/external.py
webcheck/plugins/images.py
webcheck/plugins/new.py
webcheck/plugins/notchkd.py
webcheck/plugins/notitles.py
webcheck/plugins/old.py
webcheck/plugins/problems.py
webcheck/plugins/sitemap.py
webcheck/plugins/size.py
webcheck/plugins/urllist.py
webcheck/webcheck.py
Modified: webcheck/config.py
==============================================================================
--- webcheck/config.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/config.py Thu Aug 4 21:46:26 2011 (r421)
@@ -64,7 +64,7 @@
REDIRECT_DEPTH = 5
# The list of plugins that will be used to generate the report.
-PLUGINS = [ 'anchors',
+PLUGINS = [ #'anchors',
'sitemap',
'urllist',
'images',
Modified: webcheck/crawler.py
==============================================================================
--- webcheck/crawler.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/crawler.py Thu Aug 4 21:46:26 2011 (r421)
@@ -27,21 +27,24 @@
manipulate the crawling of the website. This module also contains the Link
class that holds all the link related properties."""
-import config
-import debugio
-import urlparse
-import urllib
-import robotparser
-import parsers
-import re
-import time
-import myurllib
-import urllib2
-import httplib
-import socket
import atexit
import cookielib
+import datetime
+import httplib
import os
+import re
+import robotparser
+import socket
+import time
+import urllib
+import urllib2
+import urlparse
+
+import config
+import db
+import debugio
+import parsers
+
# set up our cookie jar
cookiejar = cookielib.LWPCookieJar('cookies.lwp')
@@ -80,12 +83,13 @@
# pattern to match anchor part of a url
_anchorpattern = re.compile('#([^#]+)$')
-class Site:
+
+# TODO: rename Site to Crawler
+class Site(object):
"""Class to represent gathered data of a site.
The available properties of this class are:
- linkMap - a map of urls to link objects
bases - a list of base link object
"""
@@ -102,15 +106,13 @@
self._yanked_res = {}
# map of scheme+netloc to robot handleds
self._robotparsers = {}
- # a map of urls to Link objects
- self.linkMap = {}
# list of base urls (these are the internal urls to start from)
self.bases = []
def add_internal(self, url):
"""Add the given url and consider all urls below it to be internal.
These links are all marked for checking with the crawl() function."""
- url = myurllib.normalizeurl(url)
+ url = db.Link.clean_url(url)
if url not in self._internal_urls:
self._internal_urls.add(url)
@@ -129,53 +131,53 @@
will not be checked at all."""
self._yanked_res[exp] = re.compile(exp, re.IGNORECASE)
- def _is_internal(self, link):
+ def _is_internal(self, url):
"""Check whether the specified url is external or internal.
This uses the urls marked with add_internal() and the regular
expressions passed with add_external_re()."""
# check if it is internal through the regexps
for regexp in self._internal_res.values():
- if regexp.search(link.url) is not None:
+ if regexp.search(url) is not None:
return True
res = False
# check that the url starts with an internal url
if config.BASE_URLS_ONLY:
# the url must start with one of the _internal_urls
for i in self._internal_urls:
- res |= (i==link.url[:len(i)])
+ res |= (i==url[:len(i)])
else:
# the netloc must match a netloc of an _internal_url
+ netloc = urlparse.urlsplit(url)[1]
for i in self._internal_urls:
- res |= (urlparse.urlsplit(i)[1]==link.netloc)
+ res |= (urlparse.urlsplit(i)[1] == netloc)
# if it is not internal now, it never will be
if not res:
return False
# check if it is external through the regexps
for x in self._external_res.values():
# if the url matches it is external and we can stop
- if x.search(link.url) is not None:
+ if x.search(url):
return False
return True
- def _get_robotparser(self, link):
+ def _get_robotparser(self, scheme, netloc):
"""Return the proper robots parser for the given url or None if one
cannot be constructed. Robot parsers are cached per scheme and
netloc."""
# only some schemes have a meaningful robots.txt file
- if link.scheme != 'http' and link.scheme != 'https':
- debugio.debug('crawler._get_robotparser() called with unsupported
scheme (%s)' % link.scheme)
+ if scheme != 'http' and scheme != 'https':
+ debugio.debug('crawler._get_robotparser() called with unsupported
scheme (%s)' % scheme)
return None
# split out the key part of the url
- location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', ''))
+ location = urlparse.urlunsplit((scheme, netloc, '', '', ''))
# try to create a new robotparser if we don't already have one
if not self._robotparsers.has_key(location):
- import httplib
debugio.info(' getting robots.txt for %s' % location)
self._robotparsers[location] = None
try:
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urlunsplit(
- (link.scheme, link.netloc, '/robots.txt', '', '') ))
+ (scheme, netloc, '/robots.txt', '', '') ))
rp.read()
self._robotparsers[location] = rp
except (TypeError, IOError, httplib.HTTPException):
@@ -183,425 +185,155 @@
pass
return self._robotparsers[location]
- def _is_yanked(self, link):
+ def _is_yanked(self, url):
"""Check whether the specified url should not be checked at all.
This uses the regualr expressions passed with add_yanked_re() and the
robots information present."""
# check if it is yanked through the regexps
for regexp in self._yanked_res.values():
# if the url matches it is yanked and we can stop
- if regexp.search(link.url) is not None:
+ if regexp.search(url):
return 'yanked'
# check if we should avoid external links
- if not link.isinternal and config.AVOID_EXTERNAL_LINKS:
+ is_internal = self._is_internal(url)
+ if not is_internal and config.AVOID_EXTERNAL_LINKS:
return 'external avoided'
# check if we should use robot parsers
if not config.USE_ROBOTS:
- return False
- # skip schemes not haveing robot.txt files
- if link.scheme != 'http' and link.scheme != 'https':
- return False
+ return None
+ (scheme, netloc) = urlparse.urlsplit(url)[0:2]
+ # skip schemes not having robot.txt files
+ if scheme not in ('http', 'https'):
+ return None
# skip robot checks for external urls
# TODO: make this configurable
- if not link.isinternal:
- return False
+ if not is_internal:
+ return None
# check robots for remaining links
- rp = self._get_robotparser(link)
- if rp is not None and not rp.can_fetch('webcheck', link.url):
+ rp = self._get_robotparser(scheme, netloc)
+ if rp and not rp.can_fetch('webcheck', url):
return 'robot restriced'
# fall back to allowing the url
- return False
+ return None
+
+ def get_link(self, session, url):
+ # try to find the URL
+ url = db.Link.clean_url(url)
+ link = session.query(db.Link).filter_by(url=url).first()
+ if not link:
+ link = db.Link(url=url)
+ session.add(link)
+ return link
- def get_link(self, url):
- """Return a link object for the given url.
- This function checks the map of cached link objects for an
- instance."""
- # clean the url
- url = myurllib.normalizeurl(url)
- # check if we have an object ready
- if self.linkMap.has_key(url):
- return self.linkMap[url]
- # create a new instance
- return Link(self, url)
+ def get_links_to_crawl(self, session):
+ links = session.query(db.Link).filter(db.Link.fetched == None)
+ return links.filter(db.Link.yanked == None)[:100]
- def crawl(self, serfp=None):
+ def crawl(self):
"""Crawl the website based on the urls specified with
add_internal(). If the serialization file pointer
is specified the crawler writes out updated links to
the file while crawling the site."""
- # TODO: have some different scheme to crawl a site (e.g. separate
- # internal and external queues, threading, etc)
- tocheck = set()
- # add all unfetched site urls
- for link in self.linkMap.values():
- if not link.isyanked and not link.isfetched:
- tocheck.add(link)
- # add all internal urls
+ # get a database session
+ session = db.Session()
+ # remove all links
+ if not config.CONTINUE:
+ session.query(db.LinkProblem).delete()
+ session.commit()
+ session.query(db.PageProblem).delete()
+ session.commit()
+ session.execute(db.children.delete())
+ session.commit()
+ session.execute(db.embedded.delete())
+ session.commit()
+ session.query(db.Link).delete()
+ session.commit()
+ # add all internal urls to the database
for url in self._internal_urls:
- tocheck.add(self.get_link(url))
+ url = db.Link.clean_url(url)
+ self.get_link(session, url)
+ # add some URLs from the database that haven't been fetched
+ tocheck = self.get_links_to_crawl(session)
# repeat until we have nothing more to check
- fetchedlinks = 0
- while len(tocheck) > 0:
+ while tocheck:
debugio.debug('crawler.crawl(): items left to check: %d' %
len(tocheck))
# choose a link from the tocheck list
link = tocheck.pop()
+ link.is_internal = self._is_internal(link.url)
+ link.yanked = self._is_yanked(link.url)
+ # see if there are any more links to check
+ if not tocheck:
+ tocheck = self.get_links_to_crawl(session)
# skip link it there is nothing to check
- if link.isyanked or link.isfetched:
+ if link.yanked or link.fetched:
continue
# fetch the link's contents
- link.fetch()
- # add children to tocheck
- for child in link.children:
- if not child.isyanked and not child.isfetched:
- tocheck.add(child)
- # add embedded content
- for embed in link.embedded:
- if not embed.isyanked and not embed.isfetched:
- tocheck.add(embed)
- # serialize all as of yet unserialized links
- fetchedlinks += 1
- # TODO: make this configurable
- if serfp and fetchedlinks >= 5:
- fetchedlinks = 0
- import serialize
- for link in self.linkMap.values():
- if link._ischanged:
- serialize.serialize_link(serfp, link)
- link._ischanged = False
- serfp.flush()
+ response = self.fetch(link)
+ if response:
+ self.parse(link, response)
+ # flush database changes
+ session.commit()
# sleep between requests if configured
if config.WAIT_BETWEEN_REQUESTS > 0:
debugio.debug('crawler.crawl(): sleeping %s seconds' %
config.WAIT_BETWEEN_REQUESTS)
time.sleep(config.WAIT_BETWEEN_REQUESTS)
- # serialize remaining changed links
- if serfp:
- import serialize
- for link in self.linkMap.values():
- if link._ischanged:
- serialize.serialize_link(serfp, link)
- link._ischanged = False
- serfp.flush()
-
- def postprocess(self):
- """Do some basic post processing of the collected data, including
- depth calculation of every link."""
- # build the list of urls that were set up with add_internal() that
- # do not have a parent (they form the base for the site)
- for url in self._internal_urls:
- link = self.linkMap[url].follow_link()
- if link == None:
- debugio.warn('base link %s redirects to nowhere' % url)
- continue
- # add the link to bases
- debugio.debug('crawler.postprocess(): adding %s to bases' %
link.url)
- self.bases.append(link)
- # if we got no bases, just use the first internal one
- if len(self.bases) == 0:
- debugio.debug('crawler.postprocess(): fallback to adding %s to
bases' % self._internal_urls[0])
- self.bases.append(self.linkMap[self._internal_urls[0]])
- # do a breadth first traversal of the website to determin depth and
- # figure out page children
- tocheck = set()
- for link in self.bases:
- link.depth = 0
- tocheck.add(link)
- # repeat until we have nothing more to check
- while len(tocheck) > 0:
- debugio.debug('crawler.postprocess(): items left to examine: %d' %
len(tocheck))
- # choose a link from the tocheck list
- link = tocheck.pop()
- # figure out page children
- for child in link._pagechildren():
- # skip children with the wrong depth
- if child.depth != link.depth+1:
- continue
- tocheck.add(child)
-
-class Link:
- """This is a basic class representing a url.
-
- Some basic information about a url is stored in instances of this
- class:
-
- url - the url this link represents
- scheme - the scheme part of the url
- netloc - the netloc part of the url
- path - the path part of the url
- query - the query part of the url
- parents - list of parent links (all the Links that link to this
- page)
- children - list of child links (the Links that this page links to)
- pagechildren - list of child pages, including children of embedded
- elements
- embedded - list of links to embeded content
- anchors - list of anchors defined on the page
- reqanchors - list of anchors requesten for this page anchor->link*
- depth - the number of clicks from the base urls this page to
- find
- isinternal - whether the link is considered to be internal
- isyanked - whether the link should be checked at all
- isfetched - whether the lis is fetched already
- ispage - whether the link represents a page
- mtime - modification time (in seconds since the Epoch)
- size - the size of this document
- mimetype - the content-type of the document
- encoding - the character set used in the document
- title - the title of this document (unicode)
- author - the author of this document (unicode)
- status - the result of retreiving the document
- linkproblems - list of problems with retrieving the link
- pageproblems - list of problems in the parsed page
- redirectdepth - the number of this redirect (=0 not a redirect)
-
- Instances of this class should be made through a site instance
- by adding internal urls and calling crawl().
- """
-
- def __init__(self, site, url):
- """Creates an instance of the Link class and initializes the
- documented properties to some sensible value."""
- # store a reference to the site
- self.site = site
- # split the url in useful parts and store the parts
- (self.scheme, self.netloc, self.path, self.query) = \
- urlparse.urlsplit(url)[0:4]
- # store the url (without the fragment)
- url = urlparse.urlunsplit(
- (self.scheme, self.netloc, self.path, self.query, '') )
- self.url = url
- # ensure that we are not creating something that already exists
- assert not self.site.linkMap.has_key(url)
- # store the Link object in the linkMap
- self.site.linkMap[url] = self
- # deternmin the kind of url (internal or external)
- self.isinternal = self.site._is_internal(self)
- # check if the url is yanked
- self.isyanked = self.site._is_yanked(self)
- # initialize some properties
- self.parents = set()
- self.children = set()
- self.pagechildren = None
- self.embedded = set()
- self.anchors = set()
- self.reqanchors = {}
- self.depth = None
- self.isfetched = False
- self.ispage = False
- self.mtime = None
- self.size = None
- self.mimetype = None
- self.encoding = None
- self.title = None
- self.author = None
- self.status = None
- self.linkproblems = []
- self.pageproblems = []
- self.redirectdepth = 0
- self.redirectlist = None
- self._ischanged = False
-
- def __checkurl(self, url):
- """Check to see if the url is formatted properly, correct formatting
- if possible and log an error in the formatting to the current page."""
- # search for spaces in the url
- if _spacepattern.search(url):
- self.add_pageproblem('link contains unescaped spaces: %s' % url)
- # replace spaces by %20
- url = _spacepattern.sub('%20', url)
- # find anchor part
- try:
- # get the anchor
- anchor = _anchorpattern.search(url).group(1)
- # get link for url we link to
- child = self.site.get_link(url)
- # store anchor
- child.add_reqanchor(self, anchor)
- except AttributeError:
- # ignore problems lookup up anchor
- pass
- return url
-
- def __tolink(self, link):
- """Convert the link to a link object, either it is already a link,
- a link object is returned from the database or a new link is
- created. This returns None for empty strings."""
- # ignore if child is empty string
- if link == '' or link == u'':
- return None
- if type(link) is unicode and self.encoding:
- # convert url to binary if passed as unicode
- link = link.encode(self.encoding)
- # convert the url to a link object if we were called with a url
- if type(link) is unicode or type(link) is str:
- link = self.site.get_link(self.__checkurl(link))
- # re're done
- return link
- def add_child(self, child):
- """Add a link object to the child relation of this link.
- The reverse relation is also made."""
- # ignore children for external links
- if not self.isinternal:
- return
- # convert to link object
- child = self.__tolink(child)
- if child is None:
- return
- # add to children
- if child not in self.children:
- self.children.add(child)
- self._ischanged = True
- # add self to parents of child
- if self not in child.parents:
- child.parents.add(self)
-
- def add_embed(self, link):
- """Mark the given link object as used as an image on this page."""
- # ignore embeds for external links
- if not self.isinternal:
- return
- # convert to link object
- link = self.__tolink(link)
- if link is None:
- return
- # add to embedded
- if link not in self.embedded:
- self.embedded.add(link)
- self._ischanged = True
- # add self to parents of embed
- if self not in link.parents:
- link.parents.add(self)
-
- def add_anchor(self, anchor):
- """Indicate that this page contains the specified anchor."""
- # lowercase anchor
- anchor = anchor.lower()
- # add anchor
- if anchor in self.anchors:
- self.add_pageproblem(
- 'anchor/id "%(anchor)s" defined multiple times'
- % { 'anchor': anchor })
- else:
- self.anchors.add(anchor)
- self._ischanged = True
-
- def add_reqanchor(self, parent, anchor):
- """Indicate that the specified link contains a reference to the
- specified anchor. This can be checked later."""
- # lowercase anchor
- anchor = anchor.lower()
- # convert the url to a link object if we were called with a url
- parent = self.__tolink(parent)
- # add anchor
- if anchor in self.reqanchors:
- if parent not in self.reqanchors[anchor]:
- self.reqanchors[anchor].add(parent)
- self._ischanged = True
- else:
- self.reqanchors[anchor] = set([parent])
- self._ischanged = True
-
- def redirect(self, url):
- """Indicate that this link redirects to the specified url. Maximum
- redirect counting is done as well as loop detection."""
- # figure out depth and urls that have been visited in this
- # redirect list
- redirectdepth = 0
- redirectlist = set()
- for parent in self.parents:
- if parent.redirectdepth > redirectdepth:
- redirectdepth = parent.redirectdepth
- redirectlist = parent.redirectlist
- self.redirectdepth = redirectdepth + 1
- self.redirectlist = redirectlist
- self.redirectlist.add(self.url)
- # check depth
- if self.redirectdepth >= config.REDIRECT_DEPTH:
- self.add_linkproblem('too many redirects (%d)' %
self.redirectdepth)
- return None
- # check for redirect to self
- url = self.__checkurl(url)
- if url == self.url:
- self.add_linkproblem('redirect same as source: %s' % url)
- return None
- # check for redirect loop
- if url in self.redirectlist:
- self.add_linkproblem('redirect loop %s' % url)
- # add child
- self.add_child(url)
-
- def add_linkproblem(self, problem):
- """Indicate that something went wrong while retreiving this link."""
- self.linkproblems.append(problem)
- self._ischanged = True
-
- def add_pageproblem(self, problem):
- """Indicate that something went wrong with parsing the document."""
- # only think about problems on internal pages
- if not self.isinternal:
- return
- # only include a single problem once (e.g. multiple anchors)
- if problem not in self.pageproblems:
- self.pageproblems.append(problem)
- self._ischanged = True
-
- def fetch(self):
- """Attempt to fetch the url (if isyanked is not True) and fill in link
- attributes (based on isinternal)."""
- debugio.info(' %s' % self.url)
- # fully ignore links that should not be feteched
- if self.isyanked:
- debugio.info(' ' + self.isyanked)
- return
+ def fetch(self, link):
+ """Attempt to fetch the url (if not yanked) and fill in link
+ attributes (based on is_internal)."""
+ debugio.info(' %s' % link.url)
+ # mark the link as fetched to avoid loops
+ link.fetched = datetime.datetime.now()
# see if we can import the proper module for this scheme
try:
# FIXME: if an URI has a username:passwd add the uri, username and
password to the HTTPPasswordMgr
- request = urllib2.Request(self.url)
- if self.parents:
- request.add_header('Referer', iter(self.parents).next().url)
+ request = urllib2.Request(link.url)
+ if link.parents:
+ request.add_header('Referer', iter(link.parents).next().url)
response = urllib2.urlopen(request)
- self.mimetype = response.info().gettype()
- self.set_encoding(response.info().getencoding())
+ link.mimetype = response.info().gettype()
+ link.set_encoding(response.headers.getparam('charset'))
# FIXME: get result code and other stuff
- self.status = str(response.code)
+ link.status = str(response.code)
# link.size = int(response.getheader('Content-length'))
# link.mtime = time.mktime(response.msg.getdate('Last-Modified'))
# if response.status == 301:
link.add_linkproblem(str(response.status)+': '+response.reason)
# elif response.status != 200:
link.add_linkproblem(str(response.status)+': '+response.reason)
# TODO: add checking for size
+ return response
except RedirectError, e:
- self.status = str(e.code)
+ link.status = str(e.code)
debugio.info(' ' + str(e))
if e.code == 301:
- self.add_linkproblem(str(e))
- self.redirect(e.newurl)
+ link.add_linkproblem(str(e))
+ link.add_redirect(e.newurl)
return
except urllib2.HTTPError, e:
- self.status = str(e.code)
+ link.status = str(e.code)
debugio.info(' ' + str(e))
- self.add_linkproblem(str(e))
+ link.add_linkproblem(str(e))
return
except urllib2.URLError, e:
debugio.info(' ' + str(e))
- self.add_linkproblem(str(e))
+ link.add_linkproblem(str(e))
return
except KeyboardInterrupt:
# handle this in a higher-level exception handler
raise
except Exception, e:
# handle all other exceptions
- debugio.warn('unknown exception caught: '+str(e))
- self.add_linkproblem('error reading HTTP response: '+str(e))
+ debugio.warn('unknown exception caught: ' + str(e))
+ link.add_linkproblem('error reading HTTP response: %s' % str(e))
import traceback
traceback.print_exc()
return
- finally:
- self.isfetched = True
- self._ischanged = True
+
+ def parse(self, link, response):
+ """Parse the fetched response."""
# find a parser for the content-type
- parsermodule = parsers.get_parsermodule(self.mimetype)
+ parsermodule = parsers.get_parsermodule(link.mimetype)
if parsermodule is None:
- debugio.debug('crawler.Link.fetch(): unsupported content-type: %s'
% self.mimetype)
+ debugio.debug('crawler.Link.fetch(): unsupported content-type: %s'
% link.mimetype)
return
# skip parsing of content if we were returned nothing
content = response.read()
@@ -610,69 +342,52 @@
# parse the content
debugio.debug('crawler.Link.fetch(): parsing using %s' %
parsermodule.__name__)
try:
- parsermodule.parse(content, self)
+ parsermodule.parse(content, link)
except Exception, e:
- self.add_pageproblem('problem parsing page: ' + str(e))
- debugio.warn('problem parsing page: ' + str(e))
import traceback
traceback.print_exc()
+ debugio.warn('problem parsing page: ' + str(e))
+ link.add_pageproblem('problem parsing page: ' + str(e))
- def follow_link(self, visited=set()):
- """If this link represents a redirect return the redirect target,
- otherwise return self. If this redirect does not find a referenced
- link None is returned."""
- # if this is not a redirect just return
- if self.redirectdepth == 0:
- return self
- # if we don't know where this redirects, return None
- if len(self.children) == 0:
- return None
- # the first (and only) child is the redirect target
- visited.add(self)
- # check for loops
- child = self.children.copy().pop()
- if child in visited:
- return None
- # check where we redirect to
- return child.follow_link(visited)
-
- def _pagechildren(self):
- """Determin the page children of this link, combining the children of
- embedded items and following redirects."""
- # if we already have pagechildren defined we're done
- if self.pagechildren is not None:
- return self.pagechildren
- self.pagechildren = set()
- # add my own children, following redirects
- for child in self.children:
- # follow redirects
- child = child.follow_link()
- # skip children we already have
- if child is None:
+ def postprocess(self):
+ """Do some basic post processing of the collected data, including
+ depth calculation of every link."""
+ # get a database session
+ session = db.Session()
+ # build the list of urls that were set up with add_internal() that
+ # do not have a parent (they form the base for the site)
+ for url in self._internal_urls:
+ link = self.get_link(session, url).follow_link()
+ if not link:
+ debugio.warn('base link %s redirects to nowhere' % url)
continue
- # set depth of child if it is not already set
- if child.depth is None:
- child.depth = self.depth+1
- # add child pages to out pagechildren
- if child.ispage:
- self.pagechildren.add(child)
- # add my embedded element's children
- for embed in self.embedded:
- # set depth of embed if it is not already set
- if embed.depth is None:
- embed.depth = self.depth
- # merge in children of embeds
- self.pagechildren.update(embed._pagechildren())
- # return the results
- return self.pagechildren
-
- def set_encoding(self, encoding):
- """Set the encoding of the link doing some basic checks
- to see if the encoding is supported."""
- if self.encoding is None and encoding is not None:
- try:
- debugio.debug('crawler.Link.set_encoding("'+str(encoding)+'")')
- unicode('just some random text', encoding, 'replace')
- self.encoding = encoding
- except Exception:
- self.add_pageproblem('unknown encoding: ' + str(encoding))
+ # add the link to bases
+ debugio.debug('crawler.postprocess(): adding %s to bases' %
link.url)
+ self.bases.append(link)
+ # if we got no bases, just use the first internal one
+ if not self.bases:
+ link = session.query(db.Link).filter(db.Link.is_internal ==
True).first()
+ debugio.debug('crawler.postprocess(): fallback to adding %s to
bases' % link.url)
+ self.bases.append(link)
+ # do a breadth first traversal of the website to determine depth and
+ # figure out page children
+ session.query(db.Link).update(dict(depth=None),
synchronize_session=False)
+ session.commit()
+ depth = 0
+ count = len(self.bases)
+ for link in self.bases:
+ link.depth = 0
+ session.commit()
+ debugio.debug('crawler.postprocess(): %d links at depth 0' % count)
+ while count > 0:
+ # update the depth of all links without a depth that have a
+ # parent with the previous depth
+ qry = session.query(db.Link).filter(db.Link.depth == None)
+ qry = qry.filter(db.Link.linked_from.any(db.Link.depth == depth))
+ count = qry.update(dict(depth=depth + 1),
synchronize_session=False)
+ session.commit()
+ depth += 1
+ debugio.debug('crawler.postprocess(): %d links at depth %d' %
(count, depth))
+ # TODO: also handle embeds
+ # make the list of links (and session) available to the plugins
+ self.links = session.query(db.Link)
Added: webcheck/db.py
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ webcheck/db.py Thu Aug 4 21:46:26 2011 (r421)
@@ -0,0 +1,252 @@
+
+# db.py - database access layer for webcheck
+#
+# Copyright (C) 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+import urlparse
+
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import Table, Column, Integer, Boolean, String, DateTime,
ForeignKey
+from sqlalchemy.orm import relationship, backref, sessionmaker
+from sqlalchemy.orm.session import object_session
+from sqlalchemy.sql.expression import ClauseElement
+
+import config
+import debugio
+import myurllib
+
+
+# provide session and schema classes
+Session = sessionmaker()
+Base = declarative_base()
+
+
+children = Table(
+ 'children', Base.metadata,
+ Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE')),
+ Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'))
+ )
+
+
+embedded = Table(
+ 'embedded', Base.metadata,
+ Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE')),
+ Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'))
+ )
+
+
+class Link(Base):
+
+ __tablename__ = 'links'
+
+ id = Column(Integer, primary_key=True)
+ url = Column(String, index=True, nullable=False, unique=True)
+ fetched = Column(DateTime, index=True)
+ is_internal = Column(Boolean, index=True)
+ yanked = Column(String, index=True)
+ depth = Column(Integer)
+
+ # information about the retrieved link
+ status = Column(String)
+ mimetype = Column(String)
+ mimetype = Column(String)
+ encoding = Column(String)
+ size = Column(Integer)
+ mtime = Column(DateTime)
+ is_page = Column(Boolean, index=True)
+ title = Column(String)
+ author = Column(String)
+
+ # relationships between links
+ children = relationship('Link', secondary=children,
+ backref=backref('linked_from', collection_class=set),
+ primaryjoin=(id == children.c.parent_id),
+ secondaryjoin=(id == children.c.child_id),
+ collection_class=set)
+ embedded = relationship('Link', secondary=embedded,
+ backref=backref('embedded_in', collection_class=set),
+ primaryjoin=(id == embedded.c.parent_id),
+ secondaryjoin=(id == embedded.c.child_id),
+ collection_class=set)
+
+ # crawling information
+ redirectdepth = Column(Integer, default=0)
+
+ @staticmethod
+ def clean_url(url):
+ # normalise the URL, removing the fragment from the URL
+ url = myurllib.normalizeurl(url)
+ (scheme, netloc, path, query) = urlparse.urlsplit(url)[0:4]
+ return urlparse.urlunsplit((scheme, netloc, path, query, ''))
+
+ def _get_link(self, url):
+ """Get a link object for the specified URL."""
+ # get the session
+ session = object_session(self)
+ # try to find the URL
+ url = self.clean_url(url)
+ instance = session.query(Link).filter_by(url=url).first()
+ if not instance:
+ instance = Link(url=url)
+ session.add(instance)
+ return instance
+
+ def set_encoding(self, encoding):
+ """Set the encoding of the link doing some basic checks to see if
+ the encoding is supported."""
+ if not self.encoding and encoding:
+ try:
+ debugio.debug('crawler.Link.set_encoding(%r)' % encoding)
+ unicode('just some random text', encoding, 'replace')
+ self.encoding = encoding
+ except Exception, e:
+ import traceback
+ traceback.print_exc()
+ self.add_pageproblem('unknown encoding: %s' % encoding)
+
+ def add_redirect(self, url):
+ """Indicate that this link redirects to the specified url."""
+ url = self.clean_url(url)
+ # figure out depth
+ self.redirectdepth = max([self.redirectdepth] +
+ [x.redirectdepth for x in self.parents]) + 1
+ # check depth
+ if self.redirectdepth >= config.REDIRECT_DEPTH:
+ self.add_linkproblem('too many redirects (%d)' %
self.redirectdepth)
+ return
+ # check for redirect to self
+ if url == self.url:
+ self.add_linkproblem('redirect same as source: %s' % url)
+ return
+ # add child
+ self.add_child(url)
+
+ def add_linkproblem(self, message):
+ """Indicate that something went wrong while retrieving this link."""
+ self.linkproblems.append(LinkProblem(message=message))
+
+ def add_pageproblem(self, message):
+ """Indicate that something went wrong with parsing the document."""
+ # only think about problems on internal pages
+ if not self.is_internal:
+ return
+ # TODO: only include a single problem once (e.g. multiple anchors)
+ self.pageproblems.append(PageProblem(message=message))
+
+ def add_child(self, url):
+ """Add the specified URL as a child of this link."""
+ # ignore children for external links
+ if not self.is_internal:
+ return
+ # add to children
+ self.children.add(self._get_link(url))
+
+ def add_embed(self, url):
+ """Mark the given URL as used as an image on this page."""
+ # ignore embeds for external links
+ if not self.is_internal:
+ return
+ # add to embedded
+ self.embedded.add(self._get_link(url))
+
+ def add_anchor(self, anchor):
+ """Indicate that this page contains the specified anchor."""
+ return # FIXME: implement/update
+ # lowercase anchor
+ anchor = anchor.lower()
+ # add anchor
+ if anchor in self.anchors:
+ self.add_pageproblem(
+ 'anchor/id "%(anchor)s" defined multiple times'
+ % { 'anchor': anchor })
+ else:
+ self.anchors.add(anchor)
+
+ def add_reqanchor(self, parent, anchor):
+ """Indicate that the specified link contains a reference to the
+ specified anchor. This can be checked later."""
+ return # FIXME: implement/update
+ # lowercase anchor
+ anchor = anchor.lower()
+ # convert the url to a link object if we were called with a url
+ parent = self.__tolink(parent)
+ # add anchor
+ if anchor in self.reqanchors:
+ if parent not in self.reqanchors[anchor]:
+ self.reqanchors[anchor].add(parent)
+ else:
+ self.reqanchors[anchor] = set([parent])
+
+ def follow_link(self, visited=None):
+ """If this link represents a redirect return the redirect target,
+ otherwise return self. If this redirect does not find a referenced
+ link None is returned."""
+ # if this is not a redirect just return
+ if not self.redirectdepth:
+ return self
+ # if we don't know where this redirects, return None
+ if not self.children:
+ return None
+ # avoid loops
+ if not visited:
+ visited = set()
+ visited.add(self.url)
+ # the first (and only) child is the redirect target
+ child = list(self.children)[0]
+ if child.url in visited:
+ return None
+ # check where we redirect to
+ return child.follow_link(visited)
+
+ @property
+ def parents(self):
+ return set(self.linked_from).union(self.embedded_in)
+
+
+class LinkProblem(Base):
+ """Storage of problems in the URL itself (e.g. problem downloading the
+ associated resource)."""
+
+ __tablename__ = 'linkproblems'
+
+ id = Column(Integer, primary_key=True)
+ link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+ link = relationship(Link, backref=backref('linkproblems', order_by=id,
+ cascade='all,delete,delete-orphan'))
+ message = Column(String)
+
+ def __unicode__(self):
+ return self.message
+
+
+class PageProblem(Base):
+ """Storage of problems in the information from the retrieved URL (e.g.
+ invalid HTML)."""
+
+ __tablename__ = 'pageproblems'
+
+ id = Column(Integer, primary_key=True)
+ link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+ link = relationship(Link, backref=backref('pageproblems', order_by=id,
+ cascade='all,delete,delete-orphan'))
+ message = Column(String)
+
+ def __unicode__(self):
+ return self.message
Modified: webcheck/parsers/css.py
==============================================================================
--- webcheck/parsers/css.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/parsers/css.py Thu Aug 4 21:46:26 2011 (r421)
@@ -1,7 +1,7 @@
# css.py - parser functions for css content
#
-# Copyright (C) 2005, 2006, 2009 Arthur de Jong
+# Copyright (C) 2005, 2006, 2009, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -39,16 +39,16 @@
# pattern for matching url(...) in css
_urlpattern = re.compile('url\(["\']?(.*?)["\']?\)')
-def parse(content, link, baseurl=None):
+def parse(content, link, base=None):
"""Parse the specified content and extract information for crawling the
site further."""
- # if no baseurl is specified, get it from the link
- baseurl = link.url
+ # if no base is specified, get it from the link
+ base = base or link.url
# strip out comments from the content
content = _commentpattern.sub('', content)
- # handler @imports
- for i in _importpattern.findall(content):
- link.add_embed(urlparse.urljoin(baseurl, i))
+ # handle @imports
+ for embed in _importpattern.findall(content):
+ link.add_embed(urlparse.urljoin(base, embed))
# handle url()s
- for i in _urlpattern.findall(content):
- link.add_embed(urlparse.urljoin(baseurl, i))
+ for embed in _urlpattern.findall(content):
+ link.add_embed(urlparse.urljoin(base, embed))
Modified: webcheck/parsers/html/__init__.py
==============================================================================
--- webcheck/parsers/html/__init__.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/parsers/html/__init__.py Thu Aug 4 21:46:26 2011 (r421)
@@ -1,7 +1,7 @@
# html.py - parser functions for html content
#
-# Copyright (C) 2005, 2006, 2007, 2008 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -44,7 +44,7 @@
return u''
# convert to unicode object
if not isinstance(txt, unicode):
- txt = unicode(txt, errors='replace')
+ txt = unicode(txt)
# the output string
out = ''
# loop over the characters of the string
Modified: webcheck/parsers/html/beautifulsoup.py
==============================================================================
--- webcheck/parsers/html/beautifulsoup.py Sat Jun 18 23:26:22 2011
(r420)
+++ webcheck/parsers/html/beautifulsoup.py Thu Aug 4 21:46:26 2011
(r421)
@@ -1,7 +1,7 @@
# beautifulsoup.py - parser functions for html content
#
-# Copyright (C) 2007, 2008, 2009 Arthur de Jong
+# Copyright (C) 2007, 2008, 2009, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -80,10 +80,10 @@
if refresh and refresh['content']:
try:
child = _refershcontentpattern.search(refresh['content']).group(1)
- link.add_child(urlparse.urljoin(base, child))
except AttributeError:
- # ignore cases where refresh header parsing causes problems
- pass
+ pass # ignore cases where refresh header parsing causes problems
+ else:
+ link.add_child(urlparse.urljoin(base, child))
# <img src="URL">
for img in soup.findAll('img', src=True):
embed = myurllib.normalizeurl(htmlunescape(img['src']).strip())
@@ -180,4 +180,4 @@
if embed:
link.add_embed(urlparse.urljoin(base, embed))
# flag that the link contains a valid page
- link.ispage = True
+ link.is_page = True
Modified: webcheck/parsers/html/calltidy.py
==============================================================================
--- webcheck/parsers/html/calltidy.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/parsers/html/calltidy.py Thu Aug 4 21:46:26 2011 (r421)
@@ -1,7 +1,7 @@
# calltidy.py - parser functions for html content
#
-# Copyright (C) 2008 Arthur de Jong
+# Copyright (C) 2008, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -28,7 +28,7 @@
"""Parse the specified content with tidy and add any errors to the
link."""
# only call tidy on internal pages
- if link.isinternal:
+ if link.is_internal:
t = tidy.parseString(content, **config.TIDY_OPTIONS)
for err in t.errors:
# error messages are escaped so we unescape them
Modified: webcheck/parsers/html/htmlparser.py
==============================================================================
--- webcheck/parsers/html/htmlparser.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/parsers/html/htmlparser.py Thu Aug 4 21:46:26 2011 (r421)
@@ -1,7 +1,7 @@
# html.py - parser functions for html content
#
-# Copyright (C) 2005, 2006, 2007, 2009 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2009, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -269,15 +269,15 @@
parser.close()
except Exception, e:
# ignore (but log) all errors
- debugio.debug('parsers.html.htmlparser.parse(): caught exception:
'+str(e))
+ debugio.debug('parsers.html.htmlparser.parse(): caught exception: ' +
str(e))
# check for parser errors
if parser.errmsg is not None:
- debugio.debug('parsers.html.htmlparser.parse(): problem parsing html:
'+parser.errmsg)
+ debugio.debug('parsers.html.htmlparser.parse(): problem parsing html:
' + parser.errmsg)
link.add_pageproblem('problem parsing html: %s' % parser.errmsg)
# dump encoding
debugio.debug('parsers.html.htmlparser.parse(): html encoding: %s' %
str(link.encoding))
# flag that the link contains a valid page
- link.ispage = True
+ link.is_page = True
# save the title
if parser.title is not None:
link.title = _maketxt(parser.title, link.encoding).strip()
Modified: webcheck/plugins/__init__.py
==============================================================================
--- webcheck/plugins/__init__.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/__init__.py Thu Aug 4 21:46:26 2011 (r421)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2009 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2009, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -84,7 +84,7 @@
def _mk_unicode(txt):
"""Returns a unicode instance of the string."""
if not isinstance(txt, unicode):
- txt = unicode(txt, errors='replace')
+ txt = unicode(txt)
return txt
def get_info(link):
@@ -96,15 +96,15 @@
info += 'title: %s\n' % link.title.strip()
if link.author:
info += 'author: %s\n' % link.author.strip()
- if link.isinternal:
+ if link.is_internal:
info += 'internal link'
else:
info += 'external link'
- if link.isyanked:
- if isinstance(link.isyanked, unicode):
- info += ', not checked (%s)\n' % link.isyanked
- if isinstance(link.isyanked, str):
- info += ', not checked (%s)\n' % _mk_unicode(link.isyanked)
+ if link.yanked:
+ if isinstance(link.yanked, unicode):
+ info += ', not checked (%s)\n' % link.yanked
+ if isinstance(link.yanked, str):
+ info += ', not checked (%s)\n' % _mk_unicode(link.yanked)
else:
info += ', not checked\n'
else:
@@ -135,7 +135,7 @@
"""Return an <a>nchor to a url with title. If url is in the Linklist and
is external, insert "class=external" in the <a> tag."""
# try to fetch the link object for this url
- if link.isinternal:
+ if link.is_internal:
cssclass = 'internal'
else:
cssclass = 'external'
@@ -152,7 +152,7 @@
The output is indeted with the specified indent."""
parents = list(link.parents)
# if there are no parents print nothing
- if len(parents) == 0:
+ if not parents:
return
parents.sort(lambda a, b: cmp(a.title, b.title) or cmp(a.url, b.url))
fp.write(
@@ -160,7 +160,7 @@
indent+' referenced from:\n'+
indent+' <ul>\n' )
more = 0
- if len(parents) > config.PARENT_LISTLEN+1:
+ if len(parents) > config.PARENT_LISTLEN + 1:
more = len(parents) - config.PARENT_LISTLEN
parents = parents[:config.PARENT_LISTLEN]
for parent in parents:
Modified: webcheck/plugins/about.py
==============================================================================
--- webcheck/plugins/about.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/about.py Thu Aug 4 21:46:26 2011 (r421)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -28,9 +28,11 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'about.html'
+import time
+
import config
import plugins
-import time
+
def generate(site):
"""Output a list of modules, it's authors and it's version to the
@@ -55,7 +57,7 @@
' </p>\n\n'
% { 'version': plugins.htmlescape(config.VERSION),
'time': plugins.htmlescape(time.ctime(time.time())),
- 'numurls': len(site.linkMap),
+ 'numurls': site.links.count(),
'homepage': config.HOMEPAGE } )
# output copyright information
fp.write(
@@ -74,7 +76,7 @@
' particular purpose. See the source for further details.\n'
' </p>\n'
' <p>\n'
- ' Copyright © 1998, 1999, 2002, 2005, 2006, 2007 Albert Hopkins
(marduk),\n'
+ ' Copyright © 1998-2011 Albert Hopkins (marduk),\n'
' Mike W. Meyer and Arthur de Jong\n'
' </p>\n'
' <p>\n'
@@ -89,7 +91,7 @@
' notices (see <tt>fancytooltips.js</tt> for details):\n'
' </p>\n'
' <p>\n'
- ' Copyright © 2003, 2005 Stuart Langridge, Paul McLanahan,\n'
+ ' Copyright © 2003-2005 Stuart Langridge, Paul McLanahan,\n'
' Peter Janes, Brad Choate, Dunstan Orchard, Ethan Marcotte,\n'
' Mark Wubben and Victor Kulinski\n'
' </p>\n\n' )
@@ -98,7 +100,7 @@
' <h3>Plugins</h3>\n'
' <ul>\n')
for plugin in config.PLUGINS:
- report = __import__('plugins.'+plugin, globals(), locals(), [plugin])
+ report = __import__('plugins.' + plugin, globals(), locals(), [plugin])
fp.write(
' <li>\n'
' <strong>%s</strong><br />\n'
Modified: webcheck/plugins/badlinks.py
==============================================================================
--- webcheck/plugins/badlinks.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/badlinks.py Thu Aug 4 21:46:26 2011 (r421)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -28,16 +28,14 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'badlinks.html'
+import db
import plugins
+
def generate(site):
"""Present the list of bad links to the given file descriptor."""
# find all links with link problems
- links = [ x
- for x in site.linkMap.values()
- if len(x.linkproblems)>0 ]
- # sort list
- links.sort(lambda a, b: cmp(a.url, b.url))
+ links = site.links.filter(db.Link.linkproblems.any()).order_by('url')
# present results
fp = plugins.open_html(plugins.badlinks, site)
if not links:
@@ -71,7 +69,7 @@
# add a reference to the problem map
for problem in link.linkproblems:
for parent in link.parents:
- parent.add_pageproblem('bad link: ' + link.url + ': ' +
problem)
+ parent.add_pageproblem('bad link: %s: %s' % (link.url,
problem))
fp.write(
' </li>\n')
fp.write(
Modified: webcheck/plugins/external.py
==============================================================================
--- webcheck/plugins/external.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/external.py Thu Aug 4 21:46:26 2011 (r421)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2009 Arthur de Jong
+# Copyright (C) 2005, 2006, 2009, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -28,17 +28,14 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'external.html'
+import db
import plugins
+
def generate(site):
"""Generate the list of external links to the given file descriptor."""
# get all external links
- links = [ x
- for x in site.linkMap.values()
- if not x.isinternal ]
- # sort list
- # FIXME: use sort(key=....) (adds dependency on python>=2.4)
- links.sort(lambda a, b: cmp(a.url, b.url))
+ links = site.links.filter(db.Link.is_internal != True).order_by('url')
# present results
fp = plugins.open_html(plugins.external, site)
if not links:
Modified: webcheck/plugins/images.py
==============================================================================
--- webcheck/plugins/images.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/images.py Thu Aug 4 21:46:26 2011 (r421)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -28,21 +28,19 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'images.html'
-import plugins
import re
+from sqlalchemy.sql.expression import or_
+
+import db
+import plugins
+
def generate(site):
"""Output a list of images to the given file descriptor."""
- # this finds all links with a reasonable image-like content-type
- matcher = re.compile('^image/.*$')
# get non-page images that have an image/* mimetype
- links = [ x
- for x in site.linkMap.values()
- if not x.ispage and
- x.mimetype is not None and
- matcher.search(x.mimetype) ]
- # sort list
- links.sort(lambda a, b: cmp(a.url, b.url))
+ links = site.links.filter(or_(db.Link.is_page != True, db.Link.is_page ==
None))
+ links = links.filter(db.Link.mimetype.startswith('image/'))
+ links = links.order_by('url')
# present results
fp = plugins.open_html(plugins.images, site)
if not links:
Modified: webcheck/plugins/new.py
==============================================================================
--- webcheck/plugins/new.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/new.py Thu Aug 4 21:46:26 2011 (r421)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -28,25 +28,22 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'new.html'
+import time
+
import config
+import db
import plugins
-import time
-SECS_PER_DAY = 60*60*24
+
+SECS_PER_DAY = 60 * 60 * 24
def generate(site):
"""Output the list of recently modified pages to the specified file
descriptor."""
# the time for which links are considered new
- newtime = time.time()-SECS_PER_DAY*config.REPORT_WHATSNEW_URL_AGE
+ newtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSNEW_URL_AGE
# get all internal pages that are new
- links = [ x
- for x in site.linkMap.values()
- if x.ispage and
- x.isinternal and
- x.mtime is not None and
- x.mtime > newtime ]
- # sort links
- links.sort(lambda a, b: cmp(b.mtime, a.mtime))
+ links = site.links.filter_by(is_page=True, is_internal=True)
+ links = links.filter(db.Link.mtime > newtime).order_by('-mtime')
# present results
fp = plugins.open_html(plugins.new, site)
if not links:
Modified: webcheck/plugins/notchkd.py
==============================================================================
--- webcheck/plugins/notchkd.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/notchkd.py Thu Aug 4 21:46:26 2011 (r421)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -28,15 +28,14 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'notchkd.html'
+import db
import plugins
+
def generate(site):
"""Output the list of not checked pages to the given file descriptor."""
# get all yanked urls
- links = [ x
- for x in site.linkMap.values()
- if x.isyanked ]
- links.sort(lambda a, b: cmp(a.url, b.url))
+ links = site.links.filter(db.Link.yanked != None).order_by('url')
# present results
fp = plugins.open_html(plugins.notchkd, site)
if not links:
Modified: webcheck/plugins/notitles.py
==============================================================================
--- webcheck/plugins/notitles.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/notitles.py Thu Aug 4 21:46:26 2011 (r421)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -28,17 +28,19 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'notitles.html'
+from sqlalchemy.sql.functions import char_length
+from sqlalchemy.sql.expression import or_
+
+import db
import plugins
+
def generate(site):
"""Output the list of pages without a title to the given file
descriptor."""
# get all internal pages without a title
- links = [ x
- for x in site.linkMap.values()
- if x.ispage and
- x.isinternal and
- (x.title is None or x.title == '') ]
- links.sort(lambda a, b: cmp(a.url, b.url))
+ links = site.links.filter_by(is_page=True, is_internal=True)
+ links = links.filter(or_(char_length(db.Link.title) == 0,
+ db.Link.title ==None)).order_by('url')
# present results
fp = plugins.open_html(plugins.notitles, site)
if not links:
Modified: webcheck/plugins/old.py
==============================================================================
--- webcheck/plugins/old.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/old.py Thu Aug 4 21:46:26 2011 (r421)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -28,25 +28,22 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'old.html'
+import time
+
import config
+import db
import plugins
-import time
-SECS_PER_DAY = 60*60*24
+
+SECS_PER_DAY = 60 * 60 * 24
def generate(site):
"""Output the list of outdated pages to the specified file descriptor."""
# the time for which links are considered old
- oldtime = time.time()-SECS_PER_DAY*config.REPORT_WHATSOLD_URL_AGE
+ oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE
# get all internal pages that are old
- links = [ x
- for x in site.linkMap.values()
- if x.ispage and
- x.isinternal and
- x.mtime is not None and
- x.mtime < oldtime ]
- # sort links
- links.sort(lambda a, b: cmp(a.mtime, b.mtime))
+ links = site.links.filter_by(is_page=True, is_internal=True)
+ links = links.filter(db.Link.mtime < oldtime).order_by('mtime').all()
# present results
fp = plugins.open_html(plugins.old, site)
if not links:
@@ -65,7 +62,7 @@
' <ul>\n'
% {'old': config.REPORT_WHATSOLD_URL_AGE })
for link in links:
- age = (time.time()-link.mtime)/SECS_PER_DAY
+ age = (time.time() - link.mtime) / SECS_PER_DAY
fp.write(
' <li>\n'
' %(link)s\n'
Modified: webcheck/plugins/problems.py
==============================================================================
--- webcheck/plugins/problems.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/problems.py Thu Aug 4 21:46:26 2011 (r421)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -28,9 +28,12 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'problems.html'
-import plugins
import urllib
+import db
+import plugins
+
+
def _mk_id(name):
"""Convert the name to a string that may be used inside an
ID attribute."""
@@ -48,10 +51,10 @@
"""Output the overview of problems to the given file descriptor."""
# make a list of problems per author
problem_db = {}
- for link in site.linkMap.values():
- # skip external pages
- if not link.isinternal or len(link.pageproblems) == 0:
- continue
+ # get internal links with page problems
+ links = site.links.filter_by(is_internal=True)
+ links = links.filter(db.Link.pageproblems.any()).order_by('url')
+ for link in links:
# make a normal name for the author
if link.author:
author = link.author.strip()
Modified: webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/plugins/sitemap.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/sitemap.py Thu Aug 4 21:46:26 2011 (r421)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -28,42 +28,60 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'index.html'
+from sqlalchemy.sql.expression import or_
+from sqlalchemy.orm.session import object_session
+
import config
+import db
import plugins
-# this is a workaround for Python 2.3
-try:
- set
-except NameError:
- from sets import Set as set
+
+def add_pagechildren(link, children, explored):
+ """Determine the page children of this link, combining the children of
+ embedded items and following redirects."""
+ links = object_session(link).query(db.Link)
+ # get all internal children
+ qry = links.filter(db.Link.linked_from.contains(link))
+ qry = qry.filter(db.Link.is_internal == True)
+ if link.depth:
+ qry = qry.filter(or_(db.Link.depth > link.depth, db.Link.depth ==
None))
+ #qry = qry.filter(~db.Link.id.in_(explored))
+ # follow redirects
+ children.update(y
+ for y in (x.follow_link() for x in qry)
+ if y and y.is_page and y.is_internal and y.id not in
explored)
+ explored.update(x.id for x in children)
+ # add embedded element's pagechildren (think frames)
+ for embed in link.embedded:
+ # TODO: put this in a query
+ if embed.is_internal and embed.is_page and \
+ embed.id not in explored and \
+ (embed.depth == None or embed.depth > link.depth):
+ add_pagechildren(embed, children, explored)
def _explore(fp, link, explored, depth=0, indent=' '):
"""Recursively do a breadth first traversal of the graph of links on the
site. Prints the html results to the file descriptor."""
# output this link
- fp.write(indent+'<li>\n')
- fp.write(indent+' '+plugins.make_link(link)+'\n')
+ fp.write(indent + '<li>\n')
+ fp.write(indent + ' ' + plugins.make_link(link) + '\n')
# only check children if we are not too deep yet
if depth <= config.REPORT_SITEMAP_LEVEL:
# figure out the links to follow and ensure that they are only
# explored from here
- children = []
- for child in link.pagechildren:
- # skip pages that have the wrong depth, are not internal or have
- # already been visited
- if child.depth != depth+1 or not child.isinternal or child in
explored:
- continue
- # set child as explored and add to to explore list
- explored.add(child)
- children.append(child)
- # go over the children and present them as a list
- if len(children) > 0:
- fp.write(indent+' <ul>\n')
+ children = set()
+ add_pagechildren(link, children, explored)
+ # remove None which could be there as a result of follow_link()
+ children.discard(None)
+ if children:
+ children = list(children)
+ # present children as a list
+ fp.write(indent + ' <ul>\n')
children.sort(lambda a, b: cmp(a.url, b.url))
for child in children:
- _explore(fp, child, explored, depth+1, indent+' ')
- fp.write(indent+' </ul>\n')
- fp.write(indent+'</li>\n')
+ _explore(fp, child, explored, depth + 1, indent + ' ')
+ fp.write(indent + ' </ul>\n')
+ fp.write(indent + '</li>\n')
def generate(site):
"""Output the sitemap to the specified file descriptor."""
@@ -74,7 +92,7 @@
' This an overview of the crawled site.\n'
' </p>\n'
' <ul>\n' )
- explored = set(site.bases)
+ explored = set(x.id for x in site.bases)
for l in site.bases:
_explore(fp, l, explored)
fp.write(
Modified: webcheck/plugins/size.py
==============================================================================
--- webcheck/plugins/size.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/size.py Thu Aug 4 21:46:26 2011 (r421)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -31,6 +31,7 @@
import config
import plugins
+
def _getsize(link, done=None):
"""Return the size of the link and all its embedded links, counting each
link only once."""
@@ -40,7 +41,7 @@
# add this link to the list
done.append(link)
# if we don't known about our total size yet, calculate
- if not hasattr(link, 'totalSize'):
+ if not hasattr(link, 'total_size'):
size = 0
# add our size
if link.size is not None:
@@ -49,19 +50,17 @@
for embed in link.embedded:
if embed not in done:
size += _getsize(embed, done)
- link.totalSize = size
- return link.totalSize
+ link.total_size = size
+ return link.total_size
def generate(site):
"""Output the list of large pages to the given file descriptor."""
# get all internal pages and get big links
- links = [ x
- for x in site.linkMap.values()
- if x.ispage and
- x.isinternal and
- _getsize(x) >= config.REPORT_SLOW_URL_SIZE*1024 ]
+ links = site.links.filter_by(is_page=True, is_internal=True)
+ links = [ x for x in links
+ if _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024 ]
# sort links by size (biggest first)
- links.sort(lambda a, b: cmp(b.totalSize, a.totalSize))
+ links.sort(lambda a, b: cmp(b.total_size, a.total_size))
# present results
fp = plugins.open_html(plugins.size, site)
if not links:
@@ -80,7 +79,7 @@
' <ul>\n'
% { 'size': config.REPORT_SLOW_URL_SIZE })
for link in links:
- size = plugins.get_size(link.totalSize)
+ size = plugins.get_size(link.total_size)
fp.write(
' <li>\n'
' %(link)s\n'
Modified: webcheck/plugins/urllist.py
==============================================================================
--- webcheck/plugins/urllist.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/plugins/urllist.py Thu Aug 4 21:46:26 2011 (r421)
@@ -1,7 +1,7 @@
# urllist.py - plugin to generate a list of visited urls
#
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -28,6 +28,7 @@
import plugins
+
def generate(site):
"""Output a sorted list of urls to the specified file descriptor."""
fp = plugins.open_html(plugins.urllist, site)
@@ -38,10 +39,9 @@
' non-examined urls.\n'
' </p>\n'
' <ol>\n' )
- urls = site.linkMap.keys()
- urls.sort()
- for url in urls:
- fp.write(' <li>'+plugins.make_link(site.linkMap[url],
url)+'</li>\n')
+ links = site.links.order_by('url')
+ for link in links:
+ fp.write(' <li>' + plugins.make_link(link, link.url) + '</li>\n')
fp.write(
' </ol>\n' )
plugins.close_html(fp)
Modified: webcheck/webcheck.py
==============================================================================
--- webcheck/webcheck.py Sat Jun 18 23:26:22 2011 (r420)
+++ webcheck/webcheck.py Thu Aug 4 21:46:26 2011 (r421)
@@ -42,8 +42,8 @@
import crawler
import plugins
import debugio
-import serialize
import monkeypatch
+import db
debugio.loglevel = debugio.INFO
@@ -108,13 +108,16 @@
'ignore-robots',
'quiet', 'silent', 'debug', 'profile', 'output=', 'continue',
'force', 'redirects=', 'userpass=', 'wait=', 'version', 'help'))
+ internal_urls = []
+ external_urls = []
+ yank_urls = []
for flag, arg in optlist:
if flag in ('-i', '--internal'):
- site.add_internal_re(arg)
+ internal_urls.append(arg)
elif flag in ('-x', '--external'):
- site.add_external_re(arg)
+ external_urls.append(arg)
elif flag in ('-y', '--yank'):
- site.add_yanked_re(arg)
+ yank_urls.append(arg)
elif flag in ('-b', '--base-only'):
config.BASE_URLS_ONLY = True
elif flag in ('-a', '--avoid-external'):
@@ -152,6 +155,24 @@
print_usage()
print_tryhelp()
sys.exit(1)
+ # ensure output directory exists
+ if not os.path.isdir(config.OUTPUT_DIR):
+ os.mkdir(config.OUTPUT_DIR)
+ # set up database connection
+ filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
+ from sqlalchemy import create_engine
+ engine = create_engine('sqlite:///' + filename)
+ db.Session.configure(bind=engine)
+ # ensure that all tables are created
+ db.Base.metadata.create_all(engine)
+ # TODO: schema migraton goes here
+ # add configuration to site
+ for pattern in internal_urls:
+ site.add_internal_re(pattern)
+ for pattern in external_urls:
+ site.add_external_re(pattern)
+ for pattern in yank_urls:
+ site.add_yanked_re(pattern)
for arg in args:
# if it does not look like a url it is probably a local file
if urlparse.urlsplit(arg)[0] == '':
@@ -218,33 +239,10 @@
def main(site):
"""Main program."""
- # read serialized file
- if config.CONTINUE:
- fname = os.path.join(config.OUTPUT_DIR, 'webcheck.dat')
- debugio.info('reading stored crawler data....')
- try:
- fp = open(fname, 'r')
- site = serialize.deserialize(fp)
- fp.close()
- except IOError, (errno, strerror):
- debugio.error('%(fname)s: %(strerror)s' %
- { 'fname': fname,
- 'strerror': strerror })
- sys.exit(1)
- debugio.info('done.')
- # create seriazlized file
- fp = plugins.open_file('webcheck.dat', makebackup=True)
- serialize.serialize_site(fp, site)
# crawl through the website
debugio.info('checking site....')
- site.crawl(fp) # this will take a while
+ site.crawl() # this will take a while
debugio.info('done.')
- fp.close()
- # serialize the final state again
- fp = plugins.open_file('webcheck.dat', makebackup=True)
- serialize.serialize_site(fp, site)
- serialize.serialize_links(fp, site)
- fp.close()
# do postprocessing (building site structure, etc)
debugio.info('postprocessing....')
site.postprocess()
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits
- webcheck commit: r421 - in webcheck: . parsers parsers/html plugins,
Commits of the webcheck project