webcheck commit: r430 - in webcheck: . parsers/html plugins
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r430 - in webcheck: . parsers/html plugins
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r430 - in webcheck: . parsers/html plugins
- Date: Fri, 19 Aug 2011 21:44:53 +0200 (CEST)
Author: arthur
Date: Fri Aug 19 21:44:51 2011
New Revision: 430
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=430
Log:
do some performance tuning to ensure that the reports are generated a little
faster
Modified:
webcheck/crawler.py
webcheck/db.py
webcheck/parsers/html/__init__.py
webcheck/plugins/__init__.py
webcheck/plugins/badlinks.py
webcheck/plugins/external.py
webcheck/plugins/new.py
webcheck/plugins/notchkd.py
webcheck/plugins/old.py
webcheck/plugins/problems.py
Modified: webcheck/crawler.py
==============================================================================
--- webcheck/crawler.py Fri Aug 19 21:28:54 2011 (r429)
+++ webcheck/crawler.py Fri Aug 19 21:44:51 2011 (r430)
@@ -295,8 +295,9 @@
try:
# FIXME: if an URI has a username:passwd add the uri, username and
password to the HTTPPasswordMgr
request = urllib2.Request(link.url)
- if link.parents:
- request.add_header('Referer', iter(link.parents).next().url)
+ parent = link.parents.first()
+ if parent:
+ request.add_header('Referer', parent.url)
response = urllib2.urlopen(request)
link.mimetype = response.info().gettype()
link.set_encoding(response.headers.getparam('charset'))
Modified: webcheck/db.py
==============================================================================
--- webcheck/db.py Fri Aug 19 21:28:54 2011 (r429)
+++ webcheck/db.py Fri Aug 19 21:44:51 2011 (r430)
@@ -23,10 +23,11 @@
import urlparse
from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import distinct, func
from sqlalchemy import Table, Column, Integer, Boolean, String, DateTime,
ForeignKey
from sqlalchemy.orm import relationship, backref, sessionmaker
from sqlalchemy.orm.session import object_session
-from sqlalchemy.sql.expression import ClauseElement
+from sqlalchemy.sql.expression import ClauseElement, union
import config
import debugio
@@ -40,15 +41,15 @@
children = Table(
'children', Base.metadata,
- Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE')),
- Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'))
+ Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE'),
index=True),
+ Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'),
index=True)
)
embedded = Table(
'embedded', Base.metadata,
- Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE')),
- Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'))
+ Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE'),
index=True),
+ Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'),
index=True)
)
@@ -68,9 +69,9 @@
mimetype = Column(String)
encoding = Column(String)
size = Column(Integer)
- mtime = Column(DateTime)
+ mtime = Column(DateTime, index=True)
is_page = Column(Boolean, index=True)
- title = Column(String)
+ title = Column(String, index=True)
author = Column(String)
# relationships between links
@@ -212,8 +213,24 @@
return child.follow_link(visited)
@property
+ def count_parents(self):
+ session = object_session(self)
+ p1 =
session.query(func.count(distinct(children.c.parent_id))).filter(children.c.child_id
== self.id)
+ p2 =
session.query(func.count(distinct(embedded.c.parent_id))).filter(embedded.c.child_id
== self.id)
+ return p1.scalar() + p2.scalar()
+
+ @property
def parents(self):
- return set(self.linked_from).union(self.embedded_in)
+ session = object_session(self)
+ #links = object_session(self).query(Link)
+ #links = links.join(children, Link.id == children.c.parent_id)
+ #links = links.join(embedded, Link.id == embedded.c.parent_id)
+ #return links.filter((children.c.child_id == self.id) |
+ # (embedded.c.child_id == self.id)).distinct()
+ parent_ids =
union(session.query(children.c.parent_id).filter(children.c.child_id ==
self.id),
+
session.query(embedded.c.parent_id).filter(embedded.c.child_id == self.id))
+
+ return session.query(Link).filter(Link.id ==
parent_ids.c.children_parent_id).distinct()
class LinkProblem(Base):
@@ -223,10 +240,10 @@
__tablename__ = 'linkproblems'
id = Column(Integer, primary_key=True)
- link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
- link = relationship(Link, backref=backref('linkproblems',
- cascade='all,delete,delete-orphan', lazy='dynamic'))
- message = Column(String)
+ link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'),
index=True)
+ message = Column(String, index=True)
+ link = relationship(Link, backref=backref('linkproblems', order_by=message,
+ cascade='all,delete,delete-orphan'))
def __unicode__(self):
return self.message
@@ -239,10 +256,10 @@
__tablename__ = 'pageproblems'
id = Column(Integer, primary_key=True)
- link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
- link = relationship(Link, backref=backref('pageproblems',
- cascade='all,delete,delete-orphan', lazy='dynamic'))
- message = Column(String)
+ link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'),
index=True)
+ message = Column(String, index=True)
+ link = relationship(Link, backref=backref('pageproblems', order_by=message,
+ cascade='all,delete,delete-orphan'))
def __unicode__(self):
return self.message
@@ -254,7 +271,7 @@
__tablename__ = 'anchors'
id = Column(Integer, primary_key=True)
- link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+ link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'),
index=True)
link = relationship(Link, backref=backref('anchors',
lazy='dynamic',
cascade='all,delete,delete-orphan'))
@@ -270,12 +287,12 @@
__tablename__ = 'reqanchors'
id = Column(Integer, primary_key=True)
- link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+ link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'),
index=True)
link = relationship(Link, backref=backref('reqanchors',
lazy='dynamic',
cascade='all,delete,delete-orphan',
), primaryjoin='Link.id == RequestedAnchor.link_id')
- parent_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+ parent_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'),
index=True)
parent = relationship(Link, primaryjoin='Link.id ==
RequestedAnchor.parent_id')
anchor = Column(String)
Modified: webcheck/parsers/html/__init__.py
==============================================================================
--- webcheck/parsers/html/__init__.py Fri Aug 19 21:28:54 2011 (r429)
+++ webcheck/parsers/html/__init__.py Fri Aug 19 21:44:51 2011 (r430)
@@ -36,10 +36,9 @@
_entitypattern = re.compile('&(#[0-9]{1,6}|[a-zA-Z]{2,10});')
-def htmlescape(txt, inattr=False):
+def htmlescape(txt):
"""HTML escape the given string and return an ASCII clean string with
- known entities and character entities for the other values.
- If the inattr parameter is set quotes and newlines will also be escaped."""
+ known entities and character entities for the other values."""
# check for empty string
if not txt:
return u''
@@ -50,17 +49,10 @@
out = ''
# loop over the characters of the string
for c in txt:
- if c == '"':
- if inattr:
- out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
- else:
- out += '"'
- elif ord(c) in htmlentitydefs.codepoint2name:
+ if ord(c) in htmlentitydefs.codepoint2name:
out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
elif ord(c) > 126:
out += '&#%d;' % ord(c)
- elif inattr and c == u'\n':
- out += ' '
else:
out += c.encode('utf-8')
return out
Modified: webcheck/plugins/__init__.py
==============================================================================
--- webcheck/plugins/__init__.py Fri Aug 19 21:28:54 2011 (r429)
+++ webcheck/plugins/__init__.py Fri Aug 19 21:44:51 2011 (r430)
@@ -43,25 +43,21 @@
Pluings can use the functions exported by this module."""
+import sys
+import time
+
+from sqlalchemy.orm import joinedload
from sqlalchemy.orm.session import object_session
-import sys
-import debugio
import config
-import time
+import db
+import debugio
import parsers.html
# reference function from html module
htmlescape = parsers.html.htmlescape
-def get_title(link):
- """Returns the title of a link if it is set otherwise returns url."""
- if link.title is None or link.title == '':
- return link.url
- return link.title
-
-
def _floatformat(f):
"""Return a float as a string while trying to keep it within three
characters."""
@@ -87,54 +83,43 @@
return '%d' % i
-def _mk_unicode(txt):
- """Returns a unicode instance of the string."""
- if not isinstance(txt, unicode):
- txt = unicode(txt)
- return txt
-
-
-def get_info(link):
+def _get_info(link):
"""Return a string with a summary of the information in the link."""
- info = u'url: %s\n' % _mk_unicode(link.url)
+ info = u'url: %s\n' % link.url
if link.status:
- info += '%s\n' % _mk_unicode(link.status)
+ info += u'%s\n' % link.status
if link.title:
- info += 'title: %s\n' % link.title.strip()
+ info += u'title: %s\n' % link.title.strip()
if link.author:
- info += 'author: %s\n' % link.author.strip()
+ info += u'author: %s\n' % link.author.strip()
if link.is_internal:
- info += 'internal link'
+ info += u'internal link'
else:
- info += 'external link'
+ info += u'external link'
if link.yanked:
- if isinstance(link.yanked, unicode):
- info += ', not checked (%s)\n' % link.yanked
- if isinstance(link.yanked, str):
- info += ', not checked (%s)\n' % _mk_unicode(link.yanked)
- else:
- info += ', not checked\n'
+ info += u', not checked (%s)\n' % link.yanked
else:
- info += '\n'
- if link.redirectdepth > 0:
+ info += u'\n'
+ if link.redirectdepth:
if link.children.count() > 0:
- info += 'redirect: %s\n' % _mk_unicode(link.children.first().url)
+ info += u'redirect: %s\n' % link.children.first().url
else:
- info += 'redirect (not followed)\n'
- if len(link.parents) == 1:
- info += 'linked from 1 page\n'
- elif len(link.parents) > 1:
- info += 'linked from %d pages\n' % len(link.parents)
+ info += u'redirect (not followed)\n'
+ count = link.count_parents
+ if count == 1:
+ info += u'linked from 1 page\n'
+ elif count > 1:
+ info += u'linked from %d pages\n' % count
if link.mtime:
- info += 'last modified: %s\n' % time.ctime(link.mtime)
+ info += u'last modified: %s\n' % time.ctime(link.mtime)
if link.size:
- info += 'size: %s\n' % get_size(link.size)
+ info += u'size: %s\n' % get_size(link.size)
if link.mimetype:
- info += 'mime-type: %s\n' % _mk_unicode(link.mimetype)
+ info += u'mime-type: %s\n' % link.mimetype
if link.encoding:
- info += 'encoding: %s\n' % _mk_unicode(link.encoding)
+ info += u'encoding: %s\n' % link.encoding
for problem in link.linkproblems:
- info += 'problem: %s\n' % _mk_unicode(problem)
+ info += u'problem: %s\n' % problem.message
# trim trailing newline
return info.strip()
@@ -142,41 +127,27 @@
def make_link(link, title=None):
"""Return an <a>nchor to a url with title. If url is in the Linklist and
is external, insert "class=external" in the <a> tag."""
- # try to fetch the link object for this url
- if link.is_internal:
- cssclass = 'internal'
- else:
- cssclass = 'external'
- if title is None:
- title = get_title(link)
- target = ''
- if config.REPORT_LINKS_IN_NEW_WINDOW:
- target = 'target="_blank" '
- # gather some information about the link to report
return '<a href="%(url)s" %(target)sclass="%(cssclass)s"
title="%(info)s">%(title)s</a>' % \
- dict(url=htmlescape(link.url, True),
- target=target,
- cssclass=cssclass,
- info=htmlescape(get_info(link), True),
- title=htmlescape(title))
+ dict(url=htmlescape(link.url),
+ target='target="_blank" ' if
config.REPORT_LINKS_IN_NEW_WINDOW else '',
+ cssclass='internal' if link.is_internal else 'external',
+ info=htmlescape(_get_info(link)).replace('\n', ' '),
+ title=htmlescape(title or link.title or link.url))
def print_parents(fp, link, indent=' '):
"""Write a list of parents to the output file descriptor.
The output is indeted with the specified indent."""
- parents = list(link.parents)
# if there are no parents print nothing
- if not parents:
+ count = link.count_parents
+ if not count:
return
- parents.sort(lambda a, b: cmp(a.title, b.title) or cmp(a.url, b.url))
+ parents = link.parents.order_by(db.Link.title,
db.Link.url).options(joinedload(db.Link.linkproblems))[:config.PARENT_LISTLEN]
fp.write(
indent + '<div class="parents">\n' +
indent + ' referenced from:\n' +
indent + ' <ul>\n')
more = 0
- if len(parents) > config.PARENT_LISTLEN + 1:
- more = len(parents) - config.PARENT_LISTLEN
- parents = parents[:config.PARENT_LISTLEN]
for parent in parents:
fp.write(
indent + ' <li>%(parent)s</li>\n'
@@ -283,7 +254,7 @@
' </head>\n'
' <body>\n'
' <h1 class="basename">Webcheck report for <a
href="%(siteurl)s">%(sitetitle)s</a></h1>\n'
- % {'sitetitle': htmlescape(get_title(base)),
+ % {'sitetitle': htmlescape(base.title or base.url),
'plugintitle': htmlescape(plugin.__title__),
'siteurl': base.url,
'version': config.VERSION})
Modified: webcheck/plugins/badlinks.py
==============================================================================
--- webcheck/plugins/badlinks.py Fri Aug 19 21:28:54 2011 (r429)
+++ webcheck/plugins/badlinks.py Fri Aug 19 21:44:51 2011 (r430)
@@ -28,6 +28,8 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'badlinks.html'
+from sqlalchemy.orm import joinedload
+
import db
import plugins
@@ -35,7 +37,7 @@
def generate(site):
"""Present the list of bad links to the given file descriptor."""
# find all links with link problems
- links = site.links.filter(db.Link.linkproblems.any()).order_by(db.Link.url)
+ links =
site.links.filter(db.Link.linkproblems.any()).order_by(db.Link.url).options(joinedload(db.Link.linkproblems))
# present results
fp = plugins.open_html(plugins.badlinks, site)
if not links:
Modified: webcheck/plugins/external.py
==============================================================================
--- webcheck/plugins/external.py Fri Aug 19 21:28:54 2011 (r429)
+++ webcheck/plugins/external.py Fri Aug 19 21:44:51 2011 (r430)
@@ -28,6 +28,8 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'external.html'
+from sqlalchemy.orm import joinedload
+
import db
import plugins
@@ -51,7 +53,7 @@
' examination of the website.'
' </p>\n'
' <ol>\n')
- for link in links:
+ for link in links.options(joinedload(db.Link.linkproblems)):
fp.write(
' <li>\n'
' %(link)s\n'
Modified: webcheck/plugins/new.py
==============================================================================
--- webcheck/plugins/new.py Fri Aug 19 21:28:54 2011 (r429)
+++ webcheck/plugins/new.py Fri Aug 19 21:44:51 2011 (r430)
@@ -47,7 +47,7 @@
links = links.filter(db.Link.mtime >
newtime).order_by(db.Link.mtime.desc())
# present results
fp = plugins.open_html(plugins.new, site)
- if not links:
+ if not links.count():
fp.write(
' <p class="description">\n'
' No pages were found that were modified within the last %(new)d
days.\n'
Modified: webcheck/plugins/notchkd.py
==============================================================================
--- webcheck/plugins/notchkd.py Fri Aug 19 21:28:54 2011 (r429)
+++ webcheck/plugins/notchkd.py Fri Aug 19 21:44:51 2011 (r430)
@@ -28,6 +28,8 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'notchkd.html'
+from sqlalchemy.orm import joinedload
+
import db
import plugins
@@ -38,7 +40,7 @@
links = site.links.filter(db.Link.yanked != None).order_by(db.Link.url)
# present results
fp = plugins.open_html(plugins.notchkd, site)
- if not links:
+ if not links.count():
fp.write(
' <p class="description">\n'
' All links have been checked.\n'
@@ -51,7 +53,7 @@
' at all during the examination of the website.\n'
' </p>\n'
' <ol>\n')
- for link in links:
+ for link in links.options(joinedload(db.Link.linkproblems)):
fp.write(
' <li>\n'
' %(link)s\n'
Modified: webcheck/plugins/old.py
==============================================================================
--- webcheck/plugins/old.py Fri Aug 19 21:28:54 2011 (r429)
+++ webcheck/plugins/old.py Fri Aug 19 21:44:51 2011 (r430)
@@ -44,10 +44,10 @@
oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE
# get all internal pages that are old
links = site.links.filter_by(is_page=True, is_internal=True)
- links = links.filter(db.Link.mtime < oldtime).order_by(db.Link.mtime).all()
+ links = links.filter(db.Link.mtime < oldtime).order_by(db.Link.mtime)
# present results
fp = plugins.open_html(plugins.old, site)
- if not links:
+ if not links.count():
fp.write(
' <p class="description">\n'
' No pages were found that were older than %(old)d days old.\n'
Modified: webcheck/plugins/problems.py
==============================================================================
--- webcheck/plugins/problems.py Fri Aug 19 21:28:54 2011 (r429)
+++ webcheck/plugins/problems.py Fri Aug 19 21:44:51 2011 (r430)
@@ -112,7 +112,7 @@
' <ul class="problems">\n'
% {'link': plugins.make_link(link)})
# list the problems
- for problem in link.pageproblems.order_by(db.PageProblem.message):
+ for problem in link.pageproblems:
fp.write(
' <li>%(problem)s</li>\n'
% {'problem': plugins.htmlescape(problem)})
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits
- webcheck commit: r430 - in webcheck: . parsers/html plugins,
Commits of the webcheck project