lists.arthurdejong.org
RSS feed

webcheck commit: r430 - in webcheck: . parsers/html plugins

[Date Prev][Date Next] [Thread Prev][Thread Next]

webcheck commit: r430 - in webcheck: . parsers/html plugins



Author: arthur
Date: Fri Aug 19 21:44:51 2011
New Revision: 430
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=430

Log:
do some performance tuning to ensure that the reports are generated a little 
faster

Modified:
   webcheck/crawler.py
   webcheck/db.py
   webcheck/parsers/html/__init__.py
   webcheck/plugins/__init__.py
   webcheck/plugins/badlinks.py
   webcheck/plugins/external.py
   webcheck/plugins/new.py
   webcheck/plugins/notchkd.py
   webcheck/plugins/old.py
   webcheck/plugins/problems.py

Modified: webcheck/crawler.py
==============================================================================
--- webcheck/crawler.py Fri Aug 19 21:28:54 2011        (r429)
+++ webcheck/crawler.py Fri Aug 19 21:44:51 2011        (r430)
@@ -295,8 +295,9 @@
         try:
             # FIXME: if an URI has a username:passwd add the uri, username and 
password to the HTTPPasswordMgr
             request = urllib2.Request(link.url)
-            if link.parents:
-                request.add_header('Referer', iter(link.parents).next().url)
+            parent = link.parents.first()
+            if parent:
+                request.add_header('Referer', parent.url)
             response = urllib2.urlopen(request)
             link.mimetype = response.info().gettype()
             link.set_encoding(response.headers.getparam('charset'))

Modified: webcheck/db.py
==============================================================================
--- webcheck/db.py      Fri Aug 19 21:28:54 2011        (r429)
+++ webcheck/db.py      Fri Aug 19 21:44:51 2011        (r430)
@@ -23,10 +23,11 @@
 import urlparse
 
 from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import distinct, func
 from sqlalchemy import Table, Column, Integer, Boolean, String, DateTime, 
ForeignKey
 from sqlalchemy.orm import relationship, backref, sessionmaker
 from sqlalchemy.orm.session import object_session
-from sqlalchemy.sql.expression import ClauseElement
+from sqlalchemy.sql.expression import ClauseElement, union
 
 import config
 import debugio
@@ -40,15 +41,15 @@
 
 children = Table(
     'children', Base.metadata,
-    Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE')),
-    Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'))
+    Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), 
index=True),
+    Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), 
index=True)
     )
 
 
 embedded = Table(
     'embedded', Base.metadata,
-    Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE')),
-    Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'))
+    Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), 
index=True),
+    Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), 
index=True)
     )
 
 
@@ -68,9 +69,9 @@
     mimetype = Column(String)
     encoding = Column(String)
     size = Column(Integer)
-    mtime = Column(DateTime)
+    mtime = Column(DateTime, index=True)
     is_page = Column(Boolean, index=True)
-    title = Column(String)
+    title = Column(String, index=True)
     author = Column(String)
 
     # relationships between links
@@ -212,8 +213,24 @@
         return child.follow_link(visited)
 
     @property
+    def count_parents(self):
+        session = object_session(self)
+        p1 = 
session.query(func.count(distinct(children.c.parent_id))).filter(children.c.child_id
 == self.id)
+        p2 = 
session.query(func.count(distinct(embedded.c.parent_id))).filter(embedded.c.child_id
 == self.id)
+        return p1.scalar() + p2.scalar()
+
+    @property
     def parents(self):
-        return set(self.linked_from).union(self.embedded_in)
+        session = object_session(self)
+        #links = object_session(self).query(Link)
+        #links = links.join(children, Link.id == children.c.parent_id)
+        #links = links.join(embedded, Link.id == embedded.c.parent_id)
+        #return links.filter((children.c.child_id == self.id) |
+        #                    (embedded.c.child_id == self.id)).distinct()
+        parent_ids = 
union(session.query(children.c.parent_id).filter(children.c.child_id == 
self.id),
+                           
session.query(embedded.c.parent_id).filter(embedded.c.child_id == self.id))
+
+        return session.query(Link).filter(Link.id == 
parent_ids.c.children_parent_id).distinct()
 
 
 class LinkProblem(Base):
@@ -223,10 +240,10 @@
     __tablename__ = 'linkproblems'
 
     id = Column(Integer, primary_key=True)
-    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
-    link = relationship(Link, backref=backref('linkproblems',
-                        cascade='all,delete,delete-orphan', lazy='dynamic'))
-    message = Column(String)
+    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), 
index=True)
+    message = Column(String, index=True)
+    link = relationship(Link, backref=backref('linkproblems', order_by=message,
+                        cascade='all,delete,delete-orphan'))
 
     def __unicode__(self):
         return self.message
@@ -239,10 +256,10 @@
     __tablename__ = 'pageproblems'
 
     id = Column(Integer, primary_key=True)
-    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
-    link = relationship(Link, backref=backref('pageproblems',
-                        cascade='all,delete,delete-orphan', lazy='dynamic'))
-    message = Column(String)
+    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), 
index=True)
+    message = Column(String, index=True)
+    link = relationship(Link, backref=backref('pageproblems', order_by=message,
+                        cascade='all,delete,delete-orphan'))
 
     def __unicode__(self):
         return self.message
@@ -254,7 +271,7 @@
     __tablename__ = 'anchors'
 
     id = Column(Integer, primary_key=True)
-    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), 
index=True)
     link = relationship(Link, backref=backref('anchors',
                         lazy='dynamic',
                         cascade='all,delete,delete-orphan'))
@@ -270,12 +287,12 @@
     __tablename__ = 'reqanchors'
 
     id = Column(Integer, primary_key=True)
-    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), 
index=True)
     link = relationship(Link, backref=backref('reqanchors',
                         lazy='dynamic',
                         cascade='all,delete,delete-orphan',
                         ), primaryjoin='Link.id == RequestedAnchor.link_id')
-    parent_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+    parent_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), 
index=True)
     parent = relationship(Link, primaryjoin='Link.id == 
RequestedAnchor.parent_id')
     anchor = Column(String)
 

Modified: webcheck/parsers/html/__init__.py
==============================================================================
--- webcheck/parsers/html/__init__.py   Fri Aug 19 21:28:54 2011        (r429)
+++ webcheck/parsers/html/__init__.py   Fri Aug 19 21:44:51 2011        (r430)
@@ -36,10 +36,9 @@
 _entitypattern = re.compile('&(#[0-9]{1,6}|[a-zA-Z]{2,10});')
 
 
-def htmlescape(txt, inattr=False):
+def htmlescape(txt):
     """HTML escape the given string and return an ASCII clean string with
-    known entities and character entities for the other values.
-    If the inattr parameter is set quotes and newlines will also be escaped."""
+    known entities and character entities for the other values."""
     # check for empty string
     if not txt:
         return u''
@@ -50,17 +49,10 @@
     out = ''
     # loop over the characters of the string
     for c in txt:
-        if c == '"':
-            if inattr:
-                out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
-            else:
-                out += '"'
-        elif ord(c) in htmlentitydefs.codepoint2name:
+        if ord(c) in htmlentitydefs.codepoint2name:
             out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
         elif ord(c) > 126:
             out += '&#%d;' % ord(c)
-        elif inattr and c == u'\n':
-            out += '
'
         else:
             out += c.encode('utf-8')
     return out

Modified: webcheck/plugins/__init__.py
==============================================================================
--- webcheck/plugins/__init__.py        Fri Aug 19 21:28:54 2011        (r429)
+++ webcheck/plugins/__init__.py        Fri Aug 19 21:44:51 2011        (r430)
@@ -43,25 +43,21 @@
 
 Pluings can use the functions exported by this module."""
 
+import sys
+import time
+
+from sqlalchemy.orm import joinedload
 from sqlalchemy.orm.session import object_session
 
-import sys
-import debugio
 import config
-import time
+import db
+import debugio
 import parsers.html
 
 # reference function from html module
 htmlescape = parsers.html.htmlescape
 
 
-def get_title(link):
-    """Returns the title of a link if it is set otherwise returns url."""
-    if link.title is None or link.title == '':
-        return link.url
-    return link.title
-
-
 def _floatformat(f):
     """Return a float as a string while trying to keep it within three
     characters."""
@@ -87,54 +83,43 @@
         return '%d' % i
 
 
-def _mk_unicode(txt):
-    """Returns a unicode instance of the string."""
-    if not isinstance(txt, unicode):
-        txt = unicode(txt)
-    return txt
-
-
-def get_info(link):
+def _get_info(link):
     """Return a string with a summary of the information in the link."""
-    info = u'url: %s\n' % _mk_unicode(link.url)
+    info = u'url: %s\n' % link.url
     if link.status:
-        info += '%s\n' % _mk_unicode(link.status)
+        info += u'%s\n' % link.status
     if link.title:
-        info += 'title: %s\n' % link.title.strip()
+        info += u'title: %s\n' % link.title.strip()
     if link.author:
-        info += 'author: %s\n' % link.author.strip()
+        info += u'author: %s\n' % link.author.strip()
     if link.is_internal:
-        info += 'internal link'
+        info += u'internal link'
     else:
-        info += 'external link'
+        info += u'external link'
     if link.yanked:
-        if isinstance(link.yanked, unicode):
-            info += ', not checked (%s)\n' % link.yanked
-        if isinstance(link.yanked, str):
-            info += ', not checked (%s)\n' % _mk_unicode(link.yanked)
-        else:
-            info += ', not checked\n'
+        info += u', not checked (%s)\n' % link.yanked
     else:
-        info += '\n'
-    if link.redirectdepth > 0:
+        info += u'\n'
+    if link.redirectdepth:
         if link.children.count() > 0:
-            info += 'redirect: %s\n' % _mk_unicode(link.children.first().url)
+            info += u'redirect: %s\n' % link.children.first().url
         else:
-            info += 'redirect (not followed)\n'
-    if len(link.parents) == 1:
-        info += 'linked from 1 page\n'
-    elif len(link.parents) > 1:
-        info += 'linked from %d pages\n' % len(link.parents)
+            info += u'redirect (not followed)\n'
+    count = link.count_parents
+    if count == 1:
+        info += u'linked from 1 page\n'
+    elif count > 1:
+        info += u'linked from %d pages\n' % count
     if link.mtime:
-        info += 'last modified: %s\n' % time.ctime(link.mtime)
+        info += u'last modified: %s\n' % time.ctime(link.mtime)
     if link.size:
-        info += 'size: %s\n' % get_size(link.size)
+        info += u'size: %s\n' % get_size(link.size)
     if link.mimetype:
-        info += 'mime-type: %s\n' % _mk_unicode(link.mimetype)
+        info += u'mime-type: %s\n' % link.mimetype
     if link.encoding:
-        info += 'encoding: %s\n' % _mk_unicode(link.encoding)
+        info += u'encoding: %s\n' % link.encoding
     for problem in link.linkproblems:
-        info += 'problem: %s\n' % _mk_unicode(problem)
+        info += u'problem: %s\n' % problem.message
     # trim trailing newline
     return info.strip()
 
@@ -142,41 +127,27 @@
 def make_link(link, title=None):
     """Return an <a>nchor to a url with title. If url is in the Linklist and
     is external, insert "class=external" in the <a> tag."""
-    # try to fetch the link object for this url
-    if link.is_internal:
-        cssclass = 'internal'
-    else:
-        cssclass = 'external'
-    if title is None:
-        title = get_title(link)
-    target = ''
-    if config.REPORT_LINKS_IN_NEW_WINDOW:
-        target = 'target="_blank" '
-    # gather some information about the link to report
     return '<a href="%(url)s" %(target)sclass="%(cssclass)s" 
title="%(info)s">%(title)s</a>' % \
-            dict(url=htmlescape(link.url, True),
-                 target=target,
-                 cssclass=cssclass,
-                 info=htmlescape(get_info(link), True),
-                 title=htmlescape(title))
+            dict(url=htmlescape(link.url),
+                 target='target="_blank" ' if 
config.REPORT_LINKS_IN_NEW_WINDOW else '',
+                 cssclass='internal' if link.is_internal else 'external',
+                 info=htmlescape(_get_info(link)).replace('\n', '&#10;'),
+                 title=htmlescape(title or link.title or link.url))
 
 
 def print_parents(fp, link, indent='     '):
     """Write a list of parents to the output file descriptor.
     The output is indeted with the specified indent."""
-    parents = list(link.parents)
     # if there are no parents print nothing
-    if not parents:
+    count = link.count_parents
+    if not count:
         return
-    parents.sort(lambda a, b: cmp(a.title, b.title) or cmp(a.url, b.url))
+    parents = link.parents.order_by(db.Link.title, 
db.Link.url).options(joinedload(db.Link.linkproblems))[:config.PARENT_LISTLEN]
     fp.write(
       indent + '<div class="parents">\n' +
       indent + ' referenced from:\n' +
       indent + ' <ul>\n')
     more = 0
-    if len(parents) > config.PARENT_LISTLEN + 1:
-        more = len(parents) - config.PARENT_LISTLEN
-        parents = parents[:config.PARENT_LISTLEN]
     for parent in parents:
         fp.write(
           indent + '  <li>%(parent)s</li>\n'
@@ -283,7 +254,7 @@
       ' </head>\n'
       ' <body>\n'
       '  <h1 class="basename">Webcheck report for <a 
href="%(siteurl)s">%(sitetitle)s</a></h1>\n'
-      % {'sitetitle':   htmlescape(get_title(base)),
+      % {'sitetitle':   htmlescape(base.title or base.url),
          'plugintitle': htmlescape(plugin.__title__),
          'siteurl':     base.url,
          'version':     config.VERSION})

Modified: webcheck/plugins/badlinks.py
==============================================================================
--- webcheck/plugins/badlinks.py        Fri Aug 19 21:28:54 2011        (r429)
+++ webcheck/plugins/badlinks.py        Fri Aug 19 21:44:51 2011        (r430)
@@ -28,6 +28,8 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'badlinks.html'
 
+from sqlalchemy.orm import joinedload
+
 import db
 import plugins
 
@@ -35,7 +37,7 @@
 def generate(site):
     """Present the list of bad links to the given file descriptor."""
     # find all links with link problems
-    links = site.links.filter(db.Link.linkproblems.any()).order_by(db.Link.url)
+    links = 
site.links.filter(db.Link.linkproblems.any()).order_by(db.Link.url).options(joinedload(db.Link.linkproblems))
     # present results
     fp = plugins.open_html(plugins.badlinks, site)
     if not links:

Modified: webcheck/plugins/external.py
==============================================================================
--- webcheck/plugins/external.py        Fri Aug 19 21:28:54 2011        (r429)
+++ webcheck/plugins/external.py        Fri Aug 19 21:44:51 2011        (r430)
@@ -28,6 +28,8 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'external.html'
 
+from sqlalchemy.orm import joinedload
+
 import db
 import plugins
 
@@ -51,7 +53,7 @@
       '    examination of the website.'
       '   </p>\n'
       '   <ol>\n')
-    for link in links:
+    for link in links.options(joinedload(db.Link.linkproblems)):
         fp.write(
           '    <li>\n'
           '     %(link)s\n'

Modified: webcheck/plugins/new.py
==============================================================================
--- webcheck/plugins/new.py     Fri Aug 19 21:28:54 2011        (r429)
+++ webcheck/plugins/new.py     Fri Aug 19 21:44:51 2011        (r430)
@@ -47,7 +47,7 @@
     links = links.filter(db.Link.mtime > 
newtime).order_by(db.Link.mtime.desc())
     # present results
     fp = plugins.open_html(plugins.new, site)
-    if not links:
+    if not links.count():
         fp.write(
           '   <p class="description">\n'
           '    No pages were found that were modified within the last %(new)d 
days.\n'

Modified: webcheck/plugins/notchkd.py
==============================================================================
--- webcheck/plugins/notchkd.py Fri Aug 19 21:28:54 2011        (r429)
+++ webcheck/plugins/notchkd.py Fri Aug 19 21:44:51 2011        (r430)
@@ -28,6 +28,8 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'notchkd.html'
 
+from sqlalchemy.orm import joinedload
+
 import db
 import plugins
 
@@ -38,7 +40,7 @@
     links = site.links.filter(db.Link.yanked != None).order_by(db.Link.url)
     # present results
     fp = plugins.open_html(plugins.notchkd, site)
-    if not links:
+    if not links.count():
         fp.write(
           '   <p class="description">\n'
           '    All links have been checked.\n'
@@ -51,7 +53,7 @@
       '    at all during the examination of the website.\n'
       '   </p>\n'
       '   <ol>\n')
-    for link in links:
+    for link in links.options(joinedload(db.Link.linkproblems)):
         fp.write(
           '    <li>\n'
           '     %(link)s\n'

Modified: webcheck/plugins/old.py
==============================================================================
--- webcheck/plugins/old.py     Fri Aug 19 21:28:54 2011        (r429)
+++ webcheck/plugins/old.py     Fri Aug 19 21:44:51 2011        (r430)
@@ -44,10 +44,10 @@
     oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE
     # get all internal pages that are old
     links = site.links.filter_by(is_page=True, is_internal=True)
-    links = links.filter(db.Link.mtime < oldtime).order_by(db.Link.mtime).all()
+    links = links.filter(db.Link.mtime < oldtime).order_by(db.Link.mtime)
     # present results
     fp = plugins.open_html(plugins.old, site)
-    if not links:
+    if not links.count():
         fp.write(
           '   <p class="description">\n'
           '    No pages were found that were older than %(old)d days old.\n'

Modified: webcheck/plugins/problems.py
==============================================================================
--- webcheck/plugins/problems.py        Fri Aug 19 21:28:54 2011        (r429)
+++ webcheck/plugins/problems.py        Fri Aug 19 21:44:51 2011        (r430)
@@ -112,7 +112,7 @@
               '     <ul class="problems">\n'
               % {'link': plugins.make_link(link)})
             # list the problems
-            for problem in link.pageproblems.order_by(db.PageProblem.message):
+            for problem in link.pageproblems:
                 fp.write(
                   '      <li>%(problem)s</li>\n'
                   % {'problem':  plugins.htmlescape(problem)})
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits