lists.arthurdejong.org
RSS feed

webcheck commit: r426 - in webcheck: . plugins

[Date Prev][Date Next] [Thread Prev][Thread Next]

webcheck commit: r426 - in webcheck: . plugins



Author: arthur
Date: Wed Aug 10 22:42:57 2011
New Revision: 426
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=426

Log:
re-enable the anchors plugin

Modified:
   webcheck/config.py
   webcheck/db.py
   webcheck/myurllib.py
   webcheck/plugins/anchors.py

Modified: webcheck/config.py
==============================================================================
--- webcheck/config.py  Wed Aug 10 22:41:24 2011        (r425)
+++ webcheck/config.py  Wed Aug 10 22:42:57 2011        (r426)
@@ -64,7 +64,7 @@
 REDIRECT_DEPTH = 5
 
 # The list of plugins that will be used to generate the report.
-PLUGINS = [ #'anchors',
+PLUGINS = [ 'anchors',
             'sitemap',
             'urllist',
             'images',

Modified: webcheck/db.py
==============================================================================
--- webcheck/db.py      Wed Aug 10 22:41:24 2011        (r425)
+++ webcheck/db.py      Wed Aug 10 22:42:57 2011        (r426)
@@ -172,31 +172,23 @@
 
     def add_anchor(self, anchor):
         """Indicate that this page contains the specified anchor."""
-        return # FIXME: implement/update
         # lowercase anchor
         anchor = anchor.lower()
-        # add anchor
-        if anchor in self.anchors:
+        if self.anchors.filter(Anchor.anchor == anchor).first():
             self.add_pageproblem(
               'anchor/id "%(anchor)s" defined multiple times'
               % { 'anchor':   anchor })
         else:
-            self.anchors.add(anchor)
+            self.anchors.append(Anchor(anchor=anchor))
 
     def add_reqanchor(self, parent, anchor):
         """Indicate that the specified link contains a reference to the
         specified anchor. This can be checked later."""
-        return # FIXME: implement/update
         # lowercase anchor
         anchor = anchor.lower()
-        # convert the url to a link object if we were called with a url
-        parent = self.__tolink(parent)
-        # add anchor
-        if anchor in self.reqanchors:
-            if parent not in self.reqanchors[anchor]:
-                self.reqanchors[anchor].add(parent)
-        else:
-            self.reqanchors[anchor] = set([parent])
+        # if RequestedAnchor doesn't exist, add it
+        if not self.reqanchors.filter((RequestedAnchor.parent_id == parent.id) 
& (RequestedAnchor.anchor == anchor)).first():
+            self.reqanchors.append(RequestedAnchor(parent_id=parent.id, 
anchor=anchor))
 
     def follow_link(self, visited=None):
         """If this link represents a redirect return the redirect target,
@@ -254,3 +246,38 @@
 
     def __unicode__(self):
         return self.message
+
+
+class Anchor(Base):
+    """The named anchors (IDs) found on the page."""
+
+    __tablename__ = 'anchors'
+
+    id = Column(Integer, primary_key=True)
+    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+    link = relationship(Link, backref=backref('anchors',
+                        lazy='dynamic',
+                        cascade='all,delete,delete-orphan'))
+    anchor = Column(String)
+
+    def __unicode__(self):
+        return self.anchor
+
+
+class RequestedAnchor(Base):
+    """The named anchors (IDs) found on the page."""
+
+    __tablename__ = 'reqanchors'
+
+    id = Column(Integer, primary_key=True)
+    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+    link = relationship(Link, backref=backref('reqanchors',
+                        lazy='dynamic',
+                        cascade='all,delete,delete-orphan',
+                        ), primaryjoin='Link.id == RequestedAnchor.link_id')
+    parent_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
+    parent = relationship(Link, primaryjoin='Link.id == 
RequestedAnchor.parent_id')
+    anchor = Column(String)
+
+    def __unicode__(self):
+        return self.anchor

Modified: webcheck/myurllib.py
==============================================================================
--- webcheck/myurllib.py        Wed Aug 10 22:41:24 2011        (r425)
+++ webcheck/myurllib.py        Wed Aug 10 22:42:57 2011        (r426)
@@ -1,7 +1,7 @@
 
 # myurllib.py - general purpose URL handling library
 #
-# Copyright (C) 2007 Arthur de Jong
+# Copyright (C) 2007, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -83,7 +83,7 @@
     # make escaping consistent
     url = _normalize_escapes(url)
     # split the url in useful parts
-    (scheme, netloc, path, query) = urlparse.urlsplit(url)[:4]
+    (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
     # remove any leading /../ parts
     if scheme in ( 'http', 'https' ):
         path = _leadingdotpattern.sub('', path)
@@ -108,8 +108,8 @@
     # get rid of double slashes in some paths
     if ( scheme == 'file' ):
         path = _doubleslashpattern.sub('/', path)
-    # put the url back together again (discarding fragment)
-    return urlparse.urlunsplit((scheme, netloc, path, query, ''))
+    # put the url back together again
+    return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
 
 def normalizeurl(url):
     """Return a normalized URL."""

Modified: webcheck/plugins/anchors.py
==============================================================================
--- webcheck/plugins/anchors.py Wed Aug 10 22:41:24 2011        (r425)
+++ webcheck/plugins/anchors.py Wed Aug 10 22:42:57 2011        (r426)
@@ -1,7 +1,7 @@
 
 # anchors.py - plugin check for missing anchors
 #
-# Copyright (C) 2006, 2007 Arthur de Jong
+# Copyright (C) 2006, 2007, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -27,21 +27,23 @@
 __title__ = 'missing anchors'
 __author__ = 'Arthur de Jong'
 
+from sqlalchemy.orm.session import object_session
+
+import db
+
+
 def generate(site):
     """Present the list of bad links to the given file descriptor."""
-    # find all links with requested anchors
-    links = [ x
-              for x in site.linkMap.values()
-              if len(x.reqanchors)>0 and x.isfetched ]
+    # find all fetched links with requested anchors
+    links = site.links.filter(db.Link.reqanchors.any()).filter(db.Link.fetched 
!= None)
     # go over list and find missing anchors
     for link in links:
-        # check all requested anchors
+        # check that all requested anchors exist
         for anchor in link.reqanchors:
-            # if the anchor is there there is no prolem
-            if anchor in link.anchors:
-                continue
-            # report problem
-            for parent in link.reqanchors[anchor]:
-                parent.add_pageproblem(
-                  'reference to undefined anchor/id "%(anchor)s"'
-                  % { 'anchor': anchor })
+            # if the anchor is not there there, report problem
+            if not link.anchors.filter(db.Anchor.anchor == 
anchor.anchor).first():
+                anchor.parent.add_pageproblem(
+                  u'bad link: %(url)s#%(anchor)s: unknown anchor'
+                  % {'url': link.url,
+                     'anchor': anchor })
+    # FIXME: commit changes in session
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits