lists.arthurdejong.org
RSS feed

webcheck commit: r425 - in webcheck: . plugins

[Date Prev][Date Next] [Thread Prev][Thread Next]

webcheck commit: r425 - in webcheck: . plugins



Author: arthur
Date: Wed Aug 10 22:41:24 2011
New Revision: 425
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=425

Log:
make all relationships into filterable collections and several smaller tweaks 
to improve database access

Modified:
   webcheck/db.py
   webcheck/plugins/__init__.py
   webcheck/plugins/problems.py
   webcheck/plugins/sitemap.py

Modified: webcheck/db.py
==============================================================================
--- webcheck/db.py      Wed Aug 10 22:30:43 2011        (r424)
+++ webcheck/db.py      Wed Aug 10 22:41:24 2011        (r425)
@@ -58,10 +58,9 @@
 
     id = Column(Integer, primary_key=True)
     url = Column(String, index=True, nullable=False, unique=True)
-    fetched = Column(DateTime, index=True)
     is_internal = Column(Boolean, index=True)
     yanked = Column(String, index=True)
-    depth = Column(Integer)
+    fetched = Column(DateTime, index=True)
 
     # information about the retrieved link
     status = Column(String)
@@ -76,36 +75,41 @@
 
     # relationships between links
     children = relationship('Link', secondary=children,
-        backref=backref('linked_from', collection_class=set),
+        backref=backref('linked_from', lazy='dynamic'),
         primaryjoin=(id == children.c.parent_id),
         secondaryjoin=(id == children.c.child_id),
-        collection_class=set)
+        lazy='dynamic')
     embedded = relationship('Link', secondary=embedded,
-        backref=backref('embedded_in', collection_class=set),
+        backref=backref('embedded_in', lazy='dynamic'),
         primaryjoin=(id == embedded.c.parent_id),
         secondaryjoin=(id == embedded.c.child_id),
-        collection_class=set)
+        lazy='dynamic')
 
     # crawling information
     redirectdepth = Column(Integer, default=0)
+    depth = Column(Integer)
 
     @staticmethod
     def clean_url(url):
         # normalise the URL, removing the fragment from the URL
         url = myurllib.normalizeurl(url)
-        (scheme, netloc, path, query) = urlparse.urlsplit(url)[0:4]
-        return urlparse.urlunsplit((scheme, netloc, path, query, ''))
+        return urlparse.urldefrag(myurllib.normalizeurl(url))[0]
 
     def _get_link(self, url):
         """Get a link object for the specified URL."""
         # get the session
         session = object_session(self)
-        # try to find the URL
-        url = self.clean_url(url)
+        # normalise the URL, removing the fragment from the URL
+        url, fragment = urlparse.urldefrag(myurllib.normalizeurl(url))
+        # try to find the link
         instance = session.query(Link).filter_by(url=url).first()
         if not instance:
             instance = Link(url=url)
             session.add(instance)
+        # mark that we were looking for an anchor/fragment
+        if fragment:
+            instance.add_reqanchor(self, fragment)
+        # return the link
         return instance
 
     def set_encoding(self, encoding):
@@ -156,7 +160,7 @@
         if not self.is_internal:
             return
         # add to children
-        self.children.add(self._get_link(url))
+        self.children.append(self._get_link(url))
 
     def add_embed(self, url):
         """Mark the given URL as used as an image on this page."""
@@ -164,7 +168,7 @@
         if not self.is_internal:
             return
         # add to embedded
-        self.embedded.add(self._get_link(url))
+        self.embedded.append(self._get_link(url))
 
     def add_anchor(self, anchor):
         """Indicate that this page contains the specified anchor."""
@@ -202,14 +206,14 @@
         if not self.redirectdepth:
             return self
         # if we don't know where this redirects, return None
-        if not self.children:
+        if not self.children.count():
             return None
         # avoid loops
         if not visited:
             visited = set()
         visited.add(self.url)
         # the first (and only) child is the redirect target
-        child = list(self.children)[0]
+        child = self.children.first()
         if child.url in visited:
             return None
         # check where we redirect to
@@ -228,8 +232,8 @@
 
     id = Column(Integer, primary_key=True)
     link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
-    link = relationship(Link, backref=backref('linkproblems', order_by=id,
-                        cascade='all,delete,delete-orphan'))
+    link = relationship(Link, backref=backref('linkproblems',
+                        cascade='all,delete,delete-orphan', lazy='dynamic'))
     message = Column(String)
 
     def __unicode__(self):
@@ -244,8 +248,8 @@
 
     id = Column(Integer, primary_key=True)
     link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
-    link = relationship(Link, backref=backref('pageproblems', order_by=id,
-                        cascade='all,delete,delete-orphan'))
+    link = relationship(Link, backref=backref('pageproblems',
+                        cascade='all,delete,delete-orphan', lazy='dynamic'))
     message = Column(String)
 
     def __unicode__(self):

Modified: webcheck/plugins/__init__.py
==============================================================================
--- webcheck/plugins/__init__.py        Wed Aug 10 22:30:43 2011        (r424)
+++ webcheck/plugins/__init__.py        Wed Aug 10 22:41:24 2011        (r425)
@@ -43,6 +43,8 @@
 
 Pluings can use the functions exported by this module."""
 
+from sqlalchemy.orm.session import object_session
+
 import sys
 import debugio
 import config
@@ -110,8 +112,8 @@
     else:
         info += '\n'
     if link.redirectdepth > 0:
-        if len(link.children) > 0:
-            info += 'redirect: %s\n' % 
_mk_unicode(link.children.copy().pop().url)
+        if link.children.count() > 0:
+            info += 'redirect: %s\n' % _mk_unicode(link.children.first().url)
         else:
             info += 'redirect (not followed)\n'
     if len(link.parents) == 1:
@@ -301,3 +303,4 @@
         plugin = __import__('plugins.' + p, globals(), locals(), [p])
         # run the plugin
         plugin.generate(site)
+        object_session(site.links[0]).commit()

Modified: webcheck/plugins/problems.py
==============================================================================
--- webcheck/plugins/problems.py        Wed Aug 10 22:30:43 2011        (r424)
+++ webcheck/plugins/problems.py        Wed Aug 10 22:41:24 2011        (r425)
@@ -110,10 +110,8 @@
               '     %(link)s\n'
               '     <ul class="problems">\n'
               % { 'link':    plugins.make_link(link) })
-            # sort problems by name
-            link.pageproblems.sort()
             # list the problems
-            for problem in link.pageproblems:
+            for problem in link.pageproblems.order_by(db.PageProblem.message):
                 fp.write(
                   '      <li>%(problem)s</li>\n'
                   % { 'problem':  plugins.htmlescape(problem) })

Modified: webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/plugins/sitemap.py Wed Aug 10 22:30:43 2011        (r424)
+++ webcheck/plugins/sitemap.py Wed Aug 10 22:41:24 2011        (r425)
@@ -28,8 +28,6 @@
 __author__ = 'Arthur de Jong'
 __outputfile__ = 'index.html'
 
-from sqlalchemy.orm.session import object_session
-
 import config
 import db
 import plugins
@@ -38,10 +36,8 @@
 def add_pagechildren(link, children, explored):
     """Determine the page children of this link, combining the children of
     embedded items and following redirects."""
-    links = object_session(link).query(db.Link)
     # get all internal children
-    qry = links.filter(db.Link.linked_from.contains(link))
-    qry = qry.filter(db.Link.is_internal == True)
+    qry = link.children.filter(db.Link.is_internal == True)
     if link.depth:
         qry = qry.filter((db.Link.depth > link.depth) | (db.Link.depth == 
None))
     # follow redirects
@@ -50,10 +46,9 @@
                     if y and y.is_page and y.is_internal and y.id not in 
explored)
     explored.update(x.id for x in children)
     # add embedded element's pagechildren (think frames)
-    for embed in link.embedded:
+    for embed in link.embedded.filter(db.Link.is_internal == 
True).filter(db.Link.is_page == True):
         # TODO: put this in a query
-        if embed.is_internal and embed.is_page and \
-           embed.id not in explored and \
+        if embed.id not in explored and \
            (embed.depth == None or embed.depth > link.depth):
             add_pagechildren(embed, children, explored)
 
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits