webcheck commit: r425 - in webcheck: . plugins
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r425 - in webcheck: . plugins
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r425 - in webcheck: . plugins
- Date: Wed, 10 Aug 2011 22:41:29 +0200 (CEST)
Author: arthur
Date: Wed Aug 10 22:41:24 2011
New Revision: 425
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=425
Log:
make all relationships into filterable collections and several smaller tweaks
to improve database access
Modified:
webcheck/db.py
webcheck/plugins/__init__.py
webcheck/plugins/problems.py
webcheck/plugins/sitemap.py
Modified: webcheck/db.py
==============================================================================
--- webcheck/db.py Wed Aug 10 22:30:43 2011 (r424)
+++ webcheck/db.py Wed Aug 10 22:41:24 2011 (r425)
@@ -58,10 +58,9 @@
id = Column(Integer, primary_key=True)
url = Column(String, index=True, nullable=False, unique=True)
- fetched = Column(DateTime, index=True)
is_internal = Column(Boolean, index=True)
yanked = Column(String, index=True)
- depth = Column(Integer)
+ fetched = Column(DateTime, index=True)
# information about the retrieved link
status = Column(String)
@@ -76,36 +75,41 @@
# relationships between links
children = relationship('Link', secondary=children,
- backref=backref('linked_from', collection_class=set),
+ backref=backref('linked_from', lazy='dynamic'),
primaryjoin=(id == children.c.parent_id),
secondaryjoin=(id == children.c.child_id),
- collection_class=set)
+ lazy='dynamic')
embedded = relationship('Link', secondary=embedded,
- backref=backref('embedded_in', collection_class=set),
+ backref=backref('embedded_in', lazy='dynamic'),
primaryjoin=(id == embedded.c.parent_id),
secondaryjoin=(id == embedded.c.child_id),
- collection_class=set)
+ lazy='dynamic')
# crawling information
redirectdepth = Column(Integer, default=0)
+ depth = Column(Integer)
@staticmethod
def clean_url(url):
# normalise the URL, removing the fragment from the URL
url = myurllib.normalizeurl(url)
- (scheme, netloc, path, query) = urlparse.urlsplit(url)[0:4]
- return urlparse.urlunsplit((scheme, netloc, path, query, ''))
+ return urlparse.urldefrag(myurllib.normalizeurl(url))[0]
def _get_link(self, url):
"""Get a link object for the specified URL."""
# get the session
session = object_session(self)
- # try to find the URL
- url = self.clean_url(url)
+ # normalise the URL, removing the fragment from the URL
+ url, fragment = urlparse.urldefrag(myurllib.normalizeurl(url))
+ # try to find the link
instance = session.query(Link).filter_by(url=url).first()
if not instance:
instance = Link(url=url)
session.add(instance)
+ # mark that we were looking for an anchor/fragment
+ if fragment:
+ instance.add_reqanchor(self, fragment)
+ # return the link
return instance
def set_encoding(self, encoding):
@@ -156,7 +160,7 @@
if not self.is_internal:
return
# add to children
- self.children.add(self._get_link(url))
+ self.children.append(self._get_link(url))
def add_embed(self, url):
"""Mark the given URL as used as an image on this page."""
@@ -164,7 +168,7 @@
if not self.is_internal:
return
# add to embedded
- self.embedded.add(self._get_link(url))
+ self.embedded.append(self._get_link(url))
def add_anchor(self, anchor):
"""Indicate that this page contains the specified anchor."""
@@ -202,14 +206,14 @@
if not self.redirectdepth:
return self
# if we don't know where this redirects, return None
- if not self.children:
+ if not self.children.count():
return None
# avoid loops
if not visited:
visited = set()
visited.add(self.url)
# the first (and only) child is the redirect target
- child = list(self.children)[0]
+ child = self.children.first()
if child.url in visited:
return None
# check where we redirect to
@@ -228,8 +232,8 @@
id = Column(Integer, primary_key=True)
link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
- link = relationship(Link, backref=backref('linkproblems', order_by=id,
- cascade='all,delete,delete-orphan'))
+ link = relationship(Link, backref=backref('linkproblems',
+ cascade='all,delete,delete-orphan', lazy='dynamic'))
message = Column(String)
def __unicode__(self):
@@ -244,8 +248,8 @@
id = Column(Integer, primary_key=True)
link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'))
- link = relationship(Link, backref=backref('pageproblems', order_by=id,
- cascade='all,delete,delete-orphan'))
+ link = relationship(Link, backref=backref('pageproblems',
+ cascade='all,delete,delete-orphan', lazy='dynamic'))
message = Column(String)
def __unicode__(self):
Modified: webcheck/plugins/__init__.py
==============================================================================
--- webcheck/plugins/__init__.py Wed Aug 10 22:30:43 2011 (r424)
+++ webcheck/plugins/__init__.py Wed Aug 10 22:41:24 2011 (r425)
@@ -43,6 +43,8 @@
Pluings can use the functions exported by this module."""
+from sqlalchemy.orm.session import object_session
+
import sys
import debugio
import config
@@ -110,8 +112,8 @@
else:
info += '\n'
if link.redirectdepth > 0:
- if len(link.children) > 0:
- info += 'redirect: %s\n' %
_mk_unicode(link.children.copy().pop().url)
+ if link.children.count() > 0:
+ info += 'redirect: %s\n' % _mk_unicode(link.children.first().url)
else:
info += 'redirect (not followed)\n'
if len(link.parents) == 1:
@@ -301,3 +303,4 @@
plugin = __import__('plugins.' + p, globals(), locals(), [p])
# run the plugin
plugin.generate(site)
+ object_session(site.links[0]).commit()
Modified: webcheck/plugins/problems.py
==============================================================================
--- webcheck/plugins/problems.py Wed Aug 10 22:30:43 2011 (r424)
+++ webcheck/plugins/problems.py Wed Aug 10 22:41:24 2011 (r425)
@@ -110,10 +110,8 @@
' %(link)s\n'
' <ul class="problems">\n'
% { 'link': plugins.make_link(link) })
- # sort problems by name
- link.pageproblems.sort()
# list the problems
- for problem in link.pageproblems:
+ for problem in link.pageproblems.order_by(db.PageProblem.message):
fp.write(
' <li>%(problem)s</li>\n'
% { 'problem': plugins.htmlescape(problem) })
Modified: webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/plugins/sitemap.py Wed Aug 10 22:30:43 2011 (r424)
+++ webcheck/plugins/sitemap.py Wed Aug 10 22:41:24 2011 (r425)
@@ -28,8 +28,6 @@
__author__ = 'Arthur de Jong'
__outputfile__ = 'index.html'
-from sqlalchemy.orm.session import object_session
-
import config
import db
import plugins
@@ -38,10 +36,8 @@
def add_pagechildren(link, children, explored):
"""Determine the page children of this link, combining the children of
embedded items and following redirects."""
- links = object_session(link).query(db.Link)
# get all internal children
- qry = links.filter(db.Link.linked_from.contains(link))
- qry = qry.filter(db.Link.is_internal == True)
+ qry = link.children.filter(db.Link.is_internal == True)
if link.depth:
qry = qry.filter((db.Link.depth > link.depth) | (db.Link.depth ==
None))
# follow redirects
@@ -50,10 +46,9 @@
if y and y.is_page and y.is_internal and y.id not in
explored)
explored.update(x.id for x in children)
# add embedded element's pagechildren (think frames)
- for embed in link.embedded:
+ for embed in link.embedded.filter(db.Link.is_internal ==
True).filter(db.Link.is_page == True):
# TODO: put this in a query
- if embed.is_internal and embed.is_page and \
- embed.id not in explored and \
+ if embed.id not in explored and \
(embed.depth == None or embed.depth > link.depth):
add_pagechildren(embed, children, explored)
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits
- webcheck commit: r425 - in webcheck: . plugins,
Commits of the webcheck project