webcheck branch master updated. 1.10.4-88-gb1a4579
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck branch master updated. 1.10.4-88-gb1a4579
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck branch master updated. 1.10.4-88-gb1a4579
- Date: Mon, 2 Dec 2013 23:45:56 +0100 (CET)
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "webcheck".
The branch, master has been updated
via b1a4579b1389fc11bed06e1a6604c9bf4f37dfb5 (commit)
via 27bd8bff7d0ad778df6ca0e67380546aa1a80fb1 (commit)
from 55365f948b6697e6bcd02bc203b93d37a285762e (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://arthurdejong.org/git/webcheck/commit/?id=b1a4579b1389fc11bed06e1a6604c9bf4f37dfb5
commit b1a4579b1389fc11bed06e1a6604c9bf4f37dfb5
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Mon Dec 2 23:41:44 2013 +0100
Store link and page problems as unicode
This converts problems to unicode so they can be stored correctly by
SQLAlchemy. This amongst other things fixes a problem when the web
server returns a status message with non-ASCII characters.
diff --git a/webcheck/db.py b/webcheck/db.py
index c1cda32..297771d 100644
--- a/webcheck/db.py
+++ b/webcheck/db.py
@@ -121,6 +121,12 @@ class Link(Base):
# return the link
return instance
+ def _mk_unicode(self, message):
+ """Turn the message into a unicode object."""
+ if not isinstance(message, unicode):
+ message = unicode(message, encoding=self.encoding or 'utf-8',
errors='replace')
+ return message
+
def set_encoding(self, encoding):
"""Set the encoding of the link doing some basic checks to see if
the encoding is supported."""
@@ -155,7 +161,7 @@ class Link(Base):
def add_linkproblem(self, message):
"""Indicate that something went wrong while retrieving this link."""
- self.linkproblems.append(LinkProblem(message=message))
+
self.linkproblems.append(LinkProblem(message=self._mk_unicode(message)))
def add_pageproblem(self, message):
"""Indicate that something went wrong with parsing the document."""
@@ -163,7 +169,7 @@ class Link(Base):
if not self.is_internal:
return
# TODO: only include a single problem once (e.g. multiple anchors)
- self.pageproblems.append(PageProblem(message=message))
+
self.pageproblems.append(PageProblem(message=self._mk_unicode(message)))
def add_child(self, url):
"""Add the specified URL as a child of this link."""
@@ -184,7 +190,7 @@ class Link(Base):
def add_anchor(self, anchor):
"""Indicate that this page contains the specified anchor."""
# lowercase anchor
- anchor = anchor.lower()
+ anchor = self._mk_unicode(anchor).lower()
if self.anchors.filter(Anchor.anchor == anchor).first():
self.add_pageproblem(
'anchor/id "%(anchor)s" defined multiple times'
@@ -196,7 +202,7 @@ class Link(Base):
"""Indicate that the specified link contains a reference to the
specified anchor. This can be checked later."""
# lowercase anchor
- anchor = anchor.lower()
+ anchor = self._mk_unicode(anchor).lower()
# if RequestedAnchor doesn't exist, add it
if not self.reqanchors.filter((RequestedAnchor.parent_id == parent.id)
& (RequestedAnchor.anchor == anchor)).first():
self.reqanchors.append(RequestedAnchor(parent_id=parent.id,
anchor=anchor))
http://arthurdejong.org/git/webcheck/commit/?id=27bd8bff7d0ad778df6ca0e67380546aa1a80fb1
commit 27bd8bff7d0ad778df6ca0e67380546aa1a80fb1
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Mon Dec 2 23:33:46 2013 +0100
Only convert content if link has encoding
This fixes an issue for calling tidy when the character encoding of the
page could not be determined.
diff --git a/webcheck/parsers/html/calltidy.py
b/webcheck/parsers/html/calltidy.py
index 1404444..57b8efb 100644
--- a/webcheck/parsers/html/calltidy.py
+++ b/webcheck/parsers/html/calltidy.py
@@ -32,7 +32,8 @@ def parse(content, link):
# only call tidy on internal pages
if link.is_internal:
# force encoding of the content to UTF-8
- content = content.decode(link.encoding).encode('utf-8')
+ if link.encoding:
+ content = content.decode(link.encoding).encode('utf-8')
t = tidy.parseString(content, **config.TIDY_OPTIONS)
for err in t.errors:
# error messages are escaped so we unescape them
-----------------------------------------------------------------------
Summary of changes:
webcheck/db.py | 14 ++++++++++----
webcheck/parsers/html/calltidy.py | 3 ++-
2 files changed, 12 insertions(+), 5 deletions(-)
hooks/post-receive
--
webcheck
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
- webcheck branch master updated. 1.10.4-88-gb1a4579,
Commits of the webcheck project