webcheck commit: r419 - webcheck
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r419 - webcheck
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r419 - webcheck
- Date: Sat, 18 Jun 2011 20:08:53 +0200 (CEST)
Author: arthur
Date: Sat Jun 18 20:08:51 2011
New Revision: 419
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=419
Log:
monkeypatch the robotparser module to improve upon some functionality
Added:
webcheck/monkeypatch.py
Modified:
webcheck/webcheck.py
Added: webcheck/monkeypatch.py
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ webcheck/monkeypatch.py Sat Jun 18 20:08:51 2011 (r419)
@@ -0,0 +1,80 @@
+
+# monkeypatch.py - add missing functionality to standard modules
+#
+# Copyright (C) 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+import re
+import urlparse
+import urllib
+import sys
+
+
+__all__ = []
+
+# This monkeypatches RuleLine.applies_to to support * and $ characters in
+# robots.txt path names.
+def my_applies_to(ruleline, filename):
+ if not hasattr(ruleline, 'pattern'):
+ pat = []
+ # we need to unescape the * from the path here
+ for x in ruleline.path.replace('%2A', '*'):
+ if x == '*':
+ pat.append('.*')
+ elif x == '$':
+ pat.append(r'\Z')
+ else:
+ pat.append(re.escape(x))
+ ruleline.pattern = re.compile(''.join(pat) + '(?ms)')
+ return bool(ruleline.pattern.match(filename))
+
+from robotparser import RuleLine
+RuleLine.applies_to = my_applies_to
+
+
+# This monkeypatches RobotFileParser.can_fetch to include the query string
+# into the tested part of the URL, taken from http://bugs.python.org/issue6325
+# this should be fixed in Python 2.7
+if sys.version_info < (2, 7):
+
+ def my_can_fetch(rfp, useragent, url):
+ """using the parsed robots.txt decide if useragent can fetch url"""
+ if rfp.disallow_all:
+ return False
+ if rfp.allow_all:
+ return True
+ # search for given user agent matches
+ # the first match counts
+ parsed_url = urlparse.urlparse(urllib.unquote(url))
+ url = urlparse.urlunparse(('', '', parsed_url.path,
+ parsed_url.params, parsed_url.query, parsed_url.fragment))
+ url = urllib.quote(url)
+ if not url:
+ url = "/"
+ for entry in rfp.entries:
+ if entry.applies_to(useragent):
+ return entry.allowance(url)
+ # try the default entry last
+ if rfp.default_entry:
+ return rfp.default_entry.allowance(url)
+ # agent not found ==> access granted
+ return True
+
+ from robotparser import RobotFileParser
+ RobotFileParser.can_fetch = my_can_fetch
Modified: webcheck/webcheck.py
==============================================================================
--- webcheck/webcheck.py Sat Jun 18 16:09:50 2011 (r418)
+++ webcheck/webcheck.py Sat Jun 18 20:08:51 2011 (r419)
@@ -4,7 +4,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2008, 2010 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -25,17 +25,19 @@
"""This is the main webcheck module."""
-import config
-import crawler
-import plugins
-import debugio
import sys
import os
import re
-import serialize
import urlparse
import urllib
+import config
+import crawler
+import plugins
+import debugio
+import serialize
+import monkeypatch
+
debugio.loglevel = debugio.INFO
def print_version():
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits
- webcheck commit: r419 - webcheck,
Commits of the webcheck project