lists.arthurdejong.org
RSS feed

webcheck commit: r419 - webcheck

[Date Prev][Date Next] [Thread Prev][Thread Next]

webcheck commit: r419 - webcheck



Author: arthur
Date: Sat Jun 18 20:08:51 2011
New Revision: 419
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=419

Log:
monkeypatch the robotparser module to improve upon some functionality

Added:
   webcheck/monkeypatch.py
Modified:
   webcheck/webcheck.py

Added: webcheck/monkeypatch.py
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ webcheck/monkeypatch.py     Sat Jun 18 20:08:51 2011        (r419)
@@ -0,0 +1,80 @@
+
+# monkeypatch.py - add missing functionality to standard modules
+#
+# Copyright (C) 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+import re
+import urlparse
+import urllib
+import sys
+
+
+__all__ = []
+
+# This monkeypatches RuleLine.applies_to to support * and $ characters in
+# robots.txt path names.
+def my_applies_to(ruleline, filename):
+    if not hasattr(ruleline, 'pattern'):
+        pat = []
+        # we need to unescape the * from the path here
+        for x in ruleline.path.replace('%2A', '*'):
+            if x == '*':
+                pat.append('.*')
+            elif x == '$':
+                pat.append(r'\Z')
+            else:
+                pat.append(re.escape(x))
+        ruleline.pattern = re.compile(''.join(pat) + '(?ms)')
+    return bool(ruleline.pattern.match(filename))
+
+from robotparser import RuleLine
+RuleLine.applies_to = my_applies_to
+
+
+# This monkeypatches RobotFileParser.can_fetch to include the query string
+# into the tested part of the URL, taken from http://bugs.python.org/issue6325
+# this should be fixed in Python 2.7
+if sys.version_info < (2, 7):
+
+    def my_can_fetch(rfp, useragent, url):
+        """using the parsed robots.txt decide if useragent can fetch url"""
+        if rfp.disallow_all:
+            return False
+        if rfp.allow_all:
+            return True
+        # search for given user agent matches
+        # the first match counts
+        parsed_url = urlparse.urlparse(urllib.unquote(url))
+        url = urlparse.urlunparse(('', '', parsed_url.path,
+        parsed_url.params, parsed_url.query, parsed_url.fragment))
+        url = urllib.quote(url)
+        if not url:
+            url = "/"
+        for entry in rfp.entries:
+            if entry.applies_to(useragent):
+                return entry.allowance(url)
+        # try the default entry last
+        if rfp.default_entry:
+            return rfp.default_entry.allowance(url)
+        # agent not found ==> access granted
+        return True
+
+    from robotparser import RobotFileParser
+    RobotFileParser.can_fetch = my_can_fetch

Modified: webcheck/webcheck.py
==============================================================================
--- webcheck/webcheck.py        Sat Jun 18 16:09:50 2011        (r418)
+++ webcheck/webcheck.py        Sat Jun 18 20:08:51 2011        (r419)
@@ -4,7 +4,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2008, 2010 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -25,17 +25,19 @@
 
 """This is the main webcheck module."""
 
-import config
-import crawler
-import plugins
-import debugio
 import sys
 import os
 import re
-import serialize
 import urlparse
 import urllib
 
+import config
+import crawler
+import plugins
+import debugio
+import serialize
+import monkeypatch
+
 debugio.loglevel = debugio.INFO
 
 def print_version():
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits