webcheck commit: r417 - in webcheck: . schemes
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r417 - in webcheck: . schemes
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r417 - in webcheck: . schemes
- Date: Sat, 18 Jun 2011 16:08:42 +0200 (CEST)
Author: arthur
Date: Sat Jun 18 16:08:40 2011
New Revision: 417
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=417
Log:
switch to using urllib2 for crawling (this is mostly functional now)
Deleted:
webcheck/schemes/
Modified:
webcheck/crawler.py
Modified: webcheck/crawler.py
==============================================================================
--- webcheck/crawler.py Sun Mar 6 22:14:27 2011 (r416)
+++ webcheck/crawler.py Sat Jun 18 16:08:40 2011 (r417)
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2008 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -32,11 +32,16 @@
import urlparse
import urllib
import robotparser
-import schemes
import parsers
import re
import time
import myurllib
+import urllib2
+import httplib
+import socket
+import atexit
+import cookielib
+import os
# this is a workaround for Python 2.3
try:
@@ -44,6 +49,37 @@
except NameError:
from sets import Set as set
+# set up our cookie jar
+cookiejar = cookielib.LWPCookieJar('cookies.lwp')
+try:
+ cookiejar.load(ignore_discard=False, ignore_expires=False)
+except IOError:
+ pass
+atexit.register(cookiejar.save, ignore_discard=False, ignore_expires=False)
+
+class RedirectError(urllib2.HTTPError):
+ def __init__(self, url, code, msg, hdrs, fp, newurl):
+ self.newurl = newurl
+ urllib2.HTTPError.__init__(self, url, code, msg, hdrs, fp)
+
+class NoRedirectHandler(urllib2.HTTPRedirectHandler):
+
+ def redirect_request(self, req, fp, code, msg, headers, newurl):
+ raise RedirectError(req.get_full_url(), code, msg, headers, fp, newurl)
+
+
+# set up our custom opener that logs a meaningful user agent
+opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar),
NoRedirectHandler())
+opener.addheaders = [
+ ('User-agent', 'webcheck %s' % config.VERSION),
+ ]
+if config.BYPASSHTTPCACHE:
+ opener.addheaders.append(('Cache-control', 'no-cache'))
+ opener.addheaders.append(('Pragma', 'no-cache'))
+
+urllib2.install_opener(opener)
+
+
# pattern for matching spaces
_spacepattern = re.compile(' ')
@@ -254,7 +290,7 @@
def postprocess(self):
"""Do some basic post processing of the collected data, including
- depth of every link."""
+ depth calculation of every link."""
# build the list of urls that were set up with add_internal() that
# do not have a parent (they form the base for the site)
for url in self._internal_urls:
@@ -518,31 +554,65 @@
def fetch(self):
"""Attempt to fetch the url (if isyanked is not True) and fill in link
attributes (based on isinternal)."""
+ debugio.info(' %s' % self.url)
# fully ignore links that should not be feteched
if self.isyanked:
- debugio.info(' %s' % self.url)
debugio.info(' ' + self.isyanked)
return
# see if we can import the proper module for this scheme
- schememodule = schemes.get_schememodule(self.scheme)
- if schememodule is None:
- self.isyanked = 'unsupported scheme (' + self.scheme + ')'
- self._ischanged = True
- debugio.info(' %s' % self.url)
- debugio.info(' ' + self.isyanked)
+ try:
+ # FIXME: if an URI has a username:passwd add the uri, username and
password to the HTTPPasswordMgr
+ request = urllib2.Request(self.url)
+ if self.parents:
+ request.add_header('Referer', iter(self.parents).next().url)
+ response = urllib2.urlopen(request)
+ self.mimetype = response.info().gettype()
+ self.set_encoding(response.info().getencoding())
+ # FIXME: get result code and other stuff
+ self.status = str(response.code)
+ # link.size = int(response.getheader('Content-length'))
+ # link.mtime = time.mktime(response.msg.getdate('Last-Modified'))
+ # if response.status == 301:
link.add_linkproblem(str(response.status)+': '+response.reason)
+ # elif response.status != 200:
link.add_linkproblem(str(response.status)+': '+response.reason)
+ # TODO: add checking for size
+ except RedirectError, e:
+ self.status = str(e.code)
+ debugio.info(' ' + str(e))
+ if e.code == 301:
+ self.add_linkproblem(str(e))
+ self.redirect(e.newurl)
return
- debugio.info(' %s' % self.url)
- content = schememodule.fetch(self, parsers.get_mimetypes())
- self.isfetched = True
- self._ischanged = True
- # skip parsing of content if we were returned nothing
- if content is None:
+ except urllib2.HTTPError, e:
+ self.status = str(e.code)
+ debugio.info(' ' + str(e))
+ self.add_linkproblem(str(e))
+ return
+ except urllib2.URLError, e:
+ debugio.info(' ' + str(e))
+ self.add_linkproblem(str(e))
+ return
+ except KeyboardInterrupt:
+ # handle this in a higher-level exception handler
+ raise
+ except Exception, e:
+ # handle all other exceptions
+ debugio.warn('unknown exception caught: '+str(e))
+ self.add_linkproblem('error reading HTTP response: '+str(e))
+ import traceback
+ traceback.print_exc()
return
+ finally:
+ self.isfetched = True
+ self._ischanged = True
# find a parser for the content-type
parsermodule = parsers.get_parsermodule(self.mimetype)
if parsermodule is None:
debugio.debug('crawler.Link.fetch(): unsupported content-type: %s'
% self.mimetype)
return
+ # skip parsing of content if we were returned nothing
+ content = response.read()
+ if content is None:
+ return
# parse the content
debugio.debug('crawler.Link.fetch(): parsing using %s' %
parsermodule.__name__)
try:
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits
- webcheck commit: r417 - in webcheck: . schemes,
Commits of the webcheck project