webcheck commit: r424 - webcheck
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r424 - webcheck
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r424 - webcheck
- Date: Wed, 10 Aug 2011 22:30:46 +0200 (CEST)
Author: arthur
Date: Wed Aug 10 22:30:43 2011
New Revision: 424
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=424
Log:
ensure that the cookies file is generated in the output directory
Modified:
webcheck/crawler.py
webcheck/webcheck.py
Modified: webcheck/crawler.py
==============================================================================
--- webcheck/crawler.py Wed Aug 10 22:11:18 2011 (r423)
+++ webcheck/crawler.py Wed Aug 10 22:30:43 2011 (r424)
@@ -46,35 +46,38 @@
import parsers
-# set up our cookie jar
-cookiejar = cookielib.LWPCookieJar('cookies.lwp')
-try:
- cookiejar.load(ignore_discard=False, ignore_expires=False)
-except IOError:
- pass
-atexit.register(cookiejar.save, ignore_discard=False, ignore_expires=False)
-
class RedirectError(urllib2.HTTPError):
def __init__(self, url, code, msg, hdrs, fp, newurl):
self.newurl = newurl
urllib2.HTTPError.__init__(self, url, code, msg, hdrs, fp)
+
class NoRedirectHandler(urllib2.HTTPRedirectHandler):
def redirect_request(self, req, fp, code, msg, headers, newurl):
raise RedirectError(req.get_full_url(), code, msg, headers, fp, newurl)
-# set up our custom opener that logs a meaningful user agent
-opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar),
NoRedirectHandler())
-opener.addheaders = [
- ('User-agent', 'webcheck %s' % config.VERSION),
- ]
-if config.BYPASSHTTPCACHE:
- opener.addheaders.append(('Cache-control', 'no-cache'))
- opener.addheaders.append(('Pragma', 'no-cache'))
-
-urllib2.install_opener(opener)
+def setup_urllib2():
+ """Configure the urllib2 module to store cookies in the output
+ directory."""
+ filename = os.path.join(config.OUTPUT_DIR, 'cookies.lwp')
+ # set up our cookie jar
+ cookiejar = cookielib.LWPCookieJar(filename)
+ try:
+ cookiejar.load(ignore_discard=False, ignore_expires=False)
+ except IOError:
+ pass
+ atexit.register(cookiejar.save, ignore_discard=False, ignore_expires=False)
+ # set up our custom opener that sets a meaningful user agent
+ opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar),
NoRedirectHandler())
+ opener.addheaders = [
+ ('User-agent', 'webcheck %s' % config.VERSION),
+ ]
+ if config.BYPASSHTTPCACHE:
+ opener.addheaders.append(('Cache-control', 'no-cache'))
+ opener.addheaders.append(('Pragma', 'no-cache'))
+ urllib2.install_opener(opener)
# pattern for matching spaces
Modified: webcheck/webcheck.py
==============================================================================
--- webcheck/webcheck.py Wed Aug 10 22:11:18 2011 (r423)
+++ webcheck/webcheck.py Wed Aug 10 22:30:43 2011 (r424)
@@ -241,6 +241,7 @@
"""Main program."""
# crawl through the website
debugio.info('checking site....')
+ crawler.setup_urllib2()
site.crawl() # this will take a while
debugio.info('done.')
# do postprocessing (building site structure, etc)
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits
- webcheck commit: r424 - webcheck,
Commits of the webcheck project