lists.arthurdejong.org
RSS feed

webcheck commit: r424 - webcheck

[Date Prev][Date Next] [Thread Prev][Thread Next]

webcheck commit: r424 - webcheck



Author: arthur
Date: Wed Aug 10 22:30:43 2011
New Revision: 424
URL: http://arthurdejong.org/viewvc/webcheck?view=rev&revision=424

Log:
ensure that the cookies file is generated in the output directory

Modified:
   webcheck/crawler.py
   webcheck/webcheck.py

Modified: webcheck/crawler.py
==============================================================================
--- webcheck/crawler.py Wed Aug 10 22:11:18 2011        (r423)
+++ webcheck/crawler.py Wed Aug 10 22:30:43 2011        (r424)
@@ -46,35 +46,38 @@
 import parsers
 
 
-# set up our cookie jar
-cookiejar = cookielib.LWPCookieJar('cookies.lwp')
-try:
-    cookiejar.load(ignore_discard=False, ignore_expires=False)
-except IOError:
-    pass
-atexit.register(cookiejar.save, ignore_discard=False, ignore_expires=False)
-
 class RedirectError(urllib2.HTTPError):
     def __init__(self, url, code, msg, hdrs, fp, newurl):
         self.newurl = newurl
         urllib2.HTTPError.__init__(self, url, code, msg, hdrs, fp)
 
+
 class NoRedirectHandler(urllib2.HTTPRedirectHandler):
 
     def redirect_request(self, req, fp, code, msg, headers, newurl):
         raise RedirectError(req.get_full_url(), code, msg, headers, fp, newurl)
 
 
-# set up our custom opener that logs a meaningful user agent
-opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar), 
NoRedirectHandler())
-opener.addheaders = [
-  ('User-agent', 'webcheck %s' % config.VERSION),
-  ]
-if config.BYPASSHTTPCACHE:
-    opener.addheaders.append(('Cache-control', 'no-cache'))
-    opener.addheaders.append(('Pragma', 'no-cache'))
-
-urllib2.install_opener(opener)
+def setup_urllib2():
+    """Configure the urllib2 module to store cookies in the output
+    directory."""
+    filename = os.path.join(config.OUTPUT_DIR, 'cookies.lwp')
+    # set up our cookie jar
+    cookiejar = cookielib.LWPCookieJar(filename)
+    try:
+        cookiejar.load(ignore_discard=False, ignore_expires=False)
+    except IOError:
+        pass
+    atexit.register(cookiejar.save, ignore_discard=False, ignore_expires=False)
+    # set up our custom opener that sets a meaningful user agent
+    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar), 
NoRedirectHandler())
+    opener.addheaders = [
+      ('User-agent', 'webcheck %s' % config.VERSION),
+      ]
+    if config.BYPASSHTTPCACHE:
+        opener.addheaders.append(('Cache-control', 'no-cache'))
+        opener.addheaders.append(('Pragma', 'no-cache'))
+    urllib2.install_opener(opener)
 
 
 # pattern for matching spaces

Modified: webcheck/webcheck.py
==============================================================================
--- webcheck/webcheck.py        Wed Aug 10 22:11:18 2011        (r423)
+++ webcheck/webcheck.py        Wed Aug 10 22:30:43 2011        (r424)
@@ -241,6 +241,7 @@
     """Main program."""
     # crawl through the website
     debugio.info('checking site....')
+    crawler.setup_urllib2()
     site.crawl() # this will take a while
     debugio.info('done.')
     # do postprocessing (building site structure, etc)
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits