webcheck commit: r450 - in webcheck: . webcheck
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck commit: r450 - in webcheck: . webcheck
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck commit: r450 - in webcheck: . webcheck
- Date: Fri, 7 Oct 2011 15:21:24 +0200 (CEST)
Author: arthur
Date: Fri Oct 7 15:21:22 2011
New Revision: 450
URL: http://arthurdejong.org/viewvc/webcheck?revision=450&view=revision
Log:
rename Crawler.add_internal() to Crawler.add_base() and automatically
initialise database connection when needed
Modified:
webcheck/cmd.py
webcheck/webcheck/crawler.py
Modified: webcheck/cmd.py
==============================================================================
--- webcheck/cmd.py Fri Oct 7 15:11:23 2011 (r449)
+++ webcheck/cmd.py Fri Oct 7 15:21:22 2011 (r450)
@@ -150,11 +150,6 @@
print_usage()
print_tryhelp()
sys.exit(1)
- # ensure output directory exists
- if not os.path.isdir(config.OUTPUT_DIR):
- os.mkdir(config.OUTPUT_DIR)
- # set up database connection
- crawler.setup_database()
# add configuration to site
for pattern in internal_urls:
crawler.add_internal_re(pattern)
@@ -166,7 +161,7 @@
# if it does not look like a url it is probably a local file
if urlparse.urlsplit(arg)[0] == '':
arg = 'file://' + urllib.pathname2url(os.path.abspath(arg))
- crawler.add_internal(arg)
+ crawler.add_base(arg)
except getopt.error, reason:
sys.stderr.write('webcheck: %s\n' % reason)
print_tryhelp()
Modified: webcheck/webcheck/crawler.py
==============================================================================
--- webcheck/webcheck/crawler.py Fri Oct 7 15:11:23 2011 (r449)
+++ webcheck/webcheck/crawler.py Fri Oct 7 15:21:22 2011 (r450)
@@ -116,6 +116,13 @@
self.bases = []
def setup_database(self):
+ if hasattr(self, 'database_configed'):
+ return
+ self.database_configed = True
+ # ensure output directory exists
+ if not os.path.isdir(config.OUTPUT_DIR):
+ os.mkdir(config.OUTPUT_DIR)
+ # open the sqlite file
filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
engine = create_engine('sqlite:///' + filename)
Session.configure(bind=engine)
@@ -123,9 +130,12 @@
Base.metadata.create_all(engine)
# TODO: schema migraton goes here
- def add_internal(self, url):
+ def add_base(self, url):
"""Add the given url and consider all urls below it to be internal.
These links are all marked for checking with the crawl() function."""
+ # ensure we have a connection to the database
+ self.setup_database()
+ # clean the URL and add it
url = Link.clean_url(url)
if url not in self._internal_urls:
self._internal_urls.add(url)
@@ -146,9 +156,9 @@
self._yanked_res[exp] = re.compile(exp, re.IGNORECASE)
def _is_internal(self, url):
- """Check whether the specified url is external or internal.
- This uses the urls marked with add_internal() and the regular
- expressions passed with add_external_re()."""
+ """Check whether the specified url is external or internal. This
+ uses the urls marked with add_base() and the regular expressions
+ passed with add_external_re()."""
# check if it is internal through the regexps
for regexp in self._internal_res.values():
if regexp.search(url) is not None:
@@ -245,10 +255,11 @@
return links.filter(Link.yanked == None)
def crawl(self):
- """Crawl the website based on the urls specified with
- add_internal(). If the serialization file pointer
- is specified the crawler writes out updated links to
- the file while crawling the site."""
+ """Crawl the website based on the urls specified with add_base().
+ If the serialization file pointer is specified the crawler writes
+ out updated links to the file while crawling the site."""
+ # connect to the database
+ self.setup_database()
# configure urllib2 to store cookies in the output directory
_setup_urllib2()
# get a database session
@@ -379,9 +390,11 @@
def postprocess(self):
"""Do some basic post processing of the collected data, including
depth calculation of every link."""
+ # ensure we have a connection to the database
+ self.setup_database()
# get a database session
session = Session()
- # build the list of urls that were set up with add_internal() that
+ # build the list of urls that were set up with add_base() that
# do not have a parent (they form the base for the site)
for url in self._internal_urls:
link = self.get_link(session, url).follow_link()
@@ -425,6 +438,9 @@
def generate(self):
"""Generate pages for plugins."""
+ # ensure we have a connection to the database
+ self.setup_database()
+ # call all the plugins
for plugin in config.PLUGINS:
# import the plugin
pluginmod = __import__(plugin, globals(), locals(), [plugin])
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
- webcheck commit: r450 - in webcheck: . webcheck,
Commits of the webcheck project