webcheck branch master updated. 1.10.4-78-g54bb33a
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck branch master updated. 1.10.4-78-g54bb33a
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck branch master updated. 1.10.4-78-g54bb33a
- Date: Sat, 28 Sep 2013 20:27:50 +0200 (CEST)
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "webcheck".
The branch, master has been updated
via 54bb33a8f68704ee690352170040cbfb2aea10ea (commit)
via a07121261ca863c4815311eaa7e685e25be32b6b (commit)
from 0b341a9e87a7743726e25a8bcb2df7f698f369b0 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://arthurdejong.org/git/webcheck/commit/?id=54bb33a8f68704ee690352170040cbfb2aea10ea
commit 54bb33a8f68704ee690352170040cbfb2aea10ea
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Sep 22 22:49:07 2013 +0200
Use crawler.base_urls instead of crawler.bases
Exposing crawler.bases leaks the sqlalchemy session to the plugins which
seems to cause problems in some cases.
As a consequence of this change, the sitemap plugin now uses its own
session.
diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index d6f49f5..f16cf5a 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -121,7 +121,7 @@ class Crawler(object):
The available properties of this class are:
site_name - the name of the website that is crawled
- bases - a list of base link object
+ base_urls - a list of base URLs
plugins - a list of plugin modules used by the crawler
"""
@@ -162,17 +162,17 @@ class Crawler(object):
__import__(plugin, globals(), locals(), [plugin])
for plugin in config.PLUGINS]
# add base urls
- self._internal_urls = set()
+ self.base_urls = []
for url in self.cfg.base_urls:
# if it does not look like a url it is probably a local file
if urlparse.urlsplit(url)[0] == '':
url = 'file://' + urllib.pathname2url(os.path.abspath(url))
# clean the URL and add it
url = Link.clean_url(url)
- if url not in self._internal_urls:
- self._internal_urls.add(url)
- # list of base link objects
- self.bases = []
+ if url not in self.base_urls:
+ self.base_urls.append(url)
+ # set up empty site name
+ self.site_name = None
def setup_database(self):
if hasattr(self, 'database_configed'):
@@ -193,27 +193,19 @@ class Crawler(object):
"""Check whether the specified url is external or internal. This
uses the urls marked with add_base() and the regular expressions
passed with add_external_re()."""
- # check if it is internal through the regexps
for regexp in self._internal_res.values():
if regexp.search(url) is not None:
return True
- res = False
- # check that the url starts with an internal url
if config.BASE_URLS_ONLY:
- # the url must start with one of the _internal_urls
- for i in self._internal_urls:
- res |= (i == url[:len(i)])
+ # the url must start with one of the base URLs
+ if not any(url.startswith(x) for x in self.base_urls):
+ return False
else:
# the netloc must match a netloc of an _internal_url
netloc = urlparse.urlsplit(url)[1]
- for i in self._internal_urls:
- res |= (urlparse.urlsplit(i)[1] == netloc)
- # if it is not internal now, it never will be
- if not res:
- return False
- # check if it is external through the regexps
+ if not any((urlparse.urlsplit(x)[1] == netloc) for x in
self.base_urls):
+ return False
for x in self._external_res.values():
- # if the url matches it is external and we can stop
if x.search(url):
return False
return True
@@ -303,7 +295,7 @@ class Crawler(object):
if not config.CONTINUE:
truncate_db()
# add all internal urls to the database
- for url in self._internal_urls:
+ for url in self.base_urls:
url = Link.clean_url(url)
self.get_link(session, url)
# add some URLs from the database that haven't been fetched
@@ -424,27 +416,28 @@ class Crawler(object):
session = Session()
# build the list of urls that were set up with add_base() that
# do not have a parent (they form the base for the site)
- for url in self._internal_urls:
+ bases = []
+ for url in list(self.base_urls):
link = self.get_link(session, url).follow_link()
if not link:
logger.warn('base link %s redirects to nowhere', url)
- continue
- # add the link to bases
- logger.debug('adding %s to bases', link.url)
- self.bases.append(link)
- # if we got no bases, just use the first internal one
- if not self.bases:
+ self.base_urls.remove(url)
+ else:
+ bases.append(link)
+ # if we got no base URLs, just use the first internal one we find
+ if not self.base_urls:
link = session.query(Link).filter(Link.is_internal == True).first()
- logger.debug('fallback to adding %s to bases', link.url)
- self.bases.append(link)
+ logger.debug('fallback to adding %s to base urls', link.url)
+ self.base_urls.append(link.url)
+ bases.append(link)
# set the site name
- self.site_name = self.bases[0].title or self.bases[0].url
+ self.site_name = bases[0].title or bases[0].url
# do a breadth first traversal of the website to determine depth
session.query(Link).update(dict(depth=None), synchronize_session=False)
session.commit()
depth = 0
- count = len(self.bases)
- for link in self.bases:
+ count = len(bases)
+ for link in bases:
link.depth = 0
session.commit()
while count > 0:
@@ -458,12 +451,13 @@ class Crawler(object):
session.commit()
depth += 1
# TODO: also handle embeds
+ session.commit()
+ session.close()
# see if any of the plugins want to do postprocessing
for plugin in self.plugins:
if hasattr(plugin, 'postprocess'):
logger.info(plugin.__name__)
plugin.postprocess(self)
- #session.close() do not close because bases uses the session
def generate(self):
"""Generate pages for plugins."""
diff --git a/webcheck/plugins/sitemap.py b/webcheck/plugins/sitemap.py
index 55bff59..5c13d59 100644
--- a/webcheck/plugins/sitemap.py
+++ b/webcheck/plugins/sitemap.py
@@ -29,7 +29,7 @@ __author__ = 'Arthur de Jong'
__outputfile__ = 'index.html'
from webcheck import config
-from webcheck.db import Link
+from webcheck.db import Session, Link
from webcheck.output import render
@@ -71,6 +71,9 @@ def explore(links, explored=None, depth=0):
def generate(crawler):
"""Output the sitemap."""
- links = explore(crawler.bases)
+ session = Session()
+ links = [session.query(Link).filter_by(url=url).first()
+ for url in crawler.base_urls]
+ links = explore(links)
render(__outputfile__, crawler=crawler, title=__title__,
links=links)
http://arthurdejong.org/git/webcheck/commit/?id=a07121261ca863c4815311eaa7e685e25be32b6b
commit a07121261ca863c4815311eaa7e685e25be32b6b
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sat Sep 28 20:02:14 2013 +0200
Introduce a site_name in the crawler
diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index d126ca7..d6f49f5 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -120,6 +120,7 @@ class Crawler(object):
The available properties of this class are:
+ site_name - the name of the website that is crawled
bases - a list of base link object
plugins - a list of plugin modules used by the crawler
"""
@@ -154,6 +155,8 @@ class Crawler(object):
config.WAIT_BETWEEN_REQUESTS = self.cfg.wait
# map of scheme+netloc to robot parsers
self._robotparsers = {}
+ # set up empty site name
+ self.site_name = None
# load the plugins
self.plugins = [
__import__(plugin, globals(), locals(), [plugin])
@@ -434,6 +437,8 @@ class Crawler(object):
link = session.query(Link).filter(Link.is_internal == True).first()
logger.debug('fallback to adding %s to bases', link.url)
self.bases.append(link)
+ # set the site name
+ self.site_name = self.bases[0].title or self.bases[0].url
# do a breadth first traversal of the website to determine depth
session.query(Link).update(dict(depth=None), synchronize_session=False)
session.commit()
diff --git a/webcheck/output.py b/webcheck/output.py
index 356cf30..6811338 100644
--- a/webcheck/output.py
+++ b/webcheck/output.py
@@ -132,9 +132,6 @@ def render(output_file, **kwargs):
kwargs.setdefault('webcheck', webcheck)
kwargs.setdefault('output_file', output_file)
kwargs.setdefault('time', time.ctime(time.time()))
- crawler = kwargs.get('crawler', None)
- if crawler:
- kwargs.setdefault('sitename', crawler.bases[0].title or
crawler.bases[0].url)
kwargs.setdefault('Link', Link)
kwargs.setdefault('config', config)
template = env.get_template(output_file)
diff --git a/webcheck/templates/base.html b/webcheck/templates/base.html
index 0cdb2d1..439fa3c 100644
--- a/webcheck/templates/base.html
+++ b/webcheck/templates/base.html
@@ -26,7 +26,7 @@
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
- <title>Webcheck report for {{ sitename }} ({{ title }})</title>
+ <title>Webcheck report for {{ crawler.site_name }} ({{ title }})</title>
<link rel="stylesheet" type="text/css" href="webcheck.css" />
<link rel="icon" href="favicon.ico" type="image/ico" />
<link rel="shortcut icon" href="favicon.ico" />
@@ -34,7 +34,7 @@
<meta name="Generator" content="webcheck {{ webcheck.__version__ }}" />
</head>
<body>
- <h1 class="basename">Webcheck report for {{ sitename }}</a></h1>
+ <h1 class="basename">Webcheck report for {{ crawler.site_name }}</a></h1>
<ul class="navbar">
{% for plugin in crawler.plugins %}
-----------------------------------------------------------------------
Summary of changes:
webcheck/crawler.py | 63 +++++++++++++++++++++---------------------
webcheck/output.py | 3 --
webcheck/plugins/sitemap.py | 7 +++--
webcheck/templates/base.html | 4 +--
4 files changed, 38 insertions(+), 39 deletions(-)
hooks/post-receive
--
webcheck
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
- webcheck branch master updated. 1.10.4-78-g54bb33a,
Commits of the webcheck project