webcheck branch master updated. 1.10.4-78-g54bb33a

[Date Prev][Date Next] [Thread Prev][Thread Next]
From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
To: webcheck-commits [at] lists.arthurdejong.org
Reply-to: webcheck-users [at] lists.arthurdejong.org
Subject: webcheck branch master updated. 1.10.4-78-g54bb33a
Date: Sat, 28 Sep 2013 20:27:50 +0200 (CEST)
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "webcheck".

The branch, master has been updated
       via  54bb33a8f68704ee690352170040cbfb2aea10ea (commit)
       via  a07121261ca863c4815311eaa7e685e25be32b6b (commit)
      from  0b341a9e87a7743726e25a8bcb2df7f698f369b0 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://arthurdejong.org/git/webcheck/commit/?id=54bb33a8f68704ee690352170040cbfb2aea10ea

commit 54bb33a8f68704ee690352170040cbfb2aea10ea
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Sep 22 22:49:07 2013 +0200

    Use crawler.base_urls instead of crawler.bases
    
    Exposing crawler.bases leaks the sqlalchemy session to the plugins which
    seems to cause problems in some cases.
    
    As a consequence of this change, the sitemap plugin now uses its own
    session.

diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index d6f49f5..f16cf5a 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -121,7 +121,7 @@ class Crawler(object):
     The available properties of this class are:
 
       site_name  - the name of the website that is crawled
-      bases      - a list of base link object
+      base_urls  - a list of base URLs
       plugins    - a list of plugin modules used by the crawler
     """
 
@@ -162,17 +162,17 @@ class Crawler(object):
             __import__(plugin, globals(), locals(), [plugin])
             for plugin in config.PLUGINS]
         # add base urls
-        self._internal_urls = set()
+        self.base_urls = []
         for url in self.cfg.base_urls:
             # if it does not look like a url it is probably a local file
             if urlparse.urlsplit(url)[0] == '':
                 url = 'file://' + urllib.pathname2url(os.path.abspath(url))
             # clean the URL and add it
             url = Link.clean_url(url)
-            if url not in self._internal_urls:
-                self._internal_urls.add(url)
-        # list of base link objects
-        self.bases = []
+            if url not in self.base_urls:
+                self.base_urls.append(url)
+        # set up empty site name
+        self.site_name = None
 
     def setup_database(self):
         if hasattr(self, 'database_configed'):
@@ -193,27 +193,19 @@ class Crawler(object):
         """Check whether the specified url is external or internal. This
         uses the urls marked with add_base() and the regular expressions
         passed with add_external_re()."""
-        # check if it is internal through the regexps
         for regexp in self._internal_res.values():
             if regexp.search(url) is not None:
                 return True
-        res = False
-        # check that the url starts with an internal url
         if config.BASE_URLS_ONLY:
-            # the url must start with one of the _internal_urls
-            for i in self._internal_urls:
-                res |= (i == url[:len(i)])
+            # the url must start with one of the base URLs
+            if not any(url.startswith(x) for x in self.base_urls):
+                return False
         else:
             # the netloc must match a netloc of an _internal_url
             netloc = urlparse.urlsplit(url)[1]
-            for i in self._internal_urls:
-                res |= (urlparse.urlsplit(i)[1] == netloc)
-        # if it is not internal now, it never will be
-        if not res:
-            return False
-        # check if it is external through the regexps
+            if not any((urlparse.urlsplit(x)[1] == netloc) for x in 
self.base_urls):
+                return False
         for x in self._external_res.values():
-            # if the url matches it is external and we can stop
             if x.search(url):
                 return False
         return True
@@ -303,7 +295,7 @@ class Crawler(object):
         if not config.CONTINUE:
             truncate_db()
         # add all internal urls to the database
-        for url in self._internal_urls:
+        for url in self.base_urls:
             url = Link.clean_url(url)
             self.get_link(session, url)
         # add some URLs from the database that haven't been fetched
@@ -424,27 +416,28 @@ class Crawler(object):
         session = Session()
         # build the list of urls that were set up with add_base() that
         # do not have a parent (they form the base for the site)
-        for url in self._internal_urls:
+        bases = []
+        for url in list(self.base_urls):
             link = self.get_link(session, url).follow_link()
             if not link:
                 logger.warn('base link %s redirects to nowhere', url)
-                continue
-            # add the link to bases
-            logger.debug('adding %s to bases', link.url)
-            self.bases.append(link)
-        # if we got no bases, just use the first internal one
-        if not self.bases:
+                self.base_urls.remove(url)
+            else:
+                bases.append(link)
+        # if we got no base URLs, just use the first internal one we find
+        if not self.base_urls:
             link = session.query(Link).filter(Link.is_internal == True).first()
-            logger.debug('fallback to adding %s to bases', link.url)
-            self.bases.append(link)
+            logger.debug('fallback to adding %s to base urls', link.url)
+            self.base_urls.append(link.url)
+            bases.append(link)
         # set the site name
-        self.site_name = self.bases[0].title or self.bases[0].url
+        self.site_name = bases[0].title or bases[0].url
         # do a breadth first traversal of the website to determine depth
         session.query(Link).update(dict(depth=None), synchronize_session=False)
         session.commit()
         depth = 0
-        count = len(self.bases)
-        for link in self.bases:
+        count = len(bases)
+        for link in bases:
             link.depth = 0
         session.commit()
         while count > 0:
@@ -458,12 +451,13 @@ class Crawler(object):
             session.commit()
             depth += 1
             # TODO: also handle embeds
+        session.commit()
+        session.close()
         # see if any of the plugins want to do postprocessing
         for plugin in self.plugins:
             if hasattr(plugin, 'postprocess'):
                 logger.info(plugin.__name__)
                 plugin.postprocess(self)
-        #session.close() do not close because bases uses the session
 
     def generate(self):
         """Generate pages for plugins."""
diff --git a/webcheck/plugins/sitemap.py b/webcheck/plugins/sitemap.py
index 55bff59..5c13d59 100644
--- a/webcheck/plugins/sitemap.py
+++ b/webcheck/plugins/sitemap.py
@@ -29,7 +29,7 @@ __author__ = 'Arthur de Jong'
 __outputfile__ = 'index.html'
 
 from webcheck import config
-from webcheck.db import Link
+from webcheck.db import Session, Link
 from webcheck.output import render
 
 
@@ -71,6 +71,9 @@ def explore(links, explored=None, depth=0):
 
 def generate(crawler):
     """Output the sitemap."""
-    links = explore(crawler.bases)
+    session = Session()
+    links = [session.query(Link).filter_by(url=url).first()
+             for url in crawler.base_urls]
+    links = explore(links)
     render(__outputfile__, crawler=crawler, title=__title__,
            links=links)

http://arthurdejong.org/git/webcheck/commit/?id=a07121261ca863c4815311eaa7e685e25be32b6b

commit a07121261ca863c4815311eaa7e685e25be32b6b
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sat Sep 28 20:02:14 2013 +0200

    Introduce a site_name in the crawler

diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index d126ca7..d6f49f5 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -120,6 +120,7 @@ class Crawler(object):
 
     The available properties of this class are:
 
+      site_name  - the name of the website that is crawled
       bases      - a list of base link object
       plugins    - a list of plugin modules used by the crawler
     """
@@ -154,6 +155,8 @@ class Crawler(object):
         config.WAIT_BETWEEN_REQUESTS = self.cfg.wait
         # map of scheme+netloc to robot parsers
         self._robotparsers = {}
+        # set up empty site name
+        self.site_name = None
         # load the plugins
         self.plugins = [
             __import__(plugin, globals(), locals(), [plugin])
@@ -434,6 +437,8 @@ class Crawler(object):
             link = session.query(Link).filter(Link.is_internal == True).first()
             logger.debug('fallback to adding %s to bases', link.url)
             self.bases.append(link)
+        # set the site name
+        self.site_name = self.bases[0].title or self.bases[0].url
         # do a breadth first traversal of the website to determine depth
         session.query(Link).update(dict(depth=None), synchronize_session=False)
         session.commit()
diff --git a/webcheck/output.py b/webcheck/output.py
index 356cf30..6811338 100644
--- a/webcheck/output.py
+++ b/webcheck/output.py
@@ -132,9 +132,6 @@ def render(output_file, **kwargs):
     kwargs.setdefault('webcheck', webcheck)
     kwargs.setdefault('output_file', output_file)
     kwargs.setdefault('time', time.ctime(time.time()))
-    crawler = kwargs.get('crawler', None)
-    if crawler:
-        kwargs.setdefault('sitename', crawler.bases[0].title or 
crawler.bases[0].url)
     kwargs.setdefault('Link', Link)
     kwargs.setdefault('config', config)
     template = env.get_template(output_file)
diff --git a/webcheck/templates/base.html b/webcheck/templates/base.html
index 0cdb2d1..439fa3c 100644
--- a/webcheck/templates/base.html
+++ b/webcheck/templates/base.html
@@ -26,7 +26,7 @@
 <html xmlns="http://www.w3.org/1999/xhtml";>
   <head>
     <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
-    <title>Webcheck report for {{ sitename }} ({{ title }})</title>
+    <title>Webcheck report for {{ crawler.site_name }} ({{ title }})</title>
     <link rel="stylesheet" type="text/css" href="webcheck.css" />
     <link rel="icon" href="favicon.ico" type="image/ico" />
     <link rel="shortcut icon" href="favicon.ico" />
@@ -34,7 +34,7 @@
     <meta name="Generator" content="webcheck {{ webcheck.__version__ }}" />
   </head>
   <body>
-    <h1 class="basename">Webcheck report for {{ sitename }}</a></h1>
+    <h1 class="basename">Webcheck report for {{ crawler.site_name }}</a></h1>
 
     <ul class="navbar">
       {% for plugin in crawler.plugins %}

-----------------------------------------------------------------------

Summary of changes:
 webcheck/crawler.py          |   63 +++++++++++++++++++++---------------------
 webcheck/output.py           |    3 --
 webcheck/plugins/sitemap.py  |    7 +++--
 webcheck/templates/base.html |    4 +--
 4 files changed, 38 insertions(+), 39 deletions(-)


hooks/post-receive
-- 
webcheck
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
webcheck branch master updated. 1.10.4-78-g54bb33a, Commits of the webcheck project
Prev by Date: webcheck branch master updated. 1.10.4-76-g0b341a9
Next by Date: webcheck branch master updated. 1.10.4-79-gc31be03
Previous by thread: webcheck branch master updated. 1.10.4-76-g0b341a9
Next by thread: webcheck branch master updated. 1.10.4-79-gc31be03