webcheck branch master updated. 1.10.4-94-g2ad066d

[Date Prev][Date Next] [Thread Prev][Thread Next]
From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
To: webcheck-commits [at] lists.arthurdejong.org
Reply-to: webcheck-users [at] lists.arthurdejong.org
Subject: webcheck branch master updated. 1.10.4-94-g2ad066d
Date: Sun, 15 Dec 2013 22:35:03 +0100 (CET)
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "webcheck".

The branch, master has been updated
       via  2ad066ddad58d7d705f24b28aa822b793e6a47c8 (commit)
       via  0b2aca62c02f109142073aadc4f72956f9a5669a (commit)
       via  59eae290e519197517459b8431ec93b48ed318ae (commit)
       via  bb6955310348aea4b4cc4c9e898639829f281bb7 (commit)
       via  dbedc712f74ecd4b5fad8c100a909d81ee9d88b3 (commit)
       via  e22b5a57749855ec3a2de2241861ffff41a159a1 (commit)
      from  b1a4579b1389fc11bed06e1a6604c9bf4f37dfb5 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://arthurdejong.org/git/webcheck/commit/?id=2ad066ddad58d7d705f24b28aa822b793e6a47c8

commit 2ad066ddad58d7d705f24b28aa822b793e6a47c8
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Dec 15 18:22:32 2013 +0100

    Remove duplicate column definition

diff --git a/webcheck/db.py b/webcheck/db.py
index 77628cb..7b070a4 100644
--- a/webcheck/db.py
+++ b/webcheck/db.py
@@ -70,7 +70,6 @@ class Link(Base):
     # information about the retrieved link
     status = Column(String)
     mimetype = Column(String)
-    mimetype = Column(String)
     encoding = Column(String)
     size = Column(Integer)
     mtime = Column(DateTime, index=True)

http://arthurdejong.org/git/webcheck/commit/?id=0b2aca62c02f109142073aadc4f72956f9a5669a

commit 0b2aca62c02f109142073aadc4f72956f9a5669a
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Dec 15 17:51:12 2013 +0100

    Split functionality into Link.get_or_create()
    
    This splits some common functionality from Link._get_child() and
    Crawler.get_link() to the new Link.get_or_create() function.

diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index 57c2730..c3f62ed 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -260,13 +260,7 @@ class Crawler(object):
         return None
 
     def _get_link(self, session, url):
-        # try to find the URL
-        url = Link.clean_url(url)
-        link = session.query(Link).filter_by(url=url).first()
-        if not link:
-            link = Link(url=url)
-            session.add(link)
-        return link
+        return Link.get_or_create(session, Link.clean_url(url))
 
     def _get_links_to_crawl(self, session):
         links = session.query(Link).filter(Link.fetched == None)
@@ -289,7 +283,6 @@ class Crawler(object):
             truncate_db()
         # add all internal urls to the database
         for url in self.base_urls:
-            url = Link.clean_url(url)
             self._get_link(session, url)
         # add some URLs from the database that haven't been fetched
         tocheck = self._get_links_to_crawl(session)
diff --git a/webcheck/db.py b/webcheck/db.py
index 9ecb487..77628cb 100644
--- a/webcheck/db.py
+++ b/webcheck/db.py
@@ -25,6 +25,7 @@ import urlparse
 
 from sqlalchemy import Table, Column, Integer, Boolean, String, DateTime, 
ForeignKey
 from sqlalchemy import create_engine
+from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import relationship, backref, sessionmaker
 from sqlalchemy.orm.session import object_session
@@ -98,6 +99,22 @@ class Link(Base):
         """normalise the URL, removing the fragment from the URL"""
         return urlparse.urldefrag(normalizeurl(url))[0]
 
+    @staticmethod
+    def get_or_create(session, url):
+        """This expects a clean url."""
+        session.commit()
+        while True:
+            instance = session.query(Link).filter_by(url=url).first()
+            if instance:
+                return instance
+            try:
+                instance = Link(url=url)
+                session.add(instance)
+                session.commit()
+                return instance
+            except IntegrityError:
+                pass  # will try again
+
     def _get_child(self, url):
         """Get a link object for the specified URL."""
         # get the session
@@ -105,15 +122,9 @@ class Link(Base):
         # normalise the URL, removing the fragment from the URL
         url, fragment = urlparse.urldefrag(normalizeurl(url))
         # try to find the link
-        instance = session.query(Link).filter_by(url=url).first()
-        if not instance:
-            if config.MAX_DEPTH != None and self.depth >= config.MAX_DEPTH:
-                logger.debug('link %s too deep', url)
-            instance = Link(url=url, depth=self.depth + 1)
-            session.add(instance)
-        else:
-            # we may have discovered a shorter path
-            instance.depth = min(instance.depth, self.depth + 1)
+        instance = self.get_or_create(session, url)
+        # we may have discovered a shorter path
+        instance.depth = min(instance.depth, self.depth + 1) or self.depth + 1
         # mark that we were looking for an anchor/fragment
         if fragment:
             instance.add_reqanchor(self, fragment)

http://arthurdejong.org/git/webcheck/commit/?id=59eae290e519197517459b8431ec93b48ed318ae

commit 59eae290e519197517459b8431ec93b48ed318ae
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Dec 15 17:46:45 2013 +0100

    Rename some functions
    
    This should make some functions clearer and marks internal functions
    with a leading underscore.

diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index aee5cc8..57c2730 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -259,7 +259,7 @@ class Crawler(object):
         # fall back to allowing the url
         return None
 
-    def get_link(self, session, url):
+    def _get_link(self, session, url):
         # try to find the URL
         url = Link.clean_url(url)
         link = session.query(Link).filter_by(url=url).first()
@@ -268,7 +268,7 @@ class Crawler(object):
             session.add(link)
         return link
 
-    def get_links_to_crawl(self, session):
+    def _get_links_to_crawl(self, session):
         links = session.query(Link).filter(Link.fetched == None)
         if config.MAX_DEPTH != None:
             links = links.filter(Link.depth <= config.MAX_DEPTH)
@@ -290,9 +290,9 @@ class Crawler(object):
         # add all internal urls to the database
         for url in self.base_urls:
             url = Link.clean_url(url)
-            self.get_link(session, url)
+            self._get_link(session, url)
         # add some URLs from the database that haven't been fetched
-        tocheck = self.get_links_to_crawl(session)
+        tocheck = self._get_links_to_crawl(session)
         remaining = tocheck.count()
         tocheck = tocheck[:100]
         remaining -= len(tocheck)
@@ -304,7 +304,7 @@ class Crawler(object):
             link.yanked = self._is_yanked(str(link.url))
             # see if there are any more links to check
             if not tocheck:
-                tocheck = self.get_links_to_crawl(session)
+                tocheck = self._get_links_to_crawl(session)
                 remaining = tocheck.count()
                 tocheck = tocheck[:100]
                 remaining -= len(tocheck)
@@ -312,9 +312,9 @@ class Crawler(object):
             if link.yanked or link.fetched:
                 continue
             # fetch the link's contents
-            response = self.fetch(link)
+            response = self._fetch_link(link)
             if response:
-                self.parse(link, response)
+                self._parse_response(link, response)
             # flush database changes
             session.commit()
             # sleep between requests if configured
@@ -327,9 +327,9 @@ class Crawler(object):
         session.commit()
         session.close()
 
-    def fetch(self, link):
-        """Attempt to fetch the url (if not yanked) and fill in link
-        attributes (based on is_internal)."""
+    def _fetch_link(self, link):
+        """Attempt to fetch the url and return content. This updates the
+        link with information retrieved."""
         logger.info(link.url)
         # mark the link as fetched to avoid loops
         link.fetched = datetime.datetime.now()
@@ -378,8 +378,8 @@ class Crawler(object):
             logger.exception('unknown exception caught: ' + str(e))
             link.add_linkproblem('error reading HTTP response: %s' % str(e))
 
-    def parse(self, link, response):
-        """Parse the fetched response."""
+    def _parse_response(self, link, response):
+        """Parse the fetched response content."""
         # find a parser for the content-type
         parsermodule = webcheck.parsers.get_parsermodule(link.mimetype)
         if parsermodule is None:
@@ -411,7 +411,7 @@ class Crawler(object):
         # do not have a parent (they form the base for the site)
         bases = []
         for url in list(self.base_urls):
-            link = self.get_link(session, url).follow_link()
+            link = self._get_link(session, url).follow_link()
             if not link:
                 logger.warn('base link %s redirects to nowhere', url)
                 self.base_urls.remove(url)
diff --git a/webcheck/db.py b/webcheck/db.py
index 1b1a050..9ecb487 100644
--- a/webcheck/db.py
+++ b/webcheck/db.py
@@ -98,7 +98,7 @@ class Link(Base):
         """normalise the URL, removing the fragment from the URL"""
         return urlparse.urldefrag(normalizeurl(url))[0]
 
-    def _get_link(self, url):
+    def _get_child(self, url):
         """Get a link object for the specified URL."""
         # get the session
         session = object_session(self)
@@ -176,7 +176,7 @@ class Link(Base):
         if not self.is_internal:
             return
         # add to children
-        self.children.append(self._get_link(url))
+        self.children.append(self._get_child(url))
 
     def add_embed(self, url):
         """Mark the given URL as used as an image on this page."""
@@ -184,7 +184,7 @@ class Link(Base):
         if not self.is_internal:
             return
         # add to embedded
-        self.embedded.append(self._get_link(url))
+        self.embedded.append(self._get_child(url))
 
     def add_anchor(self, anchor):
         """Indicate that this page contains the specified anchor."""

http://arthurdejong.org/git/webcheck/commit/?id=bb6955310348aea4b4cc4c9e898639829f281bb7

commit bb6955310348aea4b4cc4c9e898639829f281bb7
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Dec 15 17:26:17 2013 +0100

    Small simplification

diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index 006db1b..aee5cc8 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -342,7 +342,7 @@ class Crawler(object):
                 request.add_header('Referer', parent.url)
             response = urllib2.urlopen(request, timeout=config.IOTIMEOUT)
             info = response.info()
-            link.mimetype = response.info().gettype()
+            link.mimetype = info.gettype()
             link.set_encoding(response.headers.getparam('charset'))
             # get result code and other stuff
             link.status = str(response.code)

http://arthurdejong.org/git/webcheck/commit/?id=dbedc712f74ecd4b5fad8c100a909d81ee9d88b3

commit dbedc712f74ecd4b5fad8c100a909d81ee9d88b3
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Dec 15 16:47:45 2013 +0100

    Move SQLite initialisation to db module

diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index b933e28..006db1b 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -41,12 +41,10 @@ import urllib2
 import urlparse
 
 from webcheck import config
-from webcheck.db import Session, Base, Link, truncate_db
+from webcheck.db import Session, Link, setup_db, truncate_db
 from webcheck.output import install_file
 import webcheck.parsers
 
-from sqlalchemy import create_engine
-
 
 logger = logging.getLogger(__name__)
 
@@ -179,16 +177,10 @@ class Crawler(object):
         if hasattr(self, 'database_configed'):
             return
         self.database_configed = True
-        # ensure output directory exists
         if not os.path.isdir(config.OUTPUT_DIR):
             os.mkdir(config.OUTPUT_DIR)
-        # open the sqlite file
         filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
-        engine = create_engine('sqlite:///' + filename)
-        Session.configure(bind=engine)
-        # ensure that all tables are created
-        Base.metadata.create_all(engine)
-        # TODO: schema migraton goes here
+        setup_db(filename)
 
     def _is_internal(self, url):
         """Check whether the specified url is external or internal. This
diff --git a/webcheck/db.py b/webcheck/db.py
index 297771d..1b1a050 100644
--- a/webcheck/db.py
+++ b/webcheck/db.py
@@ -1,7 +1,7 @@
 
 # db.py - database access layer for webcheck
 #
-# Copyright (C) 2011 Arthur de Jong
+# Copyright (C) 2011, 2013 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -23,9 +23,9 @@
 import logging
 import urlparse
 
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import func
 from sqlalchemy import Table, Column, Integer, Boolean, String, DateTime, 
ForeignKey
+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import relationship, backref, sessionmaker
 from sqlalchemy.orm.session import object_session
 from sqlalchemy.sql.expression import union
@@ -37,7 +37,6 @@ from webcheck.myurllib import normalizeurl
 logger = logging.getLogger(__name__)
 
 
-
 # provide session and schema classes
 Session = sessionmaker()
 Base = declarative_base()
@@ -309,6 +308,14 @@ class RequestedAnchor(Base):
         return self.anchor
 
 
+def setup_db(filename):
+    # open the sqlite file
+    engine = create_engine('sqlite:///' + filename)
+    Session.configure(bind=engine)
+    # ensure that all tables are created
+    Base.metadata.create_all(engine)
+
+
 def truncate_db():
     """Clear all tables in the database."""
     session = Session()

http://arthurdejong.org/git/webcheck/commit/?id=e22b5a57749855ec3a2de2241861ffff41a159a1

commit e22b5a57749855ec3a2de2241861ffff41a159a1
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Dec 15 16:51:00 2013 +0100

    Remove annoying debug log message

diff --git a/webcheck/parsers/html/__init__.py 
b/webcheck/parsers/html/__init__.py
index 3a372df..b70481b 100644
--- a/webcheck/parsers/html/__init__.py
+++ b/webcheck/parsers/html/__init__.py
@@ -1,7 +1,7 @@
 
 # html.py - parser functions for html content
 #
-# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2011, 2013 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -98,7 +98,6 @@ def parse(content, link):
     if config.TIDY_OPTIONS:
         try:
             import webcheck.parsers.html.calltidy
-            logger.debug('the Tidy parser is ok')
             webcheck.parsers.html.calltidy.parse(content, link)
         except ImportError:
             logger.warn('tidy library (python-utidylib) is unavailable')

-----------------------------------------------------------------------

Summary of changes:
 webcheck/crawler.py               |   49 +++++++++++++----------------------
 webcheck/db.py                    |   51 ++++++++++++++++++++++++-------------
 webcheck/parsers/html/__init__.py |    3 +--
 3 files changed, 52 insertions(+), 51 deletions(-)


hooks/post-receive
-- 
webcheck
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
webcheck branch master updated. 1.10.4-94-g2ad066d, Commits of the webcheck project
Prev by Date: webcheck branch master updated. 1.10.4-88-gb1a4579
Next by Date: webcheck branch master updated. 1.10.4-95-ge5f0ff4
Previous by thread: webcheck branch master updated. 1.10.4-88-gb1a4579
Next by thread: webcheck branch master updated. 1.10.4-95-ge5f0ff4