webcheck branch master updated. 1.10.4-94-g2ad066d
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
webcheck branch master updated. 1.10.4-94-g2ad066d
- From: Commits of the webcheck project <webcheck-commits [at] lists.arthurdejong.org>
- To: webcheck-commits [at] lists.arthurdejong.org
- Reply-to: webcheck-users [at] lists.arthurdejong.org
- Subject: webcheck branch master updated. 1.10.4-94-g2ad066d
- Date: Sun, 15 Dec 2013 22:35:03 +0100 (CET)
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "webcheck".
The branch, master has been updated
via 2ad066ddad58d7d705f24b28aa822b793e6a47c8 (commit)
via 0b2aca62c02f109142073aadc4f72956f9a5669a (commit)
via 59eae290e519197517459b8431ec93b48ed318ae (commit)
via bb6955310348aea4b4cc4c9e898639829f281bb7 (commit)
via dbedc712f74ecd4b5fad8c100a909d81ee9d88b3 (commit)
via e22b5a57749855ec3a2de2241861ffff41a159a1 (commit)
from b1a4579b1389fc11bed06e1a6604c9bf4f37dfb5 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://arthurdejong.org/git/webcheck/commit/?id=2ad066ddad58d7d705f24b28aa822b793e6a47c8
commit 2ad066ddad58d7d705f24b28aa822b793e6a47c8
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Dec 15 18:22:32 2013 +0100
Remove duplicate column definition
diff --git a/webcheck/db.py b/webcheck/db.py
index 77628cb..7b070a4 100644
--- a/webcheck/db.py
+++ b/webcheck/db.py
@@ -70,7 +70,6 @@ class Link(Base):
# information about the retrieved link
status = Column(String)
mimetype = Column(String)
- mimetype = Column(String)
encoding = Column(String)
size = Column(Integer)
mtime = Column(DateTime, index=True)
http://arthurdejong.org/git/webcheck/commit/?id=0b2aca62c02f109142073aadc4f72956f9a5669a
commit 0b2aca62c02f109142073aadc4f72956f9a5669a
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Dec 15 17:51:12 2013 +0100
Split functionality into Link.get_or_create()
This splits some common functionality from Link._get_child() and
Crawler.get_link() to the new Link.get_or_create() function.
diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index 57c2730..c3f62ed 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -260,13 +260,7 @@ class Crawler(object):
return None
def _get_link(self, session, url):
- # try to find the URL
- url = Link.clean_url(url)
- link = session.query(Link).filter_by(url=url).first()
- if not link:
- link = Link(url=url)
- session.add(link)
- return link
+ return Link.get_or_create(session, Link.clean_url(url))
def _get_links_to_crawl(self, session):
links = session.query(Link).filter(Link.fetched == None)
@@ -289,7 +283,6 @@ class Crawler(object):
truncate_db()
# add all internal urls to the database
for url in self.base_urls:
- url = Link.clean_url(url)
self._get_link(session, url)
# add some URLs from the database that haven't been fetched
tocheck = self._get_links_to_crawl(session)
diff --git a/webcheck/db.py b/webcheck/db.py
index 9ecb487..77628cb 100644
--- a/webcheck/db.py
+++ b/webcheck/db.py
@@ -25,6 +25,7 @@ import urlparse
from sqlalchemy import Table, Column, Integer, Boolean, String, DateTime,
ForeignKey
from sqlalchemy import create_engine
+from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, backref, sessionmaker
from sqlalchemy.orm.session import object_session
@@ -98,6 +99,22 @@ class Link(Base):
"""normalise the URL, removing the fragment from the URL"""
return urlparse.urldefrag(normalizeurl(url))[0]
+ @staticmethod
+ def get_or_create(session, url):
+ """This expects a clean url."""
+ session.commit()
+ while True:
+ instance = session.query(Link).filter_by(url=url).first()
+ if instance:
+ return instance
+ try:
+ instance = Link(url=url)
+ session.add(instance)
+ session.commit()
+ return instance
+ except IntegrityError:
+ pass # will try again
+
def _get_child(self, url):
"""Get a link object for the specified URL."""
# get the session
@@ -105,15 +122,9 @@ class Link(Base):
# normalise the URL, removing the fragment from the URL
url, fragment = urlparse.urldefrag(normalizeurl(url))
# try to find the link
- instance = session.query(Link).filter_by(url=url).first()
- if not instance:
- if config.MAX_DEPTH != None and self.depth >= config.MAX_DEPTH:
- logger.debug('link %s too deep', url)
- instance = Link(url=url, depth=self.depth + 1)
- session.add(instance)
- else:
- # we may have discovered a shorter path
- instance.depth = min(instance.depth, self.depth + 1)
+ instance = self.get_or_create(session, url)
+ # we may have discovered a shorter path
+ instance.depth = min(instance.depth, self.depth + 1) or self.depth + 1
# mark that we were looking for an anchor/fragment
if fragment:
instance.add_reqanchor(self, fragment)
http://arthurdejong.org/git/webcheck/commit/?id=59eae290e519197517459b8431ec93b48ed318ae
commit 59eae290e519197517459b8431ec93b48ed318ae
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Dec 15 17:46:45 2013 +0100
Rename some functions
This should make some functions clearer and marks internal functions
with a leading underscore.
diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index aee5cc8..57c2730 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -259,7 +259,7 @@ class Crawler(object):
# fall back to allowing the url
return None
- def get_link(self, session, url):
+ def _get_link(self, session, url):
# try to find the URL
url = Link.clean_url(url)
link = session.query(Link).filter_by(url=url).first()
@@ -268,7 +268,7 @@ class Crawler(object):
session.add(link)
return link
- def get_links_to_crawl(self, session):
+ def _get_links_to_crawl(self, session):
links = session.query(Link).filter(Link.fetched == None)
if config.MAX_DEPTH != None:
links = links.filter(Link.depth <= config.MAX_DEPTH)
@@ -290,9 +290,9 @@ class Crawler(object):
# add all internal urls to the database
for url in self.base_urls:
url = Link.clean_url(url)
- self.get_link(session, url)
+ self._get_link(session, url)
# add some URLs from the database that haven't been fetched
- tocheck = self.get_links_to_crawl(session)
+ tocheck = self._get_links_to_crawl(session)
remaining = tocheck.count()
tocheck = tocheck[:100]
remaining -= len(tocheck)
@@ -304,7 +304,7 @@ class Crawler(object):
link.yanked = self._is_yanked(str(link.url))
# see if there are any more links to check
if not tocheck:
- tocheck = self.get_links_to_crawl(session)
+ tocheck = self._get_links_to_crawl(session)
remaining = tocheck.count()
tocheck = tocheck[:100]
remaining -= len(tocheck)
@@ -312,9 +312,9 @@ class Crawler(object):
if link.yanked or link.fetched:
continue
# fetch the link's contents
- response = self.fetch(link)
+ response = self._fetch_link(link)
if response:
- self.parse(link, response)
+ self._parse_response(link, response)
# flush database changes
session.commit()
# sleep between requests if configured
@@ -327,9 +327,9 @@ class Crawler(object):
session.commit()
session.close()
- def fetch(self, link):
- """Attempt to fetch the url (if not yanked) and fill in link
- attributes (based on is_internal)."""
+ def _fetch_link(self, link):
+ """Attempt to fetch the url and return content. This updates the
+ link with information retrieved."""
logger.info(link.url)
# mark the link as fetched to avoid loops
link.fetched = datetime.datetime.now()
@@ -378,8 +378,8 @@ class Crawler(object):
logger.exception('unknown exception caught: ' + str(e))
link.add_linkproblem('error reading HTTP response: %s' % str(e))
- def parse(self, link, response):
- """Parse the fetched response."""
+ def _parse_response(self, link, response):
+ """Parse the fetched response content."""
# find a parser for the content-type
parsermodule = webcheck.parsers.get_parsermodule(link.mimetype)
if parsermodule is None:
@@ -411,7 +411,7 @@ class Crawler(object):
# do not have a parent (they form the base for the site)
bases = []
for url in list(self.base_urls):
- link = self.get_link(session, url).follow_link()
+ link = self._get_link(session, url).follow_link()
if not link:
logger.warn('base link %s redirects to nowhere', url)
self.base_urls.remove(url)
diff --git a/webcheck/db.py b/webcheck/db.py
index 1b1a050..9ecb487 100644
--- a/webcheck/db.py
+++ b/webcheck/db.py
@@ -98,7 +98,7 @@ class Link(Base):
"""normalise the URL, removing the fragment from the URL"""
return urlparse.urldefrag(normalizeurl(url))[0]
- def _get_link(self, url):
+ def _get_child(self, url):
"""Get a link object for the specified URL."""
# get the session
session = object_session(self)
@@ -176,7 +176,7 @@ class Link(Base):
if not self.is_internal:
return
# add to children
- self.children.append(self._get_link(url))
+ self.children.append(self._get_child(url))
def add_embed(self, url):
"""Mark the given URL as used as an image on this page."""
@@ -184,7 +184,7 @@ class Link(Base):
if not self.is_internal:
return
# add to embedded
- self.embedded.append(self._get_link(url))
+ self.embedded.append(self._get_child(url))
def add_anchor(self, anchor):
"""Indicate that this page contains the specified anchor."""
http://arthurdejong.org/git/webcheck/commit/?id=bb6955310348aea4b4cc4c9e898639829f281bb7
commit bb6955310348aea4b4cc4c9e898639829f281bb7
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Dec 15 17:26:17 2013 +0100
Small simplification
diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index 006db1b..aee5cc8 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -342,7 +342,7 @@ class Crawler(object):
request.add_header('Referer', parent.url)
response = urllib2.urlopen(request, timeout=config.IOTIMEOUT)
info = response.info()
- link.mimetype = response.info().gettype()
+ link.mimetype = info.gettype()
link.set_encoding(response.headers.getparam('charset'))
# get result code and other stuff
link.status = str(response.code)
http://arthurdejong.org/git/webcheck/commit/?id=dbedc712f74ecd4b5fad8c100a909d81ee9d88b3
commit dbedc712f74ecd4b5fad8c100a909d81ee9d88b3
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Dec 15 16:47:45 2013 +0100
Move SQLite initialisation to db module
diff --git a/webcheck/crawler.py b/webcheck/crawler.py
index b933e28..006db1b 100644
--- a/webcheck/crawler.py
+++ b/webcheck/crawler.py
@@ -41,12 +41,10 @@ import urllib2
import urlparse
from webcheck import config
-from webcheck.db import Session, Base, Link, truncate_db
+from webcheck.db import Session, Link, setup_db, truncate_db
from webcheck.output import install_file
import webcheck.parsers
-from sqlalchemy import create_engine
-
logger = logging.getLogger(__name__)
@@ -179,16 +177,10 @@ class Crawler(object):
if hasattr(self, 'database_configed'):
return
self.database_configed = True
- # ensure output directory exists
if not os.path.isdir(config.OUTPUT_DIR):
os.mkdir(config.OUTPUT_DIR)
- # open the sqlite file
filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
- engine = create_engine('sqlite:///' + filename)
- Session.configure(bind=engine)
- # ensure that all tables are created
- Base.metadata.create_all(engine)
- # TODO: schema migraton goes here
+ setup_db(filename)
def _is_internal(self, url):
"""Check whether the specified url is external or internal. This
diff --git a/webcheck/db.py b/webcheck/db.py
index 297771d..1b1a050 100644
--- a/webcheck/db.py
+++ b/webcheck/db.py
@@ -1,7 +1,7 @@
# db.py - database access layer for webcheck
#
-# Copyright (C) 2011 Arthur de Jong
+# Copyright (C) 2011, 2013 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -23,9 +23,9 @@
import logging
import urlparse
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import func
from sqlalchemy import Table, Column, Integer, Boolean, String, DateTime,
ForeignKey
+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, backref, sessionmaker
from sqlalchemy.orm.session import object_session
from sqlalchemy.sql.expression import union
@@ -37,7 +37,6 @@ from webcheck.myurllib import normalizeurl
logger = logging.getLogger(__name__)
-
# provide session and schema classes
Session = sessionmaker()
Base = declarative_base()
@@ -309,6 +308,14 @@ class RequestedAnchor(Base):
return self.anchor
+def setup_db(filename):
+ # open the sqlite file
+ engine = create_engine('sqlite:///' + filename)
+ Session.configure(bind=engine)
+ # ensure that all tables are created
+ Base.metadata.create_all(engine)
+
+
def truncate_db():
"""Clear all tables in the database."""
session = Session()
http://arthurdejong.org/git/webcheck/commit/?id=e22b5a57749855ec3a2de2241861ffff41a159a1
commit e22b5a57749855ec3a2de2241861ffff41a159a1
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Dec 15 16:51:00 2013 +0100
Remove annoying debug log message
diff --git a/webcheck/parsers/html/__init__.py
b/webcheck/parsers/html/__init__.py
index 3a372df..b70481b 100644
--- a/webcheck/parsers/html/__init__.py
+++ b/webcheck/parsers/html/__init__.py
@@ -1,7 +1,7 @@
# html.py - parser functions for html content
#
-# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007, 2008, 2011, 2013 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -98,7 +98,6 @@ def parse(content, link):
if config.TIDY_OPTIONS:
try:
import webcheck.parsers.html.calltidy
- logger.debug('the Tidy parser is ok')
webcheck.parsers.html.calltidy.parse(content, link)
except ImportError:
logger.warn('tidy library (python-utidylib) is unavailable')
-----------------------------------------------------------------------
Summary of changes:
webcheck/crawler.py | 49 +++++++++++++----------------------
webcheck/db.py | 51 ++++++++++++++++++++++++-------------
webcheck/parsers/html/__init__.py | 3 +--
3 files changed, 52 insertions(+), 51 deletions(-)
hooks/post-receive
--
webcheck
--
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/
- webcheck branch master updated. 1.10.4-94-g2ad066d,
Commits of the webcheck project