lists.arthurdejong.org
RSS feed

webcheck commit: r448 - in webcheck: . webcheck webcheck/parsers/html webcheck/plugins

[Date Prev][Date Next] [Thread Prev][Thread Next]

webcheck commit: r448 - in webcheck: . webcheck webcheck/parsers/html webcheck/plugins



Author: arthur
Date: Fri Oct  7 13:19:31 2011
New Revision: 448
URL: http://arthurdejong.org/viewvc/webcheck?revision=448&view=revision

Log:
rename Site to Crawler

Modified:
   webcheck/cmd.py
   webcheck/webcheck/__init__.py
   webcheck/webcheck/crawler.py
   webcheck/webcheck/parsers/html/htmlparser.py
   webcheck/webcheck/plugins/__init__.py
   webcheck/webcheck/plugins/about.py
   webcheck/webcheck/plugins/anchors.py
   webcheck/webcheck/plugins/badlinks.py
   webcheck/webcheck/plugins/external.py
   webcheck/webcheck/plugins/images.py
   webcheck/webcheck/plugins/new.py
   webcheck/webcheck/plugins/notchkd.py
   webcheck/webcheck/plugins/notitles.py
   webcheck/webcheck/plugins/old.py
   webcheck/webcheck/plugins/problems.py
   webcheck/webcheck/plugins/sitemap.py
   webcheck/webcheck/plugins/size.py
   webcheck/webcheck/plugins/urllist.py

Modified: webcheck/cmd.py
==============================================================================
--- webcheck/cmd.py     Fri Oct  7 12:52:35 2011        (r447)
+++ webcheck/cmd.py     Fri Oct  7 13:19:31 2011        (r448)
@@ -33,8 +33,7 @@
 
 import webcheck
 import webcheck.monkeypatch
-from webcheck.crawler import Site
-from webcheck import config, debugio
+from webcheck import config, debugio, Crawler
 
 
 debugio.loglevel = debugio.INFO
@@ -97,7 +96,7 @@
       % {'redirects': config.REDIRECT_DEPTH})
 
 
-def parse_args(site):
+def parse_args(crawler):
     """Parse command-line arguments."""
     import getopt
     try:
@@ -155,19 +154,19 @@
         if not os.path.isdir(config.OUTPUT_DIR):
             os.mkdir(config.OUTPUT_DIR)
         # set up database connection
-        site.setup_database()
+        crawler.setup_database()
         # add configuration to site
         for pattern in internal_urls:
-            site.add_internal_re(pattern)
+            crawler.add_internal_re(pattern)
         for pattern in external_urls:
-            site.add_external_re(pattern)
+            crawler.add_external_re(pattern)
         for pattern in yank_urls:
-            site.add_yanked_re(pattern)
+            crawler.add_yanked_re(pattern)
         for arg in args:
             # if it does not look like a url it is probably a local file
             if urlparse.urlsplit(arg)[0] == '':
                 arg = 'file://' + urllib.pathname2url(os.path.abspath(arg))
-            site.add_internal(arg)
+            crawler.add_internal(arg)
     except getopt.error, reason:
         sys.stderr.write('webcheck: %s\n' % reason)
         print_tryhelp()
@@ -177,30 +176,30 @@
         sys.exit(1)
 
 
-def main(site):
+def main(crawler):
     """Main program."""
     # crawl through the website
     debugio.info('checking site....')
-    site.crawl()  # this will take a while
+    crawler.crawl()  # this will take a while
     debugio.info('done.')
     # do postprocessing (building site structure, etc)
     debugio.info('postprocessing....')
-    site.postprocess()
+    crawler.postprocess()
     debugio.info('done.')
     # now we can write out the files
     # start with the frame-description page
     debugio.info('generating reports...')
     # for every plugin, generate a page
-    site.generate()
+    crawler.generate()
     debugio.info('done.')
 
 
 if __name__ == '__main__':
     try:
-        # initialize site object
-        site = Site()
+        # initialize crawler object
+        crawler = Crawler()
         # parse command-line arguments
-        parse_args(site)
+        parse_args(crawler)
         # run the main program
         if PROFILE:
             fname = os.path.join(config.OUTPUT_DIR, 'webcheck.prof')
@@ -213,12 +212,12 @@
                 sqltap.start()
             except ImportError:
                 pass
-            cProfile.run('main(site)', fname)
+            cProfile.run('main(crawler)', fname)
             if 'sqltap' in locals():
                 statistics = sqltap.collect()
                 sqltap.report(statistics, os.path.join(config.OUTPUT_DIR, 
'sqltap.html'))
         else:
-            main(site)
+            main(crawler)
     except KeyboardInterrupt:
         sys.stderr.write('Interrupted\n')
         sys.exit(1)

Modified: webcheck/webcheck/__init__.py
==============================================================================
--- webcheck/webcheck/__init__.py       Fri Oct  7 12:52:35 2011        (r447)
+++ webcheck/webcheck/__init__.py       Fri Oct  7 13:19:31 2011        (r448)
@@ -23,3 +23,5 @@
 
 __version__ = '1.10.4'
 __homepage__ = 'http://arthurdejong.org/webcheck/'
+
+from webcheck.crawler import Crawler

Modified: webcheck/webcheck/crawler.py
==============================================================================
--- webcheck/webcheck/crawler.py        Fri Oct  7 12:52:35 2011        (r447)
+++ webcheck/webcheck/crawler.py        Fri Oct  7 13:19:31 2011        (r448)
@@ -22,7 +22,7 @@
 # The files produced as output from the software do not automatically fall
 # under the copyright of the software, unless explicitly stated otherwise.
 
-"""General module to do site-checking. This module contains the Site class
+"""General module to do site-checking. This module contains the Crawler class
 containing the state for the crawled site and some functions to access and
 manipulate the crawling of the website. This module also contains the Link
 class that holds all the link related properties."""
@@ -91,8 +91,7 @@
 _anchorpattern = re.compile('#([^#]+)$')
 
 
-# TODO: rename Site to Crawler
-class Site(object):
+class Crawler(object):
     """Class to represent gathered data of a site.
 
     The available properties of this class are:
@@ -101,7 +100,7 @@
    """
 
     def __init__(self):
-        """Creates an instance of the Site class and initializes the
+        """Creates an instance of the Crawler class and initializes the
         state of the site."""
         # list of internal urls
         self._internal_urls = set()

Modified: webcheck/webcheck/parsers/html/htmlparser.py
==============================================================================
--- webcheck/webcheck/parsers/html/htmlparser.py        Fri Oct  7 12:52:35 
2011        (r447)
+++ webcheck/webcheck/parsers/html/htmlparser.py        Fri Oct  7 13:19:31 
2011        (r448)
@@ -32,6 +32,7 @@
 from webcheck import debugio
 from webcheck.myurllib import normalizeurl
 from webcheck.parsers.html import htmlunescape
+import webcheck.parsers.css
 
 
 # pattern for matching numeric html entities
@@ -211,7 +212,6 @@
         # pick up any tags with a style attribute
         if 'style' in attrs:
             # delegate handling of inline css to css module
-            import webcheck.parsers.css
             webcheck.parsers.css.parse(attrs['style'], self.link, self.base)
 
     def handle_endtag(self, tag):
@@ -221,7 +221,6 @@
             self.collect = None
         elif tag == 'style' and self.collect is not None:
             # delegate handling of inline css to css module
-            import webcheck.parsers.css
             webcheck.parsers.css.parse(self.collect, self.link, self.base)
 
     def handle_data(self, data):

Modified: webcheck/webcheck/plugins/__init__.py
==============================================================================
--- webcheck/webcheck/plugins/__init__.py       Fri Oct  7 12:52:35 2011        
(r447)
+++ webcheck/webcheck/plugins/__init__.py       Fri Oct  7 13:19:31 2011        
(r448)
@@ -28,7 +28,7 @@
 the generate() function. Each plugin should export the following
 fields:
 
-    generate(site)
+    generate(crawler)
         Based on the site generate all the output files as needed.
     __title__
         A short description of the plugin that is used when linking
@@ -182,12 +182,12 @@
     fp.write('  </ul>\n')
 
 
-def open_html(plugin, site):
+def open_html(plugin, crawler):
     """Print an html fragment for the start of an html page."""
     # open the file
     fp = open_file(plugin.__outputfile__)
     # get the first base url
-    base = site.bases[0]
+    base = crawler.bases[0]
     # write basic html head
     fp.write(
       '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'

Modified: webcheck/webcheck/plugins/about.py
==============================================================================
--- webcheck/webcheck/plugins/about.py  Fri Oct  7 12:52:35 2011        (r447)
+++ webcheck/webcheck/plugins/about.py  Fri Oct  7 13:19:31 2011        (r448)
@@ -36,9 +36,9 @@
 import webcheck.plugins
 
 
-def generate(site):
+def generate(crawler):
     """Output a list of modules, it's authors and the webcheck version."""
-    fp = webcheck.plugins.open_html(webcheck.plugins.about, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.about, crawler)
     session = Session()
     # TODO: xxx links were fetched, xxx pages were examined and a total of xxx 
notes and problems were found
     # TODO: include some runtime information (e.g. supported schemes, user 
configuration, etc)

Modified: webcheck/webcheck/plugins/anchors.py
==============================================================================
--- webcheck/webcheck/plugins/anchors.py        Fri Oct  7 12:52:35 2011        
(r447)
+++ webcheck/webcheck/plugins/anchors.py        Fri Oct  7 13:19:31 2011        
(r448)
@@ -30,7 +30,7 @@
 from webcheck.db import Session, Link, Anchor
 
 
-def postprocess(site):
+def postprocess(crawler):
     """Add all missing anchors as page problems to the referring page."""
     session = Session()
     # find all fetched links with requested anchors

Modified: webcheck/webcheck/plugins/badlinks.py
==============================================================================
--- webcheck/webcheck/plugins/badlinks.py       Fri Oct  7 12:52:35 2011        
(r447)
+++ webcheck/webcheck/plugins/badlinks.py       Fri Oct  7 13:19:31 2011        
(r448)
@@ -34,7 +34,7 @@
 import webcheck.plugins
 
 
-def postporcess(site):
+def postporcess(crawler):
     """Add all bad links as pageproblems on pages where they are linked."""
     session = Session()
     # find all links with link problems
@@ -48,13 +48,13 @@
     session.commit()
 
 
-def generate(site):
+def generate(crawler):
     """Present the list of bad links."""
     session = Session()
     # find all links with link problems
     links = 
session.query(Link).filter(Link.linkproblems.any()).order_by(Link.url).options(joinedload(Link.linkproblems))
     # present results
-    fp = webcheck.plugins.open_html(webcheck.plugins.badlinks, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.badlinks, crawler)
     if not links:
         fp.write(
           '   <p class="description">\n'

Modified: webcheck/webcheck/plugins/external.py
==============================================================================
--- webcheck/webcheck/plugins/external.py       Fri Oct  7 12:52:35 2011        
(r447)
+++ webcheck/webcheck/plugins/external.py       Fri Oct  7 13:19:31 2011        
(r448)
@@ -34,13 +34,13 @@
 import webcheck.plugins
 
 
-def generate(site):
+def generate(crawler):
     """Generate the list of external links."""
     session = Session()
     # get all external links
     links = session.query(Link).filter(Link.is_internal != 
True).order_by(Link.url)
     # present results
-    fp = webcheck.plugins.open_html(webcheck.plugins.external, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.external, crawler)
     if not links:
         fp.write(
           '   <p class="description">'

Modified: webcheck/webcheck/plugins/images.py
==============================================================================
--- webcheck/webcheck/plugins/images.py Fri Oct  7 12:52:35 2011        (r447)
+++ webcheck/webcheck/plugins/images.py Fri Oct  7 13:19:31 2011        (r448)
@@ -32,7 +32,7 @@
 import webcheck.plugins
 
 
-def generate(site):
+def generate(crawler):
     """Generate a list of image URLs that were found."""
     session = Session()
     # get non-page links that have an image/* mimetype
@@ -41,7 +41,7 @@
     links = links.filter(Link.mimetype.startswith('image/'))
     links = links.order_by(Link.url)
     # present results
-    fp = webcheck.plugins.open_html(webcheck.plugins.images, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.images, crawler)
     if not links:
         fp.write(
           '   <p class="description">\n'

Modified: webcheck/webcheck/plugins/new.py
==============================================================================
--- webcheck/webcheck/plugins/new.py    Fri Oct  7 12:52:35 2011        (r447)
+++ webcheck/webcheck/plugins/new.py    Fri Oct  7 13:19:31 2011        (r448)
@@ -38,7 +38,7 @@
 SECS_PER_DAY = 60 * 60 * 24
 
 
-def generate(site):
+def generate(crawler):
     """Output the list of recently modified pages."""
     session = Session()
     # the time for which links are considered new
@@ -47,7 +47,7 @@
     links = session.query(Link).filter_by(is_page=True, is_internal=True)
     links = links.filter(Link.mtime > newtime).order_by(Link.mtime.desc())
     # present results
-    fp = webcheck.plugins.open_html(webcheck.plugins.new, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.new, crawler)
     if not links.count():
         fp.write(
           '   <p class="description">\n'

Modified: webcheck/webcheck/plugins/notchkd.py
==============================================================================
--- webcheck/webcheck/plugins/notchkd.py        Fri Oct  7 12:52:35 2011        
(r447)
+++ webcheck/webcheck/plugins/notchkd.py        Fri Oct  7 13:19:31 2011        
(r448)
@@ -34,13 +34,13 @@
 import webcheck.plugins
 
 
-def generate(site):
+def generate(crawler):
     """Output the list of not checked pages."""
     session = Session()
     # get all yanked urls
     links = session.query(Link).filter(Link.yanked != None).order_by(Link.url)
     # present results
-    fp = webcheck.plugins.open_html(webcheck.plugins.notchkd, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.notchkd, crawler)
     if not links.count():
         fp.write(
           '   <p class="description">\n'

Modified: webcheck/webcheck/plugins/notitles.py
==============================================================================
--- webcheck/webcheck/plugins/notitles.py       Fri Oct  7 12:52:35 2011        
(r447)
+++ webcheck/webcheck/plugins/notitles.py       Fri Oct  7 13:19:31 2011        
(r448)
@@ -34,7 +34,7 @@
 import webcheck.plugins
 
 
-def postprocess(site):
+def postprocess(crawler):
     """Add page problems for all pages without a title."""
     session = Session()
     # get all internal pages without a title
@@ -46,7 +46,7 @@
     session.commit()
 
 
-def generate(site):
+def generate(crawler):
     """Output the list of pages without a title."""
     session = Session()
     # get all internal pages without a title
@@ -54,7 +54,7 @@
     links = links.filter((char_length(Link.title) == 0) |
                          (Link.title == None)).order_by(Link.url)
     # present results
-    fp = webcheck.plugins.open_html(webcheck.plugins.notitles, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.notitles, crawler)
     if not links.count():
         fp.write(
           '   <p class="description">\n'

Modified: webcheck/webcheck/plugins/old.py
==============================================================================
--- webcheck/webcheck/plugins/old.py    Fri Oct  7 12:52:35 2011        (r447)
+++ webcheck/webcheck/plugins/old.py    Fri Oct  7 13:19:31 2011        (r448)
@@ -38,7 +38,7 @@
 SECS_PER_DAY = 60 * 60 * 24
 
 
-def generate(site):
+def generate(crawler):
     """Output the list of outdated pages to the specified file descriptor."""
     session = Session()
     # the time for which links are considered old
@@ -47,7 +47,7 @@
     links = session.query(Link).filter_by(is_page=True, is_internal=True)
     links = links.filter(Link.mtime < oldtime).order_by(Link.mtime)
     # present results
-    fp = webcheck.plugins.open_html(webcheck.plugins.old, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.old, crawler)
     if not links.count():
         fp.write(
           '   <p class="description">\n'

Modified: webcheck/webcheck/plugins/problems.py
==============================================================================
--- webcheck/webcheck/plugins/problems.py       Fri Oct  7 12:52:35 2011        
(r447)
+++ webcheck/webcheck/plugins/problems.py       Fri Oct  7 13:19:31 2011        
(r448)
@@ -46,7 +46,7 @@
     return name
 
 
-def generate(site):
+def generate(crawler):
     """Output the overview of problems per author."""
     session = Session()
     # make a list of problems per author
@@ -65,7 +65,7 @@
             problem_db[author].append(link)
         else:
             problem_db[author] = [link]
-    fp = webcheck.plugins.open_html(webcheck.plugins.problems, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.problems, crawler)
     if not problem_db:
         fp.write(
           '   <p class="description">\n'

Modified: webcheck/webcheck/plugins/sitemap.py
==============================================================================
--- webcheck/webcheck/plugins/sitemap.py        Fri Oct  7 12:52:35 2011        
(r447)
+++ webcheck/webcheck/plugins/sitemap.py        Fri Oct  7 13:19:31 2011        
(r448)
@@ -78,17 +78,17 @@
     fp.write(indent + '</li>\n')
 
 
-def generate(site):
+def generate(crawler):
     """Output the sitemap."""
-    fp = webcheck.plugins.open_html(webcheck.plugins.sitemap, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.sitemap, crawler)
     # output the site structure using breadth first traversal
     fp.write(
       '   <p class="description">\n'
       '    This an overview of the crawled site.\n'
       '   </p>\n'
       '   <ul>\n')
-    explored = set(x.id for x in site.bases)
-    for l in site.bases:
+    explored = set(x.id for x in crawler.bases)
+    for l in crawler.bases:
         _explore(fp, l, explored)
     fp.write(
       '   </ul>\n')

Modified: webcheck/webcheck/plugins/size.py
==============================================================================
--- webcheck/webcheck/plugins/size.py   Fri Oct  7 12:52:35 2011        (r447)
+++ webcheck/webcheck/plugins/size.py   Fri Oct  7 13:19:31 2011        (r448)
@@ -55,7 +55,7 @@
     return link.total_size
 
 
-def generate(site):
+def generate(crawler):
     """Output the list of large pages."""
     session = Session()
     # get all internal pages and get big links
@@ -65,7 +65,7 @@
     # sort links by size (biggest first)
     links.sort(lambda a, b: cmp(b.total_size, a.total_size))
     # present results
-    fp = webcheck.plugins.open_html(webcheck.plugins.size, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.size, crawler)
     if not links:
         fp.write(
           '   <p class="description">\n'

Modified: webcheck/webcheck/plugins/urllist.py
==============================================================================
--- webcheck/webcheck/plugins/urllist.py        Fri Oct  7 12:52:35 2011        
(r447)
+++ webcheck/webcheck/plugins/urllist.py        Fri Oct  7 13:19:31 2011        
(r448)
@@ -30,10 +30,10 @@
 import webcheck.plugins
 
 
-def generate(site):
+def generate(crawler):
     """Output a sorted list of URLs."""
     session = Session()
-    fp = webcheck.plugins.open_html(webcheck.plugins.urllist, site)
+    fp = webcheck.plugins.open_html(webcheck.plugins.urllist, crawler)
     fp.write(
       '   <p class="description">\n'
       '    This is the list of all urls encountered during the examination 
of\n'
-- 
To unsubscribe send an email to
webcheck-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/webcheck-commits/