lists.arthurdejong.org
RSS feed

python-stdnum branch master updated. 1.9-3-g04f78fb

[Date Prev][Date Next] [Thread Prev][Thread Next]

python-stdnum branch master updated. 1.9-3-g04f78fb



This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "python-stdnum".

The branch, master has been updated
       via  04f78fb6613cccbd78e32a569ccf4ec2e5e1d478 (commit)
       via  bae6f19f265ae6f45c6f42649fd70d1c005added (commit)
       via  08d105392029bf430de4a854cf250215ecebf6ba (commit)
      from  d9defc8b514e5f2d9c545de23054e416bd7bd2ab (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
https://arthurdejong.org/git/python-stdnum/commit/?id=04f78fb6613cccbd78e32a569ccf4ec2e5e1d478

commit 04f78fb6613cccbd78e32a569ccf4ec2e5e1d478
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Tue May 1 23:04:41 2018 +0200

    Fix encoding issues in online check
    
    This ensures that all text is unicode internally and encoded to UTF-8 on
    response.

diff --git a/online_check/stdnum.wsgi b/online_check/stdnum.wsgi
index 4ac1c36..f4d430c 100755
--- a/online_check/stdnum.wsgi
+++ b/online_check/stdnum.wsgi
@@ -18,17 +18,17 @@
 # 02110-1301 USA
 
 import cgi
+import inspect
 import json
 import os
 import re
 import sys
-import inspect
 
 sys.stdout = sys.stderr
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'python-stdnum'))
 
 from stdnum.util import (
-    get_number_modules, get_module_name, get_module_description)
+    get_module_description, get_module_name, get_number_modules, to_unicode)
 
 
 _template = None
@@ -41,11 +41,11 @@ def get_conversions(module, number):
             args, varargs, varkw, defaults = inspect.getargspec(func)
             if defaults:
                 args = args[:-len(defaults)]
-            if args == ['number']:
+            if args == ['number'] and not name.endswith('binary'):
                 try:
                     conversion = func(number)
                     if conversion != number:
-                        yield (name[3:], conversion)
+                        yield (name[3:], to_unicode(conversion))
                 except Exception:
                     pass
 
@@ -59,8 +59,8 @@ def info(module, number):
         compact=compactfn(number),
         valid=module.is_valid(number),
         module=module.__name__.split('.', 1)[1],
-        name=get_module_name(module),
-        description=get_module_description(module),
+        name=to_unicode(get_module_name(module)),
+        description=to_unicode(get_module_description(module)),
         conversions=dict(get_conversions(module, number)))
 
 
@@ -89,26 +89,26 @@ def application(environ, start_response):
         basedir = os.path.join(
             environ['DOCUMENT_ROOT'],
             os.path.dirname(environ['SCRIPT_NAME']).strip('/'))
-        _template = open(os.path.join(basedir, 'template.html'), 'r').read()
+        _template = to_unicode(open(os.path.join(basedir, 'template.html'), 
'rt').read())
     is_ajax = environ.get(
         'HTTP_X_REQUESTED_WITH', '').lower() == 'xmlhttprequest'
     parameters = cgi.parse_qs(environ.get('QUERY_STRING', ''))
     results = []
     number = ''
     if 'number' in parameters:
-        number = parameters['number'][0]
+        number = to_unicode(parameters['number'][0])
         results = [
             info(module, number)
             for module in get_number_modules()
             if module.is_valid(number)]
-    if 'HTTP_X_REQUESTED_WITH' in environ:
+    if is_ajax:
         start_response('200 OK', [
             ('Content-Type', 'application/json'),
             ('Vary', 'X-Requested-With')])
         return [json.dumps(results, indent=2, sort_keys=True)]
     start_response('200 OK', [
-        ('Content-Type', 'text/html'),
+        ('Content-Type', 'text/html; charset=utf-8'),
         ('Vary', 'X-Requested-With')])
-    return _template % dict(
+    return [(_template % dict(
         value=cgi.escape(number, True),
-        results='\n'.join(format(data) for data in results))
+        results=u'\n'.join(format(data) for data in results))).encode('utf-8')]

https://arthurdejong.org/git/python-stdnum/commit/?id=bae6f19f265ae6f45c6f42649fd70d1c005added

commit bae6f19f265ae6f45c6f42649fd70d1c005added
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Tue May 1 22:54:03 2018 +0200

    Fix an issue with format of Mexican tax numbers
    
    Fix an issue where the format accepted a mix of personal and company
    numberer in validation causing in a raised ValueError exception.

diff --git a/stdnum/mx/rfc.py b/stdnum/mx/rfc.py
index b186950..80b3de2 100644
--- a/stdnum/mx/rfc.py
+++ b/stdnum/mx/rfc.py
@@ -67,14 +67,6 @@ from stdnum.exceptions import *
 from stdnum.util import clean, to_unicode
 
 
-# regular expression for matching numbers
-_rfc_re = re.compile(u'^[A-Z&Ñ]{3,4}[0-9]{6}[0-9A-Z]{0,5}$')
-
-
-# regular expression for matching the last 3 check digits
-_check_digits_re = re.compile(u'^[1-9A-V][1-9A-Z][0-9A]$')
-
-
 # these values should not appear as first part of a personal number
 _name_blacklist = set([
     'BUEI', 'BUEY', 'CACA', 'CACO', 'CAGA', 'CAGO', 'CAKA', 'CAKO', 'COGE',
@@ -120,20 +112,22 @@ def validate(number, validate_check_digits=False):
     """Check if the number is a valid RFC."""
     number = compact(number)
     n = to_unicode(number)
-    if not _rfc_re.match(n):
-        raise InvalidFormat()
     if len(n) in (10, 13):
         # number assigned to person
+        if not re.match(u'^[A-Z&Ñ]{4}[0-9]{6}[0-9A-Z]{0,3}$', n):
+            raise InvalidFormat()
         if n[:4] in _name_blacklist:
             raise InvalidComponent()
         _get_date(n[4:10])
     elif len(n) == 12:
         # number assigned to company
+        if not re.match(u'^[A-Z&Ñ]{3}[0-9]{6}[0-9A-Z]{3}$', n):
+            raise InvalidFormat()
         _get_date(n[3:9])
     else:
         raise InvalidLength()
     if validate_check_digits and len(n) >= 12:
-        if not _check_digits_re.match(n[-3:]):
+        if not re.match(u'^[1-9A-V][1-9A-Z][0-9A]$', n[-3:]):
             raise InvalidComponent()
         if n[-1] != calc_check_digit(n[:-1]):
             raise InvalidChecksum()
diff --git a/tests/test_mx_rfc.doctest b/tests/test_mx_rfc.doctest
index eba887d..7fa5c34 100644
--- a/tests/test_mx_rfc.doctest
+++ b/tests/test_mx_rfc.doctest
@@ -41,6 +41,14 @@ Traceback (most recent call last):
 InvalidFormat: ...
 
 
+The first four digits should only be letters for 10 or 13-digit numbers.
+
+>>> rfc.validate('ABCD 12345678')
+Traceback (most recent call last):
+    ...
+InvalidFormat: ...
+
+
 The first four digits of a personal number should not be one of the
 blacklisted words.
 

https://arthurdejong.org/git/python-stdnum/commit/?id=08d105392029bf430de4a854cf250215ecebf6ba

commit 08d105392029bf430de4a854cf250215ecebf6ba
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Tue May 1 22:52:38 2018 +0200

    Make unicode conversion standard
    
    A few modules use non-ASCII characters in numbers. This introduces a
    to_unicode() function in util so that it can be used by multiple
    modules.

diff --git a/stdnum/es/referenciacatastral.py b/stdnum/es/referenciacatastral.py
index 129bb16..c4d913c 100644
--- a/stdnum/es/referenciacatastral.py
+++ b/stdnum/es/referenciacatastral.py
@@ -55,7 +55,7 @@ InvalidChecksum: ...
 """
 
 from stdnum.exceptions import *
-from stdnum.util import clean
+from stdnum.util import clean, to_unicode
 
 
 alphabet = u'ABCDEFGHIJKLMNÑOPQRSTUVWXYZ0123456789'
@@ -89,16 +89,9 @@ def _check_digit(number):
     return 'MQWERTYUIOPASDFGHJKLBZX'[s % 23]
 
 
-def _force_unicode(number):
-    """Convert the number to unicode."""
-    if not hasattr(number, 'isnumeric'):  # pragma: no cover (Python 2 code)
-        number = number.decode('utf-8')
-    return number
-
-
 def calc_check_digits(number):
     """Calculate the check digits for the number."""
-    number = _force_unicode(compact(number))
+    number = to_unicode(compact(number))
     return (
         _check_digit(number[0:7] + number[14:18]) +
         _check_digit(number[7:14] + number[14:18]))
@@ -108,7 +101,7 @@ def validate(number):
     """Check if the number is a valid Cadastral Reference. This checks the
     length, formatting and check digits."""
     number = compact(number)
-    n = _force_unicode(number)
+    n = to_unicode(number)
     if not all(c in alphabet for c in n):
         raise InvalidFormat()
     if len(n) != 20:
diff --git a/stdnum/mx/rfc.py b/stdnum/mx/rfc.py
index 87acbe8..b186950 100644
--- a/stdnum/mx/rfc.py
+++ b/stdnum/mx/rfc.py
@@ -64,15 +64,15 @@ import datetime
 import re
 
 from stdnum.exceptions import *
-from stdnum.util import clean
+from stdnum.util import clean, to_unicode
 
 
 # regular expression for matching numbers
-_rfc_re = re.compile(r'^[A-Z&Ñ]{3,4}[0-9]{6}[0-9A-Z]{0,5}$')
+_rfc_re = re.compile(u'^[A-Z&Ñ]{3,4}[0-9]{6}[0-9A-Z]{0,5}$')
 
 
 # regular expression for matching the last 3 check digits
-_check_digits_re = re.compile(r'^[1-9A-V][1-9A-Z][0-9A]$')
+_check_digits_re = re.compile(u'^[1-9A-V][1-9A-Z][0-9A]$')
 
 
 # these values should not appear as first part of a personal number
@@ -86,7 +86,7 @@ _name_blacklist = set([
 
 
 # characters used for checksum calculation,
-_alphabet = '0123456789ABCDEFGHIJKLMN&OPQRSTUVWXYZ Ñ'
+_alphabet = u'0123456789ABCDEFGHIJKLMN&OPQRSTUVWXYZ Ñ'
 
 
 def compact(number):
@@ -110,6 +110,7 @@ def _get_date(number):
 def calc_check_digit(number):
     """Calculate the check digit. The number passed should not have the
     check digit included."""
+    number = to_unicode(number)
     number = ('   ' + number)[-12:]
     check = sum(_alphabet.index(n) * (13 - i) for i, n in enumerate(number))
     return _alphabet[(11 - check) % 11]
@@ -118,22 +119,23 @@ def calc_check_digit(number):
 def validate(number, validate_check_digits=False):
     """Check if the number is a valid RFC."""
     number = compact(number)
-    if not _rfc_re.match(number):
+    n = to_unicode(number)
+    if not _rfc_re.match(n):
         raise InvalidFormat()
-    if len(number) in (10, 13):
+    if len(n) in (10, 13):
         # number assigned to person
-        if number[:4] in _name_blacklist:
+        if n[:4] in _name_blacklist:
             raise InvalidComponent()
-        _get_date(number[4:10])
-    elif len(number) == 12:
+        _get_date(n[4:10])
+    elif len(n) == 12:
         # number assigned to company
-        _get_date(number[3:9])
+        _get_date(n[3:9])
     else:
         raise InvalidLength()
-    if validate_check_digits and len(number) >= 12:
-        if not _check_digits_re.match(number[-3:]):
+    if validate_check_digits and len(n) >= 12:
+        if not _check_digits_re.match(n[-3:]):
             raise InvalidComponent()
-        if number[-1] != calc_check_digit(number[:-1]):
+        if n[-1] != calc_check_digit(n[:-1]):
             raise InvalidChecksum()
     return number
 
diff --git a/stdnum/util.py b/stdnum/util.py
index 3e04c79..8fa082a 100644
--- a/stdnum/util.py
+++ b/stdnum/util.py
@@ -129,6 +129,16 @@ def clean(number, deletechars=''):
     return ''.join(x for x in number if x not in deletechars)
 
 
+def to_unicode(text):
+    """Convert the specified text to a unicode string."""
+    if not isinstance(text, type(u'')):
+        try:
+            return text.decode('utf-8')
+        except UnicodeDecodeError:
+            return text.decode('iso-8859-15')
+    return text
+
+
 def get_number_modules(base='stdnum'):
     """Yield all the number validation modules under the specified module."""
     __import__(base)
diff --git a/tests/test_util.doctest b/tests/test_util.doctest
index c4dc072..959a544 100644
--- a/tests/test_util.doctest
+++ b/tests/test_util.doctest
@@ -24,7 +24,21 @@ meant for internal use by stdnum modules and is not 
guaranteed to remain
 stable and as such not part of the public API of stdnum.
 
 >>> from stdnum.util import (
-...     get_number_modules, get_module_name, get_module_description)
+...     get_number_modules, get_module_name, get_module_description,
+...     to_unicode)
+
+
+The to_unicode() function is used to force conversion of a string to unicode
+if it is not already a unicode string. This is mostly used to convert numbers
+with non-ASCII characters in it.
+
+>>> n_str = b'\xc3\x91'.decode('utf-8')  # Ñ character as unicode string
+>>> to_unicode(n_str) ==  n_str
+True
+>>> to_unicode(n_str.encode('utf-8')) ==  n_str
+True
+>>> to_unicode(n_str.encode('iso-8859-1')) ==  n_str
+True
 
 
 The get_module_name() function is used in the example WSGI application and

-----------------------------------------------------------------------

Summary of changes:
 online_check/stdnum.wsgi         | 24 ++++++++++++------------
 stdnum/es/referenciacatastral.py | 13 +++----------
 stdnum/mx/rfc.py                 | 36 ++++++++++++++++--------------------
 stdnum/util.py                   | 10 ++++++++++
 tests/test_mx_rfc.doctest        |  8 ++++++++
 tests/test_util.doctest          | 16 +++++++++++++++-
 6 files changed, 64 insertions(+), 43 deletions(-)


hooks/post-receive
-- 
python-stdnum
-- 
To unsubscribe send an email to
python-stdnum-commits-unsubscribe@lists.arthurdejong.org or see
https://lists.arthurdejong.org/python-stdnum-commits/