lists.arthurdejong.org
RSS feed

python-stdnum branch master updated. 0.8.1-1-gf122c88

[Date Prev][Date Next] [Thread Prev][Thread Next]

python-stdnum branch master updated. 0.8.1-1-gf122c88



This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "python-stdnum".

The branch, master has been updated
       via  f122c882ba969846c94dc492016c9d7ef1085266 (commit)
      from  c042f0224ee767a2cc2b9493d091c32b190cbeca (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://arthurdejong.org/git/python-stdnum/commit/?id=f122c882ba969846c94dc492016c9d7ef1085266

commit f122c882ba969846c94dc492016c9d7ef1085266
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Fri Nov 8 21:40:22 2013 +0100

    Try to replace Unicode characters with ASCII
    
    This changes the stdnum.util.clean() method that is used by all modules
    to replace alternative Unicode dashes, dots, etc. by their ASCII
    equivalent so the numbers will be automatically converted and validated.
    
    Inspiration for this change came from
    https://github.com/JNRowe/pyisbn/pull/6

diff --git a/stdnum/util.py b/stdnum/util.py
index a7db8cf..b0ac4a5 100644
--- a/stdnum/util.py
+++ b/stdnum/util.py
@@ -1,4 +1,5 @@
 # util.py - common utility functions
+# coding: utf-8
 #
 # Copyright (C) 2012, 2013 Arthur de Jong
 #
@@ -28,6 +29,7 @@ import pkgutil
 import pydoc
 import re
 import sys
+import unicodedata
 
 from stdnum.exceptions import *
 
@@ -35,16 +37,88 @@ from stdnum.exceptions import *
 _strip_doctest_re = re.compile('^>>> .*\Z', re.DOTALL | re.MULTILINE)
 
 
-def clean(number, deletechars):
+def _mk_char_map(mapping):
+    """Transform a dictionary with comma separated uniode chracter names
+    to tuples with unicode characters as key."""
+    for key, value in mapping.items():
+        for char in key.split(','):
+            try:
+                yield (unicodedata.lookup(char), value)
+            except KeyError:
+                pass
+
+
+# build mapping of Unicode characters to equivalent ASCII characters
+_char_map = dict(_mk_char_map({
+    'HYPHEN-MINUS,ARMENIAN HYPHEN,HEBREW PUNCTUATION MAQAF,HYPHEN,'
+    'NON-BREAKING HYPHEN,FIGURE DASH,EN DASH,EM DASH,HORIZONTAL BAR,'
+    'SMALL HYPHEN-MINUS,FULLWIDTH HYPHEN-MINUS,MONGOLIAN NIRUGU,OVERLINE,'
+    'HYPHEN BULLET,MACRON,MODIFIER LETTER MINUS SIGN,FULLWIDTH MACRON,'
+    'OGHAM SPACE MARK,SUPERSCRIPT MINUS,SUBSCRIPT MINUS,MINUS SIGN,'
+    'HORIZONTAL LINE EXTENSION,HORIZONTAL SCAN LINE-1,HORIZONTAL SCAN LINE-3,'
+    'HORIZONTAL SCAN LINE-7,HORIZONTAL SCAN LINE-9,STRAIGHTNESS': '-',
+    'ASTERISK,ARABIC FIVE POINTED STAR,SYRIAC HARKLEAN ASTERISCUS,'
+    'FLOWER PUNCTUATION MARK,VAI FULL STOP,SMALL ASTERISK,FULLWIDTH ASTERISK,'
+    'ASTERISK OPERATOR,STAR OPERATOR,HEAVY ASTERISK,LOW ASTERISK,'
+    'OPEN CENTRE ASTERISK,EIGHT SPOKED ASTERISK,SIXTEEN POINTED ASTERISK,'
+    'TEARDROP-SPOKED ASTERISK,OPEN CENTRE TEARDROP-SPOKED ASTERISK,'
+    'HEAVY TEARDROP-SPOKED ASTERISK,EIGHT TEARDROP-SPOKED PROPELLER ASTERISK,'
+    'HEAVY EIGHT TEARDROP-SPOKED PROPELLER ASTERISK,'
+    'ARABIC FIVE POINTED STAR': '*',
+    'COMMA,ARABIC COMMA,SINGLE LOW-9 QUOTATION MARK,IDEOGRAPHIC COMMA,'
+    'ARABIC DECIMAL SEPARATOR,ARABIC THOUSANDS SEPARATOR,PRIME,RAISED COMMA,'
+    'PRESENTATION FORM FOR VERTICAL COMMA,SMALL COMMA,'
+    'SMALL IDEOGRAPHIC COMMA,FULLWIDTH COMMA,CEDILLA': ',',
+    'FULL STOP,MIDDLE DOT,GREEK ANO TELEIA,ARABIC FULL STOP,'
+    'IDEOGRAPHIC FULL STOP,SYRIAC SUPRALINEAR FULL STOP,'
+    'SYRIAC SUBLINEAR FULL STOP,SAMARITAN PUNCTUATION NEQUDAA,'
+    'TIBETAN MARK INTERSYLLABIC TSHEG,TIBETAN MARK DELIMITER TSHEG BSTAR,'
+    'RUNIC SINGLE PUNCTUATION,BULLET,ONE DOT LEADER,HYPHENATION POINT,'
+    'WORD SEPARATOR MIDDLE DOT,RAISED DOT,KATAKANA MIDDLE DOT,'
+    'SMALL FULL STOP,FULLWIDTH FULL STOP,HALFWIDTH KATAKANA MIDDLE DOT,'
+    'AEGEAN WORD SEPARATOR DOT,PHOENICIAN WORD SEPARATOR,'
+    'KHAROSHTHI PUNCTUATION DOT,DOT ABOVE,ARABIC SYMBOL DOT ABOVE,'
+    'ARABIC SYMBOL DOT BELOW,BULLET OPERATOR,DOT OPERATOR': '.',
+    'SOLIDUS,SAMARITAN PUNCTUATION ARKAANU,FULLWIDTH SOLIDUS,DIVISION SLASH,'
+    'MATHEMATICAL RISING DIAGONAL,BIG SOLIDUS,FRACTION SLASH': '/',
+    'COLON,ETHIOPIC WORDSPACE,RUNIC MULTIPLE PUNCTUATION,MONGOLIAN COLON,'
+    'PRESENTATION FORM FOR VERTICAL COLON,FULLWIDTH COLON,'
+    'PRESENTATION FORM FOR VERTICAL TWO DOT LEADER,SMALL COLON': ':',
+    'SPACE,NO-BREAK SPACE,EN QUAD,EM QUAD,EN SPACE,EM SPACE,'
+    'THREE-PER-EM SPACE,FOUR-PER-EM SPACE,SIX-PER-EM SPACE,FIGURE SPACE,'
+    'PUNCTUATION SPACE,THIN SPACE,HAIR SPACE,NARROW NO-BREAK SPACE,'
+    'MEDIUM MATHEMATICAL SPACE,IDEOGRAPHIC SPACE': ' ',
+    }))
+
+
+def _clean_chars(number):
+    """Replace various Unicode characters with their ASCII counterpart."""
+    return ''.join(_char_map.get(x, x) for x in number)
+
+
+def clean(number, deletechars=''):
     """Remove the specified characters from the supplied number.
 
     >>> clean('123-456:78 9', ' -:')
     '123456789'
+    >>> clean('1–2—3―4')
+    '1-2-3-4'
     """
     try:
-        return ''.join(x for x in number if x not in deletechars)
+        number = ''.join(x for x in number)
     except:
         raise InvalidFormat()
+    if sys.version < '3' and isinstance(number, str):  # pragma: no cover 
(Python 2/3 specific code)
+        try:
+            number = _clean_chars(number.decode()).encode()
+        except UnicodeError:
+            try:
+                number = _clean_chars(number.decode('utf-8')).encode('utf-8')
+            except UnicodeError:
+                pass
+    else:  # pragma: no cover (Python 2/3 specific code)
+        number = _clean_chars(number)
+    return ''.join(x for x in number if x not in deletechars)
 
 
 def get_number_modules(base='stdnum'):

-----------------------------------------------------------------------

Summary of changes:
 stdnum/util.py |   78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 76 insertions(+), 2 deletions(-)


hooks/post-receive
-- 
python-stdnum
-- 
To unsubscribe send an email to
python-stdnum-commits-unsubscribe@lists.arthurdejong.org or see
http://lists.arthurdejong.org/python-stdnum-commits/