python-stdnum branch master updated. 1.16-5-g38c368d

[Date Prev][Date Next] [Thread Prev][Thread Next]
From: Commits of the python-stdnum project <python-stdnum-commits [at] lists.arthurdejong.org>
To: python-stdnum-commits [at] lists.arthurdejong.org
Reply-to: python-stdnum-users [at] lists.arthurdejong.org, python-stdnum-commits [at] lists.arthurdejong.org
Subject: python-stdnum branch master updated. 1.16-5-g38c368d
Date: Sun, 11 Apr 2021 19:31:37 +0200 (CEST)
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "python-stdnum".

The branch, master has been updated
       via  38c368de1c5977aa46c6cd99c276ce8fdca12cca (commit)
       via  b7901d6ed15bf90b889a54a1c3b85dea8628ae1e (commit)
      from  7e69090bf888d7a624cc51bc73c069f91c10f321 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
https://arthurdejong.org/git/python-stdnum/commit/?id=38c368de1c5977aa46c6cd99c276ce8fdca12cca

commit 38c368de1c5977aa46c6cd99c276ce8fdca12cca
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Apr 11 17:54:39 2021 +0200

    Only process the shortest matches in the numdb module
    
    This ensures that matching numbers is done consistently when the numdb
    database file has conflicting information about the length of numbers.
    
    This also refactors the _find() function to be simpler and reduces the
    number of recursive calls that have to be done.
    
    The tests have been re-formatted to use pprint to make it easier to spot
    differences if any of the tests fail (instead of just saying expected
    True, got False).
    
    Closes https://github.com/arthurdejong/python-stdnum/issues/257

diff --git a/stdnum/numdb.py b/stdnum/numdb.py
index 003708f..e22dce6 100644
--- a/stdnum/numdb.py
+++ b/stdnum/numdb.py
@@ -1,6 +1,6 @@
 # numdb.py - module for handling hierarchically organised numbers
 #
-# Copyright (C) 2010-2019 Arthur de Jong
+# Copyright (C) 2010-2021 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -39,47 +39,21 @@ To split a number:
 
 To split the number and get properties for each part:
 
->>> dbfile.info('01006') == [
-...     ('0',   {'prop1': 'foo'}),
-...     ('100', {'prop2': 'bar'}),
-...     ('6',   {}),
-... ]
-True
->>> dbfile.info('02006') == [
-...     ('0',   {'prop1': 'foo'}),
-...     ('200', {'prop2': 'bar', 'prop3': 'baz'}),
-...     ('6',   {}),
-... ]
-True
->>> dbfile.info('03456') == [
-...     ('0', {'prop1': 'foo'}),
-...     ('345', {'prop2': 'bar', 'prop3': 'baz'}),
-...     ('6', {}),
-... ]
-True
->>> dbfile.info('902006') == [
-...     ('90', {'prop1': 'booz'}),
-...     ('20', {'prop2': 'foo'}),
-...     ('06', {}),
-... ]
-True
->>> dbfile.info('909856') == [
-...     ('90', {'prop1': 'booz'}),
-...     ('985', {'prop2': 'fooz'}),
-...     ('6', {}),
-... ]
-True
->>> dbfile.info('9889') == [
-...     ('98', {'prop1': 'booz'}),
-...     ('89', {'prop2': 'foo'}),
-... ]
-True
->>> dbfile.info('633322') == [
-...     ('6', {'prop1': 'boo'}),
-...     ('333', {'prop2': 'bar', 'prop3': 'baz', 'prop4': 'bla'}),
-...     ('22', {}),
-... ]
-True
+>>> import pprint
+>>> pprint.pprint(dbfile.info('01006'))
+[('0', {'prop1': 'foo'}), ('100', {'prop2': 'bar'}), ('6', {})]
+>>> pprint.pprint(dbfile.info('02006'))
+[('0', {'prop1': 'foo'}), ('200', {'prop2': 'bar', 'prop3': 'baz'}), ('6', {})]
+>>> pprint.pprint(dbfile.info('03456'))
+[('0', {'prop1': 'foo'}), ('345', {'prop2': 'bar', 'prop3': 'baz'}), ('6', {})]
+>>> pprint.pprint(dbfile.info('902006'))
+[('90', {'prop1': 'booz'}), ('20', {'prop2': 'foo'}), ('06', {})]
+>>> pprint.pprint(dbfile.info('909856'))
+[('90', {'prop1': 'booz'}), ('985', {'prop2': 'fooz'}), ('6', {})]
+>>> pprint.pprint(dbfile.info('9889'))
+[('98', {'prop1': 'booz'}), ('89', {'prop2': 'foo'})]
+>>> pprint.pprint(dbfile.info('633322'))
+[('6', {'prop1': 'boo'}), ('333', {'prop2': 'bar', 'prop3': 'baz', 'prop4': 
'bla'}), ('22', {})]
 
 """
 
@@ -114,41 +88,27 @@ class NumDB():
         """Construct an empty database."""
         self.prefixes = []
 
-    @staticmethod
-    def _merge(results):
-        """Merge the provided list of possible results into a single result
-        list (this is a generator)."""
-        # expand the results to all have the same length
-        ml = max(len(x) for x in results)
-        results = [x + (ml - len(x)) * [None]
-                   for x in results]
-        # go over each part
-        for parts in zip(*results):
-            # regroup parts into parts list and properties list
-            partlist, proplist = list(zip(*(x for x in parts if x)))
-            part = min(partlist, key=len)
-            props = {}
-            for p in proplist:
-                props.update(p)
-            yield part, props
-
     @staticmethod
     def _find(number, prefixes):
         """Lookup the specified number in the list of prefixes, this will
         return basically what info() should return but works recursively."""
         if not number:
             return []
-        results = []
-        if prefixes:
-            for length, low, high, props, children in prefixes:
-                if low <= number[:length] <= high and len(number) >= length:
-                    results.append([(number[:length], props)] +
-                                   NumDB._find(number[length:], children))
-        # not-found fallback
-        if not results:
-            return [(number, {})]
-        # merge the results into a single result
-        return list(NumDB._merge(results))
+        part = number
+        properties = {}
+        next_prefixes = []
+        # go over prefixes and find matches
+        for length, low, high, props, children in prefixes:
+            if len(part) >= length and low <= part[:length] <= high:
+                # only use information from the shortest match
+                if length < len(part):
+                    part = part[:length]
+                    properties = {}
+                    next_prefixes = []
+                properties.update(props)
+                next_prefixes.extend(children or [])
+        # return first part and recursively find next matches
+        return [(part, properties)] + NumDB._find(number[len(part):], 
next_prefixes)
 
     def info(self, number):
         """Split the provided number in components and associate properties
diff --git a/tests/numdb-test.dat b/tests/numdb-test.dat
index 6b25454..366307e 100644
--- a/tests/numdb-test.dat
+++ b/tests/numdb-test.dat
@@ -1,3 +1,4 @@
+# numdb-test.dat: used for testing the stdnum.numdb module
 # this is a comment line
 0-8 prop1="foo"
   100-999 prop2="bar"
@@ -5,5 +6,7 @@
 6 prop1="boo"
   333 prop4="bla"
 90-99 prop1="booz"
+  200 comment1="this value will be ignored because a shorter one matches"
   00-89 prop2="foo"
+  200 comment2="this value will also be ignored"
   900-999 prop2="fooz"

https://arthurdejong.org/git/python-stdnum/commit/?id=b7901d6ed15bf90b889a54a1c3b85dea8628ae1e

commit b7901d6ed15bf90b889a54a1c3b85dea8628ae1e
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Apr 11 15:42:06 2021 +0200

    Stop non-operational MNCs from confusing IMSI dataset
    
    This only includes data from non-operational (status "Not operational"
    according to Wikipedia) Mobile Network Code operators in the generated
    data file if they would not confuse the lookup of operational numbers.
    
    This avoid problems when the "030" to "039" non-operational ranges
    conflicting with the "03" operational range. This ensures that only the
    "03" value is kept. For historical completeness we keep the other
    non-operational values.
    
    Closes https://github.com/arthurdejong/python-stdnum/issues/257

diff --git a/update/imsi.py b/update/imsi.py
index fd20e26..a6ab392 100755
--- a/update/imsi.py
+++ b/update/imsi.py
@@ -2,7 +2,7 @@
 
 # update/imsi.py - script to donwload from Wikipedia to build the database
 #
-# Copyright (C) 2011-2019 Arthur de Jong
+# Copyright (C) 2011-2021 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -126,15 +126,10 @@ def cleanup_value(val):
     val = val.replace('United Kingdom|UK', 'United Kingdom')
     val = val.replace('United States|US', 'United States')
     val = val.replace('New Zealand|NZ', 'New Zealand').strip()
+    val = val.replace('</sup>', '').strip()
     return cleanup_replacements.get(val, val)
 
 
-def update_mncs(data, mcc, mnc, **kwargs):
-    """Merge provided mnc information with the data that is already stored
-    in mccs."""
-    data[mcc][mnc].update(dict((k, cleanup_value(v)) for k, v in 
kwargs.items() if v))
-
-
 # This matches a heading on the Wikipedia page, e.g.
 # ==== [[Albania]] - AL ====
 _mnc_country_re = re.compile(
@@ -153,10 +148,10 @@ _mnc_line_re = re.compile(
     r')?)?)?)?)?')
 
 
-def get_mncs_from_wikipedia(data):
-    """Update the collection of Mobile Country Codes from Wikipedia.
-    This parses a Wikipedia page to extract the MCC and MNC, the first
-    part of any IMSI, and stores the results."""
+def get_mncs_from_wikipedia():
+    """Return the collection of Mobile Country Codes from Wikipedia.
+    This parses Wikipedia pages to extract the MCC and MNC, the first
+    part of any IMSI, and extracts other available data."""
     for page in wikipedia_pages:
         url = 'https://en.wikipedia.org/w/index.php?title=%s&action=raw' % (
             page.replace(' ', '_'))
@@ -175,11 +170,15 @@ def get_mncs_from_wikipedia(data):
             match = _mnc_line_re.match(line)
             if match:
                 for mnc in str2range(match.group('mnc')):
-                    update_mncs(data, match.group('mcc'), mnc,
-                                country=country, cc=cc, 
brand=match.group('brand'),
-                                operator=match.group('operator'),
-                                status=match.group('status'),
-                                bands=match.group('bands'))
+                    info = dict(
+                        country=country,
+                        cc=cc,
+                        brand=match.group('brand'),
+                        operator=match.group('operator'),
+                        status=match.group('status'),
+                        bands=match.group('bands'))
+                    info = dict((k, cleanup_value(v)) for k, v in info.items() 
if v)
+                    yield (match.group('mcc'), mnc, info)
 
 
 def str2range(x):
@@ -200,7 +199,17 @@ def str2range(x):
 if __name__ == '__main__':
     # download/parse the information
     data = defaultdict(lambda: defaultdict(dict))
-    get_mncs_from_wikipedia(data)
+    not_operational = defaultdict(lambda: defaultdict(dict))
+    for mcc, mnc, info in get_mncs_from_wikipedia():
+        if info.get('status', '').lower() == 'not operational':
+            not_operational[mcc][mnc].update(info)
+        else:
+            data[mcc][mnc].update(info)
+    # merge not operational entries as long as they do not conflict
+    for mcc, mncs in not_operational.items():
+        for mnc, info in mncs.items():
+            if not data[mcc][mnc] and not data[mcc][mnc[:2]]:
+                data[mcc][mnc].update(info)
     # print header
     print('# generated from various sources')
     print('# https://en.wikipedia.org/wiki/Mobile_country_code')
@@ -211,7 +220,7 @@ if __name__ == '__main__':
     for mcc in mcc_list:
         print('%s' % mcc)
         # build an ordered list of mncs
-        mnc_list = sorted(data[mcc].keys())
+        mnc_list = sorted(mnc for mnc, info in data[mcc].items() if info)
         for mnc in mnc_list:
             info = data[mcc][mnc]
             infokeys = sorted(info.keys())

-----------------------------------------------------------------------

Summary of changes:
 stdnum/numdb.py      | 102 ++++++++++++++++-----------------------------------
 tests/numdb-test.dat |   3 ++
 update/imsi.py       |  45 ++++++++++++++---------
 3 files changed, 61 insertions(+), 89 deletions(-)


hooks/post-receive
-- 
python-stdnum
python-stdnum branch master updated. 1.16-5-g38c368d, Commits of the python-stdnum project
Prev by Date: python-stdnum branch master updated. 1.16-3-g7e69090
Next by Date: python-stdnum branch master updated. 1.16-6-g175b1e5
Previous by thread: python-stdnum branch master updated. 1.16-3-g7e69090
Next by thread: python-stdnum branch master updated. 1.16-6-g175b1e5