python-stdnum branch master updated. 1.11-54-g5b835bb

[Date Prev][Date Next] [Thread Prev][Thread Next]
From: Commits of the python-stdnum project <python-stdnum-commits [at] lists.arthurdejong.org>
To: python-stdnum-commits [at] lists.arthurdejong.org
Reply-to: python-stdnum-users [at] lists.arthurdejong.org, python-stdnum-commits [at] lists.arthurdejong.org
Subject: python-stdnum branch master updated. 1.11-54-g5b835bb
Date: Sun, 27 Oct 2019 21:42:25 +0100 (CET)
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "python-stdnum".

The branch, master has been updated
       via  5b835bb22b08386a43c704550ebc5abc6daf6397 (commit)
       via  29de83e4f6bd3b3d86ca4f7e12181b7b6087cf66 (commit)
       via  67b747ba43710cd7f929babc3eab0aff7f67d9a8 (commit)
       via  0915b55c80a1bb328f3a1044e34934bf6b5fa04e (commit)
       via  40961fc0a014c72c4981d3878b886f19ec3f2f9a (commit)
       via  c4ad714866b7082983686d0ad6ef4e7640488667 (commit)
       via  c9ad8d300bd88da12a4308ad08e4e9bd1b47c9d9 (commit)
      from  7f3dcf05cfc0bf2a4deeb656c20929c9527ff95e (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
https://arthurdejong.org/git/python-stdnum/commit/?id=5b835bb22b08386a43c704550ebc5abc6daf6397

commit 5b835bb22b08386a43c704550ebc5abc6daf6397
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Oct 27 20:02:47 2019 +0100

    Parse multiple Wikipedia pages for full MCC/MNC list

diff --git a/update/imsi.py b/update/imsi.py
index f8a37dd..d2abb83 100755
--- a/update/imsi.py
+++ b/update/imsi.py
@@ -27,8 +27,19 @@ from collections import defaultdict
 import requests
 
 
-# URLs that are downloaded
-mcc_list_url = 
'https://en.wikipedia.org/w/index.php?title=Mobile_country_code&action=raw'
+# The wikipedia pages to download
+wikipedia_pages = (
+    'Mobile country code',
+    'Mobile Network Codes in ITU region 2xx (Europe)',
+    'Mobile Network Codes in ITU region 3xx (North America)',
+    'Mobile Network Codes in ITU region 4xx (Asia)',
+    'Mobile Network Codes in ITU region 5xx (Oceania)',
+    'Mobile Network Codes in ITU region 6xx (Africa)',
+    'Mobile Network Codes in ITU region 7xx (South America)',
+)
+
+# Sadly the full list requires an account at ITU-T:
+# https://www.itu.int/net/ITU-T/inrdb/
 
 
 cleanup_replacements = {
@@ -115,39 +126,51 @@ def update_mncs(data, mcc, mnc, **kwargs):
     data[mcc][mnc].update(dict((k, cleanup_value(v)) for k, v in 
kwargs.items() if v))
 
 
+# This matches a heading on the Wikipedia page, e.g.
+# ==== [[Albania]] - AL ====
+_mnc_country_re = re.compile(
+    r'^[=]{2,4}\s+(?P<country>.*?)(\s+-\s+(?P<cc>[^\s]{2}))?\s+[=]{2,4}$')
+
+# This matches a line containing a MCC/MNC, e.g.
+# | 232 || 02 || || A1 Telekom Austria || Reserved || ||
+_mnc_line_re = re.compile(
+    r'^\|\s*(?P<mcc>[0-9]+)' +
+    r'\s*\\\\\s*(?P<mnc>[0-9]+)' +
+    r'(\s*\\\\\s*(?P<brand>[^\\]*)' +
+    r'(\s*\\\\\s*(?P<operator>[^\\]*)' +
+    r'(\s*\\\\\s*(?P<status>[^\\]*)' +
+    r'(\s*\\\\\s*(?P<bands>[^\\]*)' +
+    r'(\s*\\\\\s*(?P<notes>[^\\]*)' +
+    r')?)?)?)?)?')
+
+
 def get_mncs_from_wikipedia(data):
     """Update the collection of Mobile Country Codes from Wikipedia.
     This parses a Wikipedia page to extract the MCC and MNC, the first
     part of any IMSI, and stores the results."""
-    mnc_country_re = 
re.compile(r'^[=]{2,4}\s+(?P<country>.*?)(\s+-\s+(?P<cc>[^\s]{2}))?\s+[=]{2,4}$')
-    mnc_line_re = re.compile(r'^\|\s*(?P<mcc>[0-9]+)' +
-                             r'\s*\\\\\s*(?P<mnc>[0-9]+)' +
-                             r'(\s*\\\\\s*(?P<brand>[^\\]*)' +
-                             r'(\s*\\\\\s*(?P<operator>[^\\]*)' +
-                             r'(\s*\\\\\s*(?P<status>[^\\]*)' +
-                             r'(\s*\\\\\s*(?P<bands>[^\\]*)' +
-                             r'(\s*\\\\\s*(?P<notes>[^\\]*)' +
-                             r')?)?)?)?)?')
-    response = requests.get(mcc_list_url)
-    response.raise_for_status()
-    country = cc = ''
-    for line in response.iter_lines(decode_unicode=True):
-        line = line.strip()
-        match = mnc_country_re.match(line)
-        if match:
-            country = match.group('country')
-            cc = (match.group('cc') or '').lower()
-        if '||' not in line:
-            continue
-        line = line.replace('||', '\\\\')
-        match = mnc_line_re.match(line)
-        if match:
-            for mnc in str2range(match.group('mnc')):
-                update_mncs(data, match.group('mcc'), mnc,
-                            country=country, cc=cc, brand=match.group('brand'),
-                            operator=match.group('operator'),
-                            status=match.group('status'),
-                            bands=match.group('bands'))
+    for page in wikipedia_pages:
+        url = 'https://en.wikipedia.org/w/index.php?title=%s&action=raw' % (
+            page.replace(' ', '_'))
+        response = requests.get(url)
+        response.raise_for_status()
+        country = cc = ''
+        for line in response.iter_lines(decode_unicode=True):
+            line = line.strip()
+            match = _mnc_country_re.match(line)
+            if match:
+                country = match.group('country')
+                cc = (match.group('cc') or '').lower()
+            if '||' not in line:
+                continue
+            line = line.replace('||', '\\\\')
+            match = _mnc_line_re.match(line)
+            if match:
+                for mnc in str2range(match.group('mnc')):
+                    update_mncs(data, match.group('mcc'), mnc,
+                                country=country, cc=cc, 
brand=match.group('brand'),
+                                operator=match.group('operator'),
+                                status=match.group('status'),
+                                bands=match.group('bands'))
 
 
 def str2range(x):
@@ -171,7 +194,7 @@ if __name__ == '__main__':
     get_mncs_from_wikipedia(data)
     # print header
     print('# generated from various sources')
-    print('# %s' % mcc_list_url)
+    print('# https://en.wikipedia.org/wiki/Mobile_country_code')
     # build an ordered list of mccs
     mcc_list = list(data.keys())
     mcc_list.sort()
@@ -184,7 +207,7 @@ if __name__ == '__main__':
             info = data[mcc][mnc]
             infokeys = sorted(info.keys())
             print(' %s%s' % (mnc, ''.join([' %s="%s"' % (k, info[k]) for k in 
infokeys if info[k]])))
-        # try to get the length of mnc's
+        # try to get the length of mncs
         try:
             length = len(mnc_list[0])
             if all(len(x) == length for x in mnc_list):

https://arthurdejong.org/git/python-stdnum/commit/?id=29de83e4f6bd3b3d86ca4f7e12181b7b6087cf66

commit 29de83e4f6bd3b3d86ca4f7e12181b7b6087cf66
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Oct 27 18:26:29 2019 +0100

    Make the IEEE OUI data more compact
    
    This groups consecutive assignments into a range to make the dat file a
    little more readable.

diff --git a/update/oui.py b/update/oui.py
index ed7f1e6..93a62ba 100755
--- a/update/oui.py
+++ b/update/oui.py
@@ -44,13 +44,35 @@ def download_csv(url):
     response = requests.get(url)
     response.raise_for_status()
     for row in csv.DictReader(line.decode('utf-8') for line in 
response.iter_lines()):
-        yield (
-            row['Assignment'],
-            row['Organization Name'].strip().replace('"', '%'))
+        o = row['Organization Name'].strip().replace('"', '%')
+        if o not in ('IEEE Registration Authority', 'Private'):
+            yield (row['Assignment'], o)
+
+
+def join_items(items):
+    """Join the list of items, combining consecutive numbers."""
+    length = len(items[0])
+    items = [int(b, 16) for b in items]
+    first = None
+    prev = None
+    res = ''
+    for item in items:
+        if first is not None and item == prev + 1:
+            # this item is consecutive to the previous: make a range
+            if prev > first:
+                # replace the previous value
+                res = res[:-length - 1]
+            res += '-%%0%dX' % length % item
+            prev = item
+        else:
+            # this is a new item, add a new one to the list
+            res += ',%%0%dX' % length % item
+            first = prev = item
+    return res.strip(',')
 
 
 if __name__ == '__main__':
-    # download the MAC Address Block Large (MA-L) list
+    # download the MAC Address Block Large (MA-L) list and group by org
     toplevel = defaultdict(list)
     for a, o in download_csv(mal_url):
         toplevel[o].append(a)
@@ -63,11 +85,11 @@ if __name__ == '__main__':
     print('# %s' % mal_url)
     print('# %s' % mam_url)
     print('# %s' % mas_url)
+    # output full-length assignments
     for a, o in sorted((tuple(sorted(a)), o) for o, a in toplevel.items()):
-        if o not in ('IEEE Registration Authority', 'Private'):
-            print('%s o="%s"' % (','.join(a), o))
+        print('%s o="%s"' % (join_items(a), o))
+    # output assignments that are subdivided
     for a in sorted(nested.keys()):
         print('%s' % a)
         for s, o in sorted(nested[a].items()):
-            if o not in ('IEEE Registration Authority', 'Private'):
-                print(' %s o="%s"' % (s, o))
+            print(' %s o="%s"' % (s, o))

https://arthurdejong.org/git/python-stdnum/commit/?id=67b747ba43710cd7f929babc3eab0aff7f67d9a8

commit 67b747ba43710cd7f929babc3eab0aff7f67d9a8
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Oct 27 17:52:56 2019 +0100

    Switch update scripts to Python 3

diff --git a/update/at_postleitzahl.py b/update/at_postleitzahl.py
index 596f5d0..c36d8a4 100755
--- a/update/at_postleitzahl.py
+++ b/update/at_postleitzahl.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # coding: utf-8
 
 # update/at_postleitzahl.py - download list of Austrian postal codes
@@ -99,5 +99,4 @@ if __name__ == '__main__':
     print('# %s' % base_url)
     # build an ordered list of postal codes
     for code, location, region in sorted(get_postal_codes(download_url)):
-        info = '%s location="%s" region="%s"' % (code, location, region)
-        print(info.encode('utf-8'))
+        print('%s location="%s" region="%s"' % (code, location, region))
diff --git a/update/be_banks.py b/update/be_banks.py
index a0d6f17..890bfbe 100755
--- a/update/be_banks.py
+++ b/update/be_banks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # coding: utf-8
 
 # update/be_banks.py - script to donwload Bank list from Belgian National Bank
@@ -89,4 +89,4 @@ if __name__ == '__main__':
             info += ' bic="%s"' % bic
         if bank:
             info += ' bank="%s"' % bank
-        print(info.encode('utf-8'))
+        print(info)
diff --git a/update/cn_loc.py b/update/cn_loc.py
index 10a33ed..7fc1f09 100755
--- a/update/cn_loc.py
+++ b/update/cn_loc.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # update/cn_loc.py - script to fetch data from the CN Open Data community
 #
@@ -25,7 +25,6 @@ Github."""
 
 from __future__ import print_function, unicode_literals
 
-import codecs
 import sys
 from collections import OrderedDict
 from datetime import datetime
@@ -84,21 +83,11 @@ def group_data(data_collection):
         yield code, name, prefecture_name, province_name
 
 
-def print_data_file(file):
+if __name__ == '__main__':
     """Output a data file in the right format."""
-    print("# generated from National Bureau of Statistics of the People's",
-          file=file)
-    print('# Republic of China, downloaded from %s' % data_url, file=file)
-    print('# %s' % datetime.utcnow(), file=file)
-    print('Downloading...', file=sys.stderr)
+    print("# generated from National Bureau of Statistics of the People's")
+    print('# Republic of China, downloaded from %s' % data_url)
+    print('# %s' % datetime.utcnow())
     data_collection = fetch_data()
-    print('Generating...', file=sys.stderr)
     for data in group_data(data_collection):
-        print('%s county="%s" prefecture="%s" province="%s"' % data, file=file)
-
-
-if __name__ == '__main__':
-    if sys.stdout.isatty():
-        print_data_file(sys.stdout)
-    else:
-        print_data_file(codecs.getwriter('utf-8')(sys.stdout))
+        print('%s county="%s" prefecture="%s" province="%s"' % data)
diff --git a/update/do_whitelists.py b/update/do_whitelists.py
index 429fd7d..3aea8d3 100755
--- a/update/do_whitelists.py
+++ b/update/do_whitelists.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # coding: utf-8
 
 # update/do_whitelists.py - script to update do.rnc and do.cedula whitelists
@@ -24,6 +24,7 @@
 Internos (DGII) web site with lists of all RNC and Cedula values and outputs
 new whitelists for these modules."""
 
+import io
 import os.path
 import sys
 import tempfile
@@ -51,7 +52,7 @@ def handle_zipfile(f):
     invalidcedula = set()
     # read the information from the ZIP file
     z = zipfile.ZipFile(f, 'r')
-    for line in z.open('TMP/DGII_RNC.TXT'):
+    for line in io.TextIOWrapper(z.open('TMP/DGII_RNC.TXT'), 
encoding='iso8859-15'):
         number = line.split('|', 1)[0].strip()
         if number.isdigit():
             if len(number) <= 9:
diff --git a/update/iban.py b/update/iban.py
index d563643..56a589a 100755
--- a/update/iban.py
+++ b/update/iban.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # update/iban.py - script to download and parse data from the IBAN registry
 #
@@ -50,7 +50,7 @@ if __name__ == '__main__':
     print('# downloaded from %s' % download_url)
     values = defaultdict(dict)
     # the file is CSV but the data is in columns instead of rows
-    for row in csv.reader(response.iter_lines(), delimiter='\t', 
quotechar='"'):
+    for row in csv.reader(response.iter_lines(decode_unicode=True), 
delimiter='\t', quotechar='"'):
         # skip first row
         if row[0] != 'Data element':
             # first column contains label
diff --git a/update/imsi.py b/update/imsi.py
index 034067e..f8a37dd 100755
--- a/update/imsi.py
+++ b/update/imsi.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # update/imsi.py - script to donwload from Wikipedia to build the database
 #
@@ -131,7 +131,7 @@ def get_mncs_from_wikipedia(data):
     response = requests.get(mcc_list_url)
     response.raise_for_status()
     country = cc = ''
-    for line in response.iter_lines():
+    for line in response.iter_lines(decode_unicode=True):
         line = line.strip()
         match = mnc_country_re.match(line)
         if match:
@@ -179,12 +179,10 @@ if __name__ == '__main__':
     for mcc in mcc_list:
         print('%s' % mcc)
         # build an ordered list of mncs
-        mnc_list = data[mcc].keys()
-        mnc_list.sort()
+        mnc_list = sorted(data[mcc].keys())
         for mnc in mnc_list:
             info = data[mcc][mnc]
-            infokeys = info.keys()
-            infokeys.sort()
+            infokeys = sorted(info.keys())
             print(' %s%s' % (mnc, ''.join([' %s="%s"' % (k, info[k]) for k in 
infokeys if info[k]])))
         # try to get the length of mnc's
         try:
diff --git a/update/isil.py b/update/isil.py
index 860e0ec..dedd307 100755
--- a/update/isil.py
+++ b/update/isil.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # update/isil.py - script to donwload ISIL agencies
 #
@@ -38,7 +38,7 @@ download_url = 
'https://english.slks.dk/libraries/library-standards/isil/'
 def clean(td):
     """Clean up the element removing unneeded stuff from it."""
     s = lxml.html.tostring(td, method='text', encoding='utf-8').decode('utf-8')
-    return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip().encode('utf-8')
+    return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip()
 
 
 if __name__ == '__main__':
@@ -67,4 +67,4 @@ if __name__ == '__main__':
             print(
                 '%s$ %s' % (
                     cc, ' '.join(
-                        ['%s="%s"' % (x, y) for x, y in props.iteritems()])))
+                        '%s="%s"' % (x, y) for x, y in sorted(props.items()))))
diff --git a/update/my_bp.py b/update/my_bp.py
index 50f8b3a..672d3f8 100755
--- a/update/my_bp.py
+++ b/update/my_bp.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # update/my_bp.py - script to download data from Malaysian government site
 #
@@ -44,7 +44,7 @@ spaces_re = re.compile(r'\s+', re.UNICODE)
 def clean(td):
     """Clean up the element removing unneeded stuff from it."""
     s = lxml.html.tostring(td, method='text', encoding='utf-8').decode('utf-8')
-    return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip().encode('utf-8')
+    return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip()
 
 
 def parse(content):
@@ -82,7 +82,7 @@ if __name__ == '__main__':
     print('# %s' % state_list_url)
     print('# %s' % country_list_url)
     print('')
-    for bp in sorted(results.iterkeys()):
+    for bp in sorted(results.keys()):
         res = bp
         row = results[bp]
         if 'state' in row:
diff --git a/update/numlist.py b/update/numlist.py
index 8d6d086..ec5c209 100755
--- a/update/numlist.py
+++ b/update/numlist.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # update/numlist.py - script to get a list of number formats in stdnum
 #
diff --git a/update/nz_banks.py b/update/nz_banks.py
index e3116e7..04a5463 100755
--- a/update/nz_banks.py
+++ b/update/nz_banks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # coding: utf-8
 
 # update/nz_banks.py - script to download Bank list from Bank Branch Register
diff --git a/update/oui.py b/update/oui.py
index 8ff2e19..ed7f1e6 100755
--- a/update/oui.py
+++ b/update/oui.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # update/oui.py - script to download and parse data from the IEEE registry
 #
@@ -43,7 +43,7 @@ def download_csv(url):
     organisation names."""
     response = requests.get(url)
     response.raise_for_status()
-    for row in csv.DictReader(response.iter_lines()):
+    for row in csv.DictReader(line.decode('utf-8') for line in 
response.iter_lines()):
         yield (
             row['Assignment'],
             row['Organization Name'].strip().replace('"', '%'))

https://arthurdejong.org/git/python-stdnum/commit/?id=0915b55c80a1bb328f3a1044e34934bf6b5fa04e

commit 0915b55c80a1bb328f3a1044e34934bf6b5fa04e
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Oct 27 16:25:32 2019 +0100

    Switch update scripts to use requests
    
    This makes the scripts more consistent.

diff --git a/update/at_postleitzahl.py b/update/at_postleitzahl.py
index 89308b7..596f5d0 100755
--- a/update/at_postleitzahl.py
+++ b/update/at_postleitzahl.py
@@ -26,9 +26,9 @@ from __future__ import print_function, unicode_literals
 
 import os
 import os.path
-import urllib
 
 import lxml.html
+import requests
 import xlrd
 
 
@@ -58,8 +58,9 @@ regions = {
 
 def find_download_url():
     """Extract the spreadsheet URL from the Austrian Post website."""
-    f = urllib.urlopen(base_url)
-    document = lxml.html.parse(f)
+    response = requests.get(base_url)
+    response.raise_for_status()
+    document = lxml.html.document_fromstring(response.content)
     url = [
         a.get('href')
         for a in document.findall('.//a[@href]')
@@ -69,9 +70,10 @@ def find_download_url():
 
 def get_postal_codes(download_url):
     """Download the Austrian postal codes spreadsheet."""
-    content = urllib.urlopen(download_url).read()
+    response = requests.get(download_url)
+    response.raise_for_status()
     workbook = xlrd.open_workbook(
-        file_contents=content, logfile=open(os.devnull, 'w'))
+        file_contents=response.content, logfile=open(os.devnull, 'w'))
     sheet = workbook.sheet_by_index(0)
     rows = sheet.get_rows()
     # the first row contains the column names
@@ -92,7 +94,7 @@ if __name__ == '__main__':
     # download/parse the information
     download_url = find_download_url()
     # print header
-    print('# generated from %s downloaded from ' %
+    print('# generated from %s downloaded from' %
           os.path.basename(download_url))
     print('# %s' % base_url)
     # build an ordered list of postal codes
diff --git a/update/be_banks.py b/update/be_banks.py
index 3c3a96b..a0d6f17 100755
--- a/update/be_banks.py
+++ b/update/be_banks.py
@@ -3,7 +3,7 @@
 
 # update/be_banks.py - script to donwload Bank list from Belgian National Bank
 #
-# Copyright (C) 2018 Arthur de Jong
+# Copyright (C) 2018-2019 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -24,8 +24,8 @@
 IBAN and BIC codes as published by the Belgian National Bank."""
 
 import os.path
-import urllib
 
+import requests
 import xlrd
 
 
@@ -74,8 +74,9 @@ def get_values(sheet):
 
 
 if __name__ == '__main__':
-    document = urllib.urlopen(download_url).read()
-    workbook = xlrd.open_workbook(file_contents=document)
+    response = requests.get(download_url)
+    response.raise_for_status()
+    workbook = xlrd.open_workbook(file_contents=response.content)
     sheet = workbook.sheet_by_index(0)
     version = sheet.cell(0, 0).value
     print('# generated from %s downloaded from' %
diff --git a/update/cn_loc.py b/update/cn_loc.py
index 96a13f4..10a33ed 100755
--- a/update/cn_loc.py
+++ b/update/cn_loc.py
@@ -3,7 +3,7 @@
 # update/cn_loc.py - script to fetch data from the CN Open Data community
 #
 # Copyright (C) 2014-2015 Jiangge Zhang
-# Copyright (C) 2015-2018 Arthur de Jong
+# Copyright (C) 2015-2019 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -60,6 +60,7 @@ def fetch_data():
     data_collection = OrderedDict()
     for revision in data_revisions:
         response = requests.get('%s/raw/release/%s.txt' % (data_url, revision))
+        response.raise_for_status()
         if response.ok:
             print('%s is fetched' % revision, file=sys.stderr)
         else:
diff --git a/update/do_whitelists.py b/update/do_whitelists.py
index f242c51..429fd7d 100755
--- a/update/do_whitelists.py
+++ b/update/do_whitelists.py
@@ -3,7 +3,7 @@
 
 # update/do_whitelists.py - script to update do.rnc and do.cedula whitelists
 #
-# Copyright (C) 2017 Arthur de Jong
+# Copyright (C) 2017-2019 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -25,13 +25,13 @@ Internos (DGII) web site with lists of all RNC and Cedula 
values and outputs
 new whitelists for these modules."""
 
 import os.path
-import shutil
 import sys
 import tempfile
 import textwrap
-import urllib
 import zipfile
 
+import requests
+
 
 # Ensure that our local stdnum implementation is used
 sys.path.insert(0, os.path.normpath(
@@ -41,7 +41,7 @@ from stdnum.do import cedula, rnc  # noqa, isort:skip
 
 
 # The URL of the zip file with all valid numbers
-download_url = 'http://www.dgii.gov.do/app/WebApps/Consultas/rnc/DGII_RNC.zip'
+download_url = 'https://www.dgii.gov.do/app/WebApps/Consultas/rnc/DGII_RNC.zip'
 
 
 def handle_zipfile(f):
@@ -53,13 +53,14 @@ def handle_zipfile(f):
     z = zipfile.ZipFile(f, 'r')
     for line in z.open('TMP/DGII_RNC.TXT'):
         number = line.split('|', 1)[0].strip()
-        if len(number) <= 9:
-            if not rnc.is_valid(number):
-                invalidrnc.add(number)
-        else:
-            if not cedula.is_valid(number):
-                invalidcedula.add(number)
-    # return invalid numbers
+        if number.isdigit():
+            if len(number) <= 9:
+                if not rnc.is_valid(number):
+                    invalidrnc.add(number)
+            else:
+                if not cedula.is_valid(number):
+                    invalidcedula.add(number)
+    # return known but invalid numbers
     return invalidrnc, invalidcedula
 
 
@@ -68,11 +69,12 @@ if __name__ == '__main__':
     # Download and read the ZIP file with valid data
     with tempfile.TemporaryFile() as tmp:
         # Download the zip file to a temporary file
-        download = urllib.urlopen(download_url)
+        response = requests.get(download_url, stream=True)
+        response.raise_for_status()
         print('%s: %s' % (
             os.path.basename(download_url),
-            download.info().get('Last-Modified')))
-        shutil.copyfileobj(download, tmp)
+            response.headers.get('last-modified')))
+        tmp.write(response.content)
         # Open the temporary file as a zip file and read contents
         # (we cannot do this streaming because zipfile requires seek)
         invalidrnc, invalidcedula = handle_zipfile(tmp)
diff --git a/update/eu_nace.py b/update/eu_nace.py
index b772cd4..ec53095 100755
--- a/update/eu_nace.py
+++ b/update/eu_nace.py
@@ -23,10 +23,10 @@
 Metadata Server and extracts the information that is used for validating NACE
 codes."""
 
-import cgi
-import urllib.request
+import re
 
 import lxml.etree
+import requests
 
 
 # the location of the Statistical Classification file
@@ -34,14 +34,15 @@ download_url = 
'https://ec.europa.eu/eurostat/ramon/nomenclatures/index.cfm?Targ
 
 
 if __name__ == '__main__':
-    f = urllib.request.urlopen(download_url)
-    _, params = cgi.parse_header(f.info().get('Content-Disposition', ''))
-    filename = params.get('filename', '?')
+    response = requests.get(download_url)
+    response.raise_for_status()
+    content_disposition = response.headers.get('content-disposition', '')
+    filename = re.findall(r'filename=?(.+)"?', 
content_disposition)[0].strip('"')
     print('# generated from %s, downloaded from' % filename)
     print('# %s' % download_url)
 
     # parse XML document
-    document = lxml.etree.parse(f)
+    document = lxml.etree.fromstring(response.content)
 
     # output header
     print('# %s: %s' % (
diff --git a/update/iban.py b/update/iban.py
index d199c40..d563643 100755
--- a/update/iban.py
+++ b/update/iban.py
@@ -2,7 +2,7 @@
 
 # update/iban.py - script to download and parse data from the IBAN registry
 #
-# Copyright (C) 2011-2018 Arthur de Jong
+# Copyright (C) 2011-2019 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -24,9 +24,10 @@ Financial Telecommunication which is the official IBAN 
registrar) to get
 the data needed to correctly parse and validate IBANs."""
 
 import csv
-import urllib
 from collections import defaultdict
 
+import requests
+
 
 # The place where the current version of
 # swift_standards_infopaper_ibanregistry_1.txt can be downloaded.
@@ -42,13 +43,14 @@ def get_country_codes(line):
     return [x.strip()[:2] for x in line['iban structure'].split(',')]
 
 
-def parse(f):
-    """Parse the specified file."""
+if __name__ == '__main__':
+    response = requests.get(download_url)
+    response.raise_for_status()
     print('# generated from swift_standards_infopaper_ibanregistry_1.txt,')
     print('# downloaded from %s' % download_url)
     values = defaultdict(dict)
     # the file is CSV but the data is in columns instead of rows
-    for row in csv.reader(f, delimiter='\t', quotechar='"'):
+    for row in csv.reader(response.iter_lines(), delimiter='\t', 
quotechar='"'):
         # skip first row
         if row[0] != 'Data element':
             # first column contains label
@@ -71,8 +73,3 @@ def parse(f):
         # TODO: use "Bank identifier position within the BBAN" field
         #       to add labels to the ranges (Bank identifier and Branch
         #       Identifier)
-
-
-if __name__ == '__main__':
-    f = urllib.urlopen(download_url)
-    parse(f)
diff --git a/update/imsi.py b/update/imsi.py
index d91d377..034067e 100755
--- a/update/imsi.py
+++ b/update/imsi.py
@@ -2,7 +2,7 @@
 
 # update/imsi.py - script to donwload from Wikipedia to build the database
 #
-# Copyright (C) 2011-2018 Arthur de Jong
+# Copyright (C) 2011-2019 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -22,9 +22,10 @@
 """This extracts a IMSI country and operator code from Wikipedia."""
 
 import re
-import urllib
 from collections import defaultdict
 
+import requests
+
 
 # URLs that are downloaded
 mcc_list_url = 
'https://en.wikipedia.org/w/index.php?title=Mobile_country_code&action=raw'
@@ -127,9 +128,10 @@ def get_mncs_from_wikipedia(data):
                              r'(\s*\\\\\s*(?P<bands>[^\\]*)' +
                              r'(\s*\\\\\s*(?P<notes>[^\\]*)' +
                              r')?)?)?)?)?')
-    f = urllib.urlopen(mcc_list_url)
+    response = requests.get(mcc_list_url)
+    response.raise_for_status()
     country = cc = ''
-    for line in f.readlines():
+    for line in response.iter_lines():
         line = line.strip()
         match = mnc_country_re.match(line)
         if match:
diff --git a/update/isbn.py b/update/isbn.py
index c9203e1..658ba07 100755
--- a/update/isbn.py
+++ b/update/isbn.py
@@ -25,10 +25,8 @@ ranges for those prefixes suitable for the numdb module. 
This data is needed
 to correctly split ISBNs into an EAN.UCC prefix, a group prefix, a registrant,
 an item number and a check-digit."""
 
-import ssl
-import urllib.request
-
 import lxml.etree
+import requests
 
 
 # the location of the ISBN Ranges XML file
@@ -58,11 +56,11 @@ def wrap(text):
 if __name__ == '__main__':
     print('# generated from RangeMessage.xml, downloaded from')
     print('# %s' % download_url)
-    ctx = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
-    f = urllib.request.urlopen(download_url, context=ctx)
+    response = requests.get(download_url)
+    response.raise_for_status()
 
     # parse XML document
-    document = lxml.etree.parse(f)
+    document = lxml.etree.fromstring(response.content)
 
     # dump data from document
     print('# file serial %s' % 
document.find('./MessageSerialNumber').text.strip())
diff --git a/update/isil.py b/update/isil.py
index aa51c55..860e0ec 100755
--- a/update/isil.py
+++ b/update/isil.py
@@ -24,9 +24,9 @@ and screen-scrapes the national and non-national ISIL 
agencies and
 code prefixes."""
 
 import re
-import urllib
 
 import lxml.html
+import requests
 
 
 spaces_re = re.compile(r'\s+', re.UNICODE)
@@ -41,12 +41,13 @@ def clean(td):
     return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip().encode('utf-8')
 
 
-def parse(f):
-    """Parse the specified file."""
+if __name__ == '__main__':
+    response = requests.get(download_url)
+    response.raise_for_status()
     print('# generated from ISIL Registration Authority, downloaded from')
     print('# %s' % download_url)
     # We hack the HTML to insert missing <TR> elements
-    content = f.read().replace('</TR>', '</TR><TR>')
+    content = response.text.replace('</TR>', '</TR><TR>')
     document = lxml.html.document_fromstring(content)
     # find all table rows
     for tr in document.findall('.//tr'):
@@ -67,9 +68,3 @@ def parse(f):
                 '%s$ %s' % (
                     cc, ' '.join(
                         ['%s="%s"' % (x, y) for x, y in props.iteritems()])))
-
-
-if __name__ == '__main__':
-    # f = open('isil.html', 'r')
-    f = urllib.urlopen(download_url)
-    parse(f)
diff --git a/update/oui.py b/update/oui.py
index b3f808e..8ff2e19 100755
--- a/update/oui.py
+++ b/update/oui.py
@@ -2,7 +2,7 @@
 
 # update/oui.py - script to download and parse data from the IEEE registry
 #
-# Copyright (C) 2018 Arthur de Jong
+# Copyright (C) 2018-2019 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -25,10 +25,11 @@ and produces data files that can be use by python-stdnum to 
look up
 manufacturers by MAC address."""
 
 import csv
-import urllib
 from collections import defaultdict
 from itertools import chain
 
+import requests
+
 
 # The URLs of the MA-L, MA-M and MA-S registries that are downloaded to
 # construct a full list of manufacturer prefixes.
@@ -40,7 +41,9 @@ mas_url = 'http://standards-oui.ieee.org/oui36/oui36.csv'
 def download_csv(url):
     """Download the list from the site and provide assignment and
     organisation names."""
-    for row in csv.DictReader(urllib.urlopen(url)):
+    response = requests.get(url)
+    response.raise_for_status()
+    for row in csv.DictReader(response.iter_lines()):
         yield (
             row['Assignment'],
             row['Organization Name'].strip().replace('"', '%'))

https://arthurdejong.org/git/python-stdnum/commit/?id=40961fc0a014c72c4981d3878b886f19ec3f2f9a

commit 40961fc0a014c72c4981d3878b886f19ec3f2f9a
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Oct 27 15:05:13 2019 +0100

    Switch update scripts to lxml
    
    This avoids an extra dependency on Beautiful Soup and makes the scripts
    more consistent.
    
    This also includes a fix in the ISIL because of website changes.

diff --git a/update/at_postleitzahl.py b/update/at_postleitzahl.py
index 173ee95..89308b7 100755
--- a/update/at_postleitzahl.py
+++ b/update/at_postleitzahl.py
@@ -26,9 +26,9 @@ from __future__ import print_function, unicode_literals
 
 import os
 import os.path
-import re
 import urllib
 
+import lxml.html
 import xlrd
 
 
@@ -37,11 +37,6 @@ try:
 except ImportError:
     from urlparse import urljoin
 
-try:
-    from bs4 import BeautifulSoup
-except ImportError:
-    from BeautifulSoup import BeautifulSoup
-
 
 # The page that contains a link to the downloadable spreadsheet with current
 # Austrian postal codes
@@ -64,19 +59,19 @@ regions = {
 def find_download_url():
     """Extract the spreadsheet URL from the Austrian Post website."""
     f = urllib.urlopen(base_url)
-    soup = BeautifulSoup(f)
-    url = soup.find(
-        'a',
-        attrs=dict(
-            href=re.compile(r'.*/downloads/PLZ_Verzeichnis.*')))['href']
+    document = lxml.html.parse(f)
+    url = [
+        a.get('href')
+        for a in document.findall('.//a[@href]')
+        if '/downloads/PLZ_Verzeichnis' in a.get('href')][0]
     return urljoin(base_url, url.split('?')[0])
 
 
 def get_postal_codes(download_url):
     """Download the Austrian postal codes spreadsheet."""
-    document = urllib.urlopen(download_url).read()
+    content = urllib.urlopen(download_url).read()
     workbook = xlrd.open_workbook(
-        file_contents=document, logfile=open(os.devnull, 'w'))
+        file_contents=content, logfile=open(os.devnull, 'w'))
     sheet = workbook.sheet_by_index(0)
     rows = sheet.get_rows()
     # the first row contains the column names
diff --git a/update/eu_nace.py b/update/eu_nace.py
index af831f1..b772cd4 100755
--- a/update/eu_nace.py
+++ b/update/eu_nace.py
@@ -2,7 +2,7 @@
 
 # update/eu_nace.py - script to get the NACE v2 catalogue
 #
-# Copyright (C) 2017-2018 Arthur de Jong
+# Copyright (C) 2017-2019 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -25,11 +25,12 @@ codes."""
 
 import cgi
 import urllib.request
-from xml.etree import ElementTree
 
+import lxml.etree
 
-# the location of the ISBN Ranges XML file
-download_url = 
'http://ec.europa.eu/eurostat/ramon/nomenclatures/index.cfm?TargetUrl=ACT_OTH_CLS_DLD&StrNom=NACE_REV2&StrFormat=XML&StrLanguageCode=EN'
+
+# the location of the Statistical Classification file
+download_url = 
'https://ec.europa.eu/eurostat/ramon/nomenclatures/index.cfm?TargetUrl=ACT_OTH_CLS_DLD&StrNom=NACE_REV2&StrFormat=XML&StrLanguageCode=EN'
 
 
 if __name__ == '__main__':
@@ -40,20 +41,19 @@ if __name__ == '__main__':
     print('# %s' % download_url)
 
     # parse XML document
-    doc = ElementTree.parse(f).getroot()
+    document = lxml.etree.parse(f)
 
     # output header
     print('# %s: %s' % (
-        doc.find('Classification').get('id'),
-        doc.find('Classification/Label/LabelText[@language="EN"]').text))
+        document.find('./Classification').get('id'),
+        
document.find('./Classification/Label/LabelText[@language="EN"]').text))
 
-    for item in doc.findall('Classification/Item'):
+    for item in document.findall('./Classification/Item'):
         number = item.get('id')
         level = int(item.get('idLevel', 0))
-        label = item.find('Label/LabelText[@language="EN"]').text
+        label = item.find('./Label/LabelText[@language="EN"]').text
         isic = item.find(
-            'Property[@genericName="ISIC4_REF"]/PropertyQualifier/' +
-            'PropertyText').text
+            
'./Property[@genericName="ISIC4_REF"]/PropertyQualifier/PropertyText').text
         if level == 1:
             section = number
             print('%s label="%s" isic="%s"' % (number, label, isic))
diff --git a/update/isbn.py b/update/isbn.py
index 690457b..c9203e1 100755
--- a/update/isbn.py
+++ b/update/isbn.py
@@ -2,7 +2,7 @@
 
 # update/isbn.py - script to get ISBN prefix data
 #
-# Copyright (C) 2010-2018 Arthur de Jong
+# Copyright (C) 2010-2019 Arthur de Jong
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -27,7 +27,8 @@ an item number and a check-digit."""
 
 import ssl
 import urllib.request
-from xml.etree import ElementTree
+
+import lxml.etree
 
 
 # the location of the ISBN Ranges XML file
@@ -36,12 +37,12 @@ download_url = 
'https://www.isbn-international.org/export_rangemessage.xml'
 
 def ranges(group):
     """Provide the ranges for the group."""
-    for rule in group.find('Rules').findall('Rule'):
-        length = int(rule.find('Length').text.strip())
+    for rule in group.findall('./Rules/Rule'):
+        length = int(rule.find('./Length').text.strip())
         if length:
             yield '-'.join(
                 x[:length]
-                for x in rule.find('Range').text.strip().split('-'))
+                for x in rule.find('./Range').text.strip().split('-'))
 
 
 def wrap(text):
@@ -61,20 +62,20 @@ if __name__ == '__main__':
     f = urllib.request.urlopen(download_url, context=ctx)
 
     # parse XML document
-    msg = ElementTree.parse(f).getroot()
+    document = lxml.etree.parse(f)
 
     # dump data from document
-    print('# file serial %s' % msg.find('MessageSerialNumber').text.strip())
-    print('# file date %s' % msg.find('MessageDate').text.strip())
+    print('# file serial %s' % 
document.find('./MessageSerialNumber').text.strip())
+    print('# file date %s' % document.find('./MessageDate').text.strip())
 
     top_groups = dict(
-        (x.find('Prefix').text.strip(), x)
-        for x in msg.find('EAN.UCCPrefixes').findall('EAN.UCC'))
+        (x.find('./Prefix').text.strip(), x)
+        for x in document.findall('./EAN.UCCPrefixes/EAN.UCC'))
 
     prevtop = None
-    for group in msg.find('RegistrationGroups').findall('Group'):
-        top, prefix = group.find('Prefix').text.strip().split('-')
-        agency = group.find('Agency').text.strip()
+    for group in document.findall('./RegistrationGroups/Group'):
+        top, prefix = group.find('./Prefix').text.strip().split('-')
+        agency = group.find('./Agency').text.strip()
         if top != prevtop:
             print(top)
             for line in wrap(','.join(ranges(top_groups[top]))):
diff --git a/update/isil.py b/update/isil.py
index efa4163..aa51c55 100755
--- a/update/isil.py
+++ b/update/isil.py
@@ -26,11 +26,7 @@ code prefixes."""
 import re
 import urllib
 
-
-try:
-    from bs4 import BeautifulSoup
-except ImportError:
-    from BeautifulSoup import BeautifulSoup
+import lxml.html
 
 
 spaces_re = re.compile(r'\s+', re.UNICODE)
@@ -39,8 +35,9 @@ spaces_re = re.compile(r'\s+', re.UNICODE)
 download_url = 'https://english.slks.dk/libraries/library-standards/isil/'
 
 
-def clean(s):
-    """Clean up the string removing unneeded stuff from it."""
+def clean(td):
+    """Clean up the element removing unneeded stuff from it."""
+    s = lxml.html.tostring(td, method='text', encoding='utf-8').decode('utf-8')
     return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip().encode('utf-8')
 
 
@@ -50,23 +47,22 @@ def parse(f):
     print('# %s' % download_url)
     # We hack the HTML to insert missing <TR> elements
     content = f.read().replace('</TR>', '</TR><TR>')
-    soup = BeautifulSoup(content)
+    document = lxml.html.document_fromstring(content)
     # find all table rows
-    for tr in soup.findAll('tr'):
+    for tr in document.findall('.//tr'):
         # find the rows with four columns of text
-        tds = tr.findAll('td', attrs={'class': 'text'}, recursive=False)
-        if len(tds) == 4:
+        tds = tr.findall('td')
+        if len(tds) == 4 and clean(tds[0]).lower() != 'code':
             props = {}
-            cc = clean(tds[0].string)
-            if tds[1].string:
-                props['country'] = clean(tds[1].contents[0])
-            ra_a = tds[2].find('a')
-            if ra_a:
-                props['ra'] = clean(ra_a.string)
-                props['ra_url'] = clean(ra_a['href'])
-            elif tds[2].string:
-                props['ra'] = clean(tds[2].string)
-            # we could also get the search urls from tds[3].findAll('a')
+            cc = clean(tds[0])
+            if tds[1].find('p') is not None:
+                props['country'] = clean(tds[1])
+            ra_a = tds[2].find('.//a')
+            if ra_a is not None:
+                props['ra'] = clean(tds[2])
+                props['ra_url'] = ra_a.get('href')
+            else:
+                props['ra'] = clean(tds[2])
             print(
                 '%s$ %s' % (
                     cc, ' '.join(
diff --git a/update/my_bp.py b/update/my_bp.py
index c1dd017..50f8b3a 100755
--- a/update/my_bp.py
+++ b/update/my_bp.py
@@ -25,15 +25,10 @@ birthplace code from the National Registration Department 
of Malaysia."""
 import re
 from collections import defaultdict
 
+import lxml.html
 import requests
 
 
-try:
-    from bs4 import BeautifulSoup
-except ImportError:
-    from BeautifulSoup import BeautifulSoup
-
-
 # URLs that are downloaded
 state_list_url = 'https://www.jpn.gov.my/kod-negeri/'
 country_list_url = 'https://www.jpn.gov.my/en/kod-negara/'
@@ -46,21 +41,19 @@ user_agent = 'Mozilla/5.0 (compatible; python-stdnum 
updater; +https://arthurdej
 spaces_re = re.compile(r'\s+', re.UNICODE)
 
 
-def clean(s):
-    """Clean up the string removing unneeded stuff from it."""
+def clean(td):
+    """Clean up the element removing unneeded stuff from it."""
+    s = lxml.html.tostring(td, method='text', encoding='utf-8').decode('utf-8')
     return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip().encode('utf-8')
 
 
-def parse(f):
+def parse(content):
     """Parse the specified file."""
-    soup = BeautifulSoup(f)
+    document = lxml.html.document_fromstring(content)
     # find all table rows
-    for tr in soup.find('div', {'class': 'box-content'}).findAll('tr'):
-        # find the rows with four columns of text
-        tds = [
-            clean(''.join(x.string for x in td.findAll(text=True)))
-            for td in tr.findAll('td')
-        ]
+    for tr in document.findall('.//div[@class="box-content"]//tr'):
+        tds = [clean(td) for td in tr.findall('td')]
+        # table has two columns
         if len(tds) >= 2 and tds[0] and tds[1]:
             yield tds[0], tds[1]
         if len(tds) >= 4 and tds[2] and tds[3]:
@@ -74,13 +67,15 @@ if __name__ == '__main__':
     results = defaultdict(lambda: defaultdict(set))
     # read the states
     response = requests.get(state_list_url, headers=headers, 
verify='update/my_bp.crt')
-    for state, bps in parse(response.text):
+    response.raise_for_status()
+    for state, bps in parse(response.content):
         for bp in bps.split(','):
             results[bp.strip()]['state'] = state
             results[bp.strip()]['countries'].add('Malaysia')
     # read the countries
     response = requests.get(country_list_url, headers=headers, 
verify='update/my_bp.crt')
-    for country, bp in parse(response.text):
+    response.raise_for_status()
+    for country, bp in parse(response.content):
         results[bp]['countries'].add(country)
     # print the results
     print('# generated from National Registration Department of Malaysia, 
downloaded from')
diff --git a/update/requirements.txt b/update/requirements.txt
index c74ee0c..16e367c 100644
--- a/update/requirements.txt
+++ b/update/requirements.txt
@@ -1,3 +1,3 @@
-beautifulsoup4
+lxml
 requests
 xlrd

https://arthurdejong.org/git/python-stdnum/commit/?id=c4ad714866b7082983686d0ad6ef4e7640488667

commit c4ad714866b7082983686d0ad6ef4e7640488667
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Oct 27 12:30:25 2019 +0100

    Work around incorrect jpn.gov.my certificate chain
    
    The intermediate certificate for jpn.gov.my is missing from the
    certificate chain that is returned by the server since the server
    switched to HTTPS.

diff --git a/update/my_bp.crt b/update/my_bp.crt
new file mode 100644
index 0000000..fe5a24d
--- /dev/null
+++ b/update/my_bp.crt
@@ -0,0 +1,55 @@
+-----BEGIN CERTIFICATE-----
+MIIFDjCCA/agAwIBAgIMDulMwwAAAABR03eFMA0GCSqGSIb3DQEBCwUAMIG+MQsw
+CQYDVQQGEwJVUzEWMBQGA1UEChMNRW50cnVzdCwgSW5jLjEoMCYGA1UECxMfU2Vl
+IHd3dy5lbnRydXN0Lm5ldC9sZWdhbC10ZXJtczE5MDcGA1UECxMwKGMpIDIwMDkg
+RW50cnVzdCwgSW5jLiAtIGZvciBhdXRob3JpemVkIHVzZSBvbmx5MTIwMAYDVQQD
+EylFbnRydXN0IFJvb3QgQ2VydGlmaWNhdGlvbiBBdXRob3JpdHkgLSBHMjAeFw0x
+NTEwMDUxOTEzNTZaFw0zMDEyMDUxOTQzNTZaMIG6MQswCQYDVQQGEwJVUzEWMBQG
+A1UEChMNRW50cnVzdCwgSW5jLjEoMCYGA1UECxMfU2VlIHd3dy5lbnRydXN0Lm5l
+dC9sZWdhbC10ZXJtczE5MDcGA1UECxMwKGMpIDIwMTIgRW50cnVzdCwgSW5jLiAt
+IGZvciBhdXRob3JpemVkIHVzZSBvbmx5MS4wLAYDVQQDEyVFbnRydXN0IENlcnRp
+ZmljYXRpb24gQXV0aG9yaXR5IC0gTDFLMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8A
+MIIBCgKCAQEA2j+W0E25L0Tn2zlem1DuXKVh2kFnUwmqAJqOV38pa9vH4SEkqjrQ
+jUcj0u1yFvCRIdJdt7hLqIOPt5EyaM/OJZMssn2XyP7BtBe6CZ4DkJN7fEmDImiK
+m95HwzGYei59QAvS7z7Tsoyqj0ip/wDoKVgG97aTWpRzJiatWA7lQrjV6nN5ZGhT
+JbiEz5R6rgZFDKNrTdDGvuoYpDbwkrK6HIiPOlJ/915tgxyd8B/lw9bdpXiSPbBt
+LOrJz5RBGXFEaLpHPATpXbo+8DX3Fbae8i4VHj9HyMg4p3NFXU2wO7GOFyk36t0F
+ASK7lDYqjVs1/lMZLwhGwSqzGmIdTivZGwIDAQABo4IBDDCCAQgwDgYDVR0PAQH/
+BAQDAgEGMBIGA1UdEwEB/wQIMAYBAf8CAQAwMwYIKwYBBQUHAQEEJzAlMCMGCCsG
+AQUFBzABhhdodHRwOi8vb2NzcC5lbnRydXN0Lm5ldDAwBgNVHR8EKTAnMCWgI6Ah
+hh9odHRwOi8vY3JsLmVudHJ1c3QubmV0L2cyY2EuY3JsMDsGA1UdIAQ0MDIwMAYE
+VR0gADAoMCYGCCsGAQUFBwIBFhpodHRwOi8vd3d3LmVudHJ1c3QubmV0L3JwYTAd
+BgNVHQ4EFgQUgqJwdN28Uz/Pe9T3zX+nYMYKTL8wHwYDVR0jBBgwFoAUanImetAe
+733nO2lR1GyNn5ASZqswDQYJKoZIhvcNAQELBQADggEBADnVjpiDYcgsY9NwHRkw
+y/YJrMxp1cncN0HyMg/vdMNY9ngnCTQIlZIv19+4o/0OgemknNM/TWgrFTEKFcxS
+BJPok1DD2bHi4Wi3Ogl08TRYCj93mEC45mj/XeTIRsXsgdfJghhcg85x2Ly/rJkC
+k9uUmITSnKa1/ly78EqvIazCP0kkZ9Yujs+szGQVGHLlbHfTUqi53Y2sAEo1GdRv
+c6N172tkw+CNgxKhiucOhk3YtCAbvmqljEtoZuMrx1gL+1YQ1JH7HdMxWBCMRON1
+exCdtTix9qrKgWRs6PLigVWXUX/hwidQosk8WwBD9lu51aX8/wdQQGcHsFXwt35u
+Lcw=
+-----END CERTIFICATE-----
+-----BEGIN CERTIFICATE-----
+MIIEPjCCAyagAwIBAgIESlOMKDANBgkqhkiG9w0BAQsFADCBvjELMAkGA1UEBhMC
+VVMxFjAUBgNVBAoTDUVudHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3d3cuZW50
+cnVzdC5uZXQvbGVnYWwtdGVybXMxOTA3BgNVBAsTMChjKSAyMDA5IEVudHJ1c3Qs
+IEluYy4gLSBmb3IgYXV0aG9yaXplZCB1c2Ugb25seTEyMDAGA1UEAxMpRW50cnVz
+dCBSb290IENlcnRpZmljYXRpb24gQXV0aG9yaXR5IC0gRzIwHhcNMDkwNzA3MTcy
+NTU0WhcNMzAxMjA3MTc1NTU0WjCBvjELMAkGA1UEBhMCVVMxFjAUBgNVBAoTDUVu
+dHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3d3cuZW50cnVzdC5uZXQvbGVnYWwt
+dGVybXMxOTA3BgNVBAsTMChjKSAyMDA5IEVudHJ1c3QsIEluYy4gLSBmb3IgYXV0
+aG9yaXplZCB1c2Ugb25seTEyMDAGA1UEAxMpRW50cnVzdCBSb290IENlcnRpZmlj
+YXRpb24gQXV0aG9yaXR5IC0gRzIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEK
+AoIBAQC6hLZy254Ma+KZ6TABp3bqMriVQRrJ2mFOWHLP/vaCeb9zYQYKpSfYs1/T
+RU4cctZOMvJyig/3gxnQaoCAAEUesMfnmr8SVycco2gvCoe9amsOXmXzHHfV1IWN
+cCG0szLni6LVhjkCsbjSR87kyUnEO6fe+1R9V77w6G7CebI6C1XiUJgWMhNcL3hW
+wcKUs/Ja5CeanyTXxuzQmyWC48zCxEXFjJd6BmsqEZ+pCm5IO2/b1BEZQvePB7/1
+U1+cPvQXLOZprE4yTGJ36rfo5bs0vBmLrpxR57d+tVOxMyLlbc9wPBr64ptntoP0
+jaWvYkxN4FisZDQSA/i2jZRjJKRxAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAP
+BgNVHRMBAf8EBTADAQH/MB0GA1UdDgQWBBRqciZ60B7vfec7aVHUbI2fkBJmqzAN
+BgkqhkiG9w0BAQsFAAOCAQEAeZ8dlsa2eT8ijYfThwMEYGprmi5ZiXMRrEPR9RP/
+jTkrwPK9T3CMqS/qF8QLVJ7UG5aYMzyorWKiAHarWWluBh1+xLlEjZivEtRh2woZ
+Rkfz6/djwUAFQKXSt/S1mja/qYh2iARVBCuch38aNzx+LaUa2NSJXsq9rD1s2G2v
+1fN2D807iDginWyTmsQ9v4IbZT+mD12q/OWyFcq1rca8PdCE6OoGcrBNOTJ4vz4R
+nAuknZoh8/CbCzB428Hch0P+vGOaysXCHMnHjf87ElgI5rY97HosTvuDls4MPGmH
+VHOkc8KT/1EQrBVUAdj8BbGJoX90g5pJ19xOe4pIb4tF9g==
+-----END CERTIFICATE-----
diff --git a/update/my_bp.py b/update/my_bp.py
index 7337db3..c1dd017 100755
--- a/update/my_bp.py
+++ b/update/my_bp.py
@@ -35,8 +35,8 @@ except ImportError:
 
 
 # URLs that are downloaded
-state_list_url = 'http://www.jpn.gov.my/informasi/kod-negeri/'
-country_list_url = 'http://www.jpn.gov.my/en/informasi/kod-negara/'
+state_list_url = 'https://www.jpn.gov.my/kod-negeri/'
+country_list_url = 'https://www.jpn.gov.my/en/kod-negara/'
 
 
 # The user agent that will be passed in requests
@@ -73,13 +73,13 @@ if __name__ == '__main__':
     }
     results = defaultdict(lambda: defaultdict(set))
     # read the states
-    response = requests.get(state_list_url, headers=headers)
+    response = requests.get(state_list_url, headers=headers, 
verify='update/my_bp.crt')
     for state, bps in parse(response.text):
         for bp in bps.split(','):
             results[bp.strip()]['state'] = state
             results[bp.strip()]['countries'].add('Malaysia')
     # read the countries
-    response = requests.get(country_list_url, headers=headers)
+    response = requests.get(country_list_url, headers=headers, 
verify='update/my_bp.crt')
     for country, bp in parse(response.text):
         results[bp]['countries'].add(country)
     # print the results

https://arthurdejong.org/git/python-stdnum/commit/?id=c9ad8d300bd88da12a4308ad08e4e9bd1b47c9d9

commit c9ad8d300bd88da12a4308ad08e4e9bd1b47c9d9
Author: Arthur de Jong <arthur@arthurdejong.org>
Date:   Sun Oct 27 12:02:03 2019 +0100

    Fix New Zealand Bank Branch Register update script
    
    There is now a direct URL for the XLS file and there is no longer a need
    to search the page for a link.

diff --git a/update/nz_banks.py b/update/nz_banks.py
index 61bc399..e3116e7 100755
--- a/update/nz_banks.py
+++ b/update/nz_banks.py
@@ -23,7 +23,6 @@
 """This script downloads the list of banks with bank codes as used in the
 New Zealand bank account numbers."""
 
-import os.path
 import re
 from collections import OrderedDict, defaultdict
 
@@ -31,33 +30,8 @@ import requests
 import xlrd
 
 
-try:
-    from urllib.parse import urljoin
-except ImportError:
-    from urlparse import urljoin
-
-
-try:
-    from bs4 import BeautifulSoup
-except ImportError:
-    from BeautifulSoup import BeautifulSoup
-
-
 # The page that contains a link to the latest XLS version of the codes.
-download_url = (
-    'https://www.paymentsnz.co.nz' +
-    '/resources/industry-registers/bank-branch-register/')
-
-
-def find_download_url():
-    """Find the spreadsheet URL on the New Zealand Bank Branch Register."""
-    response = requests.get(download_url)
-    soup = BeautifulSoup(response.content)
-    url = soup.find(
-        'a',
-        attrs=dict(
-            
href=re.compile(r'/documents/.*/Bank_Branch_Register_.*.xls')))['href']
-    return urljoin(download_url, url)
+download_url = 
'https://www.paymentsnz.co.nz/resources/industry-registers/bank-branch-register/download/xls/'
 
 
 def get_values(sheet):
@@ -91,15 +65,15 @@ def branch_list(branches):
 
 
 if __name__ == '__main__':
-    # download/parse the information
-    url = find_download_url()
     # parse the download as an XLS
-    response = requests.get(url)
+    response = requests.get(download_url)
+    response.raise_for_status()
+    content_disposition = response.headers.get('content-disposition', '')
+    filename = re.findall(r'filename=?(.+)"?', 
content_disposition)[0].strip('"')
     workbook = xlrd.open_workbook(file_contents=response.content)
     sheet = workbook.sheet_by_index(0)
     # print header
-    print('# generated from %s downloaded from ' %
-          os.path.basename(url))
+    print('# generated from %s downloaded from ' % filename)
     print('# %s' % download_url)
     # build banks list from spreadsheet
     banks = defaultdict(dict)

-----------------------------------------------------------------------

Summary of changes:
 update/at_postleitzahl.py |  34 +++++++---------
 update/be_banks.py        |  13 +++---
 update/cn_loc.py          |  26 ++++--------
 update/do_whitelists.py   |  35 ++++++++--------
 update/eu_nace.py         |  33 +++++++--------
 update/iban.py            |  19 ++++-----
 update/imsi.py            | 101 ++++++++++++++++++++++++++++------------------
 update/isbn.py            |  33 ++++++++-------
 update/isil.py            |  59 ++++++++++++---------------
 update/my_bp.crt          |  55 +++++++++++++++++++++++++
 update/my_bp.py           |  45 +++++++++------------
 update/numlist.py         |   2 +-
 update/nz_banks.py        |  40 ++++--------------
 update/oui.py             |  49 ++++++++++++++++------
 update/requirements.txt   |   2 +-
 15 files changed, 298 insertions(+), 248 deletions(-)
 create mode 100644 update/my_bp.crt


hooks/post-receive
-- 
python-stdnum
python-stdnum branch master updated. 1.11-54-g5b835bb, Commits of the python-stdnum project
Prev by Date: python-stdnum branch master updated. 1.11-47-g7f3dcf0
Next by Date: python-stdnum annotated tag 1.12 created. 1.12
Previous by thread: python-stdnum branch master updated. 1.11-47-g7f3dcf0
Next by thread: python-stdnum annotated tag 1.12 created. 1.12