python-stdnum branch master updated. 1.11-54-g5b835bb
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
python-stdnum branch master updated. 1.11-54-g5b835bb
- From: Commits of the python-stdnum project <python-stdnum-commits [at] lists.arthurdejong.org>
- To: python-stdnum-commits [at] lists.arthurdejong.org
- Reply-to: python-stdnum-users [at] lists.arthurdejong.org, python-stdnum-commits [at] lists.arthurdejong.org
- Subject: python-stdnum branch master updated. 1.11-54-g5b835bb
- Date: Sun, 27 Oct 2019 21:42:25 +0100 (CET)
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "python-stdnum".
The branch, master has been updated
via 5b835bb22b08386a43c704550ebc5abc6daf6397 (commit)
via 29de83e4f6bd3b3d86ca4f7e12181b7b6087cf66 (commit)
via 67b747ba43710cd7f929babc3eab0aff7f67d9a8 (commit)
via 0915b55c80a1bb328f3a1044e34934bf6b5fa04e (commit)
via 40961fc0a014c72c4981d3878b886f19ec3f2f9a (commit)
via c4ad714866b7082983686d0ad6ef4e7640488667 (commit)
via c9ad8d300bd88da12a4308ad08e4e9bd1b47c9d9 (commit)
from 7f3dcf05cfc0bf2a4deeb656c20929c9527ff95e (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
https://arthurdejong.org/git/python-stdnum/commit/?id=5b835bb22b08386a43c704550ebc5abc6daf6397
commit 5b835bb22b08386a43c704550ebc5abc6daf6397
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Oct 27 20:02:47 2019 +0100
Parse multiple Wikipedia pages for full MCC/MNC list
diff --git a/update/imsi.py b/update/imsi.py
index f8a37dd..d2abb83 100755
--- a/update/imsi.py
+++ b/update/imsi.py
@@ -27,8 +27,19 @@ from collections import defaultdict
import requests
-# URLs that are downloaded
-mcc_list_url =
'https://en.wikipedia.org/w/index.php?title=Mobile_country_code&action=raw'
+# The wikipedia pages to download
+wikipedia_pages = (
+ 'Mobile country code',
+ 'Mobile Network Codes in ITU region 2xx (Europe)',
+ 'Mobile Network Codes in ITU region 3xx (North America)',
+ 'Mobile Network Codes in ITU region 4xx (Asia)',
+ 'Mobile Network Codes in ITU region 5xx (Oceania)',
+ 'Mobile Network Codes in ITU region 6xx (Africa)',
+ 'Mobile Network Codes in ITU region 7xx (South America)',
+)
+
+# Sadly the full list requires an account at ITU-T:
+# https://www.itu.int/net/ITU-T/inrdb/
cleanup_replacements = {
@@ -115,39 +126,51 @@ def update_mncs(data, mcc, mnc, **kwargs):
data[mcc][mnc].update(dict((k, cleanup_value(v)) for k, v in
kwargs.items() if v))
+# This matches a heading on the Wikipedia page, e.g.
+# ==== [[Albania]] - AL ====
+_mnc_country_re = re.compile(
+ r'^[=]{2,4}\s+(?P<country>.*?)(\s+-\s+(?P<cc>[^\s]{2}))?\s+[=]{2,4}$')
+
+# This matches a line containing a MCC/MNC, e.g.
+# | 232 || 02 || || A1 Telekom Austria || Reserved || ||
+_mnc_line_re = re.compile(
+ r'^\|\s*(?P<mcc>[0-9]+)' +
+ r'\s*\\\\\s*(?P<mnc>[0-9]+)' +
+ r'(\s*\\\\\s*(?P<brand>[^\\]*)' +
+ r'(\s*\\\\\s*(?P<operator>[^\\]*)' +
+ r'(\s*\\\\\s*(?P<status>[^\\]*)' +
+ r'(\s*\\\\\s*(?P<bands>[^\\]*)' +
+ r'(\s*\\\\\s*(?P<notes>[^\\]*)' +
+ r')?)?)?)?)?')
+
+
def get_mncs_from_wikipedia(data):
"""Update the collection of Mobile Country Codes from Wikipedia.
This parses a Wikipedia page to extract the MCC and MNC, the first
part of any IMSI, and stores the results."""
- mnc_country_re =
re.compile(r'^[=]{2,4}\s+(?P<country>.*?)(\s+-\s+(?P<cc>[^\s]{2}))?\s+[=]{2,4}$')
- mnc_line_re = re.compile(r'^\|\s*(?P<mcc>[0-9]+)' +
- r'\s*\\\\\s*(?P<mnc>[0-9]+)' +
- r'(\s*\\\\\s*(?P<brand>[^\\]*)' +
- r'(\s*\\\\\s*(?P<operator>[^\\]*)' +
- r'(\s*\\\\\s*(?P<status>[^\\]*)' +
- r'(\s*\\\\\s*(?P<bands>[^\\]*)' +
- r'(\s*\\\\\s*(?P<notes>[^\\]*)' +
- r')?)?)?)?)?')
- response = requests.get(mcc_list_url)
- response.raise_for_status()
- country = cc = ''
- for line in response.iter_lines(decode_unicode=True):
- line = line.strip()
- match = mnc_country_re.match(line)
- if match:
- country = match.group('country')
- cc = (match.group('cc') or '').lower()
- if '||' not in line:
- continue
- line = line.replace('||', '\\\\')
- match = mnc_line_re.match(line)
- if match:
- for mnc in str2range(match.group('mnc')):
- update_mncs(data, match.group('mcc'), mnc,
- country=country, cc=cc, brand=match.group('brand'),
- operator=match.group('operator'),
- status=match.group('status'),
- bands=match.group('bands'))
+ for page in wikipedia_pages:
+ url = 'https://en.wikipedia.org/w/index.php?title=%s&action=raw' % (
+ page.replace(' ', '_'))
+ response = requests.get(url)
+ response.raise_for_status()
+ country = cc = ''
+ for line in response.iter_lines(decode_unicode=True):
+ line = line.strip()
+ match = _mnc_country_re.match(line)
+ if match:
+ country = match.group('country')
+ cc = (match.group('cc') or '').lower()
+ if '||' not in line:
+ continue
+ line = line.replace('||', '\\\\')
+ match = _mnc_line_re.match(line)
+ if match:
+ for mnc in str2range(match.group('mnc')):
+ update_mncs(data, match.group('mcc'), mnc,
+ country=country, cc=cc,
brand=match.group('brand'),
+ operator=match.group('operator'),
+ status=match.group('status'),
+ bands=match.group('bands'))
def str2range(x):
@@ -171,7 +194,7 @@ if __name__ == '__main__':
get_mncs_from_wikipedia(data)
# print header
print('# generated from various sources')
- print('# %s' % mcc_list_url)
+ print('# https://en.wikipedia.org/wiki/Mobile_country_code')
# build an ordered list of mccs
mcc_list = list(data.keys())
mcc_list.sort()
@@ -184,7 +207,7 @@ if __name__ == '__main__':
info = data[mcc][mnc]
infokeys = sorted(info.keys())
print(' %s%s' % (mnc, ''.join([' %s="%s"' % (k, info[k]) for k in
infokeys if info[k]])))
- # try to get the length of mnc's
+ # try to get the length of mncs
try:
length = len(mnc_list[0])
if all(len(x) == length for x in mnc_list):
https://arthurdejong.org/git/python-stdnum/commit/?id=29de83e4f6bd3b3d86ca4f7e12181b7b6087cf66
commit 29de83e4f6bd3b3d86ca4f7e12181b7b6087cf66
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Oct 27 18:26:29 2019 +0100
Make the IEEE OUI data more compact
This groups consecutive assignments into a range to make the dat file a
little more readable.
diff --git a/update/oui.py b/update/oui.py
index ed7f1e6..93a62ba 100755
--- a/update/oui.py
+++ b/update/oui.py
@@ -44,13 +44,35 @@ def download_csv(url):
response = requests.get(url)
response.raise_for_status()
for row in csv.DictReader(line.decode('utf-8') for line in
response.iter_lines()):
- yield (
- row['Assignment'],
- row['Organization Name'].strip().replace('"', '%'))
+ o = row['Organization Name'].strip().replace('"', '%')
+ if o not in ('IEEE Registration Authority', 'Private'):
+ yield (row['Assignment'], o)
+
+
+def join_items(items):
+ """Join the list of items, combining consecutive numbers."""
+ length = len(items[0])
+ items = [int(b, 16) for b in items]
+ first = None
+ prev = None
+ res = ''
+ for item in items:
+ if first is not None and item == prev + 1:
+ # this item is consecutive to the previous: make a range
+ if prev > first:
+ # replace the previous value
+ res = res[:-length - 1]
+ res += '-%%0%dX' % length % item
+ prev = item
+ else:
+ # this is a new item, add a new one to the list
+ res += ',%%0%dX' % length % item
+ first = prev = item
+ return res.strip(',')
if __name__ == '__main__':
- # download the MAC Address Block Large (MA-L) list
+ # download the MAC Address Block Large (MA-L) list and group by org
toplevel = defaultdict(list)
for a, o in download_csv(mal_url):
toplevel[o].append(a)
@@ -63,11 +85,11 @@ if __name__ == '__main__':
print('# %s' % mal_url)
print('# %s' % mam_url)
print('# %s' % mas_url)
+ # output full-length assignments
for a, o in sorted((tuple(sorted(a)), o) for o, a in toplevel.items()):
- if o not in ('IEEE Registration Authority', 'Private'):
- print('%s o="%s"' % (','.join(a), o))
+ print('%s o="%s"' % (join_items(a), o))
+ # output assignments that are subdivided
for a in sorted(nested.keys()):
print('%s' % a)
for s, o in sorted(nested[a].items()):
- if o not in ('IEEE Registration Authority', 'Private'):
- print(' %s o="%s"' % (s, o))
+ print(' %s o="%s"' % (s, o))
https://arthurdejong.org/git/python-stdnum/commit/?id=67b747ba43710cd7f929babc3eab0aff7f67d9a8
commit 67b747ba43710cd7f929babc3eab0aff7f67d9a8
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Oct 27 17:52:56 2019 +0100
Switch update scripts to Python 3
diff --git a/update/at_postleitzahl.py b/update/at_postleitzahl.py
index 596f5d0..c36d8a4 100755
--- a/update/at_postleitzahl.py
+++ b/update/at_postleitzahl.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# coding: utf-8
# update/at_postleitzahl.py - download list of Austrian postal codes
@@ -99,5 +99,4 @@ if __name__ == '__main__':
print('# %s' % base_url)
# build an ordered list of postal codes
for code, location, region in sorted(get_postal_codes(download_url)):
- info = '%s location="%s" region="%s"' % (code, location, region)
- print(info.encode('utf-8'))
+ print('%s location="%s" region="%s"' % (code, location, region))
diff --git a/update/be_banks.py b/update/be_banks.py
index a0d6f17..890bfbe 100755
--- a/update/be_banks.py
+++ b/update/be_banks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# coding: utf-8
# update/be_banks.py - script to donwload Bank list from Belgian National Bank
@@ -89,4 +89,4 @@ if __name__ == '__main__':
info += ' bic="%s"' % bic
if bank:
info += ' bank="%s"' % bank
- print(info.encode('utf-8'))
+ print(info)
diff --git a/update/cn_loc.py b/update/cn_loc.py
index 10a33ed..7fc1f09 100755
--- a/update/cn_loc.py
+++ b/update/cn_loc.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# update/cn_loc.py - script to fetch data from the CN Open Data community
#
@@ -25,7 +25,6 @@ Github."""
from __future__ import print_function, unicode_literals
-import codecs
import sys
from collections import OrderedDict
from datetime import datetime
@@ -84,21 +83,11 @@ def group_data(data_collection):
yield code, name, prefecture_name, province_name
-def print_data_file(file):
+if __name__ == '__main__':
"""Output a data file in the right format."""
- print("# generated from National Bureau of Statistics of the People's",
- file=file)
- print('# Republic of China, downloaded from %s' % data_url, file=file)
- print('# %s' % datetime.utcnow(), file=file)
- print('Downloading...', file=sys.stderr)
+ print("# generated from National Bureau of Statistics of the People's")
+ print('# Republic of China, downloaded from %s' % data_url)
+ print('# %s' % datetime.utcnow())
data_collection = fetch_data()
- print('Generating...', file=sys.stderr)
for data in group_data(data_collection):
- print('%s county="%s" prefecture="%s" province="%s"' % data, file=file)
-
-
-if __name__ == '__main__':
- if sys.stdout.isatty():
- print_data_file(sys.stdout)
- else:
- print_data_file(codecs.getwriter('utf-8')(sys.stdout))
+ print('%s county="%s" prefecture="%s" province="%s"' % data)
diff --git a/update/do_whitelists.py b/update/do_whitelists.py
index 429fd7d..3aea8d3 100755
--- a/update/do_whitelists.py
+++ b/update/do_whitelists.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# coding: utf-8
# update/do_whitelists.py - script to update do.rnc and do.cedula whitelists
@@ -24,6 +24,7 @@
Internos (DGII) web site with lists of all RNC and Cedula values and outputs
new whitelists for these modules."""
+import io
import os.path
import sys
import tempfile
@@ -51,7 +52,7 @@ def handle_zipfile(f):
invalidcedula = set()
# read the information from the ZIP file
z = zipfile.ZipFile(f, 'r')
- for line in z.open('TMP/DGII_RNC.TXT'):
+ for line in io.TextIOWrapper(z.open('TMP/DGII_RNC.TXT'),
encoding='iso8859-15'):
number = line.split('|', 1)[0].strip()
if number.isdigit():
if len(number) <= 9:
diff --git a/update/iban.py b/update/iban.py
index d563643..56a589a 100755
--- a/update/iban.py
+++ b/update/iban.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# update/iban.py - script to download and parse data from the IBAN registry
#
@@ -50,7 +50,7 @@ if __name__ == '__main__':
print('# downloaded from %s' % download_url)
values = defaultdict(dict)
# the file is CSV but the data is in columns instead of rows
- for row in csv.reader(response.iter_lines(), delimiter='\t',
quotechar='"'):
+ for row in csv.reader(response.iter_lines(decode_unicode=True),
delimiter='\t', quotechar='"'):
# skip first row
if row[0] != 'Data element':
# first column contains label
diff --git a/update/imsi.py b/update/imsi.py
index 034067e..f8a37dd 100755
--- a/update/imsi.py
+++ b/update/imsi.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# update/imsi.py - script to donwload from Wikipedia to build the database
#
@@ -131,7 +131,7 @@ def get_mncs_from_wikipedia(data):
response = requests.get(mcc_list_url)
response.raise_for_status()
country = cc = ''
- for line in response.iter_lines():
+ for line in response.iter_lines(decode_unicode=True):
line = line.strip()
match = mnc_country_re.match(line)
if match:
@@ -179,12 +179,10 @@ if __name__ == '__main__':
for mcc in mcc_list:
print('%s' % mcc)
# build an ordered list of mncs
- mnc_list = data[mcc].keys()
- mnc_list.sort()
+ mnc_list = sorted(data[mcc].keys())
for mnc in mnc_list:
info = data[mcc][mnc]
- infokeys = info.keys()
- infokeys.sort()
+ infokeys = sorted(info.keys())
print(' %s%s' % (mnc, ''.join([' %s="%s"' % (k, info[k]) for k in
infokeys if info[k]])))
# try to get the length of mnc's
try:
diff --git a/update/isil.py b/update/isil.py
index 860e0ec..dedd307 100755
--- a/update/isil.py
+++ b/update/isil.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# update/isil.py - script to donwload ISIL agencies
#
@@ -38,7 +38,7 @@ download_url =
'https://english.slks.dk/libraries/library-standards/isil/'
def clean(td):
"""Clean up the element removing unneeded stuff from it."""
s = lxml.html.tostring(td, method='text', encoding='utf-8').decode('utf-8')
- return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip().encode('utf-8')
+ return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip()
if __name__ == '__main__':
@@ -67,4 +67,4 @@ if __name__ == '__main__':
print(
'%s$ %s' % (
cc, ' '.join(
- ['%s="%s"' % (x, y) for x, y in props.iteritems()])))
+ '%s="%s"' % (x, y) for x, y in sorted(props.items()))))
diff --git a/update/my_bp.py b/update/my_bp.py
index 50f8b3a..672d3f8 100755
--- a/update/my_bp.py
+++ b/update/my_bp.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# update/my_bp.py - script to download data from Malaysian government site
#
@@ -44,7 +44,7 @@ spaces_re = re.compile(r'\s+', re.UNICODE)
def clean(td):
"""Clean up the element removing unneeded stuff from it."""
s = lxml.html.tostring(td, method='text', encoding='utf-8').decode('utf-8')
- return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip().encode('utf-8')
+ return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip()
def parse(content):
@@ -82,7 +82,7 @@ if __name__ == '__main__':
print('# %s' % state_list_url)
print('# %s' % country_list_url)
print('')
- for bp in sorted(results.iterkeys()):
+ for bp in sorted(results.keys()):
res = bp
row = results[bp]
if 'state' in row:
diff --git a/update/numlist.py b/update/numlist.py
index 8d6d086..ec5c209 100755
--- a/update/numlist.py
+++ b/update/numlist.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# update/numlist.py - script to get a list of number formats in stdnum
#
diff --git a/update/nz_banks.py b/update/nz_banks.py
index e3116e7..04a5463 100755
--- a/update/nz_banks.py
+++ b/update/nz_banks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# coding: utf-8
# update/nz_banks.py - script to download Bank list from Bank Branch Register
diff --git a/update/oui.py b/update/oui.py
index 8ff2e19..ed7f1e6 100755
--- a/update/oui.py
+++ b/update/oui.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# update/oui.py - script to download and parse data from the IEEE registry
#
@@ -43,7 +43,7 @@ def download_csv(url):
organisation names."""
response = requests.get(url)
response.raise_for_status()
- for row in csv.DictReader(response.iter_lines()):
+ for row in csv.DictReader(line.decode('utf-8') for line in
response.iter_lines()):
yield (
row['Assignment'],
row['Organization Name'].strip().replace('"', '%'))
https://arthurdejong.org/git/python-stdnum/commit/?id=0915b55c80a1bb328f3a1044e34934bf6b5fa04e
commit 0915b55c80a1bb328f3a1044e34934bf6b5fa04e
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Oct 27 16:25:32 2019 +0100
Switch update scripts to use requests
This makes the scripts more consistent.
diff --git a/update/at_postleitzahl.py b/update/at_postleitzahl.py
index 89308b7..596f5d0 100755
--- a/update/at_postleitzahl.py
+++ b/update/at_postleitzahl.py
@@ -26,9 +26,9 @@ from __future__ import print_function, unicode_literals
import os
import os.path
-import urllib
import lxml.html
+import requests
import xlrd
@@ -58,8 +58,9 @@ regions = {
def find_download_url():
"""Extract the spreadsheet URL from the Austrian Post website."""
- f = urllib.urlopen(base_url)
- document = lxml.html.parse(f)
+ response = requests.get(base_url)
+ response.raise_for_status()
+ document = lxml.html.document_fromstring(response.content)
url = [
a.get('href')
for a in document.findall('.//a[@href]')
@@ -69,9 +70,10 @@ def find_download_url():
def get_postal_codes(download_url):
"""Download the Austrian postal codes spreadsheet."""
- content = urllib.urlopen(download_url).read()
+ response = requests.get(download_url)
+ response.raise_for_status()
workbook = xlrd.open_workbook(
- file_contents=content, logfile=open(os.devnull, 'w'))
+ file_contents=response.content, logfile=open(os.devnull, 'w'))
sheet = workbook.sheet_by_index(0)
rows = sheet.get_rows()
# the first row contains the column names
@@ -92,7 +94,7 @@ if __name__ == '__main__':
# download/parse the information
download_url = find_download_url()
# print header
- print('# generated from %s downloaded from ' %
+ print('# generated from %s downloaded from' %
os.path.basename(download_url))
print('# %s' % base_url)
# build an ordered list of postal codes
diff --git a/update/be_banks.py b/update/be_banks.py
index 3c3a96b..a0d6f17 100755
--- a/update/be_banks.py
+++ b/update/be_banks.py
@@ -3,7 +3,7 @@
# update/be_banks.py - script to donwload Bank list from Belgian National Bank
#
-# Copyright (C) 2018 Arthur de Jong
+# Copyright (C) 2018-2019 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -24,8 +24,8 @@
IBAN and BIC codes as published by the Belgian National Bank."""
import os.path
-import urllib
+import requests
import xlrd
@@ -74,8 +74,9 @@ def get_values(sheet):
if __name__ == '__main__':
- document = urllib.urlopen(download_url).read()
- workbook = xlrd.open_workbook(file_contents=document)
+ response = requests.get(download_url)
+ response.raise_for_status()
+ workbook = xlrd.open_workbook(file_contents=response.content)
sheet = workbook.sheet_by_index(0)
version = sheet.cell(0, 0).value
print('# generated from %s downloaded from' %
diff --git a/update/cn_loc.py b/update/cn_loc.py
index 96a13f4..10a33ed 100755
--- a/update/cn_loc.py
+++ b/update/cn_loc.py
@@ -3,7 +3,7 @@
# update/cn_loc.py - script to fetch data from the CN Open Data community
#
# Copyright (C) 2014-2015 Jiangge Zhang
-# Copyright (C) 2015-2018 Arthur de Jong
+# Copyright (C) 2015-2019 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -60,6 +60,7 @@ def fetch_data():
data_collection = OrderedDict()
for revision in data_revisions:
response = requests.get('%s/raw/release/%s.txt' % (data_url, revision))
+ response.raise_for_status()
if response.ok:
print('%s is fetched' % revision, file=sys.stderr)
else:
diff --git a/update/do_whitelists.py b/update/do_whitelists.py
index f242c51..429fd7d 100755
--- a/update/do_whitelists.py
+++ b/update/do_whitelists.py
@@ -3,7 +3,7 @@
# update/do_whitelists.py - script to update do.rnc and do.cedula whitelists
#
-# Copyright (C) 2017 Arthur de Jong
+# Copyright (C) 2017-2019 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -25,13 +25,13 @@ Internos (DGII) web site with lists of all RNC and Cedula
values and outputs
new whitelists for these modules."""
import os.path
-import shutil
import sys
import tempfile
import textwrap
-import urllib
import zipfile
+import requests
+
# Ensure that our local stdnum implementation is used
sys.path.insert(0, os.path.normpath(
@@ -41,7 +41,7 @@ from stdnum.do import cedula, rnc # noqa, isort:skip
# The URL of the zip file with all valid numbers
-download_url = 'http://www.dgii.gov.do/app/WebApps/Consultas/rnc/DGII_RNC.zip'
+download_url = 'https://www.dgii.gov.do/app/WebApps/Consultas/rnc/DGII_RNC.zip'
def handle_zipfile(f):
@@ -53,13 +53,14 @@ def handle_zipfile(f):
z = zipfile.ZipFile(f, 'r')
for line in z.open('TMP/DGII_RNC.TXT'):
number = line.split('|', 1)[0].strip()
- if len(number) <= 9:
- if not rnc.is_valid(number):
- invalidrnc.add(number)
- else:
- if not cedula.is_valid(number):
- invalidcedula.add(number)
- # return invalid numbers
+ if number.isdigit():
+ if len(number) <= 9:
+ if not rnc.is_valid(number):
+ invalidrnc.add(number)
+ else:
+ if not cedula.is_valid(number):
+ invalidcedula.add(number)
+ # return known but invalid numbers
return invalidrnc, invalidcedula
@@ -68,11 +69,12 @@ if __name__ == '__main__':
# Download and read the ZIP file with valid data
with tempfile.TemporaryFile() as tmp:
# Download the zip file to a temporary file
- download = urllib.urlopen(download_url)
+ response = requests.get(download_url, stream=True)
+ response.raise_for_status()
print('%s: %s' % (
os.path.basename(download_url),
- download.info().get('Last-Modified')))
- shutil.copyfileobj(download, tmp)
+ response.headers.get('last-modified')))
+ tmp.write(response.content)
# Open the temporary file as a zip file and read contents
# (we cannot do this streaming because zipfile requires seek)
invalidrnc, invalidcedula = handle_zipfile(tmp)
diff --git a/update/eu_nace.py b/update/eu_nace.py
index b772cd4..ec53095 100755
--- a/update/eu_nace.py
+++ b/update/eu_nace.py
@@ -23,10 +23,10 @@
Metadata Server and extracts the information that is used for validating NACE
codes."""
-import cgi
-import urllib.request
+import re
import lxml.etree
+import requests
# the location of the Statistical Classification file
@@ -34,14 +34,15 @@ download_url =
'https://ec.europa.eu/eurostat/ramon/nomenclatures/index.cfm?Targ
if __name__ == '__main__':
- f = urllib.request.urlopen(download_url)
- _, params = cgi.parse_header(f.info().get('Content-Disposition', ''))
- filename = params.get('filename', '?')
+ response = requests.get(download_url)
+ response.raise_for_status()
+ content_disposition = response.headers.get('content-disposition', '')
+ filename = re.findall(r'filename=?(.+)"?',
content_disposition)[0].strip('"')
print('# generated from %s, downloaded from' % filename)
print('# %s' % download_url)
# parse XML document
- document = lxml.etree.parse(f)
+ document = lxml.etree.fromstring(response.content)
# output header
print('# %s: %s' % (
diff --git a/update/iban.py b/update/iban.py
index d199c40..d563643 100755
--- a/update/iban.py
+++ b/update/iban.py
@@ -2,7 +2,7 @@
# update/iban.py - script to download and parse data from the IBAN registry
#
-# Copyright (C) 2011-2018 Arthur de Jong
+# Copyright (C) 2011-2019 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -24,9 +24,10 @@ Financial Telecommunication which is the official IBAN
registrar) to get
the data needed to correctly parse and validate IBANs."""
import csv
-import urllib
from collections import defaultdict
+import requests
+
# The place where the current version of
# swift_standards_infopaper_ibanregistry_1.txt can be downloaded.
@@ -42,13 +43,14 @@ def get_country_codes(line):
return [x.strip()[:2] for x in line['iban structure'].split(',')]
-def parse(f):
- """Parse the specified file."""
+if __name__ == '__main__':
+ response = requests.get(download_url)
+ response.raise_for_status()
print('# generated from swift_standards_infopaper_ibanregistry_1.txt,')
print('# downloaded from %s' % download_url)
values = defaultdict(dict)
# the file is CSV but the data is in columns instead of rows
- for row in csv.reader(f, delimiter='\t', quotechar='"'):
+ for row in csv.reader(response.iter_lines(), delimiter='\t',
quotechar='"'):
# skip first row
if row[0] != 'Data element':
# first column contains label
@@ -71,8 +73,3 @@ def parse(f):
# TODO: use "Bank identifier position within the BBAN" field
# to add labels to the ranges (Bank identifier and Branch
# Identifier)
-
-
-if __name__ == '__main__':
- f = urllib.urlopen(download_url)
- parse(f)
diff --git a/update/imsi.py b/update/imsi.py
index d91d377..034067e 100755
--- a/update/imsi.py
+++ b/update/imsi.py
@@ -2,7 +2,7 @@
# update/imsi.py - script to donwload from Wikipedia to build the database
#
-# Copyright (C) 2011-2018 Arthur de Jong
+# Copyright (C) 2011-2019 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -22,9 +22,10 @@
"""This extracts a IMSI country and operator code from Wikipedia."""
import re
-import urllib
from collections import defaultdict
+import requests
+
# URLs that are downloaded
mcc_list_url =
'https://en.wikipedia.org/w/index.php?title=Mobile_country_code&action=raw'
@@ -127,9 +128,10 @@ def get_mncs_from_wikipedia(data):
r'(\s*\\\\\s*(?P<bands>[^\\]*)' +
r'(\s*\\\\\s*(?P<notes>[^\\]*)' +
r')?)?)?)?)?')
- f = urllib.urlopen(mcc_list_url)
+ response = requests.get(mcc_list_url)
+ response.raise_for_status()
country = cc = ''
- for line in f.readlines():
+ for line in response.iter_lines():
line = line.strip()
match = mnc_country_re.match(line)
if match:
diff --git a/update/isbn.py b/update/isbn.py
index c9203e1..658ba07 100755
--- a/update/isbn.py
+++ b/update/isbn.py
@@ -25,10 +25,8 @@ ranges for those prefixes suitable for the numdb module.
This data is needed
to correctly split ISBNs into an EAN.UCC prefix, a group prefix, a registrant,
an item number and a check-digit."""
-import ssl
-import urllib.request
-
import lxml.etree
+import requests
# the location of the ISBN Ranges XML file
@@ -58,11 +56,11 @@ def wrap(text):
if __name__ == '__main__':
print('# generated from RangeMessage.xml, downloaded from')
print('# %s' % download_url)
- ctx = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
- f = urllib.request.urlopen(download_url, context=ctx)
+ response = requests.get(download_url)
+ response.raise_for_status()
# parse XML document
- document = lxml.etree.parse(f)
+ document = lxml.etree.fromstring(response.content)
# dump data from document
print('# file serial %s' %
document.find('./MessageSerialNumber').text.strip())
diff --git a/update/isil.py b/update/isil.py
index aa51c55..860e0ec 100755
--- a/update/isil.py
+++ b/update/isil.py
@@ -24,9 +24,9 @@ and screen-scrapes the national and non-national ISIL
agencies and
code prefixes."""
import re
-import urllib
import lxml.html
+import requests
spaces_re = re.compile(r'\s+', re.UNICODE)
@@ -41,12 +41,13 @@ def clean(td):
return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip().encode('utf-8')
-def parse(f):
- """Parse the specified file."""
+if __name__ == '__main__':
+ response = requests.get(download_url)
+ response.raise_for_status()
print('# generated from ISIL Registration Authority, downloaded from')
print('# %s' % download_url)
# We hack the HTML to insert missing <TR> elements
- content = f.read().replace('</TR>', '</TR><TR>')
+ content = response.text.replace('</TR>', '</TR><TR>')
document = lxml.html.document_fromstring(content)
# find all table rows
for tr in document.findall('.//tr'):
@@ -67,9 +68,3 @@ def parse(f):
'%s$ %s' % (
cc, ' '.join(
['%s="%s"' % (x, y) for x, y in props.iteritems()])))
-
-
-if __name__ == '__main__':
- # f = open('isil.html', 'r')
- f = urllib.urlopen(download_url)
- parse(f)
diff --git a/update/oui.py b/update/oui.py
index b3f808e..8ff2e19 100755
--- a/update/oui.py
+++ b/update/oui.py
@@ -2,7 +2,7 @@
# update/oui.py - script to download and parse data from the IEEE registry
#
-# Copyright (C) 2018 Arthur de Jong
+# Copyright (C) 2018-2019 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -25,10 +25,11 @@ and produces data files that can be use by python-stdnum to
look up
manufacturers by MAC address."""
import csv
-import urllib
from collections import defaultdict
from itertools import chain
+import requests
+
# The URLs of the MA-L, MA-M and MA-S registries that are downloaded to
# construct a full list of manufacturer prefixes.
@@ -40,7 +41,9 @@ mas_url = 'http://standards-oui.ieee.org/oui36/oui36.csv'
def download_csv(url):
"""Download the list from the site and provide assignment and
organisation names."""
- for row in csv.DictReader(urllib.urlopen(url)):
+ response = requests.get(url)
+ response.raise_for_status()
+ for row in csv.DictReader(response.iter_lines()):
yield (
row['Assignment'],
row['Organization Name'].strip().replace('"', '%'))
https://arthurdejong.org/git/python-stdnum/commit/?id=40961fc0a014c72c4981d3878b886f19ec3f2f9a
commit 40961fc0a014c72c4981d3878b886f19ec3f2f9a
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Oct 27 15:05:13 2019 +0100
Switch update scripts to lxml
This avoids an extra dependency on Beautiful Soup and makes the scripts
more consistent.
This also includes a fix in the ISIL because of website changes.
diff --git a/update/at_postleitzahl.py b/update/at_postleitzahl.py
index 173ee95..89308b7 100755
--- a/update/at_postleitzahl.py
+++ b/update/at_postleitzahl.py
@@ -26,9 +26,9 @@ from __future__ import print_function, unicode_literals
import os
import os.path
-import re
import urllib
+import lxml.html
import xlrd
@@ -37,11 +37,6 @@ try:
except ImportError:
from urlparse import urljoin
-try:
- from bs4 import BeautifulSoup
-except ImportError:
- from BeautifulSoup import BeautifulSoup
-
# The page that contains a link to the downloadable spreadsheet with current
# Austrian postal codes
@@ -64,19 +59,19 @@ regions = {
def find_download_url():
"""Extract the spreadsheet URL from the Austrian Post website."""
f = urllib.urlopen(base_url)
- soup = BeautifulSoup(f)
- url = soup.find(
- 'a',
- attrs=dict(
- href=re.compile(r'.*/downloads/PLZ_Verzeichnis.*')))['href']
+ document = lxml.html.parse(f)
+ url = [
+ a.get('href')
+ for a in document.findall('.//a[@href]')
+ if '/downloads/PLZ_Verzeichnis' in a.get('href')][0]
return urljoin(base_url, url.split('?')[0])
def get_postal_codes(download_url):
"""Download the Austrian postal codes spreadsheet."""
- document = urllib.urlopen(download_url).read()
+ content = urllib.urlopen(download_url).read()
workbook = xlrd.open_workbook(
- file_contents=document, logfile=open(os.devnull, 'w'))
+ file_contents=content, logfile=open(os.devnull, 'w'))
sheet = workbook.sheet_by_index(0)
rows = sheet.get_rows()
# the first row contains the column names
diff --git a/update/eu_nace.py b/update/eu_nace.py
index af831f1..b772cd4 100755
--- a/update/eu_nace.py
+++ b/update/eu_nace.py
@@ -2,7 +2,7 @@
# update/eu_nace.py - script to get the NACE v2 catalogue
#
-# Copyright (C) 2017-2018 Arthur de Jong
+# Copyright (C) 2017-2019 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -25,11 +25,12 @@ codes."""
import cgi
import urllib.request
-from xml.etree import ElementTree
+import lxml.etree
-# the location of the ISBN Ranges XML file
-download_url =
'http://ec.europa.eu/eurostat/ramon/nomenclatures/index.cfm?TargetUrl=ACT_OTH_CLS_DLD&StrNom=NACE_REV2&StrFormat=XML&StrLanguageCode=EN'
+
+# the location of the Statistical Classification file
+download_url =
'https://ec.europa.eu/eurostat/ramon/nomenclatures/index.cfm?TargetUrl=ACT_OTH_CLS_DLD&StrNom=NACE_REV2&StrFormat=XML&StrLanguageCode=EN'
if __name__ == '__main__':
@@ -40,20 +41,19 @@ if __name__ == '__main__':
print('# %s' % download_url)
# parse XML document
- doc = ElementTree.parse(f).getroot()
+ document = lxml.etree.parse(f)
# output header
print('# %s: %s' % (
- doc.find('Classification').get('id'),
- doc.find('Classification/Label/LabelText[@language="EN"]').text))
+ document.find('./Classification').get('id'),
+
document.find('./Classification/Label/LabelText[@language="EN"]').text))
- for item in doc.findall('Classification/Item'):
+ for item in document.findall('./Classification/Item'):
number = item.get('id')
level = int(item.get('idLevel', 0))
- label = item.find('Label/LabelText[@language="EN"]').text
+ label = item.find('./Label/LabelText[@language="EN"]').text
isic = item.find(
- 'Property[@genericName="ISIC4_REF"]/PropertyQualifier/' +
- 'PropertyText').text
+
'./Property[@genericName="ISIC4_REF"]/PropertyQualifier/PropertyText').text
if level == 1:
section = number
print('%s label="%s" isic="%s"' % (number, label, isic))
diff --git a/update/isbn.py b/update/isbn.py
index 690457b..c9203e1 100755
--- a/update/isbn.py
+++ b/update/isbn.py
@@ -2,7 +2,7 @@
# update/isbn.py - script to get ISBN prefix data
#
-# Copyright (C) 2010-2018 Arthur de Jong
+# Copyright (C) 2010-2019 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -27,7 +27,8 @@ an item number and a check-digit."""
import ssl
import urllib.request
-from xml.etree import ElementTree
+
+import lxml.etree
# the location of the ISBN Ranges XML file
@@ -36,12 +37,12 @@ download_url =
'https://www.isbn-international.org/export_rangemessage.xml'
def ranges(group):
"""Provide the ranges for the group."""
- for rule in group.find('Rules').findall('Rule'):
- length = int(rule.find('Length').text.strip())
+ for rule in group.findall('./Rules/Rule'):
+ length = int(rule.find('./Length').text.strip())
if length:
yield '-'.join(
x[:length]
- for x in rule.find('Range').text.strip().split('-'))
+ for x in rule.find('./Range').text.strip().split('-'))
def wrap(text):
@@ -61,20 +62,20 @@ if __name__ == '__main__':
f = urllib.request.urlopen(download_url, context=ctx)
# parse XML document
- msg = ElementTree.parse(f).getroot()
+ document = lxml.etree.parse(f)
# dump data from document
- print('# file serial %s' % msg.find('MessageSerialNumber').text.strip())
- print('# file date %s' % msg.find('MessageDate').text.strip())
+ print('# file serial %s' %
document.find('./MessageSerialNumber').text.strip())
+ print('# file date %s' % document.find('./MessageDate').text.strip())
top_groups = dict(
- (x.find('Prefix').text.strip(), x)
- for x in msg.find('EAN.UCCPrefixes').findall('EAN.UCC'))
+ (x.find('./Prefix').text.strip(), x)
+ for x in document.findall('./EAN.UCCPrefixes/EAN.UCC'))
prevtop = None
- for group in msg.find('RegistrationGroups').findall('Group'):
- top, prefix = group.find('Prefix').text.strip().split('-')
- agency = group.find('Agency').text.strip()
+ for group in document.findall('./RegistrationGroups/Group'):
+ top, prefix = group.find('./Prefix').text.strip().split('-')
+ agency = group.find('./Agency').text.strip()
if top != prevtop:
print(top)
for line in wrap(','.join(ranges(top_groups[top]))):
diff --git a/update/isil.py b/update/isil.py
index efa4163..aa51c55 100755
--- a/update/isil.py
+++ b/update/isil.py
@@ -26,11 +26,7 @@ code prefixes."""
import re
import urllib
-
-try:
- from bs4 import BeautifulSoup
-except ImportError:
- from BeautifulSoup import BeautifulSoup
+import lxml.html
spaces_re = re.compile(r'\s+', re.UNICODE)
@@ -39,8 +35,9 @@ spaces_re = re.compile(r'\s+', re.UNICODE)
download_url = 'https://english.slks.dk/libraries/library-standards/isil/'
-def clean(s):
- """Clean up the string removing unneeded stuff from it."""
+def clean(td):
+ """Clean up the element removing unneeded stuff from it."""
+ s = lxml.html.tostring(td, method='text', encoding='utf-8').decode('utf-8')
return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip().encode('utf-8')
@@ -50,23 +47,22 @@ def parse(f):
print('# %s' % download_url)
# We hack the HTML to insert missing <TR> elements
content = f.read().replace('</TR>', '</TR><TR>')
- soup = BeautifulSoup(content)
+ document = lxml.html.document_fromstring(content)
# find all table rows
- for tr in soup.findAll('tr'):
+ for tr in document.findall('.//tr'):
# find the rows with four columns of text
- tds = tr.findAll('td', attrs={'class': 'text'}, recursive=False)
- if len(tds) == 4:
+ tds = tr.findall('td')
+ if len(tds) == 4 and clean(tds[0]).lower() != 'code':
props = {}
- cc = clean(tds[0].string)
- if tds[1].string:
- props['country'] = clean(tds[1].contents[0])
- ra_a = tds[2].find('a')
- if ra_a:
- props['ra'] = clean(ra_a.string)
- props['ra_url'] = clean(ra_a['href'])
- elif tds[2].string:
- props['ra'] = clean(tds[2].string)
- # we could also get the search urls from tds[3].findAll('a')
+ cc = clean(tds[0])
+ if tds[1].find('p') is not None:
+ props['country'] = clean(tds[1])
+ ra_a = tds[2].find('.//a')
+ if ra_a is not None:
+ props['ra'] = clean(tds[2])
+ props['ra_url'] = ra_a.get('href')
+ else:
+ props['ra'] = clean(tds[2])
print(
'%s$ %s' % (
cc, ' '.join(
diff --git a/update/my_bp.py b/update/my_bp.py
index c1dd017..50f8b3a 100755
--- a/update/my_bp.py
+++ b/update/my_bp.py
@@ -25,15 +25,10 @@ birthplace code from the National Registration Department
of Malaysia."""
import re
from collections import defaultdict
+import lxml.html
import requests
-try:
- from bs4 import BeautifulSoup
-except ImportError:
- from BeautifulSoup import BeautifulSoup
-
-
# URLs that are downloaded
state_list_url = 'https://www.jpn.gov.my/kod-negeri/'
country_list_url = 'https://www.jpn.gov.my/en/kod-negara/'
@@ -46,21 +41,19 @@ user_agent = 'Mozilla/5.0 (compatible; python-stdnum
updater; +https://arthurdej
spaces_re = re.compile(r'\s+', re.UNICODE)
-def clean(s):
- """Clean up the string removing unneeded stuff from it."""
+def clean(td):
+ """Clean up the element removing unneeded stuff from it."""
+ s = lxml.html.tostring(td, method='text', encoding='utf-8').decode('utf-8')
return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip().encode('utf-8')
-def parse(f):
+def parse(content):
"""Parse the specified file."""
- soup = BeautifulSoup(f)
+ document = lxml.html.document_fromstring(content)
# find all table rows
- for tr in soup.find('div', {'class': 'box-content'}).findAll('tr'):
- # find the rows with four columns of text
- tds = [
- clean(''.join(x.string for x in td.findAll(text=True)))
- for td in tr.findAll('td')
- ]
+ for tr in document.findall('.//div[@class="box-content"]//tr'):
+ tds = [clean(td) for td in tr.findall('td')]
+ # table has two columns
if len(tds) >= 2 and tds[0] and tds[1]:
yield tds[0], tds[1]
if len(tds) >= 4 and tds[2] and tds[3]:
@@ -74,13 +67,15 @@ if __name__ == '__main__':
results = defaultdict(lambda: defaultdict(set))
# read the states
response = requests.get(state_list_url, headers=headers,
verify='update/my_bp.crt')
- for state, bps in parse(response.text):
+ response.raise_for_status()
+ for state, bps in parse(response.content):
for bp in bps.split(','):
results[bp.strip()]['state'] = state
results[bp.strip()]['countries'].add('Malaysia')
# read the countries
response = requests.get(country_list_url, headers=headers,
verify='update/my_bp.crt')
- for country, bp in parse(response.text):
+ response.raise_for_status()
+ for country, bp in parse(response.content):
results[bp]['countries'].add(country)
# print the results
print('# generated from National Registration Department of Malaysia,
downloaded from')
diff --git a/update/requirements.txt b/update/requirements.txt
index c74ee0c..16e367c 100644
--- a/update/requirements.txt
+++ b/update/requirements.txt
@@ -1,3 +1,3 @@
-beautifulsoup4
+lxml
requests
xlrd
https://arthurdejong.org/git/python-stdnum/commit/?id=c4ad714866b7082983686d0ad6ef4e7640488667
commit c4ad714866b7082983686d0ad6ef4e7640488667
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Oct 27 12:30:25 2019 +0100
Work around incorrect jpn.gov.my certificate chain
The intermediate certificate for jpn.gov.my is missing from the
certificate chain that is returned by the server since the server
switched to HTTPS.
diff --git a/update/my_bp.crt b/update/my_bp.crt
new file mode 100644
index 0000000..fe5a24d
--- /dev/null
+++ b/update/my_bp.crt
@@ -0,0 +1,55 @@
+-----BEGIN CERTIFICATE-----
+MIIFDjCCA/agAwIBAgIMDulMwwAAAABR03eFMA0GCSqGSIb3DQEBCwUAMIG+MQsw
+CQYDVQQGEwJVUzEWMBQGA1UEChMNRW50cnVzdCwgSW5jLjEoMCYGA1UECxMfU2Vl
+IHd3dy5lbnRydXN0Lm5ldC9sZWdhbC10ZXJtczE5MDcGA1UECxMwKGMpIDIwMDkg
+RW50cnVzdCwgSW5jLiAtIGZvciBhdXRob3JpemVkIHVzZSBvbmx5MTIwMAYDVQQD
+EylFbnRydXN0IFJvb3QgQ2VydGlmaWNhdGlvbiBBdXRob3JpdHkgLSBHMjAeFw0x
+NTEwMDUxOTEzNTZaFw0zMDEyMDUxOTQzNTZaMIG6MQswCQYDVQQGEwJVUzEWMBQG
+A1UEChMNRW50cnVzdCwgSW5jLjEoMCYGA1UECxMfU2VlIHd3dy5lbnRydXN0Lm5l
+dC9sZWdhbC10ZXJtczE5MDcGA1UECxMwKGMpIDIwMTIgRW50cnVzdCwgSW5jLiAt
+IGZvciBhdXRob3JpemVkIHVzZSBvbmx5MS4wLAYDVQQDEyVFbnRydXN0IENlcnRp
+ZmljYXRpb24gQXV0aG9yaXR5IC0gTDFLMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8A
+MIIBCgKCAQEA2j+W0E25L0Tn2zlem1DuXKVh2kFnUwmqAJqOV38pa9vH4SEkqjrQ
+jUcj0u1yFvCRIdJdt7hLqIOPt5EyaM/OJZMssn2XyP7BtBe6CZ4DkJN7fEmDImiK
+m95HwzGYei59QAvS7z7Tsoyqj0ip/wDoKVgG97aTWpRzJiatWA7lQrjV6nN5ZGhT
+JbiEz5R6rgZFDKNrTdDGvuoYpDbwkrK6HIiPOlJ/915tgxyd8B/lw9bdpXiSPbBt
+LOrJz5RBGXFEaLpHPATpXbo+8DX3Fbae8i4VHj9HyMg4p3NFXU2wO7GOFyk36t0F
+ASK7lDYqjVs1/lMZLwhGwSqzGmIdTivZGwIDAQABo4IBDDCCAQgwDgYDVR0PAQH/
+BAQDAgEGMBIGA1UdEwEB/wQIMAYBAf8CAQAwMwYIKwYBBQUHAQEEJzAlMCMGCCsG
+AQUFBzABhhdodHRwOi8vb2NzcC5lbnRydXN0Lm5ldDAwBgNVHR8EKTAnMCWgI6Ah
+hh9odHRwOi8vY3JsLmVudHJ1c3QubmV0L2cyY2EuY3JsMDsGA1UdIAQ0MDIwMAYE
+VR0gADAoMCYGCCsGAQUFBwIBFhpodHRwOi8vd3d3LmVudHJ1c3QubmV0L3JwYTAd
+BgNVHQ4EFgQUgqJwdN28Uz/Pe9T3zX+nYMYKTL8wHwYDVR0jBBgwFoAUanImetAe
+733nO2lR1GyNn5ASZqswDQYJKoZIhvcNAQELBQADggEBADnVjpiDYcgsY9NwHRkw
+y/YJrMxp1cncN0HyMg/vdMNY9ngnCTQIlZIv19+4o/0OgemknNM/TWgrFTEKFcxS
+BJPok1DD2bHi4Wi3Ogl08TRYCj93mEC45mj/XeTIRsXsgdfJghhcg85x2Ly/rJkC
+k9uUmITSnKa1/ly78EqvIazCP0kkZ9Yujs+szGQVGHLlbHfTUqi53Y2sAEo1GdRv
+c6N172tkw+CNgxKhiucOhk3YtCAbvmqljEtoZuMrx1gL+1YQ1JH7HdMxWBCMRON1
+exCdtTix9qrKgWRs6PLigVWXUX/hwidQosk8WwBD9lu51aX8/wdQQGcHsFXwt35u
+Lcw=
+-----END CERTIFICATE-----
+-----BEGIN CERTIFICATE-----
+MIIEPjCCAyagAwIBAgIESlOMKDANBgkqhkiG9w0BAQsFADCBvjELMAkGA1UEBhMC
+VVMxFjAUBgNVBAoTDUVudHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3d3cuZW50
+cnVzdC5uZXQvbGVnYWwtdGVybXMxOTA3BgNVBAsTMChjKSAyMDA5IEVudHJ1c3Qs
+IEluYy4gLSBmb3IgYXV0aG9yaXplZCB1c2Ugb25seTEyMDAGA1UEAxMpRW50cnVz
+dCBSb290IENlcnRpZmljYXRpb24gQXV0aG9yaXR5IC0gRzIwHhcNMDkwNzA3MTcy
+NTU0WhcNMzAxMjA3MTc1NTU0WjCBvjELMAkGA1UEBhMCVVMxFjAUBgNVBAoTDUVu
+dHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3d3cuZW50cnVzdC5uZXQvbGVnYWwt
+dGVybXMxOTA3BgNVBAsTMChjKSAyMDA5IEVudHJ1c3QsIEluYy4gLSBmb3IgYXV0
+aG9yaXplZCB1c2Ugb25seTEyMDAGA1UEAxMpRW50cnVzdCBSb290IENlcnRpZmlj
+YXRpb24gQXV0aG9yaXR5IC0gRzIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEK
+AoIBAQC6hLZy254Ma+KZ6TABp3bqMriVQRrJ2mFOWHLP/vaCeb9zYQYKpSfYs1/T
+RU4cctZOMvJyig/3gxnQaoCAAEUesMfnmr8SVycco2gvCoe9amsOXmXzHHfV1IWN
+cCG0szLni6LVhjkCsbjSR87kyUnEO6fe+1R9V77w6G7CebI6C1XiUJgWMhNcL3hW
+wcKUs/Ja5CeanyTXxuzQmyWC48zCxEXFjJd6BmsqEZ+pCm5IO2/b1BEZQvePB7/1
+U1+cPvQXLOZprE4yTGJ36rfo5bs0vBmLrpxR57d+tVOxMyLlbc9wPBr64ptntoP0
+jaWvYkxN4FisZDQSA/i2jZRjJKRxAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAP
+BgNVHRMBAf8EBTADAQH/MB0GA1UdDgQWBBRqciZ60B7vfec7aVHUbI2fkBJmqzAN
+BgkqhkiG9w0BAQsFAAOCAQEAeZ8dlsa2eT8ijYfThwMEYGprmi5ZiXMRrEPR9RP/
+jTkrwPK9T3CMqS/qF8QLVJ7UG5aYMzyorWKiAHarWWluBh1+xLlEjZivEtRh2woZ
+Rkfz6/djwUAFQKXSt/S1mja/qYh2iARVBCuch38aNzx+LaUa2NSJXsq9rD1s2G2v
+1fN2D807iDginWyTmsQ9v4IbZT+mD12q/OWyFcq1rca8PdCE6OoGcrBNOTJ4vz4R
+nAuknZoh8/CbCzB428Hch0P+vGOaysXCHMnHjf87ElgI5rY97HosTvuDls4MPGmH
+VHOkc8KT/1EQrBVUAdj8BbGJoX90g5pJ19xOe4pIb4tF9g==
+-----END CERTIFICATE-----
diff --git a/update/my_bp.py b/update/my_bp.py
index 7337db3..c1dd017 100755
--- a/update/my_bp.py
+++ b/update/my_bp.py
@@ -35,8 +35,8 @@ except ImportError:
# URLs that are downloaded
-state_list_url = 'http://www.jpn.gov.my/informasi/kod-negeri/'
-country_list_url = 'http://www.jpn.gov.my/en/informasi/kod-negara/'
+state_list_url = 'https://www.jpn.gov.my/kod-negeri/'
+country_list_url = 'https://www.jpn.gov.my/en/kod-negara/'
# The user agent that will be passed in requests
@@ -73,13 +73,13 @@ if __name__ == '__main__':
}
results = defaultdict(lambda: defaultdict(set))
# read the states
- response = requests.get(state_list_url, headers=headers)
+ response = requests.get(state_list_url, headers=headers,
verify='update/my_bp.crt')
for state, bps in parse(response.text):
for bp in bps.split(','):
results[bp.strip()]['state'] = state
results[bp.strip()]['countries'].add('Malaysia')
# read the countries
- response = requests.get(country_list_url, headers=headers)
+ response = requests.get(country_list_url, headers=headers,
verify='update/my_bp.crt')
for country, bp in parse(response.text):
results[bp]['countries'].add(country)
# print the results
https://arthurdejong.org/git/python-stdnum/commit/?id=c9ad8d300bd88da12a4308ad08e4e9bd1b47c9d9
commit c9ad8d300bd88da12a4308ad08e4e9bd1b47c9d9
Author: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun Oct 27 12:02:03 2019 +0100
Fix New Zealand Bank Branch Register update script
There is now a direct URL for the XLS file and there is no longer a need
to search the page for a link.
diff --git a/update/nz_banks.py b/update/nz_banks.py
index 61bc399..e3116e7 100755
--- a/update/nz_banks.py
+++ b/update/nz_banks.py
@@ -23,7 +23,6 @@
"""This script downloads the list of banks with bank codes as used in the
New Zealand bank account numbers."""
-import os.path
import re
from collections import OrderedDict, defaultdict
@@ -31,33 +30,8 @@ import requests
import xlrd
-try:
- from urllib.parse import urljoin
-except ImportError:
- from urlparse import urljoin
-
-
-try:
- from bs4 import BeautifulSoup
-except ImportError:
- from BeautifulSoup import BeautifulSoup
-
-
# The page that contains a link to the latest XLS version of the codes.
-download_url = (
- 'https://www.paymentsnz.co.nz' +
- '/resources/industry-registers/bank-branch-register/')
-
-
-def find_download_url():
- """Find the spreadsheet URL on the New Zealand Bank Branch Register."""
- response = requests.get(download_url)
- soup = BeautifulSoup(response.content)
- url = soup.find(
- 'a',
- attrs=dict(
-
href=re.compile(r'/documents/.*/Bank_Branch_Register_.*.xls')))['href']
- return urljoin(download_url, url)
+download_url =
'https://www.paymentsnz.co.nz/resources/industry-registers/bank-branch-register/download/xls/'
def get_values(sheet):
@@ -91,15 +65,15 @@ def branch_list(branches):
if __name__ == '__main__':
- # download/parse the information
- url = find_download_url()
# parse the download as an XLS
- response = requests.get(url)
+ response = requests.get(download_url)
+ response.raise_for_status()
+ content_disposition = response.headers.get('content-disposition', '')
+ filename = re.findall(r'filename=?(.+)"?',
content_disposition)[0].strip('"')
workbook = xlrd.open_workbook(file_contents=response.content)
sheet = workbook.sheet_by_index(0)
# print header
- print('# generated from %s downloaded from ' %
- os.path.basename(url))
+ print('# generated from %s downloaded from ' % filename)
print('# %s' % download_url)
# build banks list from spreadsheet
banks = defaultdict(dict)
-----------------------------------------------------------------------
Summary of changes:
update/at_postleitzahl.py | 34 +++++++---------
update/be_banks.py | 13 +++---
update/cn_loc.py | 26 ++++--------
update/do_whitelists.py | 35 ++++++++--------
update/eu_nace.py | 33 +++++++--------
update/iban.py | 19 ++++-----
update/imsi.py | 101 ++++++++++++++++++++++++++++------------------
update/isbn.py | 33 ++++++++-------
update/isil.py | 59 ++++++++++++---------------
update/my_bp.crt | 55 +++++++++++++++++++++++++
update/my_bp.py | 45 +++++++++------------
update/numlist.py | 2 +-
update/nz_banks.py | 40 ++++--------------
update/oui.py | 49 ++++++++++++++++------
update/requirements.txt | 2 +-
15 files changed, 298 insertions(+), 248 deletions(-)
create mode 100644 update/my_bp.crt
hooks/post-receive
--
python-stdnum
- python-stdnum branch master updated. 1.11-54-g5b835bb,
Commits of the python-stdnum project