diff --git a/README.md b/README.md index 3a4d653..4d17d73 100644 --- a/README.md +++ b/README.md @@ -150,19 +150,20 @@ Note that the [example systemd service file](https://github.com/chr0mag/geoipse Performance ----------- -The Python version is consistently over twice as fast as the Bash version when generating sets for both firewall types and both address families. +* The Python version is much faster than the Bash version so use this if you have the choice. +* Versions > v2.3.1 include a significant performance improvement when generating MaxMind data. (See [issue #16](https://github.com/chr0mag/geoipsets/issues/16) and [PR #24](https://github.com/chr0mag/geoipsets/pull/24).) ``` -# BASH (maxmind) -% time bash build-country-sets.sh -bash build-country-sets.sh 34.18s user 22.12s system 108% cpu 52.121 total +# All tests below generate both ipv4 and ipv6 sets for both ipset and nftables. +## Python +% time python -m geoipsets -c ~/geoipsets.conf --provider maxmind --output-dir ~/tests +1.80s user 0.07s system 56% cpu 3.315 total -#PYTHON (maxmind) -% time python -m geoipsets -python -m geoipsets 15.16s user 7.18s system 91% cpu 24.345 total +% time python -m geoipsets -c ~/geoipsets.conf --provider dbip --output-dir ~/tests +10.74s user 0.11s system 94% cpu 11.487 total -#PYTHON (dbip) -% time python -m geoipsets -python -m geoipsets 14.25s user 0.14s system 91% cpu 15.690 total +## Bash (maxmind only) +% ./build-country-sets.sh +34.62s user 31.62s system 107% cpu 1:01.68 total ``` Sources ------------ diff --git a/python/geoipsets/maxmind.py b/python/geoipsets/maxmind.py index d26e762..331a91e 100644 --- a/python/geoipsets/maxmind.py +++ b/python/geoipsets/maxmind.py @@ -3,7 +3,6 @@ import hashlib import os import shutil -from collections import Counter from csv import DictReader from io import TextIOWrapper from pathlib import Path @@ -20,8 +19,8 @@ class MaxMindProvider(utils.AbstractProvider): def __init__(self, firewall: set, address_family: set, checksum: bool, countries: set, output_dir: str, provider_options: dict): - """'provider_options' is a ConfigParser Section that can be treated as a dictionary. - Use this mechanism to introduce provider-specific options into the configuration file.""" + # 'provider_options' is a ConfigParser Section that can be treated as a dictionary. + # Use this mechanism to introduce provider-specific options into the configuration file. super().__init__(firewall, address_family, checksum, countries, output_dir) if not (license_key := provider_options.get('license-key')): @@ -37,29 +36,28 @@ def generate(self): self.check_checksum(zip_file) with ZipFile(Path(zip_file.name), 'r') as zip_ref: - # with ZipFile(Path("/tmp/tmp96kyeecw.zip"), 'r') as zip_ref: # replace line above with this for testing + # with ZipFile(Path("/tmp/tmp23pn2bw0.zip"), 'r') as zip_ref: # replace line above with this for testing zip_dir_prefix = os.path.commonprefix(zip_ref.namelist()) - cc_map = self.build_map(zip_ref, zip_dir_prefix) + id_cc_map = self.build_id_cc_map(zip_ref, zip_dir_prefix) # TODO: run each address-family concurrently? if self.ipv4: - self.build_sets(cc_map, zip_ref, zip_dir_prefix, utils.AddressFamily.IPV4) + self.build_sets(id_cc_map, zip_ref, zip_dir_prefix, utils.AddressFamily.IPV4) if self.ipv6: - self.build_sets(cc_map, zip_ref, zip_dir_prefix, utils.AddressFamily.IPV6) + self.build_sets(id_cc_map, zip_ref, zip_dir_prefix, utils.AddressFamily.IPV6) - def build_map(self, zip_ref: ZipFile, dir_prefix: str): - """ - Build dictionary mapping geoname_ids to ISO country codes - {6251999: 'CA', 1269750: 'IN'} - example row: 6251999,en,NA,"North America",CA,Canada,0 + def build_id_cc_map(self, zip_ref: ZipFile, dir_prefix: str): + # Build dictionary mapping geoname_ids to ISO country codes + # {6251999: 'CA', 1269750: 'IN'} + # example row: 6251999,en,NA,"North America",CA,Canada,0 + # + # field names: + # geoname_id, locale_code, continent_code, continent_name, country_iso_code, country_name, is_in_european_union - field names: - geoname_id, locale_code, continent_code, continent_name, country_iso_code, country_name, is_in_european_union - """ locations = 'GeoLite2-Country-Locations-en.csv' - country_code_map = dict() + id_country_code_map = dict() with ZipFile(Path(zip_ref.filename), 'r') as zip_file: with zip_file.open(dir_prefix + locations, 'r') as csv_file_bytes: rows = DictReader(TextIOWrapper(csv_file_bytes)) @@ -67,17 +65,15 @@ def build_map(self, zip_ref: ZipFile, dir_prefix: str): if cc := r['country_iso_code']: # configparser forces keys to lower case by default if self.countries == 'all' or cc.lower() in self.countries: - country_code_map[r['geoname_id']] = cc + id_country_code_map[r['geoname_id']] = cc - return country_code_map + return id_country_code_map + + def build_sets(self, id_country_code_map: dict, zip_ref: ZipFile, dir_prefix: str, addr_fam: utils.AddressFamily): + # Iterates through IP blocks and builds country-specific IP range lists. + # field names: + # network,geoname_id,registered_country_geoname_id,represented_country_geoname_id,is_anonymous_proxy,is_satellite_provider - def build_sets(self, country_code_map: dict, zip_ref: ZipFile, dir_prefix: str, addr_fam: utils.AddressFamily): - """ - Iterates through IP blocks and builds country-specific IP range lists. - field names: - network,geoname_id,registered_country_geoname_id,represented_country_geoname_id,is_anonymous_proxy,is_satellite_provider - """ - suffix = '.' + addr_fam.value ipset_dir = self.base_dir / 'maxmind/ipset' / addr_fam.value nftset_dir = self.base_dir / 'maxmind/nftset' / addr_fam.value if addr_fam == utils.AddressFamily.IPV4: @@ -87,29 +83,13 @@ def build_sets(self, country_code_map: dict, zip_ref: ZipFile, dir_prefix: str, ip_blocks = 'GeoLite2-Country-Blocks-IPv6.csv' inet_family = 'family inet6' - # remove old sets if they exist - if ipset_dir.is_dir(): - shutil.rmtree(ipset_dir) - - if nftset_dir.is_dir(): - shutil.rmtree(nftset_dir) - - if self.ip_tables: - ipset_dir.mkdir(parents=True) - if self.nf_tables: - nftset_dir.mkdir(parents=True) + # dictionary of subnet lists, indexed by filename + # filename is CC.address_family -- eg. CA.ipv4 + country_subnets = dict() with ZipFile(Path(zip_ref.filename), 'r') as zip_file: with zip_file.open(dir_prefix + ip_blocks, 'r') as csv_file_bytes: - stream = TextIOWrapper(csv_file_bytes) - - # count the number of entries for each country - cc_counter = Counter(country_code_map.get(r['geoname_id'] or r['registered_country_geoname_id']) - for r in DictReader(stream)) - - # return the stream to the start - stream.seek(0, 0) - rows = DictReader(stream) + rows = DictReader(TextIOWrapper(csv_file_bytes)) for r in rows: geo_id = r['geoname_id'] if not geo_id: @@ -118,48 +98,63 @@ def build_sets(self, country_code_map: dict, zip_ref: ZipFile, dir_prefix: str, continue try: - cc = country_code_map[geo_id] + cc = id_country_code_map[geo_id] except KeyError: continue # skip CC if not listed in the config file net = r['network'] - set_name = cc + suffix - - # - # iptables/ipsets - # - if self.ip_tables: - ipset_file = ipset_dir / set_name - if not ipset_file.is_file(): - with open(ipset_file, 'a') as f: - # round up to the next power of 2 - maxelem = max(131072, - 1 if cc_counter[cc] == 0 else (1 << (cc_counter[cc] - 1).bit_length())) - f.write("create {0} hash:net {1} maxelem {2} comment\n".format(set_name, - inet_family, - maxelem)) - - with open(ipset_file, 'a') as f: - f.write("add " + set_name + " " + net + " comment " + cc + "\n") - - # - # nftables set - # - if self.nf_tables: - nftset_file = nftset_dir / set_name - if not nftset_file.is_file(): - with open(nftset_file, 'a') as f: - f.write("define " + set_name + " = {\n") - - with open(nftset_file, 'a') as f: - f.write(net + ",\n") - - # this feels dirty + filename_key = cc + '.' + addr_fam.value + + if filename_key in country_subnets: # append + country_subnets[filename_key].append(net) + else: # create + country_subnets[filename_key] = [net] + + # remove old sets if they exist + if self.ip_tables: + if ipset_dir.is_dir(): + shutil.rmtree(ipset_dir) + ipset_dir.mkdir(parents=True) + if self.nf_tables: + if nftset_dir.is_dir(): + shutil.rmtree(nftset_dir) + nftset_dir.mkdir(parents=True) + + # + # write data to disk + # + for set_name, subnets in country_subnets.items(): + set_name_parts = set_name.split('.') + country_code = set_name_parts[0] + + # write file headers + # iptables/ipsets + if self.ip_tables: + ipset_file = open(ipset_dir / set_name, 'w') + maxelem = max(131072, 1 if len(subnets) == 0 else (1 << (len(subnets) - 1).bit_length())) + ipset_file.write("create {0} hash:net {1} maxelem {2} comment\n".format(set_name, + inet_family, + maxelem)) + + # nftables set + if self.nf_tables: + nftset_file = open(nftset_dir / set_name, 'w') + nftset_file.write("define " + set_name + " = {\n") + + # write ranges to file(s) + for subnet in subnets: + if self.ip_tables: + ipset_file.write("add " + set_name + " " + subnet + " comment " + country_code + "\n") + if self.nf_tables: - for nf_set_file in nftset_dir.iterdir(): - if nf_set_file.is_file(): # not strictly needed - with open(nf_set_file, 'a') as f: - f.write("}\n") + nftset_file.write(subnet + ",\n") + + if self.ip_tables: + ipset_file.close() + + if self.nf_tables: + nftset_file.write("}\n") + nftset_file.close() def download(self): # URL: https://download.maxmind.com/app/geoip_download