Skip to content

Commit

Permalink
Merge pull request #24 from chr0mag/maxmind-perf
Browse files Browse the repository at this point in the history
MaxMind performance improvement.

Closes #16
  • Loading branch information
chr0mag authored Jan 9, 2022
2 parents 3ca7b3e + 91cf3f3 commit 15d011b
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 92 deletions.
21 changes: 11 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,19 +150,20 @@ Note that the [example systemd service file](https://github.com/chr0mag/geoipse

Performance
-----------
The Python version is consistently over twice as fast as the Bash version when generating sets for both firewall types and both address families.
* The Python version is much faster than the Bash version so use this if you have the choice.
* Versions > v2.3.1 include a significant performance improvement when generating MaxMind data. (See [issue #16](https://github.com/chr0mag/geoipsets/issues/16) and [PR #24](https://github.com/chr0mag/geoipsets/pull/24).)
```
# BASH (maxmind)
% time bash build-country-sets.sh
bash build-country-sets.sh 34.18s user 22.12s system 108% cpu 52.121 total
# All tests below generate both ipv4 and ipv6 sets for both ipset and nftables.
## Python
% time python -m geoipsets -c ~/geoipsets.conf --provider maxmind --output-dir ~/tests
1.80s user 0.07s system 56% cpu 3.315 total
#PYTHON (maxmind)
% time python -m geoipsets
python -m geoipsets 15.16s user 7.18s system 91% cpu 24.345 total
% time python -m geoipsets -c ~/geoipsets.conf --provider dbip --output-dir ~/tests
10.74s user 0.11s system 94% cpu 11.487 total
#PYTHON (dbip)
% time python -m geoipsets
python -m geoipsets 14.25s user 0.14s system 91% cpu 15.690 total
## Bash (maxmind only)
% ./build-country-sets.sh
34.62s user 31.62s system 107% cpu 1:01.68 total
```
Sources
------------
Expand Down
159 changes: 77 additions & 82 deletions python/geoipsets/maxmind.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import hashlib
import os
import shutil
from collections import Counter
from csv import DictReader
from io import TextIOWrapper
from pathlib import Path
Expand All @@ -20,8 +19,8 @@ class MaxMindProvider(utils.AbstractProvider):

def __init__(self, firewall: set, address_family: set, checksum: bool, countries: set, output_dir: str,
provider_options: dict):
"""'provider_options' is a ConfigParser Section that can be treated as a dictionary.
Use this mechanism to introduce provider-specific options into the configuration file."""
# 'provider_options' is a ConfigParser Section that can be treated as a dictionary.
# Use this mechanism to introduce provider-specific options into the configuration file.
super().__init__(firewall, address_family, checksum, countries, output_dir)

if not (license_key := provider_options.get('license-key')):
Expand All @@ -37,47 +36,44 @@ def generate(self):
self.check_checksum(zip_file)

with ZipFile(Path(zip_file.name), 'r') as zip_ref:
# with ZipFile(Path("/tmp/tmp96kyeecw.zip"), 'r') as zip_ref: # replace line above with this for testing
# with ZipFile(Path("/tmp/tmp23pn2bw0.zip"), 'r') as zip_ref: # replace line above with this for testing

zip_dir_prefix = os.path.commonprefix(zip_ref.namelist())
cc_map = self.build_map(zip_ref, zip_dir_prefix)
id_cc_map = self.build_id_cc_map(zip_ref, zip_dir_prefix)

# TODO: run each address-family concurrently?
if self.ipv4:
self.build_sets(cc_map, zip_ref, zip_dir_prefix, utils.AddressFamily.IPV4)
self.build_sets(id_cc_map, zip_ref, zip_dir_prefix, utils.AddressFamily.IPV4)

if self.ipv6:
self.build_sets(cc_map, zip_ref, zip_dir_prefix, utils.AddressFamily.IPV6)
self.build_sets(id_cc_map, zip_ref, zip_dir_prefix, utils.AddressFamily.IPV6)

def build_map(self, zip_ref: ZipFile, dir_prefix: str):
"""
Build dictionary mapping geoname_ids to ISO country codes
{6251999: 'CA', 1269750: 'IN'}
example row: 6251999,en,NA,"North America",CA,Canada,0
def build_id_cc_map(self, zip_ref: ZipFile, dir_prefix: str):
# Build dictionary mapping geoname_ids to ISO country codes
# {6251999: 'CA', 1269750: 'IN'}
# example row: 6251999,en,NA,"North America",CA,Canada,0
#
# field names:
# geoname_id, locale_code, continent_code, continent_name, country_iso_code, country_name, is_in_european_union

field names:
geoname_id, locale_code, continent_code, continent_name, country_iso_code, country_name, is_in_european_union
"""
locations = 'GeoLite2-Country-Locations-en.csv'
country_code_map = dict()
id_country_code_map = dict()
with ZipFile(Path(zip_ref.filename), 'r') as zip_file:
with zip_file.open(dir_prefix + locations, 'r') as csv_file_bytes:
rows = DictReader(TextIOWrapper(csv_file_bytes))
for r in rows:
if cc := r['country_iso_code']:
# configparser forces keys to lower case by default
if self.countries == 'all' or cc.lower() in self.countries:
country_code_map[r['geoname_id']] = cc
id_country_code_map[r['geoname_id']] = cc

return country_code_map
return id_country_code_map

def build_sets(self, id_country_code_map: dict, zip_ref: ZipFile, dir_prefix: str, addr_fam: utils.AddressFamily):
# Iterates through IP blocks and builds country-specific IP range lists.
# field names:
# network,geoname_id,registered_country_geoname_id,represented_country_geoname_id,is_anonymous_proxy,is_satellite_provider

def build_sets(self, country_code_map: dict, zip_ref: ZipFile, dir_prefix: str, addr_fam: utils.AddressFamily):
"""
Iterates through IP blocks and builds country-specific IP range lists.
field names:
network,geoname_id,registered_country_geoname_id,represented_country_geoname_id,is_anonymous_proxy,is_satellite_provider
"""
suffix = '.' + addr_fam.value
ipset_dir = self.base_dir / 'maxmind/ipset' / addr_fam.value
nftset_dir = self.base_dir / 'maxmind/nftset' / addr_fam.value
if addr_fam == utils.AddressFamily.IPV4:
Expand All @@ -87,29 +83,13 @@ def build_sets(self, country_code_map: dict, zip_ref: ZipFile, dir_prefix: str,
ip_blocks = 'GeoLite2-Country-Blocks-IPv6.csv'
inet_family = 'family inet6'

# remove old sets if they exist
if ipset_dir.is_dir():
shutil.rmtree(ipset_dir)

if nftset_dir.is_dir():
shutil.rmtree(nftset_dir)

if self.ip_tables:
ipset_dir.mkdir(parents=True)
if self.nf_tables:
nftset_dir.mkdir(parents=True)
# dictionary of subnet lists, indexed by filename
# filename is CC.address_family -- eg. CA.ipv4
country_subnets = dict()

with ZipFile(Path(zip_ref.filename), 'r') as zip_file:
with zip_file.open(dir_prefix + ip_blocks, 'r') as csv_file_bytes:
stream = TextIOWrapper(csv_file_bytes)

# count the number of entries for each country
cc_counter = Counter(country_code_map.get(r['geoname_id'] or r['registered_country_geoname_id'])
for r in DictReader(stream))

# return the stream to the start
stream.seek(0, 0)
rows = DictReader(stream)
rows = DictReader(TextIOWrapper(csv_file_bytes))
for r in rows:
geo_id = r['geoname_id']
if not geo_id:
Expand All @@ -118,48 +98,63 @@ def build_sets(self, country_code_map: dict, zip_ref: ZipFile, dir_prefix: str,
continue

try:
cc = country_code_map[geo_id]
cc = id_country_code_map[geo_id]
except KeyError:
continue # skip CC if not listed in the config file

net = r['network']
set_name = cc + suffix

#
# iptables/ipsets
#
if self.ip_tables:
ipset_file = ipset_dir / set_name
if not ipset_file.is_file():
with open(ipset_file, 'a') as f:
# round up to the next power of 2
maxelem = max(131072,
1 if cc_counter[cc] == 0 else (1 << (cc_counter[cc] - 1).bit_length()))
f.write("create {0} hash:net {1} maxelem {2} comment\n".format(set_name,
inet_family,
maxelem))

with open(ipset_file, 'a') as f:
f.write("add " + set_name + " " + net + " comment " + cc + "\n")

#
# nftables set
#
if self.nf_tables:
nftset_file = nftset_dir / set_name
if not nftset_file.is_file():
with open(nftset_file, 'a') as f:
f.write("define " + set_name + " = {\n")

with open(nftset_file, 'a') as f:
f.write(net + ",\n")

# this feels dirty
filename_key = cc + '.' + addr_fam.value

if filename_key in country_subnets: # append
country_subnets[filename_key].append(net)
else: # create
country_subnets[filename_key] = [net]

# remove old sets if they exist
if self.ip_tables:
if ipset_dir.is_dir():
shutil.rmtree(ipset_dir)
ipset_dir.mkdir(parents=True)
if self.nf_tables:
if nftset_dir.is_dir():
shutil.rmtree(nftset_dir)
nftset_dir.mkdir(parents=True)

#
# write data to disk
#
for set_name, subnets in country_subnets.items():
set_name_parts = set_name.split('.')
country_code = set_name_parts[0]

# write file headers
# iptables/ipsets
if self.ip_tables:
ipset_file = open(ipset_dir / set_name, 'w')
maxelem = max(131072, 1 if len(subnets) == 0 else (1 << (len(subnets) - 1).bit_length()))
ipset_file.write("create {0} hash:net {1} maxelem {2} comment\n".format(set_name,
inet_family,
maxelem))

# nftables set
if self.nf_tables:
nftset_file = open(nftset_dir / set_name, 'w')
nftset_file.write("define " + set_name + " = {\n")

# write ranges to file(s)
for subnet in subnets:
if self.ip_tables:
ipset_file.write("add " + set_name + " " + subnet + " comment " + country_code + "\n")

if self.nf_tables:
for nf_set_file in nftset_dir.iterdir():
if nf_set_file.is_file(): # not strictly needed
with open(nf_set_file, 'a') as f:
f.write("}\n")
nftset_file.write(subnet + ",\n")

if self.ip_tables:
ipset_file.close()

if self.nf_tables:
nftset_file.write("}\n")
nftset_file.close()

def download(self):
# URL: https://download.maxmind.com/app/geoip_download
Expand Down

0 comments on commit 15d011b

Please sign in to comment.