From c3ae3e0eccd7f8ee25f7123308e57481399dcb34 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Mon, 4 May 2020 12:24:14 -0500
Subject: [PATCH 01/42] First sketch of a particle selection tester

---
 yt/testing.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/yt/testing.py b/yt/testing.py
index 94abdceda26..6b09f0a60f4 100644
--- a/yt/testing.py
+++ b/yt/testing.py
@@ -1223,3 +1223,32 @@ def setUp(self):
     def tearDown(self):
         os.chdir(self.curdir)
         shutil.rmtree(self.tmpdir)
+
+# We make this a class with a setup so we can cache the particles one time
+class ParticleSelectionComparison:
+
+    def __init__(self, ds):
+        self.ds = ds
+        # Construct an index so that we get all the data_files
+        ds.index
+        particles = {}
+        for data_file in ds.index.data_files:
+            for ptype, pos_arr in ds.index.io._yield_coordinates(data_file):
+                particles.setdefault(ptype, []).append(pos_arr)
+        for ptype in particles:
+            particles[ptype] = np.concatenate(particles[ptype])
+        self.particles = particles
+
+    def compare_dobj_selection(self, dobj):
+        for ptype in sorted(self.particles):
+            x, y, z = self.particles[ptype].T
+            # Set our radii to zero for now, I guess?
+            sel_index = dobj.selector.select_points(x, y, z, 0.0)
+            sel_pos = self.particles[ptype][sel_index, :]
+
+            obj_results = []
+            for chunk in dobj.chunks([], "io"):
+                obj_results.append(chunk[ptype, "particle_position"])
+            obj_results = np.concatenate(obj_results, axis = 0)
+
+            assert np.all(sel_pos == obj_results)

From 0299776e2bc64d547fcac2604d3ec0230fda2fd6 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Mon, 4 May 2020 17:58:09 -0500
Subject: [PATCH 02/42] Add in the smoothing length calculations

---
 yt/testing.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/yt/testing.py b/yt/testing.py
index 6b09f0a60f4..60b8e7f7bc1 100644
--- a/yt/testing.py
+++ b/yt/testing.py
@@ -1232,18 +1232,26 @@ def __init__(self, ds):
         # Construct an index so that we get all the data_files
         ds.index
         particles = {}
+        hsml = {}
         for data_file in ds.index.data_files:
             for ptype, pos_arr in ds.index.io._yield_coordinates(data_file):
                 particles.setdefault(ptype, []).append(pos_arr)
+                if ptype in getattr(ds, '_sph_ptypes', ()):
+                    hsml.setdefault(ptype, []).append(ds.index.io._get_smoothing_length(
+                        data_file, pos_arr.dtype, pos_arr.shape))
         for ptype in particles:
             particles[ptype] = np.concatenate(particles[ptype])
+            if ptype in hsml:
+                hsml[ptype] = np.concatenate(hsml[ptype])
         self.particles = particles
+        self.hsml = hsml
 
     def compare_dobj_selection(self, dobj):
         for ptype in sorted(self.particles):
             x, y, z = self.particles[ptype].T
             # Set our radii to zero for now, I guess?
-            sel_index = dobj.selector.select_points(x, y, z, 0.0)
+            radii = self.hsml.get(ptype, 0.0)
+            sel_index = dobj.selector.select_points(x, y, z, radii)
             sel_pos = self.particles[ptype][sel_index, :]
 
             obj_results = []

From 073fbf9ed1b8ed765df787d4390e25293815e41f Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Mon, 4 May 2020 17:58:45 -0500
Subject: [PATCH 03/42] Fix particle selection for sub-regions

Thanks to Meagan Lang for debugging this with me.  We discovered there
was a corner case for filling subregions of mi1 and mi2, which showed up
in #2574.  This corrects that by choosing the region correctly.
---
 yt/geometry/particle_oct_container.pyx | 39 +++++++++++++++-----------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 9aeec58dae5..f95369fe2dc 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -1532,11 +1532,11 @@ cdef class ParticleBitmapSelector:
     @cython.cdivision(True)
     @cython.initializedcheck(False)
     cdef void add_coarse(self, np.uint64_t mi1, int bbox = 2):
-        cdef bint flag_ref = self.is_refined(mi1)
+        cdef bint flag_ref
         self.coarse_select_bool[mi1] = 1
         # Neighbors
-        if (self.ngz > 0) and (flag_ref == 0):
-            if (bbox == 2):
+        if (self.ngz > 0) and (bbox == 2):
+            if self.is_refined(mi1):
                 self.add_neighbors_coarse(mi1)
 
     @cython.boundscheck(False)
@@ -1563,9 +1563,8 @@ cdef class ParticleBitmapSelector:
     cdef int add_refined(self, np.uint64_t mi1, np.uint64_t mi2, int bbox = 2) except -1:
         self.refined_select_bool[mi2] = 1
         # Neighbors
-        if (self.ngz > 0):
-            if (bbox == 2):
-                self.add_neighbors_refined(mi1, mi2)
+        if (self.ngz > 0) and (bbox == 2):
+            self.add_neighbors_refined(mi1, mi2)
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
@@ -1752,11 +1751,14 @@ cdef class ParticleBitmapSelector:
                                np.uint64_t ind1[3]) except -1:
         cdef np.uint64_t imi, fmi
         cdef np.uint64_t mi
-        cdef np.uint64_t indexgap = 1 << (self.bitmap.index_order1 - nlevel)
-        imi = encode_morton_64bit(ind1[0], ind1[1], ind1[2])
-        fmi = encode_morton_64bit(
-            ind1[0]+indexgap-1, ind1[1]+indexgap-1, ind1[2]+indexgap-1)
-        for mi in range(imi, fmi+1):
+        cdef np.uint64_t start_ind[3], end_ind[3]
+        cdef np.uint64_t shift_by = (self.bitmap.index_order1 - nlevel)
+        for i in range(3):
+            start_ind[i] = ind1[i] << shift_by
+            end_ind[i] = start_ind[i] + (1 << shift_by) - 1
+        imi = encode_morton_64bit(start_ind[0], start_ind[1], start_ind[2])
+        fmi = encode_morton_64bit(end_ind[0], end_ind[1], end_ind[2])
+        for mi in range(imi, fmi):
             self.add_coarse(mi, 1)
 
     @cython.boundscheck(False)
@@ -1767,12 +1769,15 @@ cdef class ParticleBitmapSelector:
                                np.uint64_t mi1,
                                np.uint64_t ind2[3]) except -1:
         cdef np.uint64_t imi, fmi
-        cdef np.uint64_t indexgap = 1 << (
-            self.bitmap.index_order2 - (nlevel - self.bitmap.index_order1))
-        imi = encode_morton_64bit(ind2[0], ind2[1], ind2[2])
-        fmi = encode_morton_64bit(
-            ind2[0]+indexgap-1, ind2[1]+indexgap-1, ind2[2]+indexgap-1)
-        for mi2 in range(imi, fmi+1):
+        cdef np.uint64_t shift_by = (self.bitmap.index_order2 +
+                                     self.bitmap.index_order1) - nlevel
+        cdef np.uint64_t start_ind[3], end_ind[3]
+        for i in range(3):
+            start_ind[i] = ind2[i] << shift_by
+            end_ind[i] = start_ind[i] + (1 << shift_by) - 1
+        imi = encode_morton_64bit(start_ind[0], start_ind[1], start_ind[2])
+        fmi = encode_morton_64bit(end_ind[0], end_ind[1], end_ind[2])
+        for mi2 in range(imi, fmi + 1):
             self.add_refined(mi1, mi2, 1)
 
     @cython.boundscheck(False)

From f3dd24974168cb2c8215f9a5dc4ddf299d20485a Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 5 May 2020 09:38:13 -0500
Subject: [PATCH 04/42] Had the logic for is_refined backwards

---
 yt/geometry/particle_oct_container.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index f95369fe2dc..63b50e4f8e8 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -1536,7 +1536,7 @@ cdef class ParticleBitmapSelector:
         self.coarse_select_bool[mi1] = 1
         # Neighbors
         if (self.ngz > 0) and (bbox == 2):
-            if self.is_refined(mi1):
+            if not self.is_refined(mi1):
                 self.add_neighbors_coarse(mi1)
 
     @cython.boundscheck(False)

From 7a997036d3805d0405c27fa700a5811bd36aea4f Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 5 May 2020 11:30:39 -0500
Subject: [PATCH 05/42] Updating tests to use particle selection comparison

---
 yt/frontends/gadget/tests/test_outputs.py | 44 ++++++++++++++++++++++-
 yt/testing.py                             |  9 +++--
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/yt/frontends/gadget/tests/test_outputs.py b/yt/frontends/gadget/tests/test_outputs.py
index b952bd2c4f9..1f14eb50fd4 100644
--- a/yt/frontends/gadget/tests/test_outputs.py
+++ b/yt/frontends/gadget/tests/test_outputs.py
@@ -5,7 +5,9 @@
 import tempfile
 
 import yt
-from yt.testing import requires_file
+from yt.testing import requires_file, \
+    ParticleSelectionComparison, \
+    assert_equal
 from yt.utilities.answer_testing.framework import \
     data_dir_load, \
     requires_ds, \
@@ -107,6 +109,46 @@ def test_multifile_read():
     assert isinstance(data_dir_load(snap_33), GadgetDataset)
     assert isinstance(data_dir_load(snap_33_dir), GadgetDataset)
 
+@requires_file(snap_33)
+def test_particle_subselection():
+    """
+    This checks that we correctly subselect from a dataset, first by making
+    sure we get all the particles, then by comparing manual selections against
+    them.
+    """
+    ds = data_dir_load(snap_33)
+    psc = ParticleSelectionComparison(ds)
+    
+    sp1 = ds.sphere("c", (0.1, "unitary"))
+    assert_equal(psc.compare_dobj_selection(sp1) , True)
+
+    sp2 = ds.sphere("c", (0.1, "unitary"))
+    assert_equal(psc.compare_dobj_selection(sp2) , True)
+
+    sp3 = ds.sphere((1.0, 1.0, 1.0), (0.05, "unitary"))
+    assert_equal(psc.compare_dobj_selection(sp3) , True)
+
+    sp4 = ds.sphere("c", (0.5, "unitary"))
+    assert_equal(psc.compare_dobj_selection(sp4) , True)
+
+    dd = ds.all_data()
+    assert_equal(psc.compare_dobj_selection(dd) , True)
+
+    reg1 = ds.r[ (0.1, 'unitary'):(0.9, 'unitary'),
+                 (0.1, 'unitary'):(0.9, 'unitary'),
+                 (0.1, 'unitary'):(0.9, 'unitary')]
+    assert_equal(psc.compare_dobj_selection(reg1) , True)
+
+    reg2 = ds.r[ (0.8, 'unitary'):(0.85, 'unitary'),
+                 (0.8, 'unitary'):(0.85, 'unitary'),
+                 (0.8, 'unitary'):(0.85, 'unitary')]
+    assert_equal(psc.compare_dobj_selection(reg2) , True)
+
+    reg3 = ds.r[ (0.3, 'unitary'):(0.6, 'unitary'),
+                 (0.2, 'unitary'):(0.8, 'unitary'),
+                 (0.0, 'unitary'):(0.1, 'unitary')]
+    assert_equal(psc.compare_dobj_selection(reg3) , True)
+
 @requires_ds(BE_Gadget)
 def test_bigendian_field_access():
     ds = data_dir_load(BE_Gadget)
diff --git a/yt/testing.py b/yt/testing.py
index 60b8e7f7bc1..7d2bd098dc7 100644
--- a/yt/testing.py
+++ b/yt/testing.py
@@ -1224,8 +1224,13 @@ def tearDown(self):
         os.chdir(self.curdir)
         shutil.rmtree(self.tmpdir)
 
-# We make this a class with a setup so we can cache the particles one time
 class ParticleSelectionComparison:
+    """
+    This is a test helper class that takes a particle dataset, caches the
+    particles it has on disk (manually reading them using lower-level IO
+    routines) and then received a data object that it compares against manually
+    running the data object's selection routines.
+    """
 
     def __init__(self, ds):
         self.ds = ds
@@ -1259,4 +1264,4 @@ def compare_dobj_selection(self, dobj):
                 obj_results.append(chunk[ptype, "particle_position"])
             obj_results = np.concatenate(obj_results, axis = 0)
 
-            assert np.all(sel_pos == obj_results)
+            return np.all(sel_pos == obj_results)

From df4cf91acdfd114bc351b964e5891cb9621245bf Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 5 May 2020 12:24:36 -0500
Subject: [PATCH 06/42] Update yt/geometry/particle_oct_container.pyx

Co-authored-by: Meagan Lang <cfh5058@gmail.com>
---
 yt/geometry/particle_oct_container.pyx | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 63b50e4f8e8..9b6e649b9e5 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -1769,15 +1769,11 @@ cdef class ParticleBitmapSelector:
                                np.uint64_t mi1,
                                np.uint64_t ind2[3]) except -1:
         cdef np.uint64_t imi, fmi
-        cdef np.uint64_t shift_by = (self.bitmap.index_order2 +
-                                     self.bitmap.index_order1) - nlevel
-        cdef np.uint64_t start_ind[3], end_ind[3]
-        for i in range(3):
-            start_ind[i] = ind2[i] << shift_by
-            end_ind[i] = start_ind[i] + (1 << shift_by) - 1
-        imi = encode_morton_64bit(start_ind[0], start_ind[1], start_ind[2])
-        fmi = encode_morton_64bit(end_ind[0], end_ind[1], end_ind[2])
-        for mi2 in range(imi, fmi + 1):
+        cdef np.uint64_t shift_by = 3 * ((self.bitmap.index_order2 +
+                                          self.bitmap.index_order1) - nlevel)
+        imi = encode_morton_64bit(ind2[0], ind2[1], ind2[2]) << shift_by
+        fmi = imi + (1 << shift_by)
+        for mi2 in range(imi, fmi):
             self.add_refined(mi1, mi2, 1)
 
     @cython.boundscheck(False)

From 878e01816ecfeb8bdce43abe0b121a026fb63463 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 5 May 2020 12:24:44 -0500
Subject: [PATCH 07/42] Update yt/geometry/particle_oct_container.pyx

Co-authored-by: Meagan Lang <cfh5058@gmail.com>
---
 yt/geometry/particle_oct_container.pyx | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 9b6e649b9e5..a66095b7ac8 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -1751,13 +1751,9 @@ cdef class ParticleBitmapSelector:
                                np.uint64_t ind1[3]) except -1:
         cdef np.uint64_t imi, fmi
         cdef np.uint64_t mi
-        cdef np.uint64_t start_ind[3], end_ind[3]
-        cdef np.uint64_t shift_by = (self.bitmap.index_order1 - nlevel)
-        for i in range(3):
-            start_ind[i] = ind1[i] << shift_by
-            end_ind[i] = start_ind[i] + (1 << shift_by) - 1
-        imi = encode_morton_64bit(start_ind[0], start_ind[1], start_ind[2])
-        fmi = encode_morton_64bit(end_ind[0], end_ind[1], end_ind[2])
+        cdef np.uint64_t shift_by = 3 * (self.bitmap.index_order1 - nlevel)
+        imi = encode_morton_64bit(ind1[0], ind1[1], ind1[2]) << shift_by
+        fmi = imi + (1 << shift_by)
         for mi in range(imi, fmi):
             self.add_coarse(mi, 1)
 

From cc93f1949d93fa4015b2063d1a6af05e96ca11bd Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 5 May 2020 12:25:06 -0500
Subject: [PATCH 08/42] Update yt/geometry/particle_oct_container.pyx

Co-authored-by: Meagan Lang <cfh5058@gmail.com>
---
 yt/geometry/particle_oct_container.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index a66095b7ac8..aab682f48a4 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -1532,7 +1532,6 @@ cdef class ParticleBitmapSelector:
     @cython.cdivision(True)
     @cython.initializedcheck(False)
     cdef void add_coarse(self, np.uint64_t mi1, int bbox = 2):
-        cdef bint flag_ref
         self.coarse_select_bool[mi1] = 1
         # Neighbors
         if (self.ngz > 0) and (bbox == 2):

From a8ef879e4f28a353d9e4f8fc65791cbe9bbf3da6 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 5 May 2020 13:39:17 -0500
Subject: [PATCH 09/42] Updating from comments

---
 yt/frontends/gadget/tests/test_outputs.py | 2 +-
 yt/testing.py                             | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/yt/frontends/gadget/tests/test_outputs.py b/yt/frontends/gadget/tests/test_outputs.py
index 1f14eb50fd4..c836e8505ce 100644
--- a/yt/frontends/gadget/tests/test_outputs.py
+++ b/yt/frontends/gadget/tests/test_outputs.py
@@ -122,7 +122,7 @@ def test_particle_subselection():
     sp1 = ds.sphere("c", (0.1, "unitary"))
     assert_equal(psc.compare_dobj_selection(sp1) , True)
 
-    sp2 = ds.sphere("c", (0.1, "unitary"))
+    sp2 = ds.sphere("c", (0.2, "unitary"))
     assert_equal(psc.compare_dobj_selection(sp2) , True)
 
     sp3 = ds.sphere((1.0, 1.0, 1.0), (0.05, "unitary"))
diff --git a/yt/testing.py b/yt/testing.py
index 7d2bd098dc7..4ecdc693b38 100644
--- a/yt/testing.py
+++ b/yt/testing.py
@@ -1229,7 +1229,8 @@ class ParticleSelectionComparison:
     This is a test helper class that takes a particle dataset, caches the
     particles it has on disk (manually reading them using lower-level IO
     routines) and then received a data object that it compares against manually
-    running the data object's selection routines.
+    running the data object's selection routines.  All supplied data objects
+    must be created from the input dataset.
     """
 
     def __init__(self, ds):
@@ -1237,6 +1238,7 @@ def __init__(self, ds):
         # Construct an index so that we get all the data_files
         ds.index
         particles = {}
+        # hsml is the smoothing length we use for radial selection
         hsml = {}
         for data_file in ds.index.data_files:
             for ptype, pos_arr in ds.index.io._yield_coordinates(data_file):

From d22947ec058647e1722120a30493a9dc8603c7f5 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Thu, 7 May 2020 09:51:01 -0500
Subject: [PATCH 10/42] Refine tests a bit

---
 yt/frontends/gadget/tests/test_outputs.py | 41 ++++++++++++++---------
 yt/testing.py                             |  8 +++--
 2 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/yt/frontends/gadget/tests/test_outputs.py b/yt/frontends/gadget/tests/test_outputs.py
index c836e8505ce..168dd10c1fe 100644
--- a/yt/frontends/gadget/tests/test_outputs.py
+++ b/yt/frontends/gadget/tests/test_outputs.py
@@ -111,43 +111,54 @@ def test_multifile_read():
 
 @requires_file(snap_33)
 def test_particle_subselection():
-    """
-    This checks that we correctly subselect from a dataset, first by making
-    sure we get all the particles, then by comparing manual selections against
-    them.
-    """
+    #This checks that we correctly subselect from a dataset, first by making
+    #sure we get all the particles, then by comparing manual selections against
+    #them.
     ds = data_dir_load(snap_33)
     psc = ParticleSelectionComparison(ds)
-    
+
     sp1 = ds.sphere("c", (0.1, "unitary"))
-    assert_equal(psc.compare_dobj_selection(sp1) , True)
+    psc.compare_dobj_selection(sp1)
 
     sp2 = ds.sphere("c", (0.2, "unitary"))
-    assert_equal(psc.compare_dobj_selection(sp2) , True)
+    psc.compare_dobj_selection(sp2)
+
+    # Test wrapping around each axis individually: x
+    sp3_x = ds.sphere((1.0, 12.5, 12.5), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp3_x)
+
+    # Test wrapping around each axis individually: y
+    sp3_y = ds.sphere((12.5, 1.0, 12.5), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp3_y)
+
+    # Test wrapping around each axis individually: z
+    sp3_z = ds.sphere((12.5, 12.5, 1.0), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp3_z)
 
-    sp3 = ds.sphere((1.0, 1.0, 1.0), (0.05, "unitary"))
-    assert_equal(psc.compare_dobj_selection(sp3) , True)
+    # Test wrapping around all three axes simultaneously
+    sp3_all = ds.sphere((1.0, 1.0, 1.0), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp3_all)
 
     sp4 = ds.sphere("c", (0.5, "unitary"))
-    assert_equal(psc.compare_dobj_selection(sp4) , True)
+    psc.compare_dobj_selection(sp4)
 
     dd = ds.all_data()
-    assert_equal(psc.compare_dobj_selection(dd) , True)
+    psc.compare_dobj_selection(dd)
 
     reg1 = ds.r[ (0.1, 'unitary'):(0.9, 'unitary'),
                  (0.1, 'unitary'):(0.9, 'unitary'),
                  (0.1, 'unitary'):(0.9, 'unitary')]
-    assert_equal(psc.compare_dobj_selection(reg1) , True)
+    psc.compare_dobj_selection(reg1)
 
     reg2 = ds.r[ (0.8, 'unitary'):(0.85, 'unitary'),
                  (0.8, 'unitary'):(0.85, 'unitary'),
                  (0.8, 'unitary'):(0.85, 'unitary')]
-    assert_equal(psc.compare_dobj_selection(reg2) , True)
+    psc.compare_dobj_selection(reg2)
 
     reg3 = ds.r[ (0.3, 'unitary'):(0.6, 'unitary'),
                  (0.2, 'unitary'):(0.8, 'unitary'),
                  (0.0, 'unitary'):(0.1, 'unitary')]
-    assert_equal(psc.compare_dobj_selection(reg3) , True)
+    psc.compare_dobj_selection(reg3)
 
 @requires_ds(BE_Gadget)
 def test_bigendian_field_access():
diff --git a/yt/testing.py b/yt/testing.py
index 4ecdc693b38..d2d9d1bc286 100644
--- a/yt/testing.py
+++ b/yt/testing.py
@@ -1259,11 +1259,13 @@ def compare_dobj_selection(self, dobj):
             # Set our radii to zero for now, I guess?
             radii = self.hsml.get(ptype, 0.0)
             sel_index = dobj.selector.select_points(x, y, z, radii)
-            sel_pos = self.particles[ptype][sel_index, :]
+            if sel_index is None:
+                sel_pos = np.empty((0, 3))
+            else:
+                sel_pos = self.particles[ptype][sel_index, :]
 
             obj_results = []
             for chunk in dobj.chunks([], "io"):
                 obj_results.append(chunk[ptype, "particle_position"])
             obj_results = np.concatenate(obj_results, axis = 0)
-
-            return np.all(sel_pos == obj_results)
+            assert_equal(sel_pos, obj_results)

From 934a4ebd077941f73e038dba9363d20f4cde7dfe Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Thu, 7 May 2020 10:42:00 -0500
Subject: [PATCH 11/42] Add tests for wrapping on right

---
 yt/frontends/gadget/tests/test_outputs.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/yt/frontends/gadget/tests/test_outputs.py b/yt/frontends/gadget/tests/test_outputs.py
index 168dd10c1fe..d7fd3fbdfc4 100644
--- a/yt/frontends/gadget/tests/test_outputs.py
+++ b/yt/frontends/gadget/tests/test_outputs.py
@@ -135,12 +135,28 @@ def test_particle_subselection():
     sp3_z = ds.sphere((12.5, 12.5, 1.0), (2.0, "code_length"))
     psc.compare_dobj_selection(sp3_z)
 
-    # Test wrapping around all three axes simultaneously
+    # Test wrapping around all three axes simultaneously on left
     sp3_all = ds.sphere((1.0, 1.0, 1.0), (2.0, "code_length"))
     psc.compare_dobj_selection(sp3_all)
 
-    sp4 = ds.sphere("c", (0.5, "unitary"))
-    psc.compare_dobj_selection(sp4)
+    # Test wrapping around each axis individually on right: x
+    sp4_x = ds.sphere((24.0, 12.5, 12.5), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp4_x)
+
+    # Test wrapping around each axis individually on right: y
+    sp4_y = ds.sphere((12.5, 24.0, 12.5), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp4_y)
+
+    # Test wrapping around each axis individually on right: z
+    sp4_z = ds.sphere((12.5, 12.5, 24.0), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp4_z)
+
+    # Test wrapping around all three axes simultaneously on right
+    sp4_all = ds.sphere((24.0, 24.0, 24.0), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp4_all)
+
+    sp5 = ds.sphere("c", (0.5, "unitary"))
+    psc.compare_dobj_selection(sp5)
 
     dd = ds.all_data()
     psc.compare_dobj_selection(dd)

From 402afa0a1989db239575ef9a669069558420f540 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Thu, 7 May 2020 11:54:31 -0500
Subject: [PATCH 12/42] Rework periodic smoothing length calculations

---
 yt/geometry/particle_oct_container.pyx | 126 +++++++++----------------
 1 file changed, 42 insertions(+), 84 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index aab682f48a4..99752821470 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -484,22 +484,19 @@ cdef class ParticleBitmap:
         cdef np.uint64_t mi, miex, mi_max
         cdef np.uint64_t mi_split[3]
         cdef np.float64_t ppos[3]
-        cdef int skip, Nex
-        cdef int Nex_min[3]
-        cdef int Nex_max[3]
-        cdef np.float64_t rpos_min, rpos_max
-        cdef np.uint64_t xex_min, xex_max, yex_min, yex_max, zex_min, zex_max
+        cdef np.float64_t s_ppos[3] # shifted ppos
+        cdef int skip
+        cdef np.uint64_t bounds[3][2]
         cdef np.uint64_t xex, yex, zex
-        cdef int ix, iy, iz, ixe, iye, ize
-        cdef np.ndarray[np.uint64_t, ndim=1] xex_range = np.empty(7, 'uint64')
-        cdef np.ndarray[np.uint64_t, ndim=1] yex_range = np.empty(7, 'uint64')
-        cdef np.ndarray[np.uint64_t, ndim=1] zex_range = np.empty(7, 'uint64')
         cdef np.float64_t LE[3]
         cdef np.float64_t RE[3]
+        cdef np.float64_t DW[3]
         cdef np.uint8_t PER[3]
         cdef np.float64_t dds[3]
         cdef np.uint8_t[:] mask = self.masks[:, file_id]
         cdef np.int64_t msize = (1 << (self.index_order1 * 3))
+        cdef int axiter[3][2]
+        cdef np.float64_t axiterv[3][2]
         mi_max = (1 << self.index_order1) - 1
         # Copy over things for this file (type cast necessary?)
         for i in range(3):
@@ -507,10 +504,14 @@ cdef class ParticleBitmap:
             RE[i] = self.right_edge[i]
             PER[i] = self.periodicity[i]
             dds[i] = self.dds_mi1[i]
+            DW[i] = RE[i] - LE[i]
+            axiter[i][0] = 0 # We always do an offset of 0
+            axiterv[i][0] = 0.0
         # Mark index of particles that are in this file
         for p in range(pos.shape[0]):
             skip = 0
             for i in range(3):
+                axiter[i][1] = 999
                 # Skip particles outside the domain
                 if pos[p,i] >= RE[i] or pos[p,i] < LE[i]:
                     skip = 1
@@ -526,82 +527,39 @@ cdef class ParticleBitmap:
                     raise RuntimeError(
                         "Smoothing length for particle %s is negative with "
                         "value \"%s\"" % p, hsml[p])
-                Nex = 1
+                # We first check if we're bounded within the domain; this follows the logic in the
+                # pixelize_cartesian routine.  We assume that no smoothing
+                # length can wrap around both directions.
                 for i in range(3):
-                    Nex_min[i] = 0
-                    Nex_max[i] = 0
-                    rpos_min = ppos[i] - (dds[i]*mi_split[i] + LE[i])
-                    rpos_max = dds[i] - rpos_min
-                    if rpos_min > hsml[p]:
-                        Nex_min[i] = <int>((rpos_min-hsml[p])/dds[i]) + 1
-                    if rpos_max > hsml[p]:
-                        Nex_max[i] = <int>((rpos_max-hsml[p])/dds[i]) + 1
-                    Nex *= (Nex_max[i] + Nex_min[i] + 1)
-                if Nex > 1:
-                    # Ensure that min/max values for x,y,z indexes are obeyed
-                    if (Nex_max[0] + Nex_min[0] + 1) > xex_range.shape[0]:
-                        xex_range = np.empty(Nex_max[0] + Nex_min[0] + 1, 'uint64')
-                    if (Nex_max[1] + Nex_min[1] + 1) > yex_range.shape[0]:
-                        yex_range = np.empty(Nex_max[1] + Nex_min[1] + 1, 'uint64')
-                    if (Nex_max[2] + Nex_min[2] + 1) > zex_range.shape[0]:
-                        zex_range = np.empty(Nex_max[2] + Nex_min[2] + 1, 'uint64')
-                    xex_min = mi_split[0] - min(Nex_min[0], <int>mi_split[0])
-                    xex_max = mi_split[0] + min(Nex_max[0], <int>(mi_max - mi_split[0])) + 1
-                    yex_min = mi_split[1] - min(Nex_min[1], <int>mi_split[1])
-                    yex_max = mi_split[1] + min(Nex_max[1], <int>(mi_max - mi_split[1])) + 1
-                    zex_min = mi_split[2] - min(Nex_min[2], <int>mi_split[2])
-                    zex_max = mi_split[2] + min(Nex_max[2], <int>(mi_max - mi_split[2])) + 1
-                    ixe = iye = ize = 0
-                    for xex in range(xex_min, xex_max):
-                        xex_range[ixe] = xex
-                        ixe += 1
-                    for yex in range(yex_min, yex_max):
-                        yex_range[iye] = yex
-                        iye += 1
-                    for zex in range(zex_min, zex_max):
-                        zex_range[ize] = zex
-                        ize += 1
-                    # Add periodic wrapping
-                    if PER[0]:
-                        if Nex_min[0] > <int>mi_split[0]:
-                            for xex in range(mi_max + 1 - (Nex_min[0] - mi_split[0]), mi_max + 1):
-                                xex_range[ixe] = xex
-                                ixe += 1
-                        if Nex_max[0] > <int>(mi_max-mi_split[0]):
-                            for xex in range(0, Nex_max[0] - (mi_max-mi_split[0])):
-                                xex_range[ixe] = xex
-                                ixe += 1
-                    if PER[1]:
-                        if Nex_min[1] > <int>mi_split[1]:
-                            for yex in range(mi_max + 1 - (Nex_min[1] - mi_split[1]), mi_max + 1):
-                                yex_range[iye] = yex
-                                iye += 1
-                        if Nex_max[1] > <int>(mi_max-mi_split[1]):
-                            for yex in range(0, Nex_max[1] - (mi_max-mi_split[1])):
-                                yex_range[iye] = yex
-                                iye += 1
-                    if PER[2]:
-                        if Nex_min[2] > <int>mi_split[2]:
-                            for zex in range(mi_max + 1 - (Nex_min[2] - mi_split[2]), mi_max + 1):
-                                zex_range[ize] = zex
-                                ize += 1
-                        if Nex_max[2] > <int>(mi_max-mi_split[2]):
-                            for zex in range(0, Nex_max[2] - (mi_max-mi_split[2])):
-                                zex_range[ize] = zex
-                                ize += 1
-                    for ix in range(ixe):
-                        xex = xex_range[ix]
-                        for iy in range(iye):
-                            yex = yex_range[iy]
-                            for iz in range(ize):
-                                zex = zex_range[iz]
-                                miex = encode_morton_64bit(xex, yex, zex)
-                                if miex >= msize:
-                                    raise IndexError(
-                                        "Index for a softening region " +
-                                        "({}) exceeds ".format(miex) +
-                                        "max ({})".format(msize))
-                                mask[miex] = 1
+                    if PER[i] and ppos[i] - hsml[p] < LE[i]:
+                        axiter[i][1] = +1
+                        axiterv[i][1] = DW[i]
+                    elif PER[i] and ppos[i] + hsml[p] > RE[i]:
+                        axiter[i][1] = -1
+                        axiterv[i][1] = -DW[i]
+                for xi in range(2):
+                    if axiter[0][xi] == 999: continue
+                    s_ppos[0] = ppos[0] + axiterv[0][xi]
+                    for yi in range(2):
+                        if axiter[1][yi] == 999: continue
+                        s_ppos[1] = ppos[1] + axiterv[1][yi]
+                        for zi in range(2):
+                            if axiter[2][zi] == 999: continue
+                            s_ppos[2] = ppos[2] + axiterv[2][zi]
+                            # OK, now we compute the left and right edges for this shift.
+                            for i in range(3):
+                                bounds[i][0] = i64max(<np.uint64_t>((s_ppos[i] - LE[i] - hsml[p])/dds[i]), 0)
+                                bounds[i][1] = i64min(<np.uint64_t>((s_ppos[i] - LE[i] + hsml[p])/dds[i]), mi_max)
+                            for xex in range(bounds[0][0], bounds[0][1]):
+                                for yex in range(bounds[1][0], bounds[1][1]):
+                                    for zex in range(bounds[2][0], bounds[2][1]):
+                                        miex = encode_morton_64bit(xex, yex, zex)
+                                        mask[miex] = 1
+                                        if miex >= msize:
+                                            raise IndexError(
+                                                "Index for a softening region " +
+                                                "({}) exceeds ".format(miex) +
+                                                "max ({})".format(msize))
             
     @cython.boundscheck(False)
     @cython.wraparound(False)

From 2e39ef15ff207a87c689d8cccb003f10d3374068 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Thu, 7 May 2020 11:54:31 -0500
Subject: [PATCH 13/42] Rework periodic smoothing length calculations

---
 yt/geometry/particle_oct_container.pyx | 141 +++++++++----------------
 1 file changed, 51 insertions(+), 90 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index aab682f48a4..7b766121bfe 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -484,22 +484,20 @@ cdef class ParticleBitmap:
         cdef np.uint64_t mi, miex, mi_max
         cdef np.uint64_t mi_split[3]
         cdef np.float64_t ppos[3]
-        cdef int skip, Nex
-        cdef int Nex_min[3]
-        cdef int Nex_max[3]
-        cdef np.float64_t rpos_min, rpos_max
-        cdef np.uint64_t xex_min, xex_max, yex_min, yex_max, zex_min, zex_max
+        cdef np.float64_t s_ppos[3] # shifted ppos
+        cdef int skip
+        cdef np.uint64_t bounds[3][2]
         cdef np.uint64_t xex, yex, zex
-        cdef int ix, iy, iz, ixe, iye, ize
-        cdef np.ndarray[np.uint64_t, ndim=1] xex_range = np.empty(7, 'uint64')
-        cdef np.ndarray[np.uint64_t, ndim=1] yex_range = np.empty(7, 'uint64')
-        cdef np.ndarray[np.uint64_t, ndim=1] zex_range = np.empty(7, 'uint64')
         cdef np.float64_t LE[3]
         cdef np.float64_t RE[3]
+        cdef np.float64_t DW[3]
         cdef np.uint8_t PER[3]
         cdef np.float64_t dds[3]
+        cdef np.float64_t radius
         cdef np.uint8_t[:] mask = self.masks[:, file_id]
         cdef np.int64_t msize = (1 << (self.index_order1 * 3))
+        cdef int axiter[3][2]
+        cdef np.float64_t axiterv[3][2]
         mi_max = (1 << self.index_order1) - 1
         # Copy over things for this file (type cast necessary?)
         for i in range(3):
@@ -507,10 +505,14 @@ cdef class ParticleBitmap:
             RE[i] = self.right_edge[i]
             PER[i] = self.periodicity[i]
             dds[i] = self.dds_mi1[i]
+            DW[i] = RE[i] - LE[i]
+            axiter[i][0] = 0 # We always do an offset of 0
+            axiterv[i][0] = 0.0
         # Mark index of particles that are in this file
         for p in range(pos.shape[0]):
             skip = 0
             for i in range(3):
+                axiter[i][1] = 999
                 # Skip particles outside the domain
                 if pos[p,i] >= RE[i] or pos[p,i] < LE[i]:
                     skip = 1
@@ -521,87 +523,46 @@ cdef class ParticleBitmap:
                                           dds, mi_split)
             mask[mi] = 1
             # Expand mask by softening
-            if hsml is not None:
-                if hsml[p] < 0:
-                    raise RuntimeError(
-                        "Smoothing length for particle %s is negative with "
-                        "value \"%s\"" % p, hsml[p])
-                Nex = 1
-                for i in range(3):
-                    Nex_min[i] = 0
-                    Nex_max[i] = 0
-                    rpos_min = ppos[i] - (dds[i]*mi_split[i] + LE[i])
-                    rpos_max = dds[i] - rpos_min
-                    if rpos_min > hsml[p]:
-                        Nex_min[i] = <int>((rpos_min-hsml[p])/dds[i]) + 1
-                    if rpos_max > hsml[p]:
-                        Nex_max[i] = <int>((rpos_max-hsml[p])/dds[i]) + 1
-                    Nex *= (Nex_max[i] + Nex_min[i] + 1)
-                if Nex > 1:
-                    # Ensure that min/max values for x,y,z indexes are obeyed
-                    if (Nex_max[0] + Nex_min[0] + 1) > xex_range.shape[0]:
-                        xex_range = np.empty(Nex_max[0] + Nex_min[0] + 1, 'uint64')
-                    if (Nex_max[1] + Nex_min[1] + 1) > yex_range.shape[0]:
-                        yex_range = np.empty(Nex_max[1] + Nex_min[1] + 1, 'uint64')
-                    if (Nex_max[2] + Nex_min[2] + 1) > zex_range.shape[0]:
-                        zex_range = np.empty(Nex_max[2] + Nex_min[2] + 1, 'uint64')
-                    xex_min = mi_split[0] - min(Nex_min[0], <int>mi_split[0])
-                    xex_max = mi_split[0] + min(Nex_max[0], <int>(mi_max - mi_split[0])) + 1
-                    yex_min = mi_split[1] - min(Nex_min[1], <int>mi_split[1])
-                    yex_max = mi_split[1] + min(Nex_max[1], <int>(mi_max - mi_split[1])) + 1
-                    zex_min = mi_split[2] - min(Nex_min[2], <int>mi_split[2])
-                    zex_max = mi_split[2] + min(Nex_max[2], <int>(mi_max - mi_split[2])) + 1
-                    ixe = iye = ize = 0
-                    for xex in range(xex_min, xex_max):
-                        xex_range[ixe] = xex
-                        ixe += 1
-                    for yex in range(yex_min, yex_max):
-                        yex_range[iye] = yex
-                        iye += 1
-                    for zex in range(zex_min, zex_max):
-                        zex_range[ize] = zex
-                        ize += 1
-                    # Add periodic wrapping
-                    if PER[0]:
-                        if Nex_min[0] > <int>mi_split[0]:
-                            for xex in range(mi_max + 1 - (Nex_min[0] - mi_split[0]), mi_max + 1):
-                                xex_range[ixe] = xex
-                                ixe += 1
-                        if Nex_max[0] > <int>(mi_max-mi_split[0]):
-                            for xex in range(0, Nex_max[0] - (mi_max-mi_split[0])):
-                                xex_range[ixe] = xex
-                                ixe += 1
-                    if PER[1]:
-                        if Nex_min[1] > <int>mi_split[1]:
-                            for yex in range(mi_max + 1 - (Nex_min[1] - mi_split[1]), mi_max + 1):
-                                yex_range[iye] = yex
-                                iye += 1
-                        if Nex_max[1] > <int>(mi_max-mi_split[1]):
-                            for yex in range(0, Nex_max[1] - (mi_max-mi_split[1])):
-                                yex_range[iye] = yex
-                                iye += 1
-                    if PER[2]:
-                        if Nex_min[2] > <int>mi_split[2]:
-                            for zex in range(mi_max + 1 - (Nex_min[2] - mi_split[2]), mi_max + 1):
-                                zex_range[ize] = zex
-                                ize += 1
-                        if Nex_max[2] > <int>(mi_max-mi_split[2]):
-                            for zex in range(0, Nex_max[2] - (mi_max-mi_split[2])):
-                                zex_range[ize] = zex
-                                ize += 1
-                    for ix in range(ixe):
-                        xex = xex_range[ix]
-                        for iy in range(iye):
-                            yex = yex_range[iy]
-                            for iz in range(ize):
-                                zex = zex_range[iz]
-                                miex = encode_morton_64bit(xex, yex, zex)
-                                if miex >= msize:
-                                    raise IndexError(
-                                        "Index for a softening region " +
-                                        "({}) exceeds ".format(miex) +
-                                        "max ({})".format(msize))
-                                mask[miex] = 1
+            if hsml is None:
+                continue
+            if hsml[p] < 0:
+                raise RuntimeError(
+                    "Smoothing length for particle %s is negative with "
+                    "value \"%s\"" % p, hsml[p])
+            radius = hsml[p]
+            # We first check if we're bounded within the domain; this follows the logic in the
+            # pixelize_cartesian routine.  We assume that no smoothing
+            # length can wrap around both directions.
+            for i in range(3):
+                if PER[i] and ppos[i] - radius < LE[i]:
+                    axiter[i][1] = +1
+                    axiterv[i][1] = DW[i]
+                elif PER[i] and ppos[i] + radius > RE[i]:
+                    axiter[i][1] = -1
+                    axiterv[i][1] = -DW[i]
+            for xi in range(2):
+                if axiter[0][xi] == 999: continue
+                s_ppos[0] = ppos[0] + axiterv[0][xi]
+                for yi in range(2):
+                    if axiter[1][yi] == 999: continue
+                    s_ppos[1] = ppos[1] + axiterv[1][yi]
+                    for zi in range(2):
+                        if axiter[2][zi] == 999: continue
+                        s_ppos[2] = ppos[2] + axiterv[2][zi]
+                        # OK, now we compute the left and right edges for this shift.
+                        for i in range(3):
+                            bounds[i][0] = i64max(<np.uint64_t>((s_ppos[i] - LE[i] - radius)/dds[i]), 0)
+                            bounds[i][1] = i64min(<np.uint64_t>((s_ppos[i] - LE[i] + radius)/dds[i]), mi_max)
+                        for xex in range(bounds[0][0], bounds[0][1]):
+                            for yex in range(bounds[1][0], bounds[1][1]):
+                                for zex in range(bounds[2][0], bounds[2][1]):
+                                    miex = encode_morton_64bit(xex, yex, zex)
+                                    mask[miex] = 1
+                                    if miex >= msize:
+                                        raise IndexError(
+                                            "Index for a softening region " +
+                                            "({}) exceeds ".format(miex) +
+                                            "max ({})".format(msize))
             
     @cython.boundscheck(False)
     @cython.wraparound(False)

From 0b68faff7cc1463fdd7f7efe42bd5f76e1e2f8b9 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Thu, 7 May 2020 14:34:22 -0500
Subject: [PATCH 14/42] We need bounds[i][1] + 1 for inclusive loops

---
 yt/frontends/gadget/tests/test_outputs.py | 71 ++++++++++++++++++++++-
 yt/geometry/particle_oct_container.pyx    |  5 +-
 yt/testing.py                             | 46 +++++++++++++++
 3 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/yt/frontends/gadget/tests/test_outputs.py b/yt/frontends/gadget/tests/test_outputs.py
index b952bd2c4f9..d7fd3fbdfc4 100644
--- a/yt/frontends/gadget/tests/test_outputs.py
+++ b/yt/frontends/gadget/tests/test_outputs.py
@@ -5,7 +5,9 @@
 import tempfile
 
 import yt
-from yt.testing import requires_file
+from yt.testing import requires_file, \
+    ParticleSelectionComparison, \
+    assert_equal
 from yt.utilities.answer_testing.framework import \
     data_dir_load, \
     requires_ds, \
@@ -107,6 +109,73 @@ def test_multifile_read():
     assert isinstance(data_dir_load(snap_33), GadgetDataset)
     assert isinstance(data_dir_load(snap_33_dir), GadgetDataset)
 
+@requires_file(snap_33)
+def test_particle_subselection():
+    #This checks that we correctly subselect from a dataset, first by making
+    #sure we get all the particles, then by comparing manual selections against
+    #them.
+    ds = data_dir_load(snap_33)
+    psc = ParticleSelectionComparison(ds)
+
+    sp1 = ds.sphere("c", (0.1, "unitary"))
+    psc.compare_dobj_selection(sp1)
+
+    sp2 = ds.sphere("c", (0.2, "unitary"))
+    psc.compare_dobj_selection(sp2)
+
+    # Test wrapping around each axis individually: x
+    sp3_x = ds.sphere((1.0, 12.5, 12.5), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp3_x)
+
+    # Test wrapping around each axis individually: y
+    sp3_y = ds.sphere((12.5, 1.0, 12.5), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp3_y)
+
+    # Test wrapping around each axis individually: z
+    sp3_z = ds.sphere((12.5, 12.5, 1.0), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp3_z)
+
+    # Test wrapping around all three axes simultaneously on left
+    sp3_all = ds.sphere((1.0, 1.0, 1.0), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp3_all)
+
+    # Test wrapping around each axis individually on right: x
+    sp4_x = ds.sphere((24.0, 12.5, 12.5), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp4_x)
+
+    # Test wrapping around each axis individually on right: y
+    sp4_y = ds.sphere((12.5, 24.0, 12.5), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp4_y)
+
+    # Test wrapping around each axis individually on right: z
+    sp4_z = ds.sphere((12.5, 12.5, 24.0), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp4_z)
+
+    # Test wrapping around all three axes simultaneously on right
+    sp4_all = ds.sphere((24.0, 24.0, 24.0), (2.0, "code_length"))
+    psc.compare_dobj_selection(sp4_all)
+
+    sp5 = ds.sphere("c", (0.5, "unitary"))
+    psc.compare_dobj_selection(sp5)
+
+    dd = ds.all_data()
+    psc.compare_dobj_selection(dd)
+
+    reg1 = ds.r[ (0.1, 'unitary'):(0.9, 'unitary'),
+                 (0.1, 'unitary'):(0.9, 'unitary'),
+                 (0.1, 'unitary'):(0.9, 'unitary')]
+    psc.compare_dobj_selection(reg1)
+
+    reg2 = ds.r[ (0.8, 'unitary'):(0.85, 'unitary'),
+                 (0.8, 'unitary'):(0.85, 'unitary'),
+                 (0.8, 'unitary'):(0.85, 'unitary')]
+    psc.compare_dobj_selection(reg2)
+
+    reg3 = ds.r[ (0.3, 'unitary'):(0.6, 'unitary'),
+                 (0.2, 'unitary'):(0.8, 'unitary'),
+                 (0.0, 'unitary'):(0.1, 'unitary')]
+    psc.compare_dobj_selection(reg3)
+
 @requires_ds(BE_Gadget)
 def test_bigendian_field_access():
     ds = data_dir_load(BE_Gadget)
diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 7b766121bfe..dc08ab70ed5 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -552,7 +552,10 @@ cdef class ParticleBitmap:
                         # OK, now we compute the left and right edges for this shift.
                         for i in range(3):
                             bounds[i][0] = i64max(<np.uint64_t>((s_ppos[i] - LE[i] - radius)/dds[i]), 0)
-                            bounds[i][1] = i64min(<np.uint64_t>((s_ppos[i] - LE[i] + radius)/dds[i]), mi_max)
+                            bounds[i][1] = i64min(<np.uint64_t>((s_ppos[i] - LE[i] + radius)/dds[i]), mi_max) + 1
+                        # We go to the upper bound plus one so that we have *inclusive* loops -- the upper bound
+                        # is the cell *index*, so we want to make sure we include that cell.  This is also why
+                        # we don't need to worry about mi_max being the max index rather than the cell count.
                         for xex in range(bounds[0][0], bounds[0][1]):
                             for yex in range(bounds[1][0], bounds[1][1]):
                                 for zex in range(bounds[2][0], bounds[2][1]):
diff --git a/yt/testing.py b/yt/testing.py
index 94abdceda26..d2d9d1bc286 100644
--- a/yt/testing.py
+++ b/yt/testing.py
@@ -1223,3 +1223,49 @@ def setUp(self):
     def tearDown(self):
         os.chdir(self.curdir)
         shutil.rmtree(self.tmpdir)
+
+class ParticleSelectionComparison:
+    """
+    This is a test helper class that takes a particle dataset, caches the
+    particles it has on disk (manually reading them using lower-level IO
+    routines) and then received a data object that it compares against manually
+    running the data object's selection routines.  All supplied data objects
+    must be created from the input dataset.
+    """
+
+    def __init__(self, ds):
+        self.ds = ds
+        # Construct an index so that we get all the data_files
+        ds.index
+        particles = {}
+        # hsml is the smoothing length we use for radial selection
+        hsml = {}
+        for data_file in ds.index.data_files:
+            for ptype, pos_arr in ds.index.io._yield_coordinates(data_file):
+                particles.setdefault(ptype, []).append(pos_arr)
+                if ptype in getattr(ds, '_sph_ptypes', ()):
+                    hsml.setdefault(ptype, []).append(ds.index.io._get_smoothing_length(
+                        data_file, pos_arr.dtype, pos_arr.shape))
+        for ptype in particles:
+            particles[ptype] = np.concatenate(particles[ptype])
+            if ptype in hsml:
+                hsml[ptype] = np.concatenate(hsml[ptype])
+        self.particles = particles
+        self.hsml = hsml
+
+    def compare_dobj_selection(self, dobj):
+        for ptype in sorted(self.particles):
+            x, y, z = self.particles[ptype].T
+            # Set our radii to zero for now, I guess?
+            radii = self.hsml.get(ptype, 0.0)
+            sel_index = dobj.selector.select_points(x, y, z, radii)
+            if sel_index is None:
+                sel_pos = np.empty((0, 3))
+            else:
+                sel_pos = self.particles[ptype][sel_index, :]
+
+            obj_results = []
+            for chunk in dobj.chunks([], "io"):
+                obj_results.append(chunk[ptype, "particle_position"])
+            obj_results = np.concatenate(obj_results, axis = 0)
+            assert_equal(sel_pos, obj_results)

From b20c3edbbc5b057077d12dc456974f797ce726f6 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Fri, 8 May 2020 07:11:34 -0500
Subject: [PATCH 15/42] remove unused import

---
 yt/frontends/gadget/tests/test_outputs.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/yt/frontends/gadget/tests/test_outputs.py b/yt/frontends/gadget/tests/test_outputs.py
index d7fd3fbdfc4..bdba0994e55 100644
--- a/yt/frontends/gadget/tests/test_outputs.py
+++ b/yt/frontends/gadget/tests/test_outputs.py
@@ -6,8 +6,7 @@
 
 import yt
 from yt.testing import requires_file, \
-    ParticleSelectionComparison, \
-    assert_equal
+    ParticleSelectionComparison
 from yt.utilities.answer_testing.framework import \
     data_dir_load, \
     requires_ds, \

From 8cdf546033b93f1bd39111e56f3587d18a534ffd Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Fri, 8 May 2020 21:02:05 -0500
Subject: [PATCH 16/42] First, not-quite-working, pass at refined stuff.

---
 yt/geometry/particle_oct_container.pyx | 399 ++++++++++++++++---------
 1 file changed, 253 insertions(+), 146 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index f8639c399db..9808c6fb3f8 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -30,12 +30,14 @@ from yt.funcs import get_pbar
 
 from particle_deposit cimport gind
 from yt.utilities.lib.ewah_bool_array cimport \
-    ewah_bool_array, ewah_bool_iterator
+    ewah_bool_array, ewah_bool_iterator, ewah_map
 #from yt.utilities.lib.ewah_bool_wrap cimport \
 from ..utilities.lib.ewah_bool_wrap cimport BoolArrayCollection
+from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair
+from libcpp.unordered_set cimport unordered_set as uset
 from cython.operator cimport dereference, preincrement
 import struct
 import os
@@ -53,6 +55,8 @@ from ..utilities.lib.ewah_bool_wrap cimport SparseUnorderedRefinedBitmaskSet as
 from ..utilities.lib.ewah_bool_wrap cimport BoolArrayCollectionUncompressed as BoolArrayColl
 from ..utilities.lib.ewah_bool_wrap cimport FileBitmasks
 
+ctypedef map[np.uint64_t, vector[bool]] CoarseRefinedSets
+
 cdef class ParticleOctreeContainer(OctreeContainer):
     cdef Oct** oct_list
     #The starting oct index of each domain
@@ -551,8 +555,9 @@ cdef class ParticleBitmap:
                         s_ppos[2] = ppos[2] + axiterv[2][zi]
                         # OK, now we compute the left and right edges for this shift.
                         for i in range(3):
-                            bounds[i][0] = i64max(<np.uint64_t>((s_ppos[i] - LE[i] - radius)/dds[i]), 0)
-                            bounds[i][1] = i64min(<np.uint64_t>((s_ppos[i] - LE[i] + radius)/dds[i]), mi_max) + 1
+                            # Note that we cast here to int64_t because this could be negative
+                            bounds[i][0] = i64max(<np.int64_t>((s_ppos[i] - LE[i] - radius)/dds[i]), 0)
+                            bounds[i][1] = i64min(<np.int64_t>((s_ppos[i] - LE[i] + radius)/dds[i]), mi_max) + 1
                         # We go to the upper bound plus one so that we have *inclusive* loops -- the upper bound
                         # is the cell *index*, so we want to make sure we include that cell.  This is also why
                         # we don't need to worry about mi_max being the max index rather than the cell count.
@@ -617,12 +622,16 @@ cdef class ParticleBitmap:
         cdef np.int64_t i, p
         cdef np.uint64_t mi1, mi2
         cdef np.float64_t ppos[3]
+        cdef np.float64_t s_ppos[3] # shifted ppos
         cdef int skip, Nex
+        cdef np.uint64_t bounds[3][2]
         cdef np.float64_t LE[3]
         cdef np.float64_t RE[3]
+        cdef np.float64_t DW[3]
         cdef np.uint8_t PER[3]
         cdef np.float64_t dds1[3]
         cdef np.float64_t dds2[3]
+        cdef np.float64_t radius
         cdef np.uint64_t mi_split1[3]
         cdef np.uint64_t mi_split2[3]
         cdef np.uint64_t miex1, miex2, mi1_max, mi2_max
@@ -640,8 +649,12 @@ cdef class ParticleBitmap:
         cdef np.ndarray[np.uint64_t, ndim=1] yex2_range = np.empty(7, 'uint64')
         cdef np.ndarray[np.uint64_t, ndim=1] zex2_range = np.empty(7, 'uint64')
         cdef np.int64_t msize = sub_mi1.shape[0]
+        cdef int axiter[3][2]
+        cdef np.float64_t axiterv[3][2]
+        cdef CoarseRefinedSets coarse_refined_map
         mi1_max = (1 << self.index_order1) - 1
         mi2_max = (1 << self.index_order2) - 1
+        cdef np.uint64_t max_mi2_elements = 1 << (3*self.index_order2)
         # Copy things from structure (type cast)
         for i in range(3):
             LE[i] = self.left_edge[i]
@@ -649,10 +662,14 @@ cdef class ParticleBitmap:
             PER[i] = self.periodicity[i]
             dds1[i] = self.dds_mi1[i]
             dds2[i] = self.dds_mi2[i]
+            DW[i] = RE[i] - LE[i]
+            axiter[i][0] = 0 # We always do an offset of 0
+            axiterv[i][0] = 0.0
         # Loop over positions skipping those outside the domain
         for p in range(pos.shape[0]):
             skip = 0
             for i in range(3):
+                axiter[i][1] = 999
                 if pos[p,i] >= RE[i] or pos[p,i] < LE[i]:
                     skip = 1
                     break
@@ -661,162 +678,252 @@ cdef class ParticleBitmap:
             # Only look if collision at coarse index
             mi1 = bounded_morton_split_dds(ppos[0], ppos[1], ppos[2], LE,
                                            dds1, mi_split1)
-            if mask[mi1] > 1:
+            if hsml is None:
+                if mask[mi1] <= 1: # only one thing in this area
+                    continue
                 # Determine sub index within cell of primary index
-                if nsub_mi >= msize:
-                    raise IndexError("Refined index exceeded estimate.")
                 mi2 = bounded_morton_split_relative_dds(
                     ppos[0], ppos[1], ppos[2], LE, dds1, dds2, mi_split2)
+                if coarse_refined_map.count(mi1) == 0:
+                    coarse_refined_map[mi1] = vector[bool](max_mi2_elements, False)
+                coarse_refined_map[mi1][mi2] = True
+            else: # only hit if we have smoothing lengths.
+                # We have to do essentially the identical process to in the coarse indexing,
+                # except here we need to fill in all the subranges as well as the coarse ranges
+                # Note that we are also doing the null case, where we do no shifting
+                radius = hsml[p]
+                for i in range(3):
+                    if PER[i] and ppos[i] - radius < LE[i]:
+                        axiter[i][1] = +1
+                        axiterv[i][1] = DW[i]
+                    elif PER[i] and ppos[i] + radius > RE[i]:
+                        axiter[i][1] = -1
+                        axiterv[i][1] = -DW[i]
+                for xi in range(2):
+                    if axiter[0][xi] == 999: continue
+                    s_ppos[0] = ppos[0] + axiterv[0][xi]
+                    for yi in range(2):
+                        if axiter[1][yi] == 999: continue
+                        s_ppos[1] = ppos[1] + axiterv[1][yi]
+                        for zi in range(2):
+                            if axiter[2][zi] == 999: continue
+                            s_ppos[2] = ppos[2] + axiterv[2][zi]
+                            # OK, now we compute the left and right edges for this shift.
+                            for i in range(3):
+                                # casting to int64 is not nice but is so we can have negative values we clip
+                                bounds[i][0] = i64max(<np.int64_t>((s_ppos[i] - LE[i] - radius)/dds1[i]), 0)
+                                bounds[i][1] = i64min(<np.int64_t>((s_ppos[i] - LE[i] + radius)/dds1[i]), mi1_max) + 1
+                            # We go to the upper bound plus one so that we have *inclusive* loops -- the upper bound
+                            # is the cell *index*, so we want to make sure we include that cell.  This is also why
+                            # we don't need to worry about mi_max being the max index rather than the cell count.
+                            for xex in range(bounds[0][0], bounds[0][1]):
+                                for yex in range(bounds[1][0], bounds[1][1]):
+                                    for zex in range(bounds[2][0], bounds[2][1]):
+                                        miex = encode_morton_64bit(xex, yex, zex)
+                                        if mask[miex] <= 1:
+                                            continue
+                                        # Now we need to fill our sub-range
+                                        if coarse_refined_map.count(miex) == 0:
+                                            coarse_refined_map[miex] = vector[bool](max_mi2_elements, False)
+                                        self.__fill_refined_ranges(s_ppos, radius, LE, RE,
+                                                                   dds1, xex, yex, zex,
+                                                                   dds2, mi1_max, mi2_max, miex,
+                                                                   coarse_refined_map[miex])
+        print("THIS MANY COARSE CELLS", coarse_refined_map.size())
+        cdef np.uint64_t count
+        for it1 in coarse_refined_map:
+            mi1 = it1.first
+            count = 0
+            for it2 in it1.second:
+                if it2 == True:
+                    count += 1
                 sub_mi1[nsub_mi] = mi1
-                sub_mi2[nsub_mi] = mi2
-                nsub_mi += 1
-                # Expand for smoothing
-                if hsml is not None:
-                    Nex = 1
-                    for i in range(3):
-                        Nex_min[i] = 0
-                        Nex_max[i] = 0
-                        rpos_min = ppos[i] - (dds2[i]*mi_split2[i] + dds1[i]*mi_split1[i] + LE[i])
-                        rpos_max = dds2[i] - rpos_min
-                        if rpos_min > hsml[p]:
-                            Nex_min[i] = <int>((rpos_min-hsml[p])/dds2[i]) + 1
-                        if rpos_max > hsml[p]:
-                            Nex_max[i] = <int>((rpos_max-hsml[p])/dds2[i]) + 1
-                        Nex *= (Nex_max[i] + Nex_min[i] + 1)
-                    if Nex > 1:
-                        # Ensure that min/max values for x,y,z indexes are obeyed
-                        if (Nex_max[0] + Nex_min[0] + 1) > xex1_range.shape[0]:
-                            xex1_range = np.empty(Nex_max[0] + Nex_min[0] + 1, 'uint64')
-                            xex2_range = np.empty(Nex_max[0] + Nex_min[0] + 1, 'uint64')
-                        if (Nex_max[1] + Nex_min[1] + 1) > yex1_range.shape[0]:
-                            yex1_range = np.empty(Nex_max[1] + Nex_min[1] + 1, 'uint64')
-                            yex2_range = np.empty(Nex_max[1] + Nex_min[1] + 1, 'uint64')
-                        if (Nex_max[2] + Nex_min[2] + 1) > zex1_range.shape[0]:
-                            zex1_range = np.empty(Nex_max[2] + Nex_min[2] + 1, 'uint64')
-                            zex2_range = np.empty(Nex_max[2] + Nex_min[2] + 1, 'uint64')
-                        xex2_min = mi_split2[0] - min(Nex_min[0], <int>mi_split2[0])
-                        xex2_max = mi_split2[0] + min(Nex_max[0], <int>(mi2_max - mi_split2[0])) + 1
-                        yex2_min = mi_split2[1] - min(Nex_min[1], <int>mi_split2[1])
-                        yex2_max = mi_split2[1] + min(Nex_max[1], <int>(mi2_max - mi_split2[1])) + 1
-                        zex2_min = mi_split2[2] - min(Nex_min[2], <int>mi_split2[2])
-                        zex2_max = mi_split2[2] + min(Nex_max[2], <int>(mi2_max - mi_split2[2])) + 1
-                        ixe = iye = ize = 0
-                        for xex2 in range(xex2_min, xex2_max):
-                            xex1_range[ixe] = mi_split1[0]
+                sub_mi2[nsub_mi] = it2
+                #nsub_mi += 1
+            print("IN ", mi1, "THIS MANY REFINED CELLS", count)
+        return nsub_mi
+
+        if 0:
+            # Expand for smoothing
+            Nex = 1
+            for i in range(3):
+                Nex_min[i] = 0
+                Nex_max[i] = 0
+                rpos_min = ppos[i] - (dds2[i]*mi_split2[i] + dds1[i]*mi_split1[i] + LE[i])
+                rpos_max = dds2[i] - rpos_min
+                if rpos_min > hsml[p]:
+                    Nex_min[i] = <int>((rpos_min-hsml[p])/dds2[i]) + 1
+                if rpos_max > hsml[p]:
+                    Nex_max[i] = <int>((rpos_max-hsml[p])/dds2[i]) + 1
+                Nex *= (Nex_max[i] + Nex_min[i] + 1)
+            if Nex > 1:
+                # Ensure that min/max values for x,y,z indexes are obeyed
+                if (Nex_max[0] + Nex_min[0] + 1) > xex1_range.shape[0]:
+                    xex1_range = np.empty(Nex_max[0] + Nex_min[0] + 1, 'uint64')
+                    xex2_range = np.empty(Nex_max[0] + Nex_min[0] + 1, 'uint64')
+                if (Nex_max[1] + Nex_min[1] + 1) > yex1_range.shape[0]:
+                    yex1_range = np.empty(Nex_max[1] + Nex_min[1] + 1, 'uint64')
+                    yex2_range = np.empty(Nex_max[1] + Nex_min[1] + 1, 'uint64')
+                if (Nex_max[2] + Nex_min[2] + 1) > zex1_range.shape[0]:
+                    zex1_range = np.empty(Nex_max[2] + Nex_min[2] + 1, 'uint64')
+                    zex2_range = np.empty(Nex_max[2] + Nex_min[2] + 1, 'uint64')
+                xex2_min = mi_split2[0] - min(Nex_min[0], <int>mi_split2[0])
+                xex2_max = mi_split2[0] + min(Nex_max[0], <int>(mi2_max - mi_split2[0])) + 1
+                yex2_min = mi_split2[1] - min(Nex_min[1], <int>mi_split2[1])
+                yex2_max = mi_split2[1] + min(Nex_max[1], <int>(mi2_max - mi_split2[1])) + 1
+                zex2_min = mi_split2[2] - min(Nex_min[2], <int>mi_split2[2])
+                zex2_max = mi_split2[2] + min(Nex_max[2], <int>(mi2_max - mi_split2[2])) + 1
+                ixe = iye = ize = 0
+                for xex2 in range(xex2_min, xex2_max):
+                    xex1_range[ixe] = mi_split1[0]
+                    xex2_range[ixe] = xex2
+                    ixe += 1
+                for yex2 in range(yex2_min, yex2_max):
+                    yex1_range[iye] = mi_split1[1]
+                    yex2_range[iye] = yex2
+                    iye += 1
+                for zex2 in range(zex2_min, zex2_max):
+                    zex1_range[ize] = mi_split1[2]
+                    zex2_range[ize] = zex2
+                    ize += 1
+                # Expand to adjacent coarse cells, wrapping periodically
+                # if need be
+                # x
+                if Nex_min[0] > <int>mi_split2[0]:
+                    if mi_split1[0] > 0:
+                        for xex2 in range(mi2_max + 1 - (Nex_min[0] - mi_split2[0]), mi2_max + 1): 
+                            xex1_range[ixe] = mi_split1[0] - 1
+                            xex2_range[ixe] = xex2
+                            ixe += 1
+                    elif PER[0]:
+                        for xex2 in range(mi2_max + 1 - (Nex_min[0] - mi_split2[0]), mi2_max + 1): 
+                            xex1_range[ixe] = mi1_max
+                            xex2_range[ixe] = xex2
+                            ixe += 1
+                if Nex_max[0] > <int>(mi2_max-mi_split2[0]):  
+                    if mi_split1[0] < mi1_max:
+                        for xex2 in range(0, Nex_max[0] - (mi2_max-mi_split2[0])): 
+                            xex1_range[ixe] = mi_split1[0] + 1
+                            xex2_range[ixe] = xex2
+                            ixe += 1
+                    elif PER[0]:
+                        for xex2 in range(0, Nex_max[0] - (mi2_max-mi_split2[0])): 
+                            xex1_range[ixe] = 0
                             xex2_range[ixe] = xex2
                             ixe += 1
-                        for yex2 in range(yex2_min, yex2_max):
-                            yex1_range[iye] = mi_split1[1]
+                # y
+                if Nex_min[1] > <int>mi_split2[1]:
+                    if mi_split1[1] > 0:
+                        for yex2 in range(mi2_max + 1 - (Nex_min[1] - mi_split2[1]), mi2_max + 1): 
+                            yex1_range[iye] = mi_split1[1] - 1
+                            yex2_range[iye] = yex2
+                            iye += 1
+                    elif PER[1]:
+                        for yex2 in range(mi2_max + 1 - (Nex_min[1] - mi_split2[1]), mi2_max + 1): 
+                            yex1_range[iye] = mi1_max
                             yex2_range[iye] = yex2
                             iye += 1
-                        for zex2 in range(zex2_min, zex2_max):
-                            zex1_range[ize] = mi_split1[2]
+                if Nex_max[1] > <int>(mi2_max-mi_split2[1]):  
+                    if mi_split1[1] < mi1_max:
+                        for yex2 in range(0, Nex_max[1] - (mi2_max-mi_split2[1])): 
+                            yex1_range[iye] = mi_split1[1] + 1
+                            yex2_range[iye] = yex2
+                            iye += 1
+                    elif PER[1]:
+                        for yex2 in range(0, Nex_max[1] - (mi2_max-mi_split2[1])): 
+                            yex1_range[iye] = 0
+                            yex2_range[iye] = yex2
+                            iye += 1
+                # z
+                if Nex_min[2] > <int>mi_split2[2]:
+                    if mi_split1[2] > 0:
+                        for zex2 in range(mi2_max + 1 - (Nex_min[2] - mi_split2[2]), mi2_max + 1): 
+                            zex1_range[ize] = mi_split1[2] - 1
                             zex2_range[ize] = zex2
                             ize += 1
-                        # Expand to adjacent coarse cells, wrapping periodically
-                        # if need be
-                        # x
-                        if Nex_min[0] > <int>mi_split2[0]:
-                            if mi_split1[0] > 0:
-                                for xex2 in range(mi2_max + 1 - (Nex_min[0] - mi_split2[0]), mi2_max + 1): 
-                                    xex1_range[ixe] = mi_split1[0] - 1
-                                    xex2_range[ixe] = xex2
-                                    ixe += 1
-                            elif PER[0]:
-                                for xex2 in range(mi2_max + 1 - (Nex_min[0] - mi_split2[0]), mi2_max + 1): 
-                                    xex1_range[ixe] = mi1_max
-                                    xex2_range[ixe] = xex2
-                                    ixe += 1
-                        if Nex_max[0] > <int>(mi2_max-mi_split2[0]):  
-                            if mi_split1[0] < mi1_max:
-                                for xex2 in range(0, Nex_max[0] - (mi2_max-mi_split2[0])): 
-                                    xex1_range[ixe] = mi_split1[0] + 1
-                                    xex2_range[ixe] = xex2
-                                    ixe += 1
-                            elif PER[0]:
-                                for xex2 in range(0, Nex_max[0] - (mi2_max-mi_split2[0])): 
-                                    xex1_range[ixe] = 0
-                                    xex2_range[ixe] = xex2
-                                    ixe += 1
-                        # y
-                        if Nex_min[1] > <int>mi_split2[1]:
-                            if mi_split1[1] > 0:
-                                for yex2 in range(mi2_max + 1 - (Nex_min[1] - mi_split2[1]), mi2_max + 1): 
-                                    yex1_range[iye] = mi_split1[1] - 1
-                                    yex2_range[iye] = yex2
-                                    iye += 1
-                            elif PER[1]:
-                                for yex2 in range(mi2_max + 1 - (Nex_min[1] - mi_split2[1]), mi2_max + 1): 
-                                    yex1_range[iye] = mi1_max
-                                    yex2_range[iye] = yex2
-                                    iye += 1
-                        if Nex_max[1] > <int>(mi2_max-mi_split2[1]):  
-                            if mi_split1[1] < mi1_max:
-                                for yex2 in range(0, Nex_max[1] - (mi2_max-mi_split2[1])): 
-                                    yex1_range[iye] = mi_split1[1] + 1
-                                    yex2_range[iye] = yex2
-                                    iye += 1
-                            elif PER[1]:
-                                for yex2 in range(0, Nex_max[1] - (mi2_max-mi_split2[1])): 
-                                    yex1_range[iye] = 0
-                                    yex2_range[iye] = yex2
-                                    iye += 1
-                        # z
-                        if Nex_min[2] > <int>mi_split2[2]:
-                            if mi_split1[2] > 0:
-                                for zex2 in range(mi2_max + 1 - (Nex_min[2] - mi_split2[2]), mi2_max + 1): 
-                                    zex1_range[ize] = mi_split1[2] - 1
-                                    zex2_range[ize] = zex2
-                                    ize += 1
-                            elif PER[2]:
-                                for zex2 in range(mi2_max + 1 - (Nex_min[2] - mi_split2[2]), mi2_max + 1): 
-                                    zex1_range[ize] = mi1_max
-                                    zex2_range[ize] = zex2
-                                    ize += 1
-                        if Nex_max[2] > <int>(mi2_max-mi_split2[2]):  
-                            if mi_split1[2] < mi1_max:
-                                for zex2 in range(0, Nex_max[2] - (mi2_max-mi_split2[2])): 
-                                    zex1_range[ize] = mi_split1[2] + 1
-                                    zex2_range[ize] = zex2
-                                    ize += 1
-                            elif PER[2]:
-                                for zex2 in range(0, Nex_max[2] - (mi2_max-mi_split2[2])): 
-                                    zex1_range[ize] = 0
-                                    zex2_range[ize] = zex2
-                                    ize += 1
-                        for ix in range(ixe):
-                            xex1 = xex1_range[ix]
-                            xex2 = xex2_range[ix]
-                            for iy in range(iye):
-                                yex1 = yex1_range[iy]
-                                yex2 = yex2_range[iy]
-                                for iz in range(ize):
-                                    zex1 = zex1_range[iz]
-                                    zex2 = zex2_range[iz]
-                                    if (xex1 == mi_split1[0] and xex2 == mi_split2[0] and
-                                        yex1 == mi_split1[1] and yex2 == mi_split2[1] and
-                                        zex1 == mi_split1[2] and zex2 == mi_split2[2]):
-                                        continue
-                                    miex1 = encode_morton_64bit(xex1, yex1, zex1)
-                                    miex2 = encode_morton_64bit(xex2, yex2, zex2)
-                                    if nsub_mi >= msize:
-                                        # Uncomment these lines to allow periodic
-                                        # caching of refined indices
-                                        # self.bitmasks._set_refined_index_array(
-                                        #     file_id, nsub_mi, sub_mi1, sub_mi2)
-                                        # nsub_mi = 0
-                                        raise IndexError(
-                                            "Refined index exceeded original "
-                                            "estimate.\n"
-                                            "nsub_mi = %s, "
-                                            "sub_mi1.shape[0] = %s"
-                                            % (nsub_mi, sub_mi1.shape[0]))
-                                    sub_mi1[nsub_mi] = miex1
-                                    sub_mi2[nsub_mi] = miex2
-                                    nsub_mi += 1
+                    elif PER[2]:
+                        for zex2 in range(mi2_max + 1 - (Nex_min[2] - mi_split2[2]), mi2_max + 1): 
+                            zex1_range[ize] = mi1_max
+                            zex2_range[ize] = zex2
+                            ize += 1
+                if Nex_max[2] > <int>(mi2_max-mi_split2[2]):  
+                    if mi_split1[2] < mi1_max:
+                        for zex2 in range(0, Nex_max[2] - (mi2_max-mi_split2[2])): 
+                            zex1_range[ize] = mi_split1[2] + 1
+                            zex2_range[ize] = zex2
+                            ize += 1
+                    elif PER[2]:
+                        for zex2 in range(0, Nex_max[2] - (mi2_max-mi_split2[2])): 
+                            zex1_range[ize] = 0
+                            zex2_range[ize] = zex2
+                            ize += 1
+                for ix in range(ixe):
+                    xex1 = xex1_range[ix]
+                    xex2 = xex2_range[ix]
+                    for iy in range(iye):
+                        yex1 = yex1_range[iy]
+                        yex2 = yex2_range[iy]
+                        for iz in range(ize):
+                            zex1 = zex1_range[iz]
+                            zex2 = zex2_range[iz]
+                            if (xex1 == mi_split1[0] and xex2 == mi_split2[0] and
+                                yex1 == mi_split1[1] and yex2 == mi_split2[1] and
+                                zex1 == mi_split1[2] and zex2 == mi_split2[2]):
+                                continue
+                            miex1 = encode_morton_64bit(xex1, yex1, zex1)
+                            miex2 = encode_morton_64bit(xex2, yex2, zex2)
+                            if nsub_mi >= msize:
+                                # Uncomment these lines to allow periodic
+                                # caching of refined indices
+                                # self.bitmasks._set_refined_index_array(
+                                #     file_id, nsub_mi, sub_mi1, sub_mi2)
+                                # nsub_mi = 0
+                                raise IndexError(
+                                    "Refined index exceeded original "
+                                    "estimate.\n"
+                                    "nsub_mi = %s, "
+                                    "sub_mi1.shape[0] = %s"
+                                    % (nsub_mi, sub_mi1.shape[0]))
+                            sub_mi1[nsub_mi] = miex1
+                            sub_mi2[nsub_mi] = miex2
+                            nsub_mi += 1
         # Only subs of particles in the mask
         return nsub_mi
 
+    cdef np.uint64_t __fill_refined_ranges(self, np.float64_t s_ppos[3], np.float64_t radius,
+                                           np.float64_t LE[3], np.float64_t RE[3],
+                                           np.float64_t dds1[3], np.uint64_t xex, np.uint64_t yex, np.uint64_t zex,
+                                           np.float64_t dds2[3],
+                                           np.uint64_t mi1_max, np.uint64_t mi2_max, np.uint64_t miex1,
+                                           vector[bool] &refined_set) except *:
+        cdef int i
+        cdef np.uint64_t new_nsub = 0
+        cdef np.uint64_t bounds_l[3], bounds_r[3]
+        cdef np.uint64_t miex2, mi2
+        cdef np.float64_t clip_pos_l[3], clip_pos_r[3], cell_edge_l, cell_edge_r
+        cdef np.uint64_t ex1[3]
+        ex1[0] = xex; ex1[1] = yex; ex1[2] = zex
+        for i in range(3):
+        # Figure out our bounds inside our cell
+            cell_edge_l = ex1[i] * dds1[i] + LE[i]
+            cell_edge_r = (ex1[i] + 1) * dds1[i] + LE[i]
+            clip_pos_l[i] = fmax(s_ppos[i] - radius, cell_edge_l + dds2[i]/2.0)
+            clip_pos_r[i] = fmin(s_ppos[i] + radius, cell_edge_r - dds2[i]/2.0)
+        mi2 = bounded_morton_split_relative_dds(clip_pos_l[0], clip_pos_l[1], clip_pos_l[2],
+                                                LE, dds1, dds2, bounds_l)
+        mi2 = bounded_morton_split_relative_dds(clip_pos_r[0], clip_pos_r[1], clip_pos_r[2],
+                                                LE, dds1, dds2, bounds_r)
+        if bounds_r[0] < bounds_l[0] or bounds_r[1] < bounds_l[1] or bounds_r[2] < bounds_l[2]:
+            print(bounds_r[0] - bounds_l[0], bounds_r[1] - bounds_l[1], bounds_r[2] - bounds_l[2])
+            raise RuntimeError
+        for xex2 in range(bounds_l[0], bounds_r[0] + 1):
+            for yex2 in range(bounds_l[1], bounds_r[1] + 1):
+                for zex2 in range(bounds_l[2], bounds_r[2] + 1):
+                    miex2 = encode_morton_64bit(xex2, yex2, zex2)
+                    refined_set[miex2] = True
+                    new_nsub += 1
+        return new_nsub
+
     @cython.boundscheck(False)
     @cython.wraparound(False)
     @cython.cdivision(True)

From af135de3df1bae34b7b0d43e801862fdf3627d87 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Wed, 13 May 2020 15:09:48 -0500
Subject: [PATCH 17/42] Temporary commit, still not working

---
 yt/geometry/particle_oct_container.pyx | 39 ++++++++++++++++++--------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 9808c6fb3f8..9962ba844c7 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -684,8 +684,8 @@ cdef class ParticleBitmap:
                 # Determine sub index within cell of primary index
                 mi2 = bounded_morton_split_relative_dds(
                     ppos[0], ppos[1], ppos[2], LE, dds1, dds2, mi_split2)
-                if coarse_refined_map.count(mi1) == 0:
-                    coarse_refined_map[mi1] = vector[bool](max_mi2_elements, False)
+                if coarse_refined_map[mi1].size() == 0:
+                    coarse_refined_map[mi1].resize(max_mi2_elements, False)
                 coarse_refined_map[mi1][mi2] = True
             else: # only hit if we have smoothing lengths.
                 # We have to do essentially the identical process to in the coarse indexing,
@@ -723,12 +723,13 @@ cdef class ParticleBitmap:
                                         if mask[miex] <= 1:
                                             continue
                                         # Now we need to fill our sub-range
-                                        if coarse_refined_map.count(miex) == 0:
-                                            coarse_refined_map[miex] = vector[bool](max_mi2_elements, False)
+                                        if coarse_refined_map[miex].size() == 0:
+                                            coarse_refined_map[miex].resize(max_mi2_elements, False)
                                         self.__fill_refined_ranges(s_ppos, radius, LE, RE,
                                                                    dds1, xex, yex, zex,
                                                                    dds2, mi1_max, mi2_max, miex,
-                                                                   coarse_refined_map[miex])
+                                                                   coarse_refined_map[miex], ppos, mask[miex],
+                                                                   max_mi2_elements)
         print("THIS MANY COARSE CELLS", coarse_refined_map.size())
         cdef np.uint64_t count
         for it1 in coarse_refined_map:
@@ -740,7 +741,7 @@ cdef class ParticleBitmap:
                 sub_mi1[nsub_mi] = mi1
                 sub_mi2[nsub_mi] = it2
                 #nsub_mi += 1
-            print("IN ", mi1, "THIS MANY REFINED CELLS", count)
+            #print("IN ", mi1, "THIS MANY REFINED CELLS", count)
         return nsub_mi
 
         if 0:
@@ -890,12 +891,17 @@ cdef class ParticleBitmap:
         # Only subs of particles in the mask
         return nsub_mi
 
-    cdef np.uint64_t __fill_refined_ranges(self, np.float64_t s_ppos[3], np.float64_t radius,
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
+    @cython.cdivision(True)
+    @cython.initializedcheck(False)
+    cdef np.int64_t __fill_refined_ranges(self, np.float64_t s_ppos[3], np.float64_t radius,
                                            np.float64_t LE[3], np.float64_t RE[3],
                                            np.float64_t dds1[3], np.uint64_t xex, np.uint64_t yex, np.uint64_t zex,
                                            np.float64_t dds2[3],
                                            np.uint64_t mi1_max, np.uint64_t mi2_max, np.uint64_t miex1,
-                                           vector[bool] &refined_set) except *:
+                                           vector[bool] &refined_set, np.float64_t ppos[3], np.uint64_t mcount,
+                                          np.uint64_t max_mi2_elements) except -1:
         cdef int i
         cdef np.uint64_t new_nsub = 0
         cdef np.uint64_t bounds_l[3], bounds_r[3]
@@ -903,19 +909,30 @@ cdef class ParticleBitmap:
         cdef np.float64_t clip_pos_l[3], clip_pos_r[3], cell_edge_l, cell_edge_r
         cdef np.uint64_t ex1[3]
         ex1[0] = xex; ex1[1] = yex; ex1[2] = zex
+        # Check a few special cases
         for i in range(3):
-        # Figure out our bounds inside our cell
+            # Figure out our bounds inside our coarse cell, in the space of the
+            # full domain
             cell_edge_l = ex1[i] * dds1[i] + LE[i]
-            cell_edge_r = (ex1[i] + 1) * dds1[i] + LE[i]
+            cell_edge_r = cell_edge_l + dds1[i]
             clip_pos_l[i] = fmax(s_ppos[i] - radius, cell_edge_l + dds2[i]/2.0)
             clip_pos_r[i] = fmin(s_ppos[i] + radius, cell_edge_r - dds2[i]/2.0)
         mi2 = bounded_morton_split_relative_dds(clip_pos_l[0], clip_pos_l[1], clip_pos_l[2],
                                                 LE, dds1, dds2, bounds_l)
         mi2 = bounded_morton_split_relative_dds(clip_pos_r[0], clip_pos_r[1], clip_pos_r[2],
                                                 LE, dds1, dds2, bounds_r)
+        if bounds_l[0] == bounds_r[0] and bounds_l[1] == bounds_r[1] and bounds_l[2] == bounds_r[2]:
+            miex2 = encode_morton_64bit(bounds_l[0], bounds_l[1], bounds_l[2])
+            refined_set[miex2] = True
+            return 1
         if bounds_r[0] < bounds_l[0] or bounds_r[1] < bounds_l[1] or bounds_r[2] < bounds_l[2]:
             print(bounds_r[0] - bounds_l[0], bounds_r[1] - bounds_l[1], bounds_r[2] - bounds_l[2])
-            raise RuntimeError
+            return -1
+        if (bounds_l[0] == bounds_l[1] == bounds_l[2] == 0) and \
+           (bounds_r[0] == bounds_r[1] == bounds_r[2] == mi2_max):
+            for miex2 in range(max_mi2_elements):
+                refined_set[miex2] = True
+            return max_mi2_elements
         for xex2 in range(bounds_l[0], bounds_r[0] + 1):
             for yex2 in range(bounds_l[1], bounds_r[1] + 1):
                 for zex2 in range(bounds_l[2], bounds_r[2] + 1):

From e626e259890309eedaae44131e6543e48817e5d0 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Fri, 15 May 2020 13:49:26 -0500
Subject: [PATCH 18/42] another pass

---
 yt/geometry/particle_oct_container.pyx | 65 +++++++++++++-------------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 9962ba844c7..bb981b342b7 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -624,7 +624,7 @@ cdef class ParticleBitmap:
         cdef np.float64_t ppos[3]
         cdef np.float64_t s_ppos[3] # shifted ppos
         cdef int skip, Nex
-        cdef np.uint64_t bounds[3][2]
+        cdef np.uint64_t bounds[2][3]
         cdef np.float64_t LE[3]
         cdef np.float64_t RE[3]
         cdef np.float64_t DW[3]
@@ -648,10 +648,12 @@ cdef class ParticleBitmap:
         cdef np.ndarray[np.uint64_t, ndim=1] xex2_range = np.empty(7, 'uint64')
         cdef np.ndarray[np.uint64_t, ndim=1] yex2_range = np.empty(7, 'uint64')
         cdef np.ndarray[np.uint64_t, ndim=1] zex2_range = np.empty(7, 'uint64')
+        cdef np.float64_t clip_pos_l[3], clip_pos_r[3]
         cdef np.int64_t msize = sub_mi1.shape[0]
         cdef int axiter[3][2]
         cdef np.float64_t axiterv[3][2]
         cdef CoarseRefinedSets coarse_refined_map
+        cdef np.uint64_t nset = 0
         mi1_max = (1 << self.index_order1) - 1
         mi2_max = (1 << self.index_order2) - 1
         cdef np.uint64_t max_mi2_elements = 1 << (3*self.index_order2)
@@ -692,6 +694,8 @@ cdef class ParticleBitmap:
                 # except here we need to fill in all the subranges as well as the coarse ranges
                 # Note that we are also doing the null case, where we do no shifting
                 radius = hsml[p]
+                if mask[mi1] <= 1: # only one thing in this area
+                    continue
                 for i in range(3):
                     if PER[i] and ppos[i] - radius < LE[i]:
                         axiter[i][1] = +1
@@ -711,37 +715,43 @@ cdef class ParticleBitmap:
                             # OK, now we compute the left and right edges for this shift.
                             for i in range(3):
                                 # casting to int64 is not nice but is so we can have negative values we clip
-                                bounds[i][0] = i64max(<np.int64_t>((s_ppos[i] - LE[i] - radius)/dds1[i]), 0)
-                                bounds[i][1] = i64min(<np.int64_t>((s_ppos[i] - LE[i] + radius)/dds1[i]), mi1_max) + 1
+                                clip_pos_l[i] = fmax(s_ppos[i] - radius, LE[i] + dds1[i]/2)
+                                clip_pos_r[i] = fmin(s_ppos[i] + radius, RE[i] - dds1[i]/2)
+                            bounded_morton_split_dds(clip_pos_l[0], clip_pos_l[1], clip_pos_l[2], LE, dds1, bounds[0])
+                            bounded_morton_split_dds(clip_pos_r[0], clip_pos_r[1], clip_pos_r[2], LE, dds1, bounds[1])
                             # We go to the upper bound plus one so that we have *inclusive* loops -- the upper bound
                             # is the cell *index*, so we want to make sure we include that cell.  This is also why
                             # we don't need to worry about mi_max being the max index rather than the cell count.
-                            for xex in range(bounds[0][0], bounds[0][1]):
-                                for yex in range(bounds[1][0], bounds[1][1]):
-                                    for zex in range(bounds[2][0], bounds[2][1]):
+                            for xex in range(bounds[0][0], bounds[1][0] + 1):
+                                for yex in range(bounds[0][1], bounds[1][1] + 1):
+                                    for zex in range(bounds[0][2], bounds[1][2] + 1):
                                         miex = encode_morton_64bit(xex, yex, zex)
                                         if mask[miex] <= 1:
                                             continue
                                         # Now we need to fill our sub-range
                                         if coarse_refined_map[miex].size() == 0:
                                             coarse_refined_map[miex].resize(max_mi2_elements, False)
-                                        self.__fill_refined_ranges(s_ppos, radius, LE, RE,
+                                        nset += self.__fill_refined_ranges(s_ppos, radius, LE, RE,
                                                                    dds1, xex, yex, zex,
                                                                    dds2, mi1_max, mi2_max, miex,
                                                                    coarse_refined_map[miex], ppos, mask[miex],
                                                                    max_mi2_elements)
         print("THIS MANY COARSE CELLS", coarse_refined_map.size())
-        cdef np.uint64_t count
+        cdef np.uint64_t count, vec_i
+        cdef total_count = 0
         for it1 in coarse_refined_map:
             mi1 = it1.first
             count = 0
+            vec_i = 0
             for it2 in it1.second:
                 if it2 == True:
                     count += 1
-                sub_mi1[nsub_mi] = mi1
-                sub_mi2[nsub_mi] = it2
-                #nsub_mi += 1
+                    #sub_mi1[nsub_mi] = mi1
+                    #sub_mi2[nsub_mi] = vec_i
+                    nsub_mi += 1
+                vec_i += 1
             #print("IN ", mi1, "THIS MANY REFINED CELLS", count)
+            total_count += count
         return nsub_mi
 
         if 0:
@@ -905,9 +915,9 @@ cdef class ParticleBitmap:
         cdef int i
         cdef np.uint64_t new_nsub = 0
         cdef np.uint64_t bounds_l[3], bounds_r[3]
-        cdef np.uint64_t miex2, mi2
+        cdef np.uint64_t miex2, mi2, miex2_min, miex2_max
         cdef np.float64_t clip_pos_l[3], clip_pos_r[3], cell_edge_l, cell_edge_r
-        cdef np.uint64_t ex1[3]
+        cdef np.uint64_t ex1[3], ex2[3]
         ex1[0] = xex; ex1[1] = yex; ex1[2] = zex
         # Check a few special cases
         for i in range(3):
@@ -917,28 +927,19 @@ cdef class ParticleBitmap:
             cell_edge_r = cell_edge_l + dds1[i]
             clip_pos_l[i] = fmax(s_ppos[i] - radius, cell_edge_l + dds2[i]/2.0)
             clip_pos_r[i] = fmin(s_ppos[i] + radius, cell_edge_r - dds2[i]/2.0)
-        mi2 = bounded_morton_split_relative_dds(clip_pos_l[0], clip_pos_l[1], clip_pos_l[2],
+        miex2_min = bounded_morton_split_relative_dds(clip_pos_l[0], clip_pos_l[1], clip_pos_l[2],
                                                 LE, dds1, dds2, bounds_l)
-        mi2 = bounded_morton_split_relative_dds(clip_pos_r[0], clip_pos_r[1], clip_pos_r[2],
+        miex2_max = bounded_morton_split_relative_dds(clip_pos_r[0], clip_pos_r[1], clip_pos_r[2],
                                                 LE, dds1, dds2, bounds_r)
-        if bounds_l[0] == bounds_r[0] and bounds_l[1] == bounds_r[1] and bounds_l[2] == bounds_r[2]:
-            miex2 = encode_morton_64bit(bounds_l[0], bounds_l[1], bounds_l[2])
+        for miex2 in range(miex2_min, miex2_max + 1):
+            #miex2 = encode_morton_64bit(xex2, yex2, zex2)
+            decode_morton_64bit(miex2, ex2)
+            if ex2[0] < bounds_l[0] or ex2[0] > bounds_r[0] or \
+               ex2[1] < bounds_l[1] or ex2[1] > bounds_r[1] or \
+               ex2[2] < bounds_l[2] or ex2[2] > bounds_r[2]:
+                continue
             refined_set[miex2] = True
-            return 1
-        if bounds_r[0] < bounds_l[0] or bounds_r[1] < bounds_l[1] or bounds_r[2] < bounds_l[2]:
-            print(bounds_r[0] - bounds_l[0], bounds_r[1] - bounds_l[1], bounds_r[2] - bounds_l[2])
-            return -1
-        if (bounds_l[0] == bounds_l[1] == bounds_l[2] == 0) and \
-           (bounds_r[0] == bounds_r[1] == bounds_r[2] == mi2_max):
-            for miex2 in range(max_mi2_elements):
-                refined_set[miex2] = True
-            return max_mi2_elements
-        for xex2 in range(bounds_l[0], bounds_r[0] + 1):
-            for yex2 in range(bounds_l[1], bounds_r[1] + 1):
-                for zex2 in range(bounds_l[2], bounds_r[2] + 1):
-                    miex2 = encode_morton_64bit(xex2, yex2, zex2)
-                    refined_set[miex2] = True
-                    new_nsub += 1
+            new_nsub += 1
         return new_nsub
 
     @cython.boundscheck(False)

From b1c0d74e382893f61c7700a22fcda4427d8fe8a6 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Fri, 15 May 2020 15:03:01 -0500
Subject: [PATCH 19/42] try to short circuit, and fix cython bugs

---
 yt/geometry/particle_oct_container.pyx | 33 ++++++++++++++++++--------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index bb981b342b7..9e1bcaf4728 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -639,6 +639,7 @@ cdef class ParticleBitmap:
         cdef int Nex_max[3]
         cdef np.float64_t rpos_min, rpos_max
         cdef np.uint64_t xex2_min, xex2_max, yex2_min, yex2_max, zex2_min, zex2_max
+        cdef np.uint64_t xex, yex, zex
         cdef np.uint64_t xex1, yex1, zex1
         cdef np.uint64_t xex2, yex2, zex2
         cdef int ix, iy, iz, ixe, iye, ize
@@ -653,6 +654,7 @@ cdef class ParticleBitmap:
         cdef int axiter[3][2]
         cdef np.float64_t axiterv[3][2]
         cdef CoarseRefinedSets coarse_refined_map
+        cdef map[np.uint64_t, np.uint64_t] refined_count
         cdef np.uint64_t nset = 0
         mi1_max = (1 << self.index_order1) - 1
         mi2_max = (1 << self.index_order2) - 1
@@ -688,7 +690,10 @@ cdef class ParticleBitmap:
                     ppos[0], ppos[1], ppos[2], LE, dds1, dds2, mi_split2)
                 if coarse_refined_map[mi1].size() == 0:
                     coarse_refined_map[mi1].resize(max_mi2_elements, False)
-                coarse_refined_map[mi1][mi2] = True
+                    refined_count[mi1] = 0
+                if coarse_refined_map[mi1][mi2] == False:
+                    coarse_refined_map[mi1][mi2] = True
+                    refined_count[mi1] += 1
             else: # only hit if we have smoothing lengths.
                 # We have to do essentially the identical process to in the coarse indexing,
                 # except here we need to fill in all the subranges as well as the coarse ranges
@@ -725,18 +730,22 @@ cdef class ParticleBitmap:
                             for xex in range(bounds[0][0], bounds[1][0] + 1):
                                 for yex in range(bounds[0][1], bounds[1][1] + 1):
                                     for zex in range(bounds[0][2], bounds[1][2] + 1):
-                                        miex = encode_morton_64bit(xex, yex, zex)
-                                        if mask[miex] <= 1:
+                                        miex1 = encode_morton_64bit(xex, yex, zex)
+                                        if mask[miex1] <= 1:
                                             continue
                                         # Now we need to fill our sub-range
-                                        if coarse_refined_map[miex].size() == 0:
-                                            coarse_refined_map[miex].resize(max_mi2_elements, False)
-                                        nset += self.__fill_refined_ranges(s_ppos, radius, LE, RE,
+                                        if coarse_refined_map[miex1].size() == 0:
+                                            coarse_refined_map[miex1].resize(max_mi2_elements, False)
+                                            refined_count[miex1] = 0
+                                        if refined_count[miex1] >= max_mi2_elements:
+                                            continue
+                                        refined_count[miex1] += self.__fill_refined_ranges(s_ppos, radius, LE, RE,
                                                                    dds1, xex, yex, zex,
-                                                                   dds2, mi1_max, mi2_max, miex,
-                                                                   coarse_refined_map[miex], ppos, mask[miex],
+                                                                   dds2, mi1_max, mi2_max, miex1,
+                                                                   coarse_refined_map[miex1], ppos, mask[miex1],
                                                                    max_mi2_elements)
         print("THIS MANY COARSE CELLS", coarse_refined_map.size())
+        print("THIS MANY NSET", nset, nset / pos.shape[0], nsub_mi)
         cdef np.uint64_t count, vec_i
         cdef total_count = 0
         for it1 in coarse_refined_map:
@@ -750,8 +759,11 @@ cdef class ParticleBitmap:
                     #sub_mi2[nsub_mi] = vec_i
                     nsub_mi += 1
                 vec_i += 1
+            if count != refined_count[mi1]:
+                print("WHY IS THIS WRONG", count, refined_count[mi1])
             #print("IN ", mi1, "THIS MANY REFINED CELLS", count)
             total_count += count
+        print("NSUB_MI NOW", total_count, total_count / (coarse_refined_map.size() * max_mi2_elements), nsub_mi, sub_mi1.shape[0], sub_mi2.shape[0])
         return nsub_mi
 
         if 0:
@@ -938,8 +950,9 @@ cdef class ParticleBitmap:
                ex2[1] < bounds_l[1] or ex2[1] > bounds_r[1] or \
                ex2[2] < bounds_l[2] or ex2[2] > bounds_r[2]:
                 continue
-            refined_set[miex2] = True
-            new_nsub += 1
+            if refined_set[miex2] == False:
+                refined_set[miex2] = True
+                new_nsub += 1
         return new_nsub
 
     @cython.boundscheck(False)

From bc7121caef2c94a276dc580b74488f837b278c0a Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 19 May 2020 15:17:27 -0500
Subject: [PATCH 20/42] Use expanded morton for faster BIGMAX selection

---
 yt/geometry/particle_oct_container.pyx | 35 +++++++++++++++-----------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 9e1bcaf4728..b8409d64b0d 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -747,23 +747,23 @@ cdef class ParticleBitmap:
         print("THIS MANY COARSE CELLS", coarse_refined_map.size())
         print("THIS MANY NSET", nset, nset / pos.shape[0], nsub_mi)
         cdef np.uint64_t count, vec_i
-        cdef total_count = 0
+        cdef np.uint64_t total_count = 0
         for it1 in coarse_refined_map:
             mi1 = it1.first
             count = 0
             vec_i = 0
-            for it2 in it1.second:
-                if it2 == True:
+            for vec_i in range(it1.second.size()):
+                if it1.second[vec_i] == True:
                     count += 1
                     #sub_mi1[nsub_mi] = mi1
                     #sub_mi2[nsub_mi] = vec_i
                     nsub_mi += 1
-                vec_i += 1
             if count != refined_count[mi1]:
                 print("WHY IS THIS WRONG", count, refined_count[mi1])
             #print("IN ", mi1, "THIS MANY REFINED CELLS", count)
             total_count += count
-        print("NSUB_MI NOW", total_count, total_count / (coarse_refined_map.size() * max_mi2_elements), nsub_mi, sub_mi1.shape[0], sub_mi2.shape[0])
+        if coarse_refined_map.size() > 0:
+            print("NSUB_MI NOW", total_count, total_count / (coarse_refined_map.size() * max_mi2_elements), nsub_mi, sub_mi1.shape[0], sub_mi2.shape[0])
         return nsub_mi
 
         if 0:
@@ -929,7 +929,8 @@ cdef class ParticleBitmap:
         cdef np.uint64_t bounds_l[3], bounds_r[3]
         cdef np.uint64_t miex2, mi2, miex2_min, miex2_max
         cdef np.float64_t clip_pos_l[3], clip_pos_r[3], cell_edge_l, cell_edge_r
-        cdef np.uint64_t ex1[3], ex2[3]
+        cdef np.uint64_t ex1[3], ex2[3], ex3[3]
+        cdef np.uint64_t xex_max, yex_max, zex_max
         ex1[0] = xex; ex1[1] = yex; ex1[2] = zex
         # Check a few special cases
         for i in range(3):
@@ -943,16 +944,22 @@ cdef class ParticleBitmap:
                                                 LE, dds1, dds2, bounds_l)
         miex2_max = bounded_morton_split_relative_dds(clip_pos_r[0], clip_pos_r[1], clip_pos_r[2],
                                                 LE, dds1, dds2, bounds_r)
+        xex_max = encode_morton_64bit(mi2_max, 0, 0)
+        yex_max = encode_morton_64bit(0, mi2_max, 0)
+        zex_max = encode_morton_64bit(0, 0, mi2_max)
         for miex2 in range(miex2_min, miex2_max + 1):
             #miex2 = encode_morton_64bit(xex2, yex2, zex2)
-            decode_morton_64bit(miex2, ex2)
-            if ex2[0] < bounds_l[0] or ex2[0] > bounds_r[0] or \
-               ex2[1] < bounds_l[1] or ex2[1] > bounds_r[1] or \
-               ex2[2] < bounds_l[2] or ex2[2] > bounds_r[2]:
-                continue
-            if refined_set[miex2] == False:
-                refined_set[miex2] = True
-                new_nsub += 1
+            #decode_morton_64bit(miex2, ex2)
+            # Let's check all our cases here
+            if refined_set[miex2] == True: continue
+            if (miex2 & xex_max) < (miex2_min & xex_max): continue
+            if (miex2 & yex_max) < (miex2_min & yex_max): continue
+            if (miex2 & zex_max) < (miex2_min & zex_max): continue
+            if (miex2 & xex_max) > (miex2_max & xex_max): continue
+            if (miex2 & yex_max) > (miex2_max & yex_max): continue
+            if (miex2 & zex_max) > (miex2_max & zex_max): continue
+            refined_set[miex2] = True
+            new_nsub += 1
         return new_nsub
 
     @cython.boundscheck(False)

From 300fc4afd49379d2e1333d7381bbbcb4e103d48d Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 19 May 2020 15:12:53 -0500
Subject: [PATCH 21/42] Give up on hiding the C++ in EWAH

---
 yt/geometry/selection_routines.pxd  |   2 -
 yt/geometry/selection_routines.pyx  |   3 -
 yt/utilities/lib/ewah_bool_wrap.pxd |  37 +--
 yt/utilities/lib/ewah_bool_wrap.pyx | 334 ++++++++--------------------
 4 files changed, 119 insertions(+), 257 deletions(-)

diff --git a/yt/geometry/selection_routines.pxd b/yt/geometry/selection_routines.pxd
index 81d02dbb6cd..8ad6c687d63 100644
--- a/yt/geometry/selection_routines.pxd
+++ b/yt/geometry/selection_routines.pxd
@@ -12,8 +12,6 @@ from oct_visitors cimport Oct, OctVisitor
 from oct_container cimport OctreeContainer
 from grid_visitors cimport GridTreeNode, GridVisitorData, \
     grid_visitor_function, check_child_masked
-from yt.utilities.lib.ewah_bool_wrap cimport \
-    BoolArrayCollection
 from yt.utilities.lib.geometry_utils cimport decode_morton_64bit
 from yt.utilities.lib.fp_utils cimport _ensure_code
 
diff --git a/yt/geometry/selection_routines.pyx b/yt/geometry/selection_routines.pyx
index 694ceb9448a..f4ba17b2365 100644
--- a/yt/geometry/selection_routines.pyx
+++ b/yt/geometry/selection_routines.pyx
@@ -23,9 +23,6 @@ from yt.utilities.lib.volume_container cimport \
 from yt.utilities.lib.grid_traversal cimport \
     sampler_function, walk_volume
 from yt.utilities.lib.bitarray cimport ba_get_value, ba_set_value
-from yt.utilities.lib.ewah_bool_wrap cimport BoolArrayCollection
-# from yt.utilities.lib.ewah_bool_wrap cimport SparseUnorderedBitmaskSet #as SparseUnorderedBitmask
-# from yt.utilities.lib.ewah_bool_wrap cimport SparseUnorderedRefinedBitmaskSet #as SparseUnorderedRefinedBitmask
 from yt.utilities.lib.geometry_utils cimport encode_morton_64bit, decode_morton_64bit, \
     bounded_morton_dds, morton_neighbors_coarse, morton_neighbors_refined
 
diff --git a/yt/utilities/lib/ewah_bool_wrap.pxd b/yt/utilities/lib/ewah_bool_wrap.pxd
index aa239d299d8..589d56a028c 100644
--- a/yt/utilities/lib/ewah_bool_wrap.pxd
+++ b/yt/utilities/lib/ewah_bool_wrap.pxd
@@ -1,10 +1,19 @@
 cimport numpy as np
+from libcpp.vector cimport vector
+from libcpp.set cimport set as cset
+from libcpp.pair cimport pair
+
+from yt.utilities.lib.ewah_bool_array cimport \
+    sstream, ewah_map, ewah_bool_array, ewah_bool_iterator
+
+ctypedef bint bitarrtype
+ctypedef pair[np.uint64_t, np.uint64_t] ind_pair
 
 cdef class FileBitmasks:
     cdef np.uint32_t nfiles
-    cdef void** ewah_coll
-    cdef void** ewah_keys
-    cdef void** ewah_refn
+    cdef ewah_map** ewah_coll
+    cdef ewah_bool_array** ewah_keys
+    cdef ewah_bool_array** ewah_refn
 
     cdef void _reset(self)
     cdef bint _iseq(self, FileBitmasks solf)
@@ -43,10 +52,10 @@ cdef class FileBitmasks:
     cdef bint _check(self)
 
 cdef class BoolArrayCollection:
-    cdef void* ewah_coll
-    cdef void* ewah_keys
-    cdef void* ewah_refn
-    cdef void* ewah_coar
+    cdef ewah_map* ewah_coll
+    cdef ewah_bool_array* ewah_keys
+    cdef ewah_bool_array* ewah_refn
+    cdef ewah_bool_array* ewah_coar
 
     cdef void _reset(self)
     cdef int _richcmp(self, BoolArrayCollection solf, int op) except -1
@@ -85,9 +94,9 @@ cdef class BoolArrayCollection:
 cdef class BoolArrayCollectionUncompressed:
     cdef int nele1
     cdef int nele2
-    cdef void* ewah_coll
-    cdef void* ewah_keys
-    cdef void* ewah_refn
+    cdef ewah_map* ewah_coll
+    cdef bitarrtype* ewah_keys
+    cdef bitarrtype* ewah_refn
 
     cdef void _set(self, np.uint64_t i1, np.uint64_t i2=*)
     cdef void _set_coarse(self, np.uint64_t i1)
@@ -108,7 +117,7 @@ cdef class BoolArrayCollectionUncompressed:
     cdef void _compress(self, BoolArrayCollection solf)
 
 cdef class SparseUnorderedBitmaskSet:
-    cdef void* entries
+    cdef cset[np.uint64_t] entries
     cdef void _set(self, np.uint64_t ind)
     cdef void _fill(self, np.uint8_t[:] mask)
     cdef void _fill_ewah(self, BoolArrayCollection mm)
@@ -118,7 +127,7 @@ cdef class SparseUnorderedBitmaskSet:
 
 cdef class SparseUnorderedBitmaskVector:
     cdef int total
-    cdef void* entries
+    cdef vector[np.uint64_t] entries
     cdef void _set(self, np.uint64_t ind)
     cdef void _fill(self, np.uint8_t[:] mask)
     cdef void _fill_ewah(self, BoolArrayCollection mm)
@@ -129,7 +138,7 @@ cdef class SparseUnorderedBitmaskVector:
     cdef void _prune(self)
 
 cdef class SparseUnorderedRefinedBitmaskSet:
-    cdef void* entries
+    cdef cset[ind_pair] entries
     cdef void _set(self, np.uint64_t ind1, np.uint64_t ind2)
     cdef void _fill(self, np.uint8_t[:] mask1, np.uint8_t[:])
     cdef void _fill_ewah(self, BoolArrayCollection mm)
@@ -139,7 +148,7 @@ cdef class SparseUnorderedRefinedBitmaskSet:
 
 cdef class SparseUnorderedRefinedBitmaskVector:
     cdef int total
-    cdef void* entries
+    cdef vector[ind_pair] entries
     cdef void _set(self, np.uint64_t ind1, np.uint64_t ind2)
     cdef void _fill(self, np.uint8_t[:] mask1, np.uint8_t[:])
     cdef void _fill_ewah(self, BoolArrayCollection mm)
diff --git a/yt/utilities/lib/ewah_bool_wrap.pyx b/yt/utilities/lib/ewah_bool_wrap.pyx
index b4ced2c87bd..f25a785386e 100644
--- a/yt/utilities/lib/ewah_bool_wrap.pyx
+++ b/yt/utilities/lib/ewah_bool_wrap.pyx
@@ -8,14 +8,9 @@ Wrapper for EWAH Bool Array: https://github.com/lemire/EWAHBoolArray
 
 import struct
 from libcpp.map cimport map as cmap
-from libcpp.vector cimport vector
-from libcpp.pair cimport pair
-from libcpp.set cimport set as cset
 from libcpp.map cimport map
 from libcpp.algorithm cimport sort
 from libc.stdlib cimport malloc, free, qsort
-from yt.utilities.lib.ewah_bool_array cimport \
-    sstream, ewah_map, ewah_bool_array, ewah_bool_iterator
 from cython.operator cimport dereference, preincrement
 import numpy as np
 cimport numpy as np
@@ -29,12 +24,6 @@ cdef extern from "<algorithm>" namespace "std" nogil:
 cdef np.uint64_t FLAG = ~(<np.uint64_t>0)
 cdef np.uint64_t MAX_VECTOR_SIZE = <np.uint64_t>1e7
 
-DEF UncompressedFormat = 'Pointer'
-
-#ctypedef np.uint8_t bitarrtype
-ctypedef bint bitarrtype
-
-ctypedef pair[np.uint64_t, np.uint64_t] ind_pair
 ctypedef cmap[np.uint64_t, ewah_bool_array] ewahmap
 ctypedef cmap[np.uint64_t, ewah_bool_array].iterator ewahmap_it
 ctypedef pair[np.uint64_t, ewah_bool_array] ewahmap_p
@@ -44,26 +33,20 @@ cdef class FileBitmasks:
     def __cinit__(self, np.uint32_t nfiles):
         cdef int i
         self.nfiles = nfiles
-        cdef ewah_bool_array **ewah_keys = <ewah_bool_array **>malloc(nfiles*sizeof(ewah_bool_array*))
-        cdef ewah_bool_array **ewah_refn = <ewah_bool_array **>malloc(nfiles*sizeof(ewah_bool_array*))
-        cdef ewah_map **ewah_coll = <ewah_map **>malloc(nfiles*sizeof(ewah_map*))
+        self.ewah_keys = <ewah_bool_array **>malloc(nfiles*sizeof(ewah_bool_array*))
+        self.ewah_refn = <ewah_bool_array **>malloc(nfiles*sizeof(ewah_bool_array*))
+        self.ewah_coll = <ewah_map **>malloc(nfiles*sizeof(ewah_map*))
         for i in range(nfiles):
-            ewah_keys[i] = new ewah_bool_array()
-            ewah_refn[i] = new ewah_bool_array()
-            ewah_coll[i] = new ewah_map()
-        self.ewah_keys = <void **>ewah_keys
-        self.ewah_refn = <void **>ewah_refn
-        self.ewah_coll = <void **>ewah_coll
+            self.ewah_keys[i] = new ewah_bool_array()
+            self.ewah_refn[i] = new ewah_bool_array()
+            self.ewah_coll[i] = new ewah_map()
 
     cdef void _reset(self):
-        cdef ewah_bool_array **ewah_keys = <ewah_bool_array **>self.ewah_keys
-        cdef ewah_bool_array **ewah_refn = <ewah_bool_array **>self.ewah_refn
-        cdef ewah_map **ewah_coll = <ewah_map **>self.ewah_coll
         cdef np.int32_t ifile
         for ifile in range(self.nfiles):
-            ewah_keys[ifile][0].reset()
-            ewah_refn[ifile][0].reset()
-            ewah_coll[ifile][0].clear()
+            self.ewah_keys[ifile].reset()
+            self.ewah_refn[ifile].reset()
+            self.ewah_coll[ifile].clear()
 
     cdef bint _iseq(self, FileBitmasks solf):
         cdef np.int32_t ifile
@@ -644,24 +627,16 @@ cdef class FileBitmasks:
 cdef class BoolArrayCollection:
 
     def __cinit__(self):
-        cdef ewah_bool_array *ewah_keys = new ewah_bool_array()
-        cdef ewah_bool_array *ewah_refn = new ewah_bool_array()
-        cdef ewah_bool_array *ewah_coar = new ewah_bool_array()
-        cdef ewah_map *ewah_coll = new ewah_map()
-        self.ewah_keys = <void *> ewah_keys
-        self.ewah_refn = <void *> ewah_refn
-        self.ewah_coar = <void *> ewah_coar
-        self.ewah_coll = <void *> ewah_coll
+        self.ewah_keys = new ewah_bool_array()
+        self.ewah_refn = new ewah_bool_array()
+        self.ewah_coar = new ewah_bool_array()
+        self.ewah_coll = new ewah_map()
 
     cdef void _reset(self):
-        cdef ewah_bool_array *ewah_keys = <ewah_bool_array *>self.ewah_keys
-        cdef ewah_bool_array *ewah_refn = <ewah_bool_array *>self.ewah_refn
-        cdef ewah_bool_array *ewah_coar = <ewah_bool_array *>self.ewah_coar
-        cdef ewah_map *ewah_coll = <ewah_map *>self.ewah_coll
-        ewah_keys[0].reset()
-        ewah_refn[0].reset()
-        ewah_coar[0].reset()
-        ewah_coll[0].clear()
+        self.ewah_keys[0].reset()
+        self.ewah_refn[0].reset()
+        self.ewah_coar[0].reset()
+        self.ewah_coll[0].clear()
 
     cdef int _richcmp(self, BoolArrayCollection solf, int op) except -1:
 
@@ -1322,25 +1297,13 @@ cdef class BoolArrayCollectionUncompressed:
     def __cinit__(self, np.uint64_t nele1, np.uint64_t nele2):
         self.nele1 = <int>nele1
         self.nele2 = <int>nele2
-        cdef ewah_map *ewah_coll = new ewah_map()
-        self.ewah_coll = <void *> ewah_coll
+        self.ewah_coll = new ewah_map()
         cdef np.uint64_t i
-        IF UncompressedFormat == 'MemoryView':
-            self.ewah_keys = malloc(sizeof(bitarrtype)*nele1)
-            self.ewah_refn = malloc(sizeof(bitarrtype)*nele1)
-            cdef bitarrtype[:] ewah_keys = <bitarrtype[:nele1]>self.ewah_keys
-            cdef bitarrtype[:] ewah_refn = <bitarrtype[:nele1]>self.ewah_refn
-            for i in range(nele1):
-                ewah_keys[i] = 0
-                ewah_refn[i] = 0
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_keys = <bitarrtype *>malloc(sizeof(bitarrtype)*nele1)
-            cdef bitarrtype *ewah_refn = <bitarrtype *>malloc(sizeof(bitarrtype)*nele1)
-            for i in range(nele1):
-                ewah_keys[i] = 0
-                ewah_refn[i] = 0
-            self.ewah_keys = <void *> ewah_keys
-            self.ewah_refn = <void *> ewah_refn
+        self.ewah_keys = <bitarrtype *>malloc(sizeof(bitarrtype)*nele1)
+        self.ewah_refn = <bitarrtype *>malloc(sizeof(bitarrtype)*nele1)
+        for i in range(nele1):
+            self.ewah_keys[i] = 0
+            self.ewah_refn[i] = 0
 
     def reset(self):
         self.__dealloc__()
@@ -1350,12 +1313,8 @@ cdef class BoolArrayCollectionUncompressed:
         cdef np.uint64_t i
         cdef ewah_bool_array *ewah_keys = <ewah_bool_array *> solf.ewah_keys
         cdef ewah_bool_array *ewah_refn = <ewah_bool_array *> solf.ewah_refn
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] bool_keys = <bitarrtype[:self.nele1]>self.ewah_keys
-            cdef bitarrtype[:] bool_refn = <bitarrtype[:self.nele1]>self.ewah_refn
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *bool_keys = <bitarrtype *> self.ewah_keys
-            cdef bitarrtype *bool_refn = <bitarrtype *> self.ewah_refn
+        cdef bitarrtype *bool_keys = <bitarrtype *> self.ewah_keys
+        cdef bitarrtype *bool_refn = <bitarrtype *> self.ewah_refn
         for i in range(self.nele1):
             if bool_keys[i] == 1:
                 ewah_keys[0].set(i)
@@ -1366,12 +1325,8 @@ cdef class BoolArrayCollectionUncompressed:
         ewah_coll2[0] = ewah_coll1[0]
 
     cdef void _set(self, np.uint64_t i1, np.uint64_t i2 = FLAG):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_keys = <bitarrtype[:self.nele1]>self.ewah_keys
-            cdef bitarrtype[:] ewah_refn = <bitarrtype[:self.nele1]>self.ewah_refn
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
-            cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
+        cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
+        cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
         cdef ewah_map *ewah_coll = <ewah_map *> self.ewah_coll
         ewah_keys[i1] = 1
         # Note the 0 here, for dereferencing
@@ -1380,17 +1335,11 @@ cdef class BoolArrayCollectionUncompressed:
             ewah_coll[0][i1].set(i2)
 
     cdef void _set_coarse(self, np.uint64_t i1):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_keys = <bitarrtype[:self.nele1]>self.ewah_keys
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
+        cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
         ewah_keys[i1] = 1
 
     cdef void _set_refined(self, np.uint64_t i1, np.uint64_t i2):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_refn = <bitarrtype[:self.nele1]>self.ewah_refn
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
+        cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
         cdef ewah_map *ewah_coll = <ewah_map *> self.ewah_coll
         # Note the 0 here, for dereferencing
         ewah_refn[i1] = 1
@@ -1401,10 +1350,7 @@ cdef class BoolArrayCollectionUncompressed:
     @cython.cdivision(True)
     @cython.initializedcheck(False)
     cdef void _set_coarse_array(self, np.uint8_t[:] arr):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_keys = <bitarrtype[:self.nele1]>self.ewah_keys
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
+        cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
         cdef np.uint64_t i1
         for i1 in range(arr.shape[0]):
             if arr[i1] == 1:
@@ -1415,11 +1361,8 @@ cdef class BoolArrayCollectionUncompressed:
     @cython.cdivision(True)
     @cython.initializedcheck(False)
     cdef void _set_coarse_array_ptr(self, np.uint8_t *arr):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_keys = <bitarrtype[:self.nele1]>self.ewah_keys
-        ELIF UncompressedFormat == 'Pointer':
-            # TODO: memcpy?
-            cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
+        # TODO: memcpy?
+        cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
         cdef np.uint64_t i1
         for i1 in range(self.nele1):
             if arr[i1] == 1:
@@ -1430,10 +1373,7 @@ cdef class BoolArrayCollectionUncompressed:
     @cython.cdivision(True)
     @cython.initializedcheck(False)
     cdef void _set_refined_array(self, np.uint64_t i1, np.uint8_t[:] arr):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_refn = <bitarrtype[:self.nele1]>self.ewah_refn
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
+        cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
         cdef ewah_map *ewah_coll = <ewah_map *> self.ewah_coll
         cdef np.uint64_t i2
         for i2 in range(arr.shape[0]):
@@ -1446,10 +1386,7 @@ cdef class BoolArrayCollectionUncompressed:
     @cython.cdivision(True)
     @cython.initializedcheck(False)
     cdef void _set_refined_array_ptr(self, np.uint64_t i1, np.uint8_t *arr):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_refn = <bitarrtype[:self.nele1]>self.ewah_refn
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
+        cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
         cdef ewah_map *ewah_coll = <ewah_map *> self.ewah_coll
         cdef np.uint64_t i2
         cdef ewah_bool_array *barr = &ewah_coll[0][i1]
@@ -1463,19 +1400,12 @@ cdef class BoolArrayCollectionUncompressed:
         ewah_coll[0][i1].set(i2)
 
     cdef void _set_refn(self, np.uint64_t i1):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_refn = <bitarrtype[:self.nele1]>self.ewah_refn
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
+        cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
         ewah_refn[i1] = 1
 
     cdef bint _get(self, np.uint64_t i1, np.uint64_t i2 = FLAG):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_keys = <bitarrtype[:self.nele1]>self.ewah_keys
-            cdef bitarrtype[:] ewah_refn = <bitarrtype[:self.nele1]>self.ewah_refn
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
-            cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
+        cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
+        cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
         cdef ewah_map *ewah_coll = <ewah_map *> self.ewah_coll
         # Note the 0 here, for dereferencing
         if ewah_keys[i1] == 0: return 0
@@ -1484,26 +1414,17 @@ cdef class BoolArrayCollectionUncompressed:
         return ewah_coll[0][i1].get(i2)
 
     cdef bint _get_coarse(self, np.uint64_t i1):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_keys = <bitarrtype[:self.nele1]>self.ewah_keys
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
+        cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
         return <bint>ewah_keys[i1]
         # if (ewah_keys[i1] == 0): return 0
         # return 1
 
     cdef bint _isref(self, np.uint64_t i):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_refn = <bitarrtype[:self.nele1]>self.ewah_refn
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
+        cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
         return <bint>ewah_refn[i]
 
     cdef int _count_total(self):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_keys = <bitarrtype[:self.nele1]>self.ewah_keys
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
+        cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
         cdef np.uint64_t i
         cdef int out = 0
         for i in range(self.nele1):
@@ -1511,10 +1432,7 @@ cdef class BoolArrayCollectionUncompressed:
         return out
 
     cdef int _count_refined(self):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_refn = <bitarrtype[:self.nele1]>self.ewah_refn
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
+        cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
         cdef np.uint64_t i
         cdef int out = 0
         for i in range(self.nele1):
@@ -1522,16 +1440,10 @@ cdef class BoolArrayCollectionUncompressed:
         return out
 
     cdef void _append(self, BoolArrayCollectionUncompressed solf):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_keys1 = <bitarrtype[:self.nele1]>self.ewah_keys
-            cdef bitarrtype[:] ewah_refn1 = <bitarrtype[:self.nele1]>self.ewah_refn
-            cdef bitarrtype[:] ewah_keys2 = <bitarrtype[:solf.nele1]>solf.ewah_keys
-            cdef bitarrtype[:] ewah_refn2 = <bitarrtype[:solf.nele1]>solf.ewah_refn
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_keys1 = <bitarrtype *> self.ewah_keys
-            cdef bitarrtype *ewah_refn1 = <bitarrtype *> self.ewah_refn
-            cdef bitarrtype *ewah_keys2 = <bitarrtype *> solf.ewah_keys
-            cdef bitarrtype *ewah_refn2 = <bitarrtype *> solf.ewah_refn
+        cdef bitarrtype *ewah_keys1 = <bitarrtype *> self.ewah_keys
+        cdef bitarrtype *ewah_refn1 = <bitarrtype *> self.ewah_refn
+        cdef bitarrtype *ewah_keys2 = <bitarrtype *> solf.ewah_keys
+        cdef bitarrtype *ewah_refn2 = <bitarrtype *> solf.ewah_refn
         cdef ewahmap *ewah_coll1 = <ewahmap *> self.ewah_coll
         cdef ewahmap *ewah_coll2 = <ewahmap *> solf.ewah_coll
         cdef ewahmap_it it_map1, it_map2
@@ -1561,16 +1473,10 @@ cdef class BoolArrayCollectionUncompressed:
             preincrement(it_map2)
 
     cdef bint _intersects(self, BoolArrayCollectionUncompressed solf):
-        IF UncompressedFormat == 'MemoryView':
-            cdef bitarrtype[:] ewah_keys1 = <bitarrtype[:self.nele1]>self.ewah_keys
-            cdef bitarrtype[:] ewah_refn1 = <bitarrtype[:self.nele1]>self.ewah_refn
-            cdef bitarrtype[:] ewah_keys2 = <bitarrtype[:solf.nele1]>solf.ewah_keys
-            cdef bitarrtype[:] ewah_refn2 = <bitarrtype[:solf.nele1]>solf.ewah_refn
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_keys1 = <bitarrtype *> self.ewah_keys
-            cdef bitarrtype *ewah_refn1 = <bitarrtype *> self.ewah_refn
-            cdef bitarrtype *ewah_keys2 = <bitarrtype *> solf.ewah_keys
-            cdef bitarrtype *ewah_refn2 = <bitarrtype *> solf.ewah_refn
+        cdef bitarrtype *ewah_keys1 = <bitarrtype *> self.ewah_keys
+        cdef bitarrtype *ewah_refn1 = <bitarrtype *> self.ewah_refn
+        cdef bitarrtype *ewah_keys2 = <bitarrtype *> solf.ewah_keys
+        cdef bitarrtype *ewah_refn2 = <bitarrtype *> solf.ewah_refn
         cdef ewahmap *ewah_coll1 = <ewahmap *> self.ewah_coll
         cdef ewahmap *ewah_coll2 = <ewahmap *> solf.ewah_coll
         cdef ewahmap_it it_map1, it_map2
@@ -1602,14 +1508,10 @@ cdef class BoolArrayCollectionUncompressed:
         return 0
 
     def __dealloc__(self):
-        IF UncompressedFormat == 'MemoryView':
-            free(self.ewah_keys)
-            free(self.ewah_refn)
-        ELIF UncompressedFormat == 'Pointer':
-            cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
-            cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
-            free(ewah_keys)
-            free(ewah_refn)
+        cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
+        cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
+        free(ewah_keys)
+        free(ewah_refn)
         cdef ewah_map *ewah_coll = <ewah_map *> self.ewah_coll
         del ewah_coll
 
@@ -1626,13 +1528,10 @@ cdef class BoolArrayCollectionUncompressed:
 # Vector version
 cdef class SparseUnorderedBitmaskVector:
     def __cinit__(self):
-        cdef vector[np.uint64_t] *entries = new vector[np.uint64_t]()
-        self.entries = <void *> entries
         self.total = 0
 
     cdef void _set(self, np.uint64_t ind):
-        cdef vector[np.uint64_t] *entries = <vector[np.uint64_t]*> self.entries
-        entries[0].push_back(ind)
+        self.entries.push_back(ind)
         self.total += 1
 
     def set(self, ind):
@@ -1640,47 +1539,41 @@ cdef class SparseUnorderedBitmaskVector:
 
     cdef void _fill(self, np.uint8_t[:] mask):
         cdef np.uint64_t i, ind
-        cdef vector[np.uint64_t] *entries = <vector[np.uint64_t]*> self.entries
-        for i in range(entries[0].size()):
-            ind = entries[0][i]
+        for i in range(self.entries.size()):
+            ind = self.entries[i]
             mask[ind] = 1
 
     cdef void _fill_ewah(self, BoolArrayCollection mm):
         self._remove_duplicates()
         cdef np.uint64_t i, ind
-        cdef vector[np.uint64_t] *entries = <vector[np.uint64_t]*> self.entries
-        for i in range(entries[0].size()):
-            ind = entries[0][i]
+        for i in range(self.entries.size()):
+            ind = self.entries[i]
             mm._set_coarse(ind)
 
     cdef void _fill_bool(self, BoolArrayCollectionUncompressed mm):
         self._remove_duplicates()
         cdef np.uint64_t i, ind
-        cdef vector[np.uint64_t] *entries = <vector[np.uint64_t]*> self.entries
-        for i in range(entries[0].size()):
-            ind = entries[0][i]
+        for i in range(self.entries.size()):
+            ind = self.entries[i]
             mm._set_coarse(ind)
 
     cdef void _reset(self):
-        cdef vector[np.uint64_t] *entries = <vector[np.uint64_t]*> self.entries
-        entries[0].erase(entries[0].begin(), entries[0].end())
+        self.entries.erase(self.entries.begin(), self.entries.end())
         self.total = 0
 
     cdef to_array(self):
         self._remove_duplicates()
         cdef np.ndarray[np.uint64_t, ndim=1] rv
-        cdef vector[np.uint64_t] *entries = <vector[np.uint64_t]*> self.entries
-        rv = np.empty(entries[0].size(), dtype='uint64')
-        for i in range(entries[0].size()):
-            rv[i] = entries[0][i]
+        rv = np.empty(self.entries.size(), dtype='uint64')
+        for i in range(self.entries.size()):
+            rv[i] = self.entries[i]
         return rv
 
     cdef void _remove_duplicates(self):
-        cdef vector[np.uint64_t] *entries = <vector[np.uint64_t]*> self.entries
         cdef vector[np.uint64_t].iterator last
-        sort(entries[0].begin(), entries[0].end())
-        last = unique(entries[0].begin(), entries[0].end())
-        entries[0].erase(last, entries[0].end())
+        sort(self.entries.begin(), self.entries.end())
+        last = unique(self.entries.begin(), self.entries.end())
+        self.entries.erase(last, self.entries.end())
 
     cdef void _prune(self):
         if self.total > MAX_VECTOR_SIZE:
@@ -1688,50 +1581,39 @@ cdef class SparseUnorderedBitmaskVector:
             self.total = 0
 
     def __dealloc__(self):
-        cdef vector[np.uint64_t] *entries = <vector[np.uint64_t]*> self.entries
-        del entries
+        self.entries.clear()
 
 # Set version
 cdef class SparseUnorderedBitmaskSet:
-    def __cinit__(self):
-        cdef cset[np.uint64_t] *entries = new cset[np.uint64_t]()
-        self.entries = <void *> entries
-
     cdef void _set(self, np.uint64_t ind):
-        cdef cset[np.uint64_t] *entries = <cset[np.uint64_t]*> self.entries
-        entries[0].insert(ind)
+        self.entries.insert(ind)
 
     def set(self, ind):
         self._set(ind)
 
     cdef void _fill(self, np.uint8_t[:] mask):
-        cdef cset[np.uint64_t] *entries = <cset[np.uint64_t]*> self.entries
-        for it in entries[0]:
+        for it in self.entries:
             mask[it] = 1
 
     cdef void _fill_ewah(self, BoolArrayCollection mm):
-        cdef cset[np.uint64_t] *entries = <cset[np.uint64_t]*> self.entries
-        for it in entries[0]:
+        for it in self.entries:
             mm._set_coarse(it)
 
     cdef void _fill_bool(self, BoolArrayCollectionUncompressed mm):
-        cdef cset[np.uint64_t] *entries = <cset[np.uint64_t]*> self.entries
-        for it in entries[0]:
+        for it in self.entries:
             mm._set_coarse(it)
 
     cdef void _reset(self):
-        cdef cset[np.uint64_t] *entries = <cset[np.uint64_t]*> self.entries
-        entries[0].clear()
+        self.entries.clear()
 
     cdef to_array(self):
         cdef np.uint64_t ind
         cdef np.ndarray[np.uint64_t, ndim=1] rv
-        cdef cset[np.uint64_t] *entries = <cset[np.uint64_t]*> self.entries
         cdef cset[np.uint64_t].iterator it
-        rv = np.empty(entries[0].size(), dtype='uint64')
-        it = entries[0].begin()
+        rv = np.empty(self.entries.size(), dtype='uint64')
+        it = self.entries.begin()
         i = 0
-        while it != entries[0].end():
+        while it != self.entries.end():
             ind = dereference(it)
             rv[i] = ind
             preincrement(it)
@@ -1739,69 +1621,58 @@ cdef class SparseUnorderedBitmaskSet:
         return rv
 
     def __dealloc__(self):
-        cdef cset[np.uint64_t] *entries = <cset[np.uint64_t]*> self.entries
-        del entries
+        self.entries.clear()
 
 # vector version
 cdef class SparseUnorderedRefinedBitmaskVector:
     def __cinit__(self):
-        cdef vector[ind_pair] *entries = new vector[ind_pair]()
-        self.entries = <void *> entries
         self.total = 0
 
     cdef void _set(self, np.uint64_t ind1, np.uint64_t ind2):
         cdef ind_pair ind
-        cdef vector[ind_pair] *entries = <vector[ind_pair]*> self.entries
         ind.first = ind1
         ind.second = ind2
-        entries[0].push_back(ind)
+        self.entries.push_back(ind)
         self.total += 1
 
-
     def set(self, ind1, ind2):
         self._set(ind1, ind2)
 
     cdef void _fill(self, np.uint8_t[:] mask1, np.uint8_t[:] mask2):
-        cdef vector[ind_pair] *entries = <vector[ind_pair]*> self.entries
-        for it in entries[0]:
+        for it in self.entries:
             mask1[it.first] = mask2[it.second] = 1
 
     cdef void _fill_ewah(self, BoolArrayCollection mm):
         self._remove_duplicates()
-        cdef vector[ind_pair] *entries = <vector[ind_pair]*> self.entries
-        for it in entries[0]:
+        for it in self.entries:
             mm._set_refined(it.first, it.second)
 
     cdef void _fill_bool(self, BoolArrayCollectionUncompressed mm):
         self._remove_duplicates()
-        cdef vector[ind_pair] *entries = <vector[ind_pair]*> self.entries
-        for it in entries[0]:
+        for it in self.entries:
             mm._set_refined(it.first, it.second)
 
     cdef void _reset(self):
-        cdef vector[ind_pair] *entries = <vector[ind_pair]*> self.entries
-        entries[0].erase(entries[0].begin(), entries[0].end())
+        self.entries.erase(self.entries.begin(), self.entries.end())
         self.total = 0
 
     cdef to_array(self):
         cdef int i
         cdef np.ndarray[np.uint64_t, ndim=2] rv
         self._remove_duplicates()
-        cdef vector[ind_pair] *entries = <vector[ind_pair]*> self.entries
-        rv = np.empty((entries[0].size(),2),dtype='uint64')
+        rv = np.empty((self.entries.size(),2),dtype='uint64')
         i = 0
-        for it in entries[0]:
+        for it in self.entries:
             rv[i,0] = it.first
             rv[i,1] = it.second
             i += 1
         return rv
 
     cdef void _remove_duplicates(self):
-        cdef vector[ind_pair] *entries = <vector[ind_pair]*> self.entries
         cdef vector[ind_pair].iterator last
-        sort(entries[0].begin(), entries[0].end())
-        last = unique(entries[0].begin(), entries[0].end())
-        entries[0].erase(last, entries[0].end())
+        sort(self.entries.begin(), self.entries.end())
+        last = unique(self.entries.begin(), self.entries.end())
+        self.entries.erase(last, self.entries.end())
         # http://stackoverflow.com/questions/16970982/find-unique-rows-in-numpy-array
         # cdef np.ndarray[np.uint64_t, ndim=2] rv
         # cdef np.ndarray[np.uint64_t, ndim=2] rv_uni
@@ -1830,57 +1701,44 @@ cdef class SparseUnorderedRefinedBitmaskVector:
             self.total = 0
 
     def __dealloc__(self):
-        cdef vector[ind_pair] *entries = <vector[ind_pair]*> self.entries
-        del entries
+        self.entries.clear()
 
 # Set version
 cdef class SparseUnorderedRefinedBitmaskSet:
-    def __cinit__(self):
-        cdef cset[ind_pair] *entries = new cset[ind_pair]()
-        self.entries = <void *> entries
-
     cdef void _set(self, np.uint64_t ind1, np.uint64_t ind2):
         cdef ind_pair ind
-        cdef cset[ind_pair] *entries = <cset[ind_pair]*> self.entries
         ind.first = ind1
         ind.second = ind2
-        entries[0].insert(ind)
+        self.entries.insert(ind)
 
     def set(self, ind1, ind2):
         self._set(ind1, ind2)
 
     cdef void _fill(self, np.uint8_t[:] mask1, np.uint8_t[:] mask2):
-        cdef cset[ind_pair] *entries = <cset[ind_pair]*> self.entries
-        for p in entries[0]:
+        for p in self.entries:
             mask1[p.first] = mask2[p.second] = 1
 
     cdef void _fill_ewah(self, BoolArrayCollection mm):
-        cdef cset[ind_pair] *entries = <cset[ind_pair]*> self.entries
-        for it in entries[0]:
+        for it in self.entries:
             mm._set_refined(it.first, it.second)
 
     cdef void _fill_bool(self, BoolArrayCollectionUncompressed mm):
-        cdef cset[ind_pair] *entries = <cset[ind_pair]*> self.entries
-        for it in entries[0]:
+        for it in self.entries:
             mm._set_refined(it.first, it.second)
 
     cdef void _reset(self):
-        cdef cset[ind_pair] *entries = <cset[ind_pair]*> self.entries
-        entries[0].clear()
+        self.entries.clear()
 
     cdef to_array(self):
         cdef int i
         cdef np.ndarray[np.uint64_t, ndim=2] rv
-        cdef cset[ind_pair] *entries = <cset[ind_pair]*> self.entries
-        rv = np.empty((entries[0].size(),2),dtype='uint64')
+        rv = np.empty((self.entries.size(),2),dtype='uint64')
         i = 0
-        for it in entries[0]:
+        for it in self.entries:
             rv[i,0] = it.first
             rv[i,1] = it.second
             i += 1
         return rv
 
     def __dealloc__(self):
-        cdef cset[ind_pair] *entries = <cset[ind_pair]*> self.entries
-        del entries
-
+        self.entries.clear()

From 2164393663e61547b10e413a5be70ed77d638a46 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Thu, 21 May 2020 15:50:03 -0500
Subject: [PATCH 22/42] Intermediate commit on way to working

---
 yt/geometry/particle_geometry_handler.py |  22 ++++-
 yt/geometry/particle_oct_container.pyx   | 101 +++++++++++++++++------
 yt/utilities/lib/ewah_bool_wrap.pyx      |  12 +--
 3 files changed, 99 insertions(+), 36 deletions(-)

diff --git a/yt/geometry/particle_geometry_handler.py b/yt/geometry/particle_geometry_handler.py
index dd12a47e7af..ae2830ecde3 100644
--- a/yt/geometry/particle_geometry_handler.py
+++ b/yt/geometry/particle_geometry_handler.py
@@ -166,24 +166,42 @@ def _initialize_refined_index(self):
                         for d in self.data_files) * 28
         sub_mi1 = np.zeros(max_npart, "uint64")
         sub_mi2 = np.zeros(max_npart, "uint64")
+        mi1_dds = self.ds.domain_width.max() / (1 << self.regions.index_order1)
+        mi2_dds = mi1_dds / (1 << self.regions.index_order2)
         pb = get_pbar("Initializing refined index", len(self.data_files))
+        count_threshold = getattr(self, '_index_count_threshold',
+                                  (1 << (3*self.regions.index_order2))/512)
+        total_refined = 0
+        total_coarse_refined = ((mask >= 2) & (self.regions.particle_counts > count_threshold)).sum()
+        print("Total coarse refined zones: {} out of {} for {}%".format(
+            total_coarse_refined, mask.size, 100 * total_coarse_refined / mask.size))
         for i, data_file in enumerate(self.data_files):
             pb.update(i)
             nsub_mi = 0
             for ptype, pos in self.io._yield_coordinates(data_file):
+                print(i, ptype, pos.shape)
+                if pos.size == 0: continue
                 if hasattr(self.ds, '_sph_ptypes') and ptype == self.ds._sph_ptypes[0]:
                     hsml = self.io._get_smoothing_length(
                         data_file, pos.dtype, pos.shape)
+                    print("Has smoothing length: max coverage of %0.3e %0.3e and min coverage of %0.3e %0.3e" % (
+                        hsml.max() / mi1_dds, hsml.max() / mi2_dds,
+                        hsml.min() / mi1_dds, hsml.min() / mi2_dds))
                 else:
                     hsml = None
+                #hsml = None
                 nsub_mi = self.regions._refined_index_data_file(
                     pos, hsml, mask, sub_mi1, sub_mi2,
-                    data_file.file_id, nsub_mi)
+                    data_file.file_id, nsub_mi, count_threshold = count_threshold,
+                    mask_threshold = 2)
+                total_refined += nsub_mi
+            continue
             self.regions._set_refined_index_data_file(
                 sub_mi1, sub_mi2,
                 data_file.file_id, nsub_mi)
         pb.finish()
-        self.regions.find_collisions_refined()
+        print("TOTAL REFINED", total_refined)
+        #self.regions.find_collisions_refined()
 
     def _detect_output_fields(self):
         # TODO: Add additional fields
diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index b8409d64b0d..c12f334c0a6 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -12,6 +12,7 @@ from oct_container cimport OctreeContainer, Oct, OctInfo, ORDER_MAX, \
 cimport oct_visitors
 from oct_visitors cimport cind, OctVisitor
 from libc.stdlib cimport malloc, free, qsort
+from libc.string cimport memset
 from libc.math cimport floor, ceil, fmod
 from yt.utilities.lib.fp_utils cimport *
 from yt.utilities.lib.geometry_utils cimport bounded_morton, \
@@ -55,7 +56,7 @@ from ..utilities.lib.ewah_bool_wrap cimport SparseUnorderedRefinedBitmaskSet as
 from ..utilities.lib.ewah_bool_wrap cimport BoolArrayCollectionUncompressed as BoolArrayColl
 from ..utilities.lib.ewah_bool_wrap cimport FileBitmasks
 
-ctypedef map[np.uint64_t, vector[bool]] CoarseRefinedSets
+ctypedef map[np.uint64_t, np.uint8_t*] CoarseRefinedSets
 
 cdef class ParticleOctreeContainer(OctreeContainer):
     cdef Oct** oct_list
@@ -414,6 +415,7 @@ cdef class ParticleBitmap:
     cdef public np.int32_t index_order1
     cdef public np.int32_t index_order2
     cdef public object masks
+    cdef public object particle_counts
     cdef public object counts
     cdef public object max_count
     cdef public object _last_selector
@@ -458,6 +460,7 @@ cdef class ParticleBitmap:
         # by particles.
         # This is the simple way, for now.
         self.masks = np.zeros((1 << (index_order1 * 3), nfiles), dtype="uint8")
+        self.particle_counts = np.zeros(1 << (index_order1 * 3), dtype="uint64")
         self.bitmasks = FileBitmasks(self.nfiles)
         self.collisions = BoolArrayCollection()
 
@@ -499,6 +502,7 @@ cdef class ParticleBitmap:
         cdef np.float64_t dds[3]
         cdef np.float64_t radius
         cdef np.uint8_t[:] mask = self.masks[:, file_id]
+        cdef np.uint64_t[:] particle_counts = self.particle_counts
         cdef np.int64_t msize = (1 << (self.index_order1 * 3))
         cdef int axiter[3][2]
         cdef np.float64_t axiterv[3][2]
@@ -526,6 +530,7 @@ cdef class ParticleBitmap:
             mi = bounded_morton_split_dds(ppos[0], ppos[1], ppos[2], LE,
                                           dds, mi_split)
             mask[mi] = 1
+            particle_counts[mi] += 1
             # Expand mask by softening
             if hsml is None:
                 continue
@@ -566,6 +571,7 @@ cdef class ParticleBitmap:
                                 for zex in range(bounds[2][0], bounds[2][1]):
                                     miex = encode_morton_64bit(xex, yex, zex)
                                     mask[miex] = 1
+                                    particle_counts[miex] += 1
                                     if miex >= msize:
                                         raise IndexError(
                                             "Index for a softening region " +
@@ -600,10 +606,13 @@ cdef class ParticleBitmap:
                                  np.ndarray[np.uint8_t, ndim=1] mask,
                                  np.ndarray[np.uint64_t, ndim=1] sub_mi1,
                                  np.ndarray[np.uint64_t, ndim=1] sub_mi2,
-                                 np.uint64_t file_id, np.int64_t nsub_mi):
+                                 np.uint64_t file_id, np.int64_t nsub_mi,
+                                 np.uint64_t count_threshold = 128,
+                                 np.uint8_t mask_threshold = 2):
         return self.__refined_index_data_file(pos, hsml, mask,
                                               sub_mi1, sub_mi2, 
-                                              file_id, nsub_mi)
+                                              file_id, nsub_mi, 
+                                              count_threshold, mask_threshold)
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
@@ -616,15 +625,17 @@ cdef class ParticleBitmap:
         np.ndarray[np.uint8_t, ndim=1] mask,
         np.ndarray[np.uint64_t, ndim=1] sub_mi1,
         np.ndarray[np.uint64_t, ndim=1] sub_mi2,
-        np.uint64_t file_id, np.int64_t nsub_mi
+        np.uint64_t file_id, np.int64_t nsub_mi,
+        np.uint64_t count_threshold, np.uint8_t mask_threshold
     ) except -1:
         # Initialize
-        cdef np.int64_t i, p
+        cdef np.int64_t i, p, sorted_ind
         cdef np.uint64_t mi1, mi2
         cdef np.float64_t ppos[3]
         cdef np.float64_t s_ppos[3] # shifted ppos
         cdef int skip, Nex
         cdef np.uint64_t bounds[2][3]
+        cdef np.uint8_t fully_enclosed
         cdef np.float64_t LE[3]
         cdef np.float64_t RE[3]
         cdef np.float64_t DW[3]
@@ -635,6 +646,7 @@ cdef class ParticleBitmap:
         cdef np.uint64_t mi_split1[3]
         cdef np.uint64_t mi_split2[3]
         cdef np.uint64_t miex1, miex2, mi1_max, mi2_max
+        cdef np.uint64_t[:] particle_counts = self.particle_counts
         cdef int Nex_min[3]
         cdef int Nex_max[3]
         cdef np.float64_t rpos_min, rpos_max
@@ -655,7 +667,7 @@ cdef class ParticleBitmap:
         cdef np.float64_t axiterv[3][2]
         cdef CoarseRefinedSets coarse_refined_map
         cdef map[np.uint64_t, np.uint64_t] refined_count
-        cdef np.uint64_t nset = 0
+        cdef np.uint64_t nset = 0, nfully_enclosed = 0, n_calls = 0
         mi1_max = (1 << self.index_order1) - 1
         mi2_max = (1 << self.index_order2) - 1
         cdef np.uint64_t max_mi2_elements = 1 << (3*self.index_order2)
@@ -669,8 +681,18 @@ cdef class ParticleBitmap:
             DW[i] = RE[i] - LE[i]
             axiter[i][0] = 0 # We always do an offset of 0
             axiterv[i][0] = 0.0
-        # Loop over positions skipping those outside the domain
+        cdef np.ndarray[np.uint64_t, ndim=1] morton_indices = np.empty(pos.shape[0], dtype="u8")
         for p in range(pos.shape[0]):
+            morton_indices[p] = bounded_morton(pos[p, 0], pos[p, 1], pos[p, 2],
+                                               LE, RE, self.index_order1)
+        # Loop over positions skipping those outside the domain
+        cdef np.ndarray[np.uint64_t, ndim=1, cast=True] sorted_order 
+        if hsml is None:
+            sorted_order = np.argsort(morton_indices)
+        else:
+            sorted_order = np.argsort(hsml)[::-1]
+        for sorted_ind in range(sorted_order.shape[0]):
+            p = sorted_order[sorted_ind]
             skip = 0
             for i in range(3):
                 axiter[i][1] = 999
@@ -683,14 +705,16 @@ cdef class ParticleBitmap:
             mi1 = bounded_morton_split_dds(ppos[0], ppos[1], ppos[2], LE,
                                            dds1, mi_split1)
             if hsml is None:
-                if mask[mi1] <= 1: # only one thing in this area
+                if mask[mi1] < mask_threshold \
+                        or particle_counts[mi1] < count_threshold:
                     continue
                 # Determine sub index within cell of primary index
                 mi2 = bounded_morton_split_relative_dds(
                     ppos[0], ppos[1], ppos[2], LE, dds1, dds2, mi_split2)
-                if coarse_refined_map[mi1].size() == 0:
-                    coarse_refined_map[mi1].resize(max_mi2_elements, False)
-                    refined_count[mi1] = 0
+                if refined_count[mi1] == 0:
+                    coarse_refined_map[mi1] = <np.uint8_t*> malloc(
+                        sizeof(np.uint8_t) * max_mi2_elements)
+                    memset(coarse_refined_map[mi1], 0, max_mi2_elements)
                 if coarse_refined_map[mi1][mi2] == False:
                     coarse_refined_map[mi1][mi2] = True
                     refined_count[mi1] += 1
@@ -699,8 +723,8 @@ cdef class ParticleBitmap:
                 # except here we need to fill in all the subranges as well as the coarse ranges
                 # Note that we are also doing the null case, where we do no shifting
                 radius = hsml[p]
-                if mask[mi1] <= 1: # only one thing in this area
-                    continue
+                #if mask[mi1] <= 4: # only one thing in this area
+                #    continue
                 for i in range(3):
                     if PER[i] and ppos[i] - radius < LE[i]:
                         axiter[i][1] = +1
@@ -720,25 +744,47 @@ cdef class ParticleBitmap:
                             # OK, now we compute the left and right edges for this shift.
                             for i in range(3):
                                 # casting to int64 is not nice but is so we can have negative values we clip
-                                clip_pos_l[i] = fmax(s_ppos[i] - radius, LE[i] + dds1[i]/2)
-                                clip_pos_r[i] = fmin(s_ppos[i] + radius, RE[i] - dds1[i]/2)
+                                clip_pos_l[i] = fmax(s_ppos[i] - radius, LE[i] + dds1[i]/10)
+                                clip_pos_r[i] = fmin(s_ppos[i] + radius, RE[i] - dds1[i]/10)
+
                             bounded_morton_split_dds(clip_pos_l[0], clip_pos_l[1], clip_pos_l[2], LE, dds1, bounds[0])
                             bounded_morton_split_dds(clip_pos_r[0], clip_pos_r[1], clip_pos_r[2], LE, dds1, bounds[1])
+
                             # We go to the upper bound plus one so that we have *inclusive* loops -- the upper bound
                             # is the cell *index*, so we want to make sure we include that cell.  This is also why
                             # we don't need to worry about mi_max being the max index rather than the cell count.
+                            # One additional thing to note is that for all of
+                            # the *internal* cells, i.e., those that are both
+                            # greater than the left edge and less than the
+                            # right edge, we are fully enclosed.
                             for xex in range(bounds[0][0], bounds[1][0] + 1):
                                 for yex in range(bounds[0][1], bounds[1][1] + 1):
                                     for zex in range(bounds[0][2], bounds[1][2] + 1):
                                         miex1 = encode_morton_64bit(xex, yex, zex)
-                                        if mask[miex1] <= 1:
+                                        if mask[miex1] < mask_threshold or \
+                                                particle_counts[miex1] < count_threshold:
                                             continue
+                                        # this explicitly requires that it be *between*
+                                        # them, not overlapping
+                                        if xex > bounds[0][0] and xex < bounds[1][0] and \
+                                           yex > bounds[0][1] and yex < bounds[1][1] and \
+                                           zex > bounds[0][2] and zex < bounds[1][2]:
+                                            fully_enclosed = 1
+                                        else:
+                                            fully_enclosed = 0
                                         # Now we need to fill our sub-range
-                                        if coarse_refined_map[miex1].size() == 0:
-                                            coarse_refined_map[miex1].resize(max_mi2_elements, False)
-                                            refined_count[miex1] = 0
-                                        if refined_count[miex1] >= max_mi2_elements:
+                                        if refined_count[miex1] == 0:
+                                            coarse_refined_map[miex1] = <np.uint8_t*> malloc(
+                                                sizeof(np.uint8_t) * max_mi2_elements)
+                                            memset(coarse_refined_map[miex1], 0, max_mi2_elements)
+                                        elif refined_count[miex1] >= max_mi2_elements:
+                                            continue
+                                        if fully_enclosed == 1:
+                                            nfully_enclosed += 1
+                                            memset(coarse_refined_map[miex1], 0xFF, max_mi2_elements)
+                                            refined_count[miex1] = max_mi2_elements
                                             continue
+                                        n_calls += 1
                                         refined_count[miex1] += self.__fill_refined_ranges(s_ppos, radius, LE, RE,
                                                                    dds1, xex, yex, zex,
                                                                    dds2, mi1_max, mi2_max, miex1,
@@ -746,18 +792,21 @@ cdef class ParticleBitmap:
                                                                    max_mi2_elements)
         print("THIS MANY COARSE CELLS", coarse_refined_map.size())
         print("THIS MANY NSET", nset, nset / pos.shape[0], nsub_mi)
+        if n_calls > 0:
+            print("THIS MANY TERMINATIONS AND THIS MANY CALLS", nfully_enclosed, n_calls, nfully_enclosed / n_calls)
         cdef np.uint64_t count, vec_i
         cdef np.uint64_t total_count = 0
         for it1 in coarse_refined_map:
             mi1 = it1.first
             count = 0
             vec_i = 0
-            for vec_i in range(it1.second.size()):
-                if it1.second[vec_i] == True:
+            for vec_i in range(max_mi2_elements):
+                if it1.second[vec_i] > 0:
                     count += 1
                     #sub_mi1[nsub_mi] = mi1
                     #sub_mi2[nsub_mi] = vec_i
                     nsub_mi += 1
+            free(coarse_refined_map[mi1])
             if count != refined_count[mi1]:
                 print("WHY IS THIS WRONG", count, refined_count[mi1])
             #print("IN ", mi1, "THIS MANY REFINED CELLS", count)
@@ -922,7 +971,7 @@ cdef class ParticleBitmap:
                                            np.float64_t dds1[3], np.uint64_t xex, np.uint64_t yex, np.uint64_t zex,
                                            np.float64_t dds2[3],
                                            np.uint64_t mi1_max, np.uint64_t mi2_max, np.uint64_t miex1,
-                                           vector[bool] &refined_set, np.float64_t ppos[3], np.uint64_t mcount,
+                                           np.uint8_t *refined_set, np.float64_t ppos[3], np.uint64_t mcount,
                                           np.uint64_t max_mi2_elements) except -1:
         cdef int i
         cdef np.uint64_t new_nsub = 0
@@ -938,6 +987,8 @@ cdef class ParticleBitmap:
             # full domain
             cell_edge_l = ex1[i] * dds1[i] + LE[i]
             cell_edge_r = cell_edge_l + dds1[i]
+            if s_ppos[i] + radius < cell_edge_l or s_ppos[i] - radius > cell_edge_r:
+                return 0
             clip_pos_l[i] = fmax(s_ppos[i] - radius, cell_edge_l + dds2[i]/2.0)
             clip_pos_r[i] = fmin(s_ppos[i] + radius, cell_edge_r - dds2[i]/2.0)
         miex2_min = bounded_morton_split_relative_dds(clip_pos_l[0], clip_pos_l[1], clip_pos_l[2],
@@ -951,14 +1002,14 @@ cdef class ParticleBitmap:
             #miex2 = encode_morton_64bit(xex2, yex2, zex2)
             #decode_morton_64bit(miex2, ex2)
             # Let's check all our cases here
-            if refined_set[miex2] == True: continue
+            if refined_set[miex2] > 0: continue
             if (miex2 & xex_max) < (miex2_min & xex_max): continue
             if (miex2 & yex_max) < (miex2_min & yex_max): continue
             if (miex2 & zex_max) < (miex2_min & zex_max): continue
             if (miex2 & xex_max) > (miex2_max & xex_max): continue
             if (miex2 & yex_max) > (miex2_max & yex_max): continue
             if (miex2 & zex_max) > (miex2_max & zex_max): continue
-            refined_set[miex2] = True
+            refined_set[miex2] = 1
             new_nsub += 1
         return new_nsub
 
diff --git a/yt/utilities/lib/ewah_bool_wrap.pyx b/yt/utilities/lib/ewah_bool_wrap.pyx
index f25a785386e..cf96862718f 100644
--- a/yt/utilities/lib/ewah_bool_wrap.pyx
+++ b/yt/utilities/lib/ewah_bool_wrap.pyx
@@ -606,16 +606,10 @@ cdef class FileBitmasks:
         return self._check()
 
     def __dealloc__(self):
-        cdef ewah_bool_array *ewah_keys
-        cdef ewah_bool_array *ewah_refn
-        cdef ewah_map *ewah_coll
         for ifile in range(self.nfiles):
-            ewah_keys = (<ewah_bool_array **> self.ewah_keys)[ifile]
-            ewah_refn = (<ewah_bool_array **> self.ewah_refn)[ifile]
-            ewah_coll = (<ewah_map **> self.ewah_coll)[ifile]
-            del ewah_keys
-            del ewah_refn
-            del ewah_coll
+            del self.ewah_keys[ifile]
+            del self.ewah_refn[ifile]
+            del self.ewah_coll[ifile]
 
     def print_info(self, ifile, prefix=''):
         print("{}{: 8d} coarse, {: 8d} refined, {: 8d} total".format(

From e89bd594d228665bb1d40d9d346047dae2f114e4 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Thu, 21 May 2020 16:30:13 -0500
Subject: [PATCH 23/42] Keep a semi-running tally of bool array collections

---
 yt/geometry/particle_geometry_handler.py |  23 +--
 yt/geometry/particle_oct_container.pyx   | 189 +++--------------------
 yt/utilities/lib/ewah_bool_wrap.pyx      |   3 +
 3 files changed, 39 insertions(+), 176 deletions(-)

diff --git a/yt/geometry/particle_geometry_handler.py b/yt/geometry/particle_geometry_handler.py
index ae2830ecde3..61aaf6d9e4a 100644
--- a/yt/geometry/particle_geometry_handler.py
+++ b/yt/geometry/particle_geometry_handler.py
@@ -169,13 +169,16 @@ def _initialize_refined_index(self):
         mi1_dds = self.ds.domain_width.max() / (1 << self.regions.index_order1)
         mi2_dds = mi1_dds / (1 << self.regions.index_order2)
         pb = get_pbar("Initializing refined index", len(self.data_files))
+        mask_threshold = getattr(self, '_index_mask_threshold', 2)
         count_threshold = getattr(self, '_index_count_threshold',
-                                  (1 << (3*self.regions.index_order2))/512)
+                                  (1 << (3*self.regions.index_order2))/128)
+        print("Count threshold ", count_threshold)
         total_refined = 0
         total_coarse_refined = ((mask >= 2) & (self.regions.particle_counts > count_threshold)).sum()
         print("Total coarse refined zones: {} out of {} for {}%".format(
             total_coarse_refined, mask.size, 100 * total_coarse_refined / mask.size))
         for i, data_file in enumerate(self.data_files):
+            coll = None
             pb.update(i)
             nsub_mi = 0
             for ptype, pos in self.io._yield_coordinates(data_file):
@@ -190,18 +193,18 @@ def _initialize_refined_index(self):
                 else:
                     hsml = None
                 #hsml = None
-                nsub_mi = self.regions._refined_index_data_file(
-                    pos, hsml, mask, sub_mi1, sub_mi2,
+                nsub_mi, coll = self.regions._refined_index_data_file(
+                    coll, pos, hsml, mask, sub_mi1, sub_mi2,
                     data_file.file_id, nsub_mi, count_threshold = count_threshold,
-                    mask_threshold = 2)
+                    mask_threshold = mask_threshold)
                 total_refined += nsub_mi
-            continue
-            self.regions._set_refined_index_data_file(
-                sub_mi1, sub_mi2,
-                data_file.file_id, nsub_mi)
+            self.regions.bitmasks.append(data_file.file_id, coll)
+            #self.regions._set_refined_index_data_file(
+            #    sub_mi1, sub_mi2,
+            #    data_file.file_id, nsub_mi)
         pb.finish()
-        print("TOTAL REFINED", total_refined)
-        #self.regions.find_collisions_refined()
+        #print("TOTAL REFINED", total_refined)
+        self.regions.find_collisions_refined()
 
     def _detect_output_fields(self):
         # TODO: Add additional fields
diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index c12f334c0a6..a760e6450c0 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -428,7 +428,7 @@ cdef class ParticleBitmap:
     cdef np.uint32_t *file_markers
     cdef np.uint64_t n_file_markers
     cdef np.uint64_t file_marker_i
-    cdef FileBitmasks bitmasks
+    cdef public FileBitmasks bitmasks
     cdef public BoolArrayCollection collisions
 
     def __init__(self, left_edge, right_edge, periodicity, file_hash, nfiles, 
@@ -601,6 +601,7 @@ cdef class ParticleBitmap:
     @cython.cdivision(True)
     @cython.initializedcheck(False)
     def _refined_index_data_file(self,
+                                 BoolArrayCollection in_collection,
                                  np.ndarray[floating, ndim=2] pos,
                                  np.ndarray[floating, ndim=1] hsml,
                                  np.ndarray[np.uint8_t, ndim=1] mask,
@@ -609,31 +610,38 @@ cdef class ParticleBitmap:
                                  np.uint64_t file_id, np.int64_t nsub_mi,
                                  np.uint64_t count_threshold = 128,
                                  np.uint8_t mask_threshold = 2):
-        return self.__refined_index_data_file(pos, hsml, mask,
+        if in_collection is None:
+            in_collection = BoolArrayCollection()
+        cdef BoolArrayCollection _in_coll = in_collection
+        cdef np.int64_t nsub
+        out_collection = self.__refined_index_data_file(_in_coll, pos, hsml, mask,
                                               sub_mi1, sub_mi2, 
-                                              file_id, nsub_mi, 
+                                              file_id, &nsub,
                                               count_threshold, mask_threshold)
+        return nsub, out_collection
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
     @cython.cdivision(True)
     @cython.initializedcheck(False)
-    cdef np.int64_t __refined_index_data_file(
+    cdef BoolArrayCollection __refined_index_data_file(
         self,
+        BoolArrayCollection in_collection,
         np.ndarray[floating, ndim=2] pos,
         np.ndarray[floating, ndim=1] hsml,
         np.ndarray[np.uint8_t, ndim=1] mask,
         np.ndarray[np.uint64_t, ndim=1] sub_mi1,
         np.ndarray[np.uint64_t, ndim=1] sub_mi2,
-        np.uint64_t file_id, np.int64_t nsub_mi,
+        np.uint64_t file_id, np.int64_t *nsub_mi,
         np.uint64_t count_threshold, np.uint8_t mask_threshold
-    ) except -1:
+    ):
         # Initialize
         cdef np.int64_t i, p, sorted_ind
         cdef np.uint64_t mi1, mi2
         cdef np.float64_t ppos[3]
         cdef np.float64_t s_ppos[3] # shifted ppos
         cdef int skip, Nex
+        cdef BoolArrayCollection this_collection, out_collection
         cdef np.uint64_t bounds[2][3]
         cdef np.uint8_t fully_enclosed
         cdef np.float64_t LE[3]
@@ -790,12 +798,10 @@ cdef class ParticleBitmap:
                                                                    dds2, mi1_max, mi2_max, miex1,
                                                                    coarse_refined_map[miex1], ppos, mask[miex1],
                                                                    max_mi2_elements)
-        print("THIS MANY COARSE CELLS", coarse_refined_map.size())
-        print("THIS MANY NSET", nset, nset / pos.shape[0], nsub_mi)
-        if n_calls > 0:
-            print("THIS MANY TERMINATIONS AND THIS MANY CALLS", nfully_enclosed, n_calls, nfully_enclosed / n_calls)
         cdef np.uint64_t count, vec_i
         cdef np.uint64_t total_count = 0
+        this_collection = BoolArrayCollection()
+        print("Appending to the new BoolArrayCollection")
         for it1 in coarse_refined_map:
             mi1 = it1.first
             count = 0
@@ -803,164 +809,15 @@ cdef class ParticleBitmap:
             for vec_i in range(max_mi2_elements):
                 if it1.second[vec_i] > 0:
                     count += 1
-                    #sub_mi1[nsub_mi] = mi1
-                    #sub_mi2[nsub_mi] = vec_i
-                    nsub_mi += 1
+                    nsub_mi[0] += 1
+                    this_collection._set(mi1, vec_i)
             free(coarse_refined_map[mi1])
-            if count != refined_count[mi1]:
-                print("WHY IS THIS WRONG", count, refined_count[mi1])
-            #print("IN ", mi1, "THIS MANY REFINED CELLS", count)
             total_count += count
-        if coarse_refined_map.size() > 0:
-            print("NSUB_MI NOW", total_count, total_count / (coarse_refined_map.size() * max_mi2_elements), nsub_mi, sub_mi1.shape[0], sub_mi2.shape[0])
-        return nsub_mi
-
-        if 0:
-            # Expand for smoothing
-            Nex = 1
-            for i in range(3):
-                Nex_min[i] = 0
-                Nex_max[i] = 0
-                rpos_min = ppos[i] - (dds2[i]*mi_split2[i] + dds1[i]*mi_split1[i] + LE[i])
-                rpos_max = dds2[i] - rpos_min
-                if rpos_min > hsml[p]:
-                    Nex_min[i] = <int>((rpos_min-hsml[p])/dds2[i]) + 1
-                if rpos_max > hsml[p]:
-                    Nex_max[i] = <int>((rpos_max-hsml[p])/dds2[i]) + 1
-                Nex *= (Nex_max[i] + Nex_min[i] + 1)
-            if Nex > 1:
-                # Ensure that min/max values for x,y,z indexes are obeyed
-                if (Nex_max[0] + Nex_min[0] + 1) > xex1_range.shape[0]:
-                    xex1_range = np.empty(Nex_max[0] + Nex_min[0] + 1, 'uint64')
-                    xex2_range = np.empty(Nex_max[0] + Nex_min[0] + 1, 'uint64')
-                if (Nex_max[1] + Nex_min[1] + 1) > yex1_range.shape[0]:
-                    yex1_range = np.empty(Nex_max[1] + Nex_min[1] + 1, 'uint64')
-                    yex2_range = np.empty(Nex_max[1] + Nex_min[1] + 1, 'uint64')
-                if (Nex_max[2] + Nex_min[2] + 1) > zex1_range.shape[0]:
-                    zex1_range = np.empty(Nex_max[2] + Nex_min[2] + 1, 'uint64')
-                    zex2_range = np.empty(Nex_max[2] + Nex_min[2] + 1, 'uint64')
-                xex2_min = mi_split2[0] - min(Nex_min[0], <int>mi_split2[0])
-                xex2_max = mi_split2[0] + min(Nex_max[0], <int>(mi2_max - mi_split2[0])) + 1
-                yex2_min = mi_split2[1] - min(Nex_min[1], <int>mi_split2[1])
-                yex2_max = mi_split2[1] + min(Nex_max[1], <int>(mi2_max - mi_split2[1])) + 1
-                zex2_min = mi_split2[2] - min(Nex_min[2], <int>mi_split2[2])
-                zex2_max = mi_split2[2] + min(Nex_max[2], <int>(mi2_max - mi_split2[2])) + 1
-                ixe = iye = ize = 0
-                for xex2 in range(xex2_min, xex2_max):
-                    xex1_range[ixe] = mi_split1[0]
-                    xex2_range[ixe] = xex2
-                    ixe += 1
-                for yex2 in range(yex2_min, yex2_max):
-                    yex1_range[iye] = mi_split1[1]
-                    yex2_range[iye] = yex2
-                    iye += 1
-                for zex2 in range(zex2_min, zex2_max):
-                    zex1_range[ize] = mi_split1[2]
-                    zex2_range[ize] = zex2
-                    ize += 1
-                # Expand to adjacent coarse cells, wrapping periodically
-                # if need be
-                # x
-                if Nex_min[0] > <int>mi_split2[0]:
-                    if mi_split1[0] > 0:
-                        for xex2 in range(mi2_max + 1 - (Nex_min[0] - mi_split2[0]), mi2_max + 1): 
-                            xex1_range[ixe] = mi_split1[0] - 1
-                            xex2_range[ixe] = xex2
-                            ixe += 1
-                    elif PER[0]:
-                        for xex2 in range(mi2_max + 1 - (Nex_min[0] - mi_split2[0]), mi2_max + 1): 
-                            xex1_range[ixe] = mi1_max
-                            xex2_range[ixe] = xex2
-                            ixe += 1
-                if Nex_max[0] > <int>(mi2_max-mi_split2[0]):  
-                    if mi_split1[0] < mi1_max:
-                        for xex2 in range(0, Nex_max[0] - (mi2_max-mi_split2[0])): 
-                            xex1_range[ixe] = mi_split1[0] + 1
-                            xex2_range[ixe] = xex2
-                            ixe += 1
-                    elif PER[0]:
-                        for xex2 in range(0, Nex_max[0] - (mi2_max-mi_split2[0])): 
-                            xex1_range[ixe] = 0
-                            xex2_range[ixe] = xex2
-                            ixe += 1
-                # y
-                if Nex_min[1] > <int>mi_split2[1]:
-                    if mi_split1[1] > 0:
-                        for yex2 in range(mi2_max + 1 - (Nex_min[1] - mi_split2[1]), mi2_max + 1): 
-                            yex1_range[iye] = mi_split1[1] - 1
-                            yex2_range[iye] = yex2
-                            iye += 1
-                    elif PER[1]:
-                        for yex2 in range(mi2_max + 1 - (Nex_min[1] - mi_split2[1]), mi2_max + 1): 
-                            yex1_range[iye] = mi1_max
-                            yex2_range[iye] = yex2
-                            iye += 1
-                if Nex_max[1] > <int>(mi2_max-mi_split2[1]):  
-                    if mi_split1[1] < mi1_max:
-                        for yex2 in range(0, Nex_max[1] - (mi2_max-mi_split2[1])): 
-                            yex1_range[iye] = mi_split1[1] + 1
-                            yex2_range[iye] = yex2
-                            iye += 1
-                    elif PER[1]:
-                        for yex2 in range(0, Nex_max[1] - (mi2_max-mi_split2[1])): 
-                            yex1_range[iye] = 0
-                            yex2_range[iye] = yex2
-                            iye += 1
-                # z
-                if Nex_min[2] > <int>mi_split2[2]:
-                    if mi_split1[2] > 0:
-                        for zex2 in range(mi2_max + 1 - (Nex_min[2] - mi_split2[2]), mi2_max + 1): 
-                            zex1_range[ize] = mi_split1[2] - 1
-                            zex2_range[ize] = zex2
-                            ize += 1
-                    elif PER[2]:
-                        for zex2 in range(mi2_max + 1 - (Nex_min[2] - mi_split2[2]), mi2_max + 1): 
-                            zex1_range[ize] = mi1_max
-                            zex2_range[ize] = zex2
-                            ize += 1
-                if Nex_max[2] > <int>(mi2_max-mi_split2[2]):  
-                    if mi_split1[2] < mi1_max:
-                        for zex2 in range(0, Nex_max[2] - (mi2_max-mi_split2[2])): 
-                            zex1_range[ize] = mi_split1[2] + 1
-                            zex2_range[ize] = zex2
-                            ize += 1
-                    elif PER[2]:
-                        for zex2 in range(0, Nex_max[2] - (mi2_max-mi_split2[2])): 
-                            zex1_range[ize] = 0
-                            zex2_range[ize] = zex2
-                            ize += 1
-                for ix in range(ixe):
-                    xex1 = xex1_range[ix]
-                    xex2 = xex2_range[ix]
-                    for iy in range(iye):
-                        yex1 = yex1_range[iy]
-                        yex2 = yex2_range[iy]
-                        for iz in range(ize):
-                            zex1 = zex1_range[iz]
-                            zex2 = zex2_range[iz]
-                            if (xex1 == mi_split1[0] and xex2 == mi_split2[0] and
-                                yex1 == mi_split1[1] and yex2 == mi_split2[1] and
-                                zex1 == mi_split1[2] and zex2 == mi_split2[2]):
-                                continue
-                            miex1 = encode_morton_64bit(xex1, yex1, zex1)
-                            miex2 = encode_morton_64bit(xex2, yex2, zex2)
-                            if nsub_mi >= msize:
-                                # Uncomment these lines to allow periodic
-                                # caching of refined indices
-                                # self.bitmasks._set_refined_index_array(
-                                #     file_id, nsub_mi, sub_mi1, sub_mi2)
-                                # nsub_mi = 0
-                                raise IndexError(
-                                    "Refined index exceeded original "
-                                    "estimate.\n"
-                                    "nsub_mi = %s, "
-                                    "sub_mi1.shape[0] = %s"
-                                    % (nsub_mi, sub_mi1.shape[0]))
-                            sub_mi1[nsub_mi] = miex1
-                            sub_mi2[nsub_mi] = miex2
-                            nsub_mi += 1
-        # Only subs of particles in the mask
-        return nsub_mi
+        out_collection = BoolArrayCollection()
+        print("Logical or-ing")
+        in_collection._logicalor(this_collection, out_collection)
+        print("Completed")
+        return out_collection
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
diff --git a/yt/utilities/lib/ewah_bool_wrap.pyx b/yt/utilities/lib/ewah_bool_wrap.pyx
index cf96862718f..9cfc5cfd2d6 100644
--- a/yt/utilities/lib/ewah_bool_wrap.pyx
+++ b/yt/utilities/lib/ewah_bool_wrap.pyx
@@ -321,6 +321,9 @@ cdef class FileBitmasks:
         out = ewah_refn[0].numberOfOnes()
         return out
 
+    def append(self, np.uint32_t ifile, BoolArrayCollection solf):
+        self._append(ifile, solf)
+
     cdef void _append(self, np.uint32_t ifile, BoolArrayCollection solf):
         cdef ewah_bool_array *ewah_keys1 = (<ewah_bool_array **> self.ewah_keys)[ifile]
         cdef ewah_bool_array *ewah_refn1 = (<ewah_bool_array **> self.ewah_refn)[ifile]

From 09aacb9afcccff3156c126cfbe624d87dd18432b Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Fri, 22 May 2020 11:04:16 -0500
Subject: [PATCH 24/42] Switch to using BoolArray

---
 yt/geometry/particle_oct_container.pyx | 43 +++++++++++++++-----------
 yt/utilities/lib/ewah_bool_array.pxd   | 19 ++++++++++--
 2 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index a760e6450c0..5e5b586d182 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -31,7 +31,7 @@ from yt.funcs import get_pbar
 
 from particle_deposit cimport gind
 from yt.utilities.lib.ewah_bool_array cimport \
-    ewah_bool_array, ewah_bool_iterator, ewah_map
+    ewah_bool_array, ewah_bool_iterator, ewah_map, bool_array
 #from yt.utilities.lib.ewah_bool_wrap cimport \
 from ..utilities.lib.ewah_bool_wrap cimport BoolArrayCollection
 from libcpp cimport bool
@@ -56,7 +56,7 @@ from ..utilities.lib.ewah_bool_wrap cimport SparseUnorderedRefinedBitmaskSet as
 from ..utilities.lib.ewah_bool_wrap cimport BoolArrayCollectionUncompressed as BoolArrayColl
 from ..utilities.lib.ewah_bool_wrap cimport FileBitmasks
 
-ctypedef map[np.uint64_t, np.uint8_t*] CoarseRefinedSets
+ctypedef map[np.uint64_t, bool_array] CoarseRefinedSets
 
 cdef class ParticleOctreeContainer(OctreeContainer):
     cdef Oct** oct_list
@@ -720,11 +720,9 @@ cdef class ParticleBitmap:
                 mi2 = bounded_morton_split_relative_dds(
                     ppos[0], ppos[1], ppos[2], LE, dds1, dds2, mi_split2)
                 if refined_count[mi1] == 0:
-                    coarse_refined_map[mi1] = <np.uint8_t*> malloc(
-                        sizeof(np.uint8_t) * max_mi2_elements)
-                    memset(coarse_refined_map[mi1], 0, max_mi2_elements)
-                if coarse_refined_map[mi1][mi2] == False:
-                    coarse_refined_map[mi1][mi2] = True
+                    coarse_refined_map[mi1].padWithZeroes(max_mi2_elements)
+                if coarse_refined_map[mi1].get(mi2) == False:
+                    coarse_refined_map[mi1].set(mi2)
                     refined_count[mi1] += 1
             else: # only hit if we have smoothing lengths.
                 # We have to do essentially the identical process to in the coarse indexing,
@@ -782,14 +780,14 @@ cdef class ParticleBitmap:
                                             fully_enclosed = 0
                                         # Now we need to fill our sub-range
                                         if refined_count[miex1] == 0:
-                                            coarse_refined_map[miex1] = <np.uint8_t*> malloc(
-                                                sizeof(np.uint8_t) * max_mi2_elements)
-                                            memset(coarse_refined_map[miex1], 0, max_mi2_elements)
+                                            coarse_refined_map[miex1].padWithZeroes(max_mi2_elements)
                                         elif refined_count[miex1] >= max_mi2_elements:
                                             continue
                                         if fully_enclosed == 1:
                                             nfully_enclosed += 1
-                                            memset(coarse_refined_map[miex1], 0xFF, max_mi2_elements)
+                                            coarse_refined_map[miex1].inplace_logicalxor(
+                                                coarse_refined_map[miex1])
+                                            coarse_refined_map[miex1].inplace_logicalnot()
                                             refined_count[miex1] = max_mi2_elements
                                             continue
                                         n_calls += 1
@@ -800,18 +798,27 @@ cdef class ParticleBitmap:
                                                                    max_mi2_elements)
         cdef np.uint64_t count, vec_i
         cdef np.uint64_t total_count = 0
+        cdef bool_array *buf = NULL
         this_collection = BoolArrayCollection()
-        print("Appending to the new BoolArrayCollection")
+        cdef ewah_bool_array *refined_arr = NULL
+        print("Appending to the new BoolArrayCollection", coarse_refined_map.size())
+        cdef np.uint64_t ncrm = 0
         for it1 in coarse_refined_map:
+            if ncrm % 1000 == 0:
+                print(ncrm)
+            ncrm += 1
             mi1 = it1.first
+            refined_arr = &this_collection.ewah_coll[0][mi1]
+            this_collection.ewah_keys[0].set(mi1)
+            this_collection.ewah_refn[0].set(mi1)
             count = 0
             vec_i = 0
+            buf = &it1.second
             for vec_i in range(max_mi2_elements):
-                if it1.second[vec_i] > 0:
+                if buf.get(vec_i) > 0:
                     count += 1
+                    refined_arr.set(vec_i)
                     nsub_mi[0] += 1
-                    this_collection._set(mi1, vec_i)
-            free(coarse_refined_map[mi1])
             total_count += count
         out_collection = BoolArrayCollection()
         print("Logical or-ing")
@@ -828,7 +835,7 @@ cdef class ParticleBitmap:
                                            np.float64_t dds1[3], np.uint64_t xex, np.uint64_t yex, np.uint64_t zex,
                                            np.float64_t dds2[3],
                                            np.uint64_t mi1_max, np.uint64_t mi2_max, np.uint64_t miex1,
-                                           np.uint8_t *refined_set, np.float64_t ppos[3], np.uint64_t mcount,
+                                           bool_array &refined_set, np.float64_t ppos[3], np.uint64_t mcount,
                                           np.uint64_t max_mi2_elements) except -1:
         cdef int i
         cdef np.uint64_t new_nsub = 0
@@ -859,14 +866,14 @@ cdef class ParticleBitmap:
             #miex2 = encode_morton_64bit(xex2, yex2, zex2)
             #decode_morton_64bit(miex2, ex2)
             # Let's check all our cases here
-            if refined_set[miex2] > 0: continue
+            if refined_set.get(miex2): continue
             if (miex2 & xex_max) < (miex2_min & xex_max): continue
             if (miex2 & yex_max) < (miex2_min & yex_max): continue
             if (miex2 & zex_max) < (miex2_min & zex_max): continue
             if (miex2 & xex_max) > (miex2_max & xex_max): continue
             if (miex2 & yex_max) > (miex2_max & yex_max): continue
             if (miex2 & zex_max) > (miex2_max & zex_max): continue
-            refined_set[miex2] = 1
+            refined_set.set(miex2)
             new_nsub += 1
         return new_nsub
 
diff --git a/yt/utilities/lib/ewah_bool_array.pxd b/yt/utilities/lib/ewah_bool_array.pxd
index 59d8db03328..797a681134b 100644
--- a/yt/utilities/lib/ewah_bool_array.pxd
+++ b/yt/utilities/lib/ewah_bool_array.pxd
@@ -11,6 +11,7 @@ cimport cython
 from libcpp.vector cimport vector
 from libcpp.map cimport map
 from libcpp.string cimport string
+from libcpp cimport bool
 from libc.stdint cimport uint64_t
 
 # Streams req for c++ IO
@@ -71,8 +72,22 @@ cdef extern from "ewah.h":
         EWAHBoolArraySetBitForwardIterator begin()
         EWAHBoolArraySetBitForwardIterator end()
 
-ctypedef EWAHBoolArray[uint64_t] ewah_bool_array
-ctypedef EWAHBoolArraySetBitForwardIterator[uint64_t] ewah_bool_iterator
+cdef extern from "boolarray.h":
+    cppclass BoolArray[uword]:
+        void setSizeInBits(size_t sizeib)
+        void set(size_t pos)
+        void unset(size_t pos)
+        bool get(size_t pos)
+        void reset()
+        size_t sizeInBits()
+        size_t numberOfOnes()
+        void inplace_logicalxor(BoolArray &other)
+        void inplace_logicalnot()
+        size_t padWithZeroes(size_t totalbits)
+
+ctypedef EWAHBoolArray[np.uint64_t] ewah_bool_array
+ctypedef EWAHBoolArraySetBitForwardIterator[np.uint64_t] ewah_bool_iterator
 ctypedef vector[size_t] bitset_array
 ctypedef map[np.uint64_t, ewah_bool_array] ewah_map
 ctypedef stringstream sstream
+ctypedef BoolArray[np.uint64_t] bool_array

From c82a015a838cb72fa32975296674273d9ff1730b Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Fri, 22 May 2020 15:58:36 -0500
Subject: [PATCH 25/42] Switch to word adding for refined EWAH.

---
 yt/geometry/particle_geometry_handler.py | 18 ++------
 yt/geometry/particle_oct_container.pyx   | 56 ++++++++++++------------
 yt/utilities/lib/ewah_bool_array.pxd     | 11 +++--
 3 files changed, 41 insertions(+), 44 deletions(-)

diff --git a/yt/geometry/particle_geometry_handler.py b/yt/geometry/particle_geometry_handler.py
index 61aaf6d9e4a..6950203dc86 100644
--- a/yt/geometry/particle_geometry_handler.py
+++ b/yt/geometry/particle_geometry_handler.py
@@ -170,40 +170,30 @@ def _initialize_refined_index(self):
         mi2_dds = mi1_dds / (1 << self.regions.index_order2)
         pb = get_pbar("Initializing refined index", len(self.data_files))
         mask_threshold = getattr(self, '_index_mask_threshold', 2)
-        count_threshold = getattr(self, '_index_count_threshold',
-                                  (1 << (3*self.regions.index_order2))/128)
-        print("Count threshold ", count_threshold)
+        count_threshold = getattr(self, '_index_count_threshold', 256)
+        mylog.debug("Using estimated thresholds of %s and %s for refinement", mask_threshold, count_threshold)
         total_refined = 0
         total_coarse_refined = ((mask >= 2) & (self.regions.particle_counts > count_threshold)).sum()
-        print("Total coarse refined zones: {} out of {} for {}%".format(
-            total_coarse_refined, mask.size, 100 * total_coarse_refined / mask.size))
+        mylog.debug("This should produce roughly %s zones, for %s of the domain",
+                    total_coarse_refined, 100 * total_coarse_refined / mask.size)
         for i, data_file in enumerate(self.data_files):
             coll = None
             pb.update(i)
             nsub_mi = 0
             for ptype, pos in self.io._yield_coordinates(data_file):
-                print(i, ptype, pos.shape)
                 if pos.size == 0: continue
                 if hasattr(self.ds, '_sph_ptypes') and ptype == self.ds._sph_ptypes[0]:
                     hsml = self.io._get_smoothing_length(
                         data_file, pos.dtype, pos.shape)
-                    print("Has smoothing length: max coverage of %0.3e %0.3e and min coverage of %0.3e %0.3e" % (
-                        hsml.max() / mi1_dds, hsml.max() / mi2_dds,
-                        hsml.min() / mi1_dds, hsml.min() / mi2_dds))
                 else:
                     hsml = None
-                #hsml = None
                 nsub_mi, coll = self.regions._refined_index_data_file(
                     coll, pos, hsml, mask, sub_mi1, sub_mi2,
                     data_file.file_id, nsub_mi, count_threshold = count_threshold,
                     mask_threshold = mask_threshold)
                 total_refined += nsub_mi
             self.regions.bitmasks.append(data_file.file_id, coll)
-            #self.regions._set_refined_index_data_file(
-            #    sub_mi1, sub_mi2,
-            #    data_file.file_id, nsub_mi)
         pb.finish()
-        #print("TOTAL REFINED", total_refined)
         self.regions.find_collisions_refined()
 
     def _detect_output_fields(self):
diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 5e5b586d182..8641b8e677c 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -31,7 +31,7 @@ from yt.funcs import get_pbar
 
 from particle_deposit cimport gind
 from yt.utilities.lib.ewah_bool_array cimport \
-    ewah_bool_array, ewah_bool_iterator, ewah_map, bool_array
+    ewah_bool_array, ewah_bool_iterator, ewah_map, bool_array, ewah_word_type
 #from yt.utilities.lib.ewah_bool_wrap cimport \
 from ..utilities.lib.ewah_bool_wrap cimport BoolArrayCollection
 from libcpp cimport bool
@@ -411,6 +411,7 @@ cdef class ParticleBitmap:
     cdef np.float64_t idds[3]
     cdef np.int32_t dims[3]
     cdef np.int64_t file_hash
+    cdef np.uint64_t directional_max2[3]
     cdef public np.uint64_t nfiles
     cdef public np.int32_t index_order1
     cdef public np.int32_t index_order2
@@ -456,6 +457,10 @@ cdef class ParticleBitmap:
         # We use 64-bit masks
         self.index_order1 = index_order1
         self.index_order2 = index_order2
+        mi2_max = (1 << self.index_order2) - 1
+        self.directional_max2[0] = encode_morton_64bit(mi2_max, 0, 0)
+        self.directional_max2[1] = encode_morton_64bit(0, mi2_max, 0)
+        self.directional_max2[2] = encode_morton_64bit(0, 0, mi2_max)
         # This will be an on/off flag for which morton index values are touched
         # by particles.
         # This is the simple way, for now.
@@ -799,31 +804,20 @@ cdef class ParticleBitmap:
         cdef np.uint64_t count, vec_i
         cdef np.uint64_t total_count = 0
         cdef bool_array *buf = NULL
+        cdef ewah_word_type w
         this_collection = BoolArrayCollection()
         cdef ewah_bool_array *refined_arr = NULL
-        print("Appending to the new BoolArrayCollection", coarse_refined_map.size())
-        cdef np.uint64_t ncrm = 0
         for it1 in coarse_refined_map:
-            if ncrm % 1000 == 0:
-                print(ncrm)
-            ncrm += 1
             mi1 = it1.first
             refined_arr = &this_collection.ewah_coll[0][mi1]
             this_collection.ewah_keys[0].set(mi1)
             this_collection.ewah_refn[0].set(mi1)
-            count = 0
-            vec_i = 0
             buf = &it1.second
-            for vec_i in range(max_mi2_elements):
-                if buf.get(vec_i) > 0:
-                    count += 1
-                    refined_arr.set(vec_i)
-                    nsub_mi[0] += 1
-            total_count += count
+            for vec_i in range(buf.sizeInBytes() / sizeof(ewah_word_type)):
+                w = buf.getWord(vec_i)
+                refined_arr.addWord(w)
         out_collection = BoolArrayCollection()
-        print("Logical or-ing")
         in_collection._logicalor(this_collection, out_collection)
-        print("Completed")
         return out_collection
 
     @cython.boundscheck(False)
@@ -844,6 +838,8 @@ cdef class ParticleBitmap:
         cdef np.float64_t clip_pos_l[3], clip_pos_r[3], cell_edge_l, cell_edge_r
         cdef np.uint64_t ex1[3], ex2[3], ex3[3]
         cdef np.uint64_t xex_max, yex_max, zex_max
+        cdef np.uint64_t xiex_min, yiex_min, ziex_min
+        cdef np.uint64_t xiex_max, yiex_max, ziex_max
         ex1[0] = xex; ex1[1] = yex; ex1[2] = zex
         # Check a few special cases
         for i in range(3):
@@ -859,23 +855,29 @@ cdef class ParticleBitmap:
                                                 LE, dds1, dds2, bounds_l)
         miex2_max = bounded_morton_split_relative_dds(clip_pos_r[0], clip_pos_r[1], clip_pos_r[2],
                                                 LE, dds1, dds2, bounds_r)
-        xex_max = encode_morton_64bit(mi2_max, 0, 0)
-        yex_max = encode_morton_64bit(0, mi2_max, 0)
-        zex_max = encode_morton_64bit(0, 0, mi2_max)
+        xex_max = self.directional_max2[0]
+        yex_max = self.directional_max2[1]
+        zex_max = self.directional_max2[2]
+        xiex_min = miex2_min & xex_max
+        yiex_min = miex2_min & yex_max
+        ziex_min = miex2_min & zex_max
+        xiex_max = miex2_max & xex_max
+        yiex_max = miex2_max & yex_max
+        ziex_max = miex2_max & zex_max
+        # This could *probably* be sped up by iterating over words.
         for miex2 in range(miex2_min, miex2_max + 1):
             #miex2 = encode_morton_64bit(xex2, yex2, zex2)
             #decode_morton_64bit(miex2, ex2)
             # Let's check all our cases here
-            if refined_set.get(miex2): continue
-            if (miex2 & xex_max) < (miex2_min & xex_max): continue
-            if (miex2 & yex_max) < (miex2_min & yex_max): continue
-            if (miex2 & zex_max) < (miex2_min & zex_max): continue
-            if (miex2 & xex_max) > (miex2_max & xex_max): continue
-            if (miex2 & yex_max) > (miex2_max & yex_max): continue
-            if (miex2 & zex_max) > (miex2_max & zex_max): continue
+            if (miex2 & xex_max) < (xiex_min): continue
+            if (miex2 & xex_max) > (xiex_max): continue
+            if (miex2 & yex_max) < (yiex_min): continue
+            if (miex2 & yex_max) > (yiex_max): continue
+            if (miex2 & zex_max) < (ziex_min): continue
+            if (miex2 & zex_max) > (ziex_max): continue
             refined_set.set(miex2)
             new_nsub += 1
-        return new_nsub
+        return refined_set.numberOfOnes()
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
diff --git a/yt/utilities/lib/ewah_bool_array.pxd b/yt/utilities/lib/ewah_bool_array.pxd
index 797a681134b..856b6e8d6d7 100644
--- a/yt/utilities/lib/ewah_bool_array.pxd
+++ b/yt/utilities/lib/ewah_bool_array.pxd
@@ -66,6 +66,7 @@ cdef extern from "ewah.h":
         void readBuffer(stringstream &incoming, const size_t buffersize)
         void write(stringstream &out, bint savesizeinbits)
         void writeBuffer(stringstream &out)
+        size_t addWord(uword newdata)
         vector[uword] &getBuffer()
         # const_iterator begin()
         # const_iterator end()
@@ -80,14 +81,18 @@ cdef extern from "boolarray.h":
         bool get(size_t pos)
         void reset()
         size_t sizeInBits()
+        size_t sizeInBytes()
         size_t numberOfOnes()
         void inplace_logicalxor(BoolArray &other)
         void inplace_logicalnot()
         size_t padWithZeroes(size_t totalbits)
+        uword getWord(size_t pos)
+        size_t wordinbits
 
-ctypedef EWAHBoolArray[np.uint64_t] ewah_bool_array
-ctypedef EWAHBoolArraySetBitForwardIterator[np.uint64_t] ewah_bool_iterator
+ctypedef np.uint64_t ewah_word_type
+ctypedef EWAHBoolArray[ewah_word_type] ewah_bool_array
+ctypedef EWAHBoolArraySetBitForwardIterator[ewah_word_type] ewah_bool_iterator
 ctypedef vector[size_t] bitset_array
 ctypedef map[np.uint64_t, ewah_bool_array] ewah_map
 ctypedef stringstream sstream
-ctypedef BoolArray[np.uint64_t] bool_array
+ctypedef BoolArray[ewah_word_type] bool_array

From 5c30b9a6cf8152c5540825ab0b7c32fa57591c14 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Fri, 22 May 2020 16:15:42 -0500
Subject: [PATCH 26/42] Fixing a flake8 error

---
 yt/geometry/particle_geometry_handler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/yt/geometry/particle_geometry_handler.py b/yt/geometry/particle_geometry_handler.py
index 6950203dc86..8722ad58e25 100644
--- a/yt/geometry/particle_geometry_handler.py
+++ b/yt/geometry/particle_geometry_handler.py
@@ -166,8 +166,6 @@ def _initialize_refined_index(self):
                         for d in self.data_files) * 28
         sub_mi1 = np.zeros(max_npart, "uint64")
         sub_mi2 = np.zeros(max_npart, "uint64")
-        mi1_dds = self.ds.domain_width.max() / (1 << self.regions.index_order1)
-        mi2_dds = mi1_dds / (1 << self.regions.index_order2)
         pb = get_pbar("Initializing refined index", len(self.data_files))
         mask_threshold = getattr(self, '_index_mask_threshold', 2)
         count_threshold = getattr(self, '_index_count_threshold', 256)

From a66be30c7938ebd7ab4687fccf2c18c18db38ad2 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Fri, 22 May 2020 16:48:49 -0500
Subject: [PATCH 27/42] remove unused unordered_set import

---
 yt/geometry/particle_oct_container.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 8641b8e677c..96a4756151d 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -38,7 +38,6 @@ from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair
-from libcpp.unordered_set cimport unordered_set as uset
 from cython.operator cimport dereference, preincrement
 import struct
 import os

From 017768bcb79e0ae689934db38447b46e84b336be Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Fri, 22 May 2020 20:17:30 -0500
Subject: [PATCH 28/42] Fix testing calls; not working yet.

---
 yt/geometry/tests/test_particle_octree.py | 26 ++++++++++++-----------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/yt/geometry/tests/test_particle_octree.py b/yt/geometry/tests/test_particle_octree.py
index ab320d2725c..0e193c845b5 100644
--- a/yt/geometry/tests/test_particle_octree.py
+++ b/yt/geometry/tests/test_particle_octree.py
@@ -133,11 +133,12 @@ def FakeBitmap(npart, nfiles, order1, order2,
         posgen = yield_fake_decomp(decomp, npart, nfiles,
                                    left_edge, right_edge, buff=buff, 
                                    distrib=distrib)
+        coll = None
         for i, (pos, hsml) in enumerate(posgen):
-            nsub_mi = reg._refined_index_data_file(
-                pos, hsml, mask, sub_mi1, sub_mi2, i, 0)
-            reg._set_refined_index_data_file(
-                sub_mi1, sub_mi2, i, nsub_mi)
+            nsub_mi, coll = reg._refined_index_data_file(
+                coll, pos, hsml, mask, sub_mi1, sub_mi2, i,
+                0, count_threshold = 1, mask_threshold = 2)
+            reg.bitmasks.append(i, coll)
         # Save if file name provided
         if isinstance(fname, str):
             reg.save_bitmasks(fname)
@@ -175,11 +176,12 @@ def test_bitmap_no_collisions():
     sub_mi2 = np.zeros(max_npart, "uint64")
     posgen = yield_fake_decomp('sliced', npart, nfiles,
                                left_edge, right_edge)
+    coll = None
     for i, (pos, hsml) in enumerate(posgen):
-        nsub_mi = reg._refined_index_data_file(
-            pos, hsml, mask, sub_mi1, sub_mi2, i, 0)
-        reg._set_refined_index_data_file(
-            sub_mi1, sub_mi2, i, nsub_mi)
+        nsub_mi, coll = reg._refined_index_data_file(
+            coll, pos, hsml, mask, sub_mi1, sub_mi2, i,
+                0, count_threshold = 1, mask_threshold = 2)
+        reg.bitmasks.append(i, coll)
         assert_equal(reg.count_refined(i), 0)
     nr, nm = reg.find_collisions_refined()
     assert_equal(nr, 0, "%d collisions" % nr)
@@ -214,10 +216,10 @@ def test_bitmap_collisions():
     sub_mi1 = np.zeros(max_npart, "uint64")
     sub_mi2 = np.zeros(max_npart, "uint64")
     for i in range(nfiles):
-        nsub_mi = reg._refined_index_data_file(
-            pos, hsml, mask, sub_mi1, sub_mi2, i, 0)
-        reg._set_refined_index_data_file(
-            sub_mi1, sub_mi2, i, nsub_mi)
+        nsub_mi, coll = reg._refined_index_data_file(
+            None, pos, hsml, mask, sub_mi1, sub_mi2, i,
+            0, count_threshold = 1, mask_threshold = 2)
+        reg.bitmasks.append(i, coll)
         assert_equal(reg.count_refined(i), ncoll)
     nr, nm = reg.find_collisions_refined()
     assert_equal(nr, 2**(3*(order1+order2)), "%d collisions" % nr)

From d99d87f02a4f2fdf6b93c5dd95ff6c056f092410 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Mon, 25 May 2020 17:07:17 -0500
Subject: [PATCH 29/42] Missed a logic check

---
 yt/geometry/particle_oct_container.pyx |  3 +++
 yt/utilities/lib/ewah_bool_wrap.pyx    | 20 +++++++++++---------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 96a4756151d..8b1db6c3e05 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -682,7 +682,10 @@ cdef class ParticleBitmap:
         cdef np.uint64_t nset = 0, nfully_enclosed = 0, n_calls = 0
         mi1_max = (1 << self.index_order1) - 1
         mi2_max = (1 << self.index_order2) - 1
+        cdef np.uint64_t max_mi1_elements = 1 << (3*self.index_order1)
         cdef np.uint64_t max_mi2_elements = 1 << (3*self.index_order2)
+        for i in range(max_mi1_elements):
+            refined_count[i] = 0
         # Copy things from structure (type cast)
         for i in range(3):
             LE[i] = self.left_edge[i]
diff --git a/yt/utilities/lib/ewah_bool_wrap.pyx b/yt/utilities/lib/ewah_bool_wrap.pyx
index 9cfc5cfd2d6..742478ab210 100644
--- a/yt/utilities/lib/ewah_bool_wrap.pyx
+++ b/yt/utilities/lib/ewah_bool_wrap.pyx
@@ -870,15 +870,15 @@ cdef class BoolArrayCollection:
         return self._count_coarse()
 
     cdef void _logicalor(self, BoolArrayCollection solf, BoolArrayCollection out):
-        cdef ewah_bool_array *ewah_keys1 = <ewah_bool_array *> self.ewah_keys
-        cdef ewah_bool_array *ewah_refn1 = <ewah_bool_array *> self.ewah_refn
-        cdef ewahmap *ewah_coll1 = <ewahmap *> self.ewah_coll
-        cdef ewah_bool_array *ewah_keys2 = <ewah_bool_array *> solf.ewah_keys
-        cdef ewah_bool_array *ewah_refn2 = <ewah_bool_array *> solf.ewah_refn
-        cdef ewahmap *ewah_coll2 = <ewahmap *> solf.ewah_coll
-        cdef ewah_bool_array *ewah_keys3 = <ewah_bool_array *> out.ewah_keys
-        cdef ewah_bool_array *ewah_refn3 = <ewah_bool_array *> out.ewah_refn
-        cdef ewahmap *ewah_coll3 = <ewahmap *> out.ewah_coll
+        cdef ewah_bool_array *ewah_keys1 = self.ewah_keys
+        cdef ewah_bool_array *ewah_refn1 = self.ewah_refn
+        cdef ewahmap *ewah_coll1 = self.ewah_coll
+        cdef ewah_bool_array *ewah_keys2 = solf.ewah_keys
+        cdef ewah_bool_array *ewah_refn2 = solf.ewah_refn
+        cdef ewahmap *ewah_coll2 = solf.ewah_coll
+        cdef ewah_bool_array *ewah_keys3 = out.ewah_keys
+        cdef ewah_bool_array *ewah_refn3 = out.ewah_refn
+        cdef ewahmap *ewah_coll3 = out.ewah_coll
         cdef ewahmap_it it_map1, it_map2
         cdef ewah_bool_array mi1_ewah1, mi1_ewah2
         cdef np.uint64_t mi1
@@ -901,6 +901,8 @@ cdef class BoolArrayCollection:
             if it_map1 != ewah_coll1[0].end():
                 mi1_ewah1 = dereference(it_map1).second
                 mi1_ewah1.logicalor(mi1_ewah2, ewah_coll3[0][mi1])
+            else:
+                ewah_coll3[0][mi1] = mi1_ewah2
             preincrement(it_map2)
 
     cdef void _append(self, BoolArrayCollection solf):

From c5da9113d52437ece316d433d3dd466950898620 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 26 May 2020 10:40:12 -0500
Subject: [PATCH 30/42] Use bounded_morton_split_dds in coarse indexing

---
 yt/geometry/particle_oct_container.pyx | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 8b1db6c3e05..2236996a463 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -496,8 +496,10 @@ cdef class ParticleBitmap:
         cdef np.uint64_t mi_split[3]
         cdef np.float64_t ppos[3]
         cdef np.float64_t s_ppos[3] # shifted ppos
+        cdef np.float64_t clip_pos_l[3]
+        cdef np.float64_t clip_pos_r[3]
         cdef int skip
-        cdef np.uint64_t bounds[3][2]
+        cdef np.uint64_t bounds[2][3]
         cdef np.uint64_t xex, yex, zex
         cdef np.float64_t LE[3]
         cdef np.float64_t RE[3]
@@ -564,15 +566,17 @@ cdef class ParticleBitmap:
                         s_ppos[2] = ppos[2] + axiterv[2][zi]
                         # OK, now we compute the left and right edges for this shift.
                         for i in range(3):
-                            # Note that we cast here to int64_t because this could be negative
-                            bounds[i][0] = i64max(<np.int64_t>((s_ppos[i] - LE[i] - radius)/dds[i]), 0)
-                            bounds[i][1] = i64min(<np.int64_t>((s_ppos[i] - LE[i] + radius)/dds[i]), mi_max) + 1
+                            clip_pos_l[i] = fmax(s_ppos[i] - radius, LE[i] + dds[i]/10)
+                            clip_pos_r[i] = fmin(s_ppos[i] + radius, RE[i] - dds[i]/10)
+
+                        bounded_morton_split_dds(clip_pos_l[0], clip_pos_l[1], clip_pos_l[2], LE, dds, bounds[0])
+                        bounded_morton_split_dds(clip_pos_r[0], clip_pos_r[1], clip_pos_r[2], LE, dds, bounds[1])
                         # We go to the upper bound plus one so that we have *inclusive* loops -- the upper bound
                         # is the cell *index*, so we want to make sure we include that cell.  This is also why
                         # we don't need to worry about mi_max being the max index rather than the cell count.
-                        for xex in range(bounds[0][0], bounds[0][1]):
-                            for yex in range(bounds[1][0], bounds[1][1]):
-                                for zex in range(bounds[2][0], bounds[2][1]):
+                        for xex in range(bounds[0][0], bounds[1][0]):
+                            for yex in range(bounds[0][1], bounds[1][1]):
+                                for zex in range(bounds[0][2], bounds[1][2]):
                                     miex = encode_morton_64bit(xex, yex, zex)
                                     mask[miex] = 1
                                     particle_counts[miex] += 1

From ac32bb0cb2dd3b486274ecd11ccfd3255759bfe5 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 26 May 2020 11:33:14 -0500
Subject: [PATCH 31/42] Fencepost error

---
 yt/geometry/particle_oct_container.pyx | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 2236996a463..64e8855971d 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -568,15 +568,14 @@ cdef class ParticleBitmap:
                         for i in range(3):
                             clip_pos_l[i] = fmax(s_ppos[i] - radius, LE[i] + dds[i]/10)
                             clip_pos_r[i] = fmin(s_ppos[i] + radius, RE[i] - dds[i]/10)
-
                         bounded_morton_split_dds(clip_pos_l[0], clip_pos_l[1], clip_pos_l[2], LE, dds, bounds[0])
                         bounded_morton_split_dds(clip_pos_r[0], clip_pos_r[1], clip_pos_r[2], LE, dds, bounds[1])
                         # We go to the upper bound plus one so that we have *inclusive* loops -- the upper bound
                         # is the cell *index*, so we want to make sure we include that cell.  This is also why
                         # we don't need to worry about mi_max being the max index rather than the cell count.
-                        for xex in range(bounds[0][0], bounds[1][0]):
-                            for yex in range(bounds[0][1], bounds[1][1]):
-                                for zex in range(bounds[0][2], bounds[1][2]):
+                        for xex in range(bounds[0][0], bounds[1][0] + 1):
+                            for yex in range(bounds[0][1], bounds[1][1] + 1):
+                                for zex in range(bounds[0][2], bounds[1][2] + 1):
                                     miex = encode_morton_64bit(xex, yex, zex)
                                     mask[miex] = 1
                                     particle_counts[miex] += 1

From e8ce92b2e4b252e12c8a616e79c3448139fe03db Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 26 May 2020 12:55:41 -0500
Subject: [PATCH 32/42] Check for None in append()

---
 yt/utilities/lib/ewah_bool_wrap.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/yt/utilities/lib/ewah_bool_wrap.pyx b/yt/utilities/lib/ewah_bool_wrap.pyx
index 742478ab210..1c782f4b878 100644
--- a/yt/utilities/lib/ewah_bool_wrap.pyx
+++ b/yt/utilities/lib/ewah_bool_wrap.pyx
@@ -322,6 +322,7 @@ cdef class FileBitmasks:
         return out
 
     def append(self, np.uint32_t ifile, BoolArrayCollection solf):
+        if solf is None: return
         self._append(ifile, solf)
 
     cdef void _append(self, np.uint32_t ifile, BoolArrayCollection solf):

From 588b50e527880707b4f4439ab0bce15dfa97c7ee Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 26 May 2020 13:36:21 -0500
Subject: [PATCH 33/42] Changing to uint32_t for Clang

---
 yt/utilities/lib/ewah_bool_array.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yt/utilities/lib/ewah_bool_array.pxd b/yt/utilities/lib/ewah_bool_array.pxd
index 856b6e8d6d7..9b55e626d9a 100644
--- a/yt/utilities/lib/ewah_bool_array.pxd
+++ b/yt/utilities/lib/ewah_bool_array.pxd
@@ -89,7 +89,7 @@ cdef extern from "boolarray.h":
         uword getWord(size_t pos)
         size_t wordinbits
 
-ctypedef np.uint64_t ewah_word_type
+ctypedef np.uint32_t ewah_word_type
 ctypedef EWAHBoolArray[ewah_word_type] ewah_bool_array
 ctypedef EWAHBoolArraySetBitForwardIterator[ewah_word_type] ewah_bool_iterator
 ctypedef vector[size_t] bitset_array

From 6488e10072d54192a6860d79e6fddb941fd65876 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 26 May 2020 15:07:29 -0500
Subject: [PATCH 34/42] Try to be more careful with uint/int distinctions.

---
 yt/geometry/particle_oct_container.pyx | 44 ++++++++++++++------------
 yt/utilities/lib/ewah_bool_wrap.pxd    | 16 +++++-----
 yt/utilities/lib/ewah_bool_wrap.pyx    | 44 ++++++++++++--------------
 3 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 64e8855971d..cfef5378d95 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -509,7 +509,7 @@ cdef class ParticleBitmap:
         cdef np.float64_t radius
         cdef np.uint8_t[:] mask = self.masks[:, file_id]
         cdef np.uint64_t[:] particle_counts = self.particle_counts
-        cdef np.int64_t msize = (1 << (self.index_order1 * 3))
+        cdef np.uint64_t msize = (1 << (self.index_order1 * 3))
         cdef int axiter[3][2]
         cdef np.float64_t axiterv[3][2]
         mi_max = (1 << self.index_order1) - 1
@@ -643,7 +643,8 @@ cdef class ParticleBitmap:
         np.uint64_t count_threshold, np.uint8_t mask_threshold
     ):
         # Initialize
-        cdef np.int64_t i, p, sorted_ind
+        cdef np.int64_t p, sorted_ind
+        cdef np.uint64_t i
         cdef np.uint64_t mi1, mi2
         cdef np.float64_t ppos[3]
         cdef np.float64_t s_ppos[3] # shifted ppos
@@ -961,7 +962,7 @@ cdef class ParticleBitmap:
     def calcsize_bitmasks(self):
         # TODO: All cython
         cdef bytes serial_BAC
-        cdef int ifile
+        cdef np.uint64_t ifile
         cdef int out = 0
         out += struct.calcsize('Q')
         # Bitmaps for each file
@@ -983,7 +984,7 @@ cdef class ParticleBitmap:
 
     def save_bitmasks(self, fname):
         cdef bytes serial_BAC
-        cdef int ifile
+        cdef np.uint64_t ifile
         f = open(fname,'wb')
         # Header
         f.write(struct.pack('Q', _bitmask_version))
@@ -1057,7 +1058,7 @@ cdef class ParticleBitmap:
         return read_flag
 
     def print_info(self):
-        cdef int ifile
+        cdef np.uint64_t ifile
         for ifile in range(self.nfiles):
             self.bitmasks.print_info(ifile, "File: %03d" % ifile)
 
@@ -1080,7 +1081,8 @@ cdef class ParticleBitmap:
         cdef vector[size_t] vec_totref
         cdef vector[size_t].iterator it_mi1
         cdef int nm = 0, nc = 0
-        cdef int ifile
+        cdef np.uint64_t ifile, nbitmasks
+        nbitmasks = len(self.bitmasks)
         # Locate all indices with second level refinement
         for ifile in range(self.nfiles):
             arr = (<ewah_bool_array**> self.bitmasks.ewah_refn)[ifile][0]
@@ -1092,7 +1094,7 @@ cdef class ParticleBitmap:
             mi1 = dereference(it_mi1)
             arr_any.reset()
             arr_two.reset()
-            for ifile in range(len(self.bitmasks)):
+            for ifile in range(nbitmasks):
                 if self.bitmasks._isref(ifile, mi1) == 1:
                     arr = (<map[np.int64_t, ewah_bool_array]**> self.bitmasks.ewah_coll)[ifile][0][mi1]
                     arr_any.logicaland(arr, arr_two) # Indices in previous files
@@ -1249,7 +1251,7 @@ cdef class ParticleBitmap:
 
     def mask_to_files(self, BoolArrayCollection mm_s):
         cdef FileBitmasks mm_d = self.bitmasks
-        cdef np.int32_t ifile
+        cdef np.uint32_t ifile
         cdef np.ndarray[np.uint8_t, ndim=1] file_mask_p
         file_mask_p = np.zeros(self.nfiles, dtype="uint8")
         # Compare with mask of particles
@@ -1264,7 +1266,7 @@ cdef class ParticleBitmap:
 
     def masks_to_files(self, BoolArrayCollection mm_s, BoolArrayCollection mm_g):
         cdef FileBitmasks mm_d = self.bitmasks
-        cdef np.int32_t ifile
+        cdef np.uint32_t ifile
         cdef np.ndarray[np.uint8_t, ndim=1] file_mask_p
         cdef np.ndarray[np.uint8_t, ndim=1] file_mask_g
         file_mask_p = np.zeros(self.nfiles, dtype="uint8")
@@ -1327,13 +1329,15 @@ cdef class ParticleBitmap:
         cdef ewah_bool_array *ewah_base
         if base_mask is not None:
             ewah_base = <ewah_bool_array *> base_mask.ewah_keys
+        else:
+            ewah_base = NULL
         cdef ewah_bool_iterator *iter_set = new ewah_bool_iterator(ewah_slct[0].begin())
         cdef ewah_bool_iterator *iter_end = new ewah_bool_iterator(ewah_slct[0].end())
         cdef np.ndarray[np.uint8_t, ndim=1] slct_arr
         slct_arr = np.zeros((1 << (self.index_order1 * 3)),'uint8')
         while iter_set[0] != iter_end[0]:
             mi = dereference(iter_set[0])
-            if base_mask is not None and ewah_base[0].get(mi) == 0:
+            if ewah_base != NULL and ewah_base[0].get(mi) == 0:
                 octree._index_base_roots[croot] = 0
                 slct_arr[mi] = 2
             else:
@@ -1345,7 +1349,7 @@ cdef class ParticleBitmap:
             croot += 1
             preincrement(iter_set[0])
         assert(croot == nroot)
-        if base_mask is not None:
+        if ewah_base != NULL:
             assert(np.sum(octree._index_base_roots) == ewah_base[0].numberOfOnes())
         # Get morton indices for all particles in this file and those
         # contaminating cells it has majority control of.
@@ -1496,7 +1500,7 @@ cdef class ParticleBitmapSelector:
             rpos[i] = self.DRE[i] - self.bitmap.dds_mi2[i]/2.0
         sbbox = self.selector.select_bbox_edge(pos, rpos)
         if sbbox == 1:
-            for mi1 in range(<np.int64_t>self.s1):
+            for mi1 in range(<np.uint64_t>self.s1):
                 mm_s0._set_coarse(mi1)
             mm_s0._compress(mm_s)
             return
@@ -1513,7 +1517,7 @@ cdef class ParticleBitmapSelector:
     def find_files(self,
                    np.ndarray[np.uint8_t, ndim=1] file_mask_p,
                    np.ndarray[np.uint8_t, ndim=1] file_mask_g):
-        cdef int i
+        cdef np.uint64_t i
         cdef np.int32_t level = 0
         cdef np.uint64_t mi1
         mi1 = ~(<np.uint64_t>0)
@@ -1547,7 +1551,7 @@ cdef class ParticleBitmapSelector:
     @cython.wraparound(False)
     @cython.cdivision(True)
     cdef bint is_refined_files(self, np.uint64_t mi1):
-        cdef int i
+        cdef np.uint64_t i
         if self.bitmap.collisions._isref(mi1):
             # Don't refine if files all selected already
             for i in range(self.nfiles):
@@ -1574,7 +1578,7 @@ cdef class ParticleBitmapSelector:
     @cython.cdivision(True)
     @cython.initializedcheck(False)
     cdef void set_files_coarse(self, np.uint64_t mi1):
-        cdef int i
+        cdef np.uint64_t i
         cdef bint flag_ref = self.is_refined(mi1)
         # Flag files at coarse level
         if flag_ref == 0:
@@ -1601,7 +1605,7 @@ cdef class ParticleBitmapSelector:
     @cython.cdivision(True)
     @cython.initializedcheck(False)
     cdef void set_files_refined(self, np.uint64_t mi1, np.uint64_t mi2):
-        cdef int i
+        cdef np.uint64_t i
         # Flag files
         for i in range(self.nfiles):
             if self.file_mask_p[i] == 0:
@@ -1616,14 +1620,14 @@ cdef class ParticleBitmapSelector:
     @cython.cdivision(True)
     @cython.initializedcheck(False)
     cdef void add_neighbors_coarse(self, np.uint64_t mi1):
-        cdef int m
+        cdef np.uint64_t m
         cdef np.uint32_t ntot
         cdef np.uint64_t mi1_n
         ntot = morton_neighbors_coarse(mi1, self.max_index1, 
                                        self.periodicity,
                                        self.ngz, self.neighbors,
                                        self.ind1_n, self.neighbor_list1)
-        for m in range(<np.int32_t>ntot):
+        for m in range(ntot):
             mi1_n = self.neighbor_list1[m]
             self.coarse_ghosts_bool[mi1_n] = 1
 
@@ -1632,14 +1636,14 @@ cdef class ParticleBitmapSelector:
     @cython.cdivision(True)
     @cython.initializedcheck(False)
     cdef void set_files_neighbors_coarse(self, np.uint64_t mi1):
-        cdef int i, m
+        cdef np.uint64_t i, m
         cdef np.uint32_t ntot
         cdef np.uint64_t mi1_n
         ntot = morton_neighbors_coarse(mi1, self.max_index1, 
                                        self.periodicity,
                                        self.ngz, self.neighbors,
                                        self.ind1_n, self.neighbor_list1)
-        for m in range(<np.int32_t>ntot):
+        for m in range(ntot):
             mi1_n = self.neighbor_list1[m]
             for i in range(self.nfiles):
                 if self.file_mask_g[i] == 0:
diff --git a/yt/utilities/lib/ewah_bool_wrap.pxd b/yt/utilities/lib/ewah_bool_wrap.pxd
index 589d56a028c..4feeaf31e4f 100644
--- a/yt/utilities/lib/ewah_bool_wrap.pxd
+++ b/yt/utilities/lib/ewah_bool_wrap.pxd
@@ -36,9 +36,9 @@ cdef class FileBitmasks:
     cdef bint _get_coarse(self, np.uint32_t ifile, np.uint64_t i1)
     cdef void _get_coarse_array(self, np.uint32_t ifile, np.uint64_t imax, np.uint8_t[:] arr) except *
     cdef bint _isref(self, np.uint32_t ifile, np.uint64_t i)
-    cdef int _count_total(self, np.uint32_t ifile)
-    cdef int _count_refined(self, np.uint32_t ifile)
-    cdef int _count_coarse(self, np.uint32_t ifile)
+    cdef np.uint64_t _count_total(self, np.uint32_t ifile)
+    cdef np.uint64_t _count_refined(self, np.uint32_t ifile)
+    cdef np.uint64_t _count_coarse(self, np.uint32_t ifile)
     cdef void _append(self, np.uint32_t ifile, BoolArrayCollection solf)
     cdef bint _intersects(self, np.uint32_t ifile, BoolArrayCollection solf)
     cdef void _logicalxor(self, np.uint32_t ifile, BoolArrayCollection solf, BoolArrayCollection out)
@@ -72,9 +72,9 @@ cdef class BoolArrayCollection:
     cdef bint _contains(self, np.uint64_t i)
     cdef bint _isref(self, np.uint64_t i)
     cdef void _ewah_coarse(self)
-    cdef int _count_total(self)
-    cdef int _count_refined(self)
-    cdef int _count_coarse(self)
+    cdef np.uint64_t _count_total(self)
+    cdef np.uint64_t _count_refined(self)
+    cdef np.uint64_t _count_coarse(self)
     cdef void _append(self, BoolArrayCollection solf)
     cdef void _logicalor(self, BoolArrayCollection solf, BoolArrayCollection out)
     cdef bint _intersects(self, BoolArrayCollection solf)
@@ -110,8 +110,8 @@ cdef class BoolArrayCollectionUncompressed:
     cdef bint _get(self, np.uint64_t i1, np.uint64_t i2=*)
     cdef bint _get_coarse(self, np.uint64_t i1)
     cdef bint _isref(self, np.uint64_t i)
-    cdef int _count_total(self)
-    cdef int _count_refined(self)
+    cdef np.uint64_t _count_total(self)
+    cdef np.uint64_t _count_refined(self)
     cdef void _append(self, BoolArrayCollectionUncompressed solf)
     cdef bint _intersects(self, BoolArrayCollectionUncompressed solf)
     cdef void _compress(self, BoolArrayCollection solf)
diff --git a/yt/utilities/lib/ewah_bool_wrap.pyx b/yt/utilities/lib/ewah_bool_wrap.pyx
index 1c782f4b878..05cbe86ce4b 100644
--- a/yt/utilities/lib/ewah_bool_wrap.pyx
+++ b/yt/utilities/lib/ewah_bool_wrap.pyx
@@ -306,19 +306,17 @@ cdef class FileBitmasks:
     def count_refined(self, ifile):
         return self._count_refined(ifile)
 
-    cdef int _count_coarse(self, np.uint32_t ifile):
+    cdef np.uint64_t _count_coarse(self, np.uint32_t ifile):
         return self._count_total(ifile) - self._count_refined(ifile)
 
-    cdef int _count_total(self, np.uint32_t ifile):
+    cdef np.uint64_t _count_total(self, np.uint32_t ifile):
         cdef ewah_bool_array *ewah_keys = (<ewah_bool_array **> self.ewah_keys)[ifile]
-        cdef int out
-        out = ewah_keys[0].numberOfOnes()
+        cdef np.uint64_t out = ewah_keys[0].numberOfOnes()
         return out
 
-    cdef int _count_refined(self, np.uint32_t ifile):
+    cdef np.uint64_t _count_refined(self, np.uint32_t ifile):
         cdef ewah_bool_array *ewah_refn = (<ewah_bool_array **> self.ewah_refn)[ifile]
-        cdef int out
-        out = ewah_refn[0].numberOfOnes()
+        cdef np.uint64_t out = ewah_refn[0].numberOfOnes()
         return out
 
     def append(self, np.uint32_t ifile, BoolArrayCollection solf):
@@ -842,29 +840,26 @@ cdef class BoolArrayCollection:
     def ewah_coarse(self):
         return self._ewah_coarse()
 
-    cdef int _count_total(self):
+    cdef np.uint64_t _count_total(self):
         cdef ewah_bool_array *ewah_keys = <ewah_bool_array *> self.ewah_keys
-        cdef int out
-        out = ewah_keys.numberOfOnes()
+        cdef np.uint64_t out = ewah_keys.numberOfOnes()
         return out
 
     def count_total(self):
         return self._count_total()
 
-    cdef int _count_refined(self):
+    cdef np.uint64_t _count_refined(self):
         cdef ewah_bool_array *ewah_refn = <ewah_bool_array *> self.ewah_refn
-        cdef int out
-        out = ewah_refn.numberOfOnes()
+        cdef np.uint64_t out = ewah_refn.numberOfOnes()
         return out
 
     def count_refined(self):
         return self._count_refined()
 
-    cdef int _count_coarse(self):
+    cdef np.uint64_t _count_coarse(self):
         self._ewah_coarse()
         cdef ewah_bool_array *ewah_coar = <ewah_bool_array *> self.ewah_coar
-        cdef int out
-        out = ewah_coar.numberOfOnes()
+        cdef np.uint64_t out = ewah_coar.numberOfOnes()
         return out
 
     def count_coarse(self):
@@ -1423,18 +1418,18 @@ cdef class BoolArrayCollectionUncompressed:
         cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
         return <bint>ewah_refn[i]
 
-    cdef int _count_total(self):
+    cdef np.uint64_t _count_total(self):
         cdef bitarrtype *ewah_keys = <bitarrtype *> self.ewah_keys
         cdef np.uint64_t i
-        cdef int out = 0
+        cdef np.uint64_t out = 0
         for i in range(self.nele1):
             out += ewah_keys[i]
         return out
 
-    cdef int _count_refined(self):
+    cdef np.uint64_t _count_refined(self):
         cdef bitarrtype *ewah_refn = <bitarrtype *> self.ewah_refn
         cdef np.uint64_t i
-        cdef int out = 0
+        cdef np.uint64_t out = 0
         for i in range(self.nele1):
             out += ewah_refn[i]
         return out
@@ -1488,6 +1483,7 @@ cdef class BoolArrayCollectionUncompressed:
                 break
         if (mi1 < self.nele1):
             return 0
+        mi1 = self.nele1 # This is to get rid of a warning
         # Intersection at refined level
         for mi1 in range(self.nele1):
             if (ewah_refn1[mi1] == 1) and (ewah_refn2[mi1] == 1):
@@ -1516,8 +1512,8 @@ cdef class BoolArrayCollectionUncompressed:
         del ewah_coll
 
     def print_info(self, prefix=''):
-        cdef int nrefn = self._count_refined()
-        cdef int nkeys = self._count_total()
+        cdef np.uint64_t nrefn = self._count_refined()
+        cdef np.uint64_t nkeys = self._count_total()
         print("{}{: 8d} coarse, {: 8d} refined, {: 8d} total".format(prefix,
                                                                      nkeys - nrefn,
                                                                      nrefn,
@@ -1657,7 +1653,7 @@ cdef class SparseUnorderedRefinedBitmaskVector:
         self.total = 0
 
     cdef to_array(self):
-        cdef int i
+        cdef np.uint64_t i
         cdef np.ndarray[np.uint64_t, ndim=2] rv
         self._remove_duplicates()
         rv = np.empty((self.entries.size(),2),dtype='uint64')
@@ -1730,7 +1726,7 @@ cdef class SparseUnorderedRefinedBitmaskSet:
         self.entries.clear()
 
     cdef to_array(self):
-        cdef int i
+        cdef np.uint64_t i
         cdef np.ndarray[np.uint64_t, ndim=2] rv
         rv = np.empty((self.entries.size(),2),dtype='uint64')
         i = 0

From 26a4ed4df1e062d133aaebb9c2e52e6bd05d1182 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 26 May 2020 15:35:38 -0500
Subject: [PATCH 35/42] Explicitly cast to uword

---
 yt/utilities/lib/ewahboolarray/boolarray.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yt/utilities/lib/ewahboolarray/boolarray.h b/yt/utilities/lib/ewahboolarray/boolarray.h
index 44fdbd6b8a9..fa7da1c1ecf 100644
--- a/yt/utilities/lib/ewahboolarray/boolarray.h
+++ b/yt/utilities/lib/ewahboolarray/boolarray.h
@@ -322,7 +322,7 @@ class BoolArray {
     size_t numberOfOnes() const  {
         size_t count = 0;
         for (size_t i = 0; i < buffer.size(); ++i) {
-            count += countOnes(buffer[i]);
+            count += countOnes((uword) buffer[i]);
         }
         return count;
     }

From e29599d6e09c318539dd39f8521a360e138fdd22 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Wed, 27 May 2020 10:36:15 -0500
Subject: [PATCH 36/42] Update EWAH to 88b25a3345b82353ccd97a7de6064e6c179a7cc2

---
 yt/utilities/lib/ewahboolarray/README         |    2 +-
 yt/utilities/lib/ewahboolarray/boolarray.h    |  823 ++--
 yt/utilities/lib/ewahboolarray/ewah.h         | 3832 +++++++++--------
 yt/utilities/lib/ewahboolarray/ewahutil.h     |  262 +-
 .../lib/ewahboolarray/runninglengthword.h     |  950 ++--
 5 files changed, 3186 insertions(+), 2683 deletions(-)

diff --git a/yt/utilities/lib/ewahboolarray/README b/yt/utilities/lib/ewahboolarray/README
index 7f8803852bc..b86d316c9ad 100644
--- a/yt/utilities/lib/ewahboolarray/README
+++ b/yt/utilities/lib/ewahboolarray/README
@@ -3,6 +3,6 @@ available at:
 
 https://github.com/lemire/EWAHBoolArray
 
-Currently this is at revision 80881379f8a582f45dda1be9edfc84d244846427.
+Currently this is at revision 88b25a3345b82353ccd97a7de6064e6c179a7cc2
 
 This code is available under the Apache2.0 license.
diff --git a/yt/utilities/lib/ewahboolarray/boolarray.h b/yt/utilities/lib/ewahboolarray/boolarray.h
index fa7da1c1ecf..4a607adf7d4 100644
--- a/yt/utilities/lib/ewahboolarray/boolarray.h
+++ b/yt/utilities/lib/ewahboolarray/boolarray.h
@@ -15,423 +15,474 @@
 #include <stdexcept>
 #include <sstream>
 
-using namespace std;
-
-
 // uncomment this for debugging
 //#define EWAHASSERT
 
 /**
  * A dynamic bitset implementation. (without compression).
  */
-template<class uword = uint32_t>
-class BoolArray {
+template <class uword = uint32_t> class BoolArray {
 public:
-    BoolArray(const size_t n, const uword initval = 0) :
-        buffer(n / wordinbits + (n % wordinbits == 0 ? 0 : 1), initval),
-                sizeinbits(n) {
-    }
-
-    BoolArray() :
-        buffer(), sizeinbits(0) {
-    }
-
-    BoolArray(const BoolArray & ba) :
-        buffer(ba.buffer), sizeinbits(ba.sizeinbits) {
-    }
-    static BoolArray bitmapOf(size_t n, ...) {
-        BoolArray ans;
-        va_list vl;
-        va_start(vl, n);
-        for (size_t i = 0; i < n; i++) {
-            ans.set(static_cast<size_t>(va_arg(vl, int)));
-        }
-        va_end(vl);
-        return ans;
-    }
-    size_t sizeInBytes() const {
-        return buffer.size() * sizeof(uword);
-    }
-
-    void read(istream & in) {
-        sizeinbits = 0;
-        in.read(reinterpret_cast<char *> (&sizeinbits), sizeof(sizeinbits));
-        buffer.resize(
-                sizeinbits / wordinbits
-                        + (sizeinbits % wordinbits == 0 ? 0 : 1));
-        if(buffer.size() == 0) return;
-        in.read(reinterpret_cast<char *> (&buffer[0]),
-                static_cast<streamsize>(buffer.size() * sizeof(uword)));
-    }
-
-    void readBuffer(istream & in, const size_t size) {
-        buffer.resize(size);
-        sizeinbits = size * sizeof(uword) * 8;
-        if(buffer.empty()) return;
-        in.read(reinterpret_cast<char *> (&buffer[0]),
-                buffer.size() * sizeof(uword));
-    }
-
-    void setSizeInBits(const size_t sizeib) {
-        sizeinbits = sizeib;
-    }
-
-    void write(ostream & out) {
-        write(out, sizeinbits);
-    }
-
-    void write(ostream & out, const size_t numberofbits) const {
-        const size_t size = numberofbits / wordinbits + (numberofbits
-                % wordinbits == 0 ? 0 : 1);
-        out.write(reinterpret_cast<const char *> (&numberofbits),
-                sizeof(numberofbits));
-        if(numberofbits == 0) return;
-        out.write(reinterpret_cast<const char *> (&buffer[0]),
-                static_cast<streamsize>(size * sizeof(uword)));
-    }
-
-    void writeBuffer(ostream & out, const size_t numberofbits) const {
-        const size_t size = numberofbits / wordinbits + (numberofbits
-                % wordinbits == 0 ? 0 : 1);
-        if(size == 0) return;
+  BoolArray(const size_t n, const uword initval = 0)
+      : buffer(n / wordinbits + (n % wordinbits == 0 ? 0 : 1), initval),
+        sizeinbits(n) {}
+
+  BoolArray() : buffer(), sizeinbits(0) {}
+
+  BoolArray(const BoolArray &ba)
+      : buffer(ba.buffer), sizeinbits(ba.sizeinbits) {}
+  static BoolArray bitmapOf(size_t n, ...) {
+    BoolArray ans;
+    va_list vl;
+    va_start(vl, n);
+    for (size_t i = 0; i < n; i++) {
+      ans.set(static_cast<size_t>(va_arg(vl, int)));
+    }
+    va_end(vl);
+    return ans;
+  }
+  size_t sizeInBytes() const { return buffer.size() * sizeof(uword); }
+
+  void read(std::istream &in) {
+    sizeinbits = 0;
+    in.read(reinterpret_cast<char *>(&sizeinbits), sizeof(sizeinbits));
+    buffer.resize(sizeinbits / wordinbits +
+                  (sizeinbits % wordinbits == 0 ? 0 : 1));
+    if (buffer.size() == 0)
+      return;
+    in.read(reinterpret_cast<char *>(&buffer[0]),
+            static_cast<std::streamsize>(buffer.size() * sizeof(uword)));
+  }
+
+  void readBuffer(std::istream &in, const size_t size) {
+    buffer.resize(size);
+    sizeinbits = size * sizeof(uword) * 8;
+    if (buffer.empty())
+      return;
+    in.read(reinterpret_cast<char *>(&buffer[0]),
+            buffer.size() * sizeof(uword));
+  }
+
+  void setSizeInBits(const size_t sizeib) { sizeinbits = sizeib; }
+
+  void write(std::ostream &out) { write(out, sizeinbits); }
+
+  void write(std::ostream &out, const size_t numberofbits) const {
+    const size_t size =
+        numberofbits / wordinbits + (numberofbits % wordinbits == 0 ? 0 : 1);
+    out.write(reinterpret_cast<const char *>(&numberofbits),
+              sizeof(numberofbits));
+    if (numberofbits == 0)
+      return;
+    out.write(reinterpret_cast<const char *>(&buffer[0]),
+              static_cast<std::streamsize>(size * sizeof(uword)));
+  }
+
+  void writeBuffer(std::ostream &out, const size_t numberofbits) const {
+    const size_t size =
+        numberofbits / wordinbits + (numberofbits % wordinbits == 0 ? 0 : 1);
+    if (size == 0)
+      return;
 #ifdef EWAHASSERT
-        assert(buffer.size() >= size);
+    assert(buffer.size() >= size);
 #endif
-        out.write(reinterpret_cast<const char *> (&buffer[0]),
-                size * sizeof(uword));
-    }
-
-    size_t sizeOnDisk() const {
-        size_t size = sizeinbits / wordinbits
-                + (sizeinbits % wordinbits == 0 ? 0 : 1);
-        return sizeof(sizeinbits) + size * sizeof(uword);
-    }
-
-    BoolArray& operator=(const BoolArray & x) {
-        this->buffer = x.buffer;
-        this->sizeinbits = x.sizeinbits;
-        return *this;
-    }
-
-    bool operator==(const BoolArray & x) const {
-        if (sizeinbits != x.sizeinbits)
-            return false;
-        for (size_t k = 0; k < buffer.size(); ++k)
-            if (buffer[k] != x.buffer[k])
-                return false;
-        return true;
-    }
-
-    bool operator!=(const BoolArray & x) const {
-        return !operator==(x);
-    }
-
-    void setWord(const size_t pos, const uword val) {
+    out.write(reinterpret_cast<const char *>(&buffer[0]), size * sizeof(uword));
+  }
+
+  size_t sizeOnDisk() const {
+    size_t size =
+        sizeinbits / wordinbits + (sizeinbits % wordinbits == 0 ? 0 : 1);
+    return sizeof(sizeinbits) + size * sizeof(uword);
+  }
+
+  BoolArray &operator=(const BoolArray &x) {
+    this->buffer = x.buffer;
+    this->sizeinbits = x.sizeinbits;
+    return *this;
+  }
+
+  bool operator==(const BoolArray &x) const {
+    if (sizeinbits != x.sizeinbits)
+      return false;
+    for (size_t k = 0; k < buffer.size(); ++k)
+      if (buffer[k] != x.buffer[k])
+        return false;
+    return true;
+  }
+
+  bool operator!=(const BoolArray &x) const { return !operator==(x); }
+
+  void setWord(const size_t pos, const uword val) {
 #ifdef EWAHASSERT
-        assert(pos < buffer.size());
+    assert(pos < buffer.size());
 #endif
-        buffer[pos] = val;
-    }
+    buffer[pos] = val;
+  }
 
-    void addWord(const uword val) {
-        if (sizeinbits % wordinbits != 0)
-            throw invalid_argument("you probably didn't want to do this");
-        sizeinbits += wordinbits;
-        buffer.push_back(val);
-    }
+  void addWord(const uword val) {
+    if (sizeinbits % wordinbits != 0)
+      throw std::invalid_argument("you probably didn't want to do this");
+    sizeinbits += wordinbits;
+    buffer.push_back(val);
+  }
 
-    uword getWord(const size_t pos) const {
+  uword getWord(const size_t pos) const {
 #ifdef EWAHASSERT
-        assert(pos < buffer.size());
+    assert(pos < buffer.size());
 #endif
-        return buffer[pos];
-    }
+    return buffer[pos];
+  }
 
-    /**
-     * set to true (whether it was already set to true or not)
-     *
-     * This is an expensive (random access) API, you really ought to
-     * prepare a new word and then append it.
-     */
-    void set(const size_t pos) {
-        if(pos >= sizeinbits) padWithZeroes(pos+1);
-        buffer[pos / wordinbits] |= (static_cast<uword> (1) << (pos
-                % wordinbits));
-    }
-
-    /**
-     * set to false (whether it was already set to false or not)
-     *
-     * This is an expensive (random access) API, you really ought to
-     * prepare a new word and then append it.
-     */
-    void unset(const size_t pos) {
-        if(pos < sizeinbits)
-            buffer[pos / wordinbits] |= ~(static_cast<uword> (1) << (pos
-                % wordinbits));
-    }
-
-    /**
-     * true of false? (set or unset)
-     */
-    bool get(const size_t pos) const {
+  /**
+   * set to true (whether it was already set to true or not)
+   */
+  void set(const size_t pos) {
+    if (pos >= sizeinbits)
+      padWithZeroes(pos + 1);
+    buffer[pos / wordinbits] |= (static_cast<uword>(1) << (pos % wordinbits));
+  }
+
+  /**
+   * set to false (whether it was already set to false or not)
+   *
+   */
+  void unset(const size_t pos) {
+    if (pos < sizeinbits)
+      buffer[pos / wordinbits] &=
+          ~(static_cast<uword>(1) << (pos % wordinbits));
+  }
+
+  /**
+   * true of false? (set or unset)
+   */
+  bool get(const size_t pos) const {
 #ifdef EWAHASSERT
-        assert(pos / wordinbits < buffer.size());
+    assert(pos / wordinbits < buffer.size());
 #endif
-        return (buffer[pos / wordinbits] & (static_cast<uword> (1) << (pos
-                % wordinbits))) != 0;
-    }
-
-
-
-    /**
-     * set all bits to 0
-     */
-    void reset() {
-        if(buffer.size() > 0)  memset(&buffer[0], 0, sizeof(uword) * buffer.size());
-        sizeinbits = 0;
-    }
-
-    size_t sizeInBits() const {
-        return sizeinbits;
-    }
-
-    ~BoolArray() {
-    }
-
-    /**
-     * Computes the logical and and writes to the provided BoolArray (out).
-     * The current bitmaps is unchanged.
-     */
-    void logicaland(const BoolArray & ba, BoolArray & out) {
-        if(ba.buffer.size() < buffer.size())
-            out.setToSize(ba);
-        else
-            out.setToSize(*this);
-        for (size_t i = 0; i < out.buffer.size(); ++i)
-            out.buffer[i] = buffer[i] & ba.buffer[i];
-    }
-
-    void inplace_logicaland(const BoolArray & ba) {
-        if(ba.buffer.size() < buffer.size())
-            setToSize(ba);
-        for (size_t i = 0; i < buffer.size(); ++i)
-            buffer[i] = buffer[i] & ba.buffer[i];
-    }
-
-    /**
-     * Computes the logical andnot and writes to the provided BoolArray (out).
-     * The current bitmaps is unchanged.
-     */
-    void logicalandnot(const BoolArray & ba, BoolArray & out) {
-        out.setToSize(*this);
-        size_t upto = out.buffer.size() < ba.buffer.size() ? out.buffer.size() :  ba.buffer.size();
-        for (size_t i = 0; i < upto; ++i)
-            out.buffer[i] = buffer[i] & (~ba.buffer[i]);
-        for (size_t i = upto; i < out.buffer.size(); ++i)
-            out.buffer[i] = buffer[i];
-        out.clearBogusBits();
-    }
-
-    void inplace_logicalandnot(const BoolArray & ba) {
-        size_t upto = buffer.size() < ba.buffer.size() ? buffer.size() :  ba.buffer.size();
-        for (size_t i = 0; i < upto; ++i)
-            buffer[i] = buffer[i] & (~ba.buffer[i]);
-        clearBogusBits();
-    }
-
-    /**
-     * Computes the logical or and writes to the provided BoolArray (out).
-     * The current bitmaps is unchanged.
-     */
-    void logicalor(const BoolArray & ba, BoolArray & out) {
-        const BoolArray * smallest;
-        const BoolArray * largest;
-        if(ba.buffer.size() > buffer.size()) {
-            smallest = this;
-            largest = &ba;
-            out.setToSize(ba);
-        } else {
-            smallest = &ba;
-            largest = this;
-            out.setToSize(*this);
-        }
-        for (size_t i = 0; i < smallest->buffer.size(); ++i)
-            out.buffer[i] = buffer[i] | ba.buffer[i];
-        for (size_t i = smallest->buffer.size(); i < largest->buffer.size(); ++i)
-            out.buffer[i] = largest->buffer[i];
-    }
-
-
-    void inplace_logicalor(const BoolArray & ba) {
-        logicalor(ba,*this);
-    }
-
-    /**
-     * Computes the logical xor and writes to the provided BoolArray (out).
-     * The current bitmaps is unchanged.
-     */
-    void logicalxor(const BoolArray & ba, BoolArray & out) {
-        const BoolArray * smallest;
-        const BoolArray * largest;
-        if(ba.buffer.size() > buffer.size()) {
-            smallest = this;
-            largest = &ba;
-            out.setToSize(ba);
-        } else {
-            smallest = &ba;
-            largest = this;
-            out.setToSize(*this);
-        }
-        for (size_t i = 0; i < smallest->buffer.size(); ++i)
-            out.buffer[i] = buffer[i] ^ ba.buffer[i];
-        for (size_t i = smallest->buffer.size(); i < largest->buffer.size(); ++i)
-            out.buffer[i] = largest->buffer[i];
-    }
-
-    void inplace_logicalxor(const BoolArray & ba) {
-        logicalxor(ba,*this);
-    }
-
-    /**
-     * Computes the logical not and writes to the provided BoolArray (out).
-     * The current bitmaps is unchanged.
-     */
-    void logicalnot(BoolArray & out) {
-        out.setToSize(*this);
-        for (size_t i = 0; i < buffer.size(); ++i)
-            out.buffer[i] = ~buffer[i];
-        out.clearBogusBits();
-    }
-
+    return (buffer[pos / wordinbits] &
+            (static_cast<uword>(1) << (pos % wordinbits))) != 0;
+  }
 
-    void inplace_logicalnot() {
-        for (size_t i = 0; i < buffer.size(); ++i)
-            buffer[i] = ~buffer[i];
-        clearBogusBits();
-    }
+  /**
+   * set all bits to 0
+   */
+  void reset() {
+    if (buffer.size() > 0)
+      memset(&buffer[0], 0, sizeof(uword) * buffer.size());
+    sizeinbits = 0;
+  }
 
+  size_t sizeInBits() const { return sizeinbits; }
 
-    /**
-     * Returns the number of bits set to the value 1.
-     * The running time complexity is proportional to the
-     *  size of the bitmap.
-     *
-     * This is sometimes called the cardinality.
-     */
-    size_t numberOfOnes() const  {
-        size_t count = 0;
-        for (size_t i = 0; i < buffer.size(); ++i) {
-            count += countOnes((uword) buffer[i]);
-        }
-        return count;
-    }
+  ~BoolArray() {}
 
-    inline void printout(ostream &o = cout) {
-        for (size_t k = 0; k < sizeinbits; ++k)
-            o << get(k) << " ";
-        o << endl;
-    }
-
-    /**
-     * Make sure the two bitmaps have the same size (padding with zeroes
-     * if necessary). It has constant running time complexity.
-     */
-    void makeSameSize(BoolArray & a) {
-        if (a.sizeinbits < sizeinbits)
-            a.padWithZeroes(sizeinbits);
-        else if (sizeinbits < a.sizeinbits)
-            padWithZeroes(a.sizeinbits);
-    }
-    /**
-   * Make sure the current bitmap has the size of the provided bitmap.
+  /**
+   * Computes the logical and and writes to the provided BoolArray (out).
+   * The current bitmaps is unchanged.
+   */
+  void logicaland(const BoolArray &ba, BoolArray &out) const {
+    if (ba.buffer.size() < buffer.size())
+      out.setToSize(ba);
+    else
+      out.setToSize(*this);
+    for (size_t i = 0; i < out.buffer.size(); ++i)
+      out.buffer[i] = buffer[i] & ba.buffer[i];
+  }
+
+  /**
+  * Computes the logical and and return the result.
+  * The current bitmaps is unchanged.
+  */
+  BoolArray logicaland(const BoolArray &a) const {
+    BoolArray answer;
+    logicaland(a, answer);
+    return answer;
+  }
+
+  void inplace_logicaland(const BoolArray &ba) {
+    if (ba.buffer.size() < buffer.size())
+      setToSize(ba);
+    for (size_t i = 0; i < buffer.size(); ++i)
+      buffer[i] = buffer[i] & ba.buffer[i];
+  }
+
+  /**
+   * Computes the logical andnot and writes to the provided BoolArray (out).
+   * The current bitmaps is unchanged.
+   */
+  void logicalandnot(const BoolArray &ba, BoolArray &out) const {
+    out.setToSize(*this);
+    size_t upto = out.buffer.size() < ba.buffer.size() ? out.buffer.size()
+                                                       : ba.buffer.size();
+    for (size_t i = 0; i < upto; ++i)
+      out.buffer[i] = buffer[i] & (~ba.buffer[i]);
+    for (size_t i = upto; i < out.buffer.size(); ++i)
+      out.buffer[i] = buffer[i];
+    out.clearBogusBits();
+  }
+
+  /**
+  * Computes the logical andnot and return the result.
+  * The current bitmaps is unchanged.
+  */
+  BoolArray logicalandnot(const BoolArray &a) const {
+    BoolArray answer;
+    logicalandnot(a, answer);
+    return answer;
+  }
+
+  void inplace_logicalandnot(const BoolArray &ba) {
+    size_t upto =
+        buffer.size() < ba.buffer.size() ? buffer.size() : ba.buffer.size();
+    for (size_t i = 0; i < upto; ++i)
+      buffer[i] = buffer[i] & (~ba.buffer[i]);
+    clearBogusBits();
+  }
+
+  /**
+   * Computes the logical or and writes to the provided BoolArray (out).
+   * The current bitmaps is unchanged.
+   */
+  void logicalor(const BoolArray &ba, BoolArray &out) const {
+    const BoolArray *smallest;
+    const BoolArray *largest;
+    if (ba.buffer.size() > buffer.size()) {
+      smallest = this;
+      largest = &ba;
+      out.setToSize(ba);
+    } else {
+      smallest = &ba;
+      largest = this;
+      out.setToSize(*this);
+    }
+    for (size_t i = 0; i < smallest->buffer.size(); ++i)
+      out.buffer[i] = buffer[i] | ba.buffer[i];
+    for (size_t i = smallest->buffer.size(); i < largest->buffer.size(); ++i)
+      out.buffer[i] = largest->buffer[i];
+  }
+
+  /**
+  * Computes the logical or and return the result.
+  * The current bitmaps is unchanged.
+  */
+  BoolArray logicalor(const BoolArray &a) const {
+    BoolArray answer;
+    logicalor(a, answer);
+    return answer;
+  }
+
+  void inplace_logicalor(const BoolArray &ba) { logicalor(ba, *this); }
+
+  /**
+   * Computes the logical xor and writes to the provided BoolArray (out).
+   * The current bitmaps is unchanged.
+   */
+  void logicalxor(const BoolArray &ba, BoolArray &out) const {
+    const BoolArray *smallest;
+    const BoolArray *largest;
+    if (ba.buffer.size() > buffer.size()) {
+      smallest = this;
+      largest = &ba;
+      out.setToSize(ba);
+    } else {
+      smallest = &ba;
+      largest = this;
+      out.setToSize(*this);
+    }
+    for (size_t i = 0; i < smallest->buffer.size(); ++i)
+      out.buffer[i] = buffer[i] ^ ba.buffer[i];
+    for (size_t i = smallest->buffer.size(); i < largest->buffer.size(); ++i)
+      out.buffer[i] = largest->buffer[i];
+  }
+
+  /**
+  * Computes the logical xor and return the result.
+  * The current bitmaps is unchanged.
+  */
+  BoolArray logicalxor(const BoolArray &a) const {
+    BoolArray answer;
+    logicalxor(a, answer);
+    return answer;
+  }
+
+  void inplace_logicalxor(const BoolArray &ba) { logicalxor(ba, *this); }
+
+  /**
+   * Computes the logical not and writes to the provided BoolArray (out).
+   * The current bitmaps is unchanged.
+   */
+  void logicalnot(BoolArray &out) const {
+    out.setToSize(*this);
+    for (size_t i = 0; i < buffer.size(); ++i)
+      out.buffer[i] = ~buffer[i];
+    out.clearBogusBits();
+  }
+
+  /**
+  * Computes the logical not and return the result.
+  * The current bitmaps is unchanged.
+  */
+  BoolArray logicalandnot() const {
+    BoolArray answer;
+    logicalnot(answer);
+    return answer;
+  }
+
+  void inplace_logicalnot() {
+    for (size_t i = 0; i < buffer.size(); ++i)
+      buffer[i] = ~buffer[i];
+    clearBogusBits();
+  }
+
+  /**
+   * Returns the number of bits set to the value 1.
+   * The running time complexity is proportional to the
+   *  size of the bitmap.
+   *
+   * This is sometimes called the cardinality.
    */
-   void setToSize(const BoolArray & a) {
-      sizeinbits = a.sizeinbits;
-      buffer.resize(a.buffer.size());
-   }
-
-    /**
-     * make sure the size of the array is totalbits bits by padding with zeroes.
-     * returns the number of words added (storage cost increase)
-     */
-    size_t padWithZeroes(const size_t totalbits) {
-        size_t currentwordsize = (sizeinbits + wordinbits - 1) / wordinbits;
-        size_t neededwordsize = (totalbits + wordinbits - 1) / wordinbits;
+  size_t numberOfOnes() const {
+    size_t count = 0;
+    for (size_t i = 0; i < buffer.size(); ++i) {
+      count += countOnes(buffer[i]);
+    }
+    return count;
+  }
+
+  inline void printout(std::ostream &o = std::cout) {
+    for (size_t k = 0; k < sizeinbits; ++k)
+      o << get(k) << " ";
+    o << std::endl;
+  }
+
+  /**
+   * Make sure the two bitmaps have the same size (padding with zeroes
+   * if necessary). It has constant running time complexity.
+   */
+  void makeSameSize(BoolArray &a) {
+    if (a.sizeinbits < sizeinbits)
+      a.padWithZeroes(sizeinbits);
+    else if (sizeinbits < a.sizeinbits)
+      padWithZeroes(a.sizeinbits);
+  }
+  /**
+  * Make sure the current bitmap has the size of the provided bitmap.
+  */
+  void setToSize(const BoolArray &a) {
+    sizeinbits = a.sizeinbits;
+    buffer.resize(a.buffer.size());
+  }
+
+  /**
+   * make sure the size of the array is totalbits bits by padding with zeroes.
+   * returns the number of words added (storage cost increase)
+   */
+  size_t padWithZeroes(const size_t totalbits) {
+    size_t currentwordsize = (sizeinbits + wordinbits - 1) / wordinbits;
+    size_t neededwordsize = (totalbits + wordinbits - 1) / wordinbits;
 #ifdef EWAHASSERT
-        assert(neededwordsize >= currentwordsize);
+    assert(neededwordsize >= currentwordsize);
 #endif
-        buffer.resize(neededwordsize);
-        sizeinbits = totalbits;
-        return static_cast<size_t>(neededwordsize - currentwordsize);
-
-    }
-
-    void append(const BoolArray & a);
-
-    enum {
-        wordinbits = sizeof(uword) * 8
-    };
-
-    vector<size_t> toArray() const {
-        vector<size_t> ans;
-        for (size_t k = 0; k < buffer.size(); ++k) {
-            uword myword = buffer[k];
-            while (myword != 0) {
-              uint32_t ntz =  numberOfTrailingZeros (myword);
-              ans.push_back(sizeof(uword) * 8 * k + ntz);
-              myword ^= (static_cast<uword>(1) << ntz);
-            }
-        }
-        return ans;
-    }
-
-    /**
-     * Transform into a string that presents a list of set bits.
-     * The running time is linear in the size of the bitmap.
-     */
-    operator string() const {
-        stringstream ss;
-        ss << *this;
-        return ss.str();
-
-    }
+    buffer.resize(neededwordsize);
+    sizeinbits = totalbits;
+    return static_cast<size_t>(neededwordsize - currentwordsize);
+  }
+
+  void append(const BoolArray &a);
+
+  enum { wordinbits = sizeof(uword) * 8 };
+
+  std::vector<size_t> toArray() const {
+    std::vector<size_t> ans;
+    for (size_t k = 0; k < buffer.size(); ++k) {
+      uword myword = buffer[k];
+      while (myword != 0) {
+        uint32_t ntz = numberOfTrailingZeros(myword);
+        ans.push_back(sizeof(uword) * 8 * k + ntz);
+        myword ^= (static_cast<uword>(1) << ntz);
+      }
+    }
+    return ans;
+  }
+
+  /**
+   * Transform into a string that presents a list of set bits.
+   * The running time is linear in the size of the bitmap.
+   */
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  friend std::ostream &operator<<(std::ostream &out, const BoolArray &a) {
+    std::vector<size_t> v = a.toArray();
+    out << "{";
+    for (std::vector<size_t>::const_iterator i = v.begin(); i != v.end();) {
+      out << *i;
+      ++i;
+      if (i != v.end())
+        out << ",";
+    }
+    out << "}";
+    return out;
+
+    return (out << static_cast<std::string>(a));
+  }
 
-    friend ostream& operator<< (ostream &out, const BoolArray &a) {
-        vector<size_t> v = a.toArray();
-        out <<"{";
-        for (vector<size_t>::const_iterator i = v.begin(); i != v.end(); ) {
-            out << *i;
-            ++i;
-            if( i != v.end())
-                out << ",";
-        }
-        out <<"}";
-        return out;
-
-        return (out << static_cast<string>(a));
-    }
 private:
-
-    void clearBogusBits() {
-          if((sizeinbits % wordinbits) != 0) {
-                const uword maskbogus = (static_cast<uword>(1) << (sizeinbits % wordinbits)) - 1;
-                buffer[buffer.size() - 1] &= maskbogus;
-            }
+  void clearBogusBits() {
+    if ((sizeinbits % wordinbits) != 0) {
+      const uword maskbogus =
+          (static_cast<uword>(1) << (sizeinbits % wordinbits)) - 1;
+      buffer[buffer.size() - 1] &= maskbogus;
     }
+  }
 
-    vector<uword> buffer;
-    size_t sizeinbits;
+  std::vector<uword> buffer;
+  size_t sizeinbits;
 };
 
-template<class uword>
-void BoolArray<uword>::append(const BoolArray & a) {
-    if (sizeinbits % wordinbits == 0) {
-        buffer.insert(buffer.end(), a.buffer.begin(), a.buffer.end());
-    } else {
-        throw invalid_argument("Cannot append if parent does not meet boundary");
-    }
-    sizeinbits += a.sizeinbits;
+/**
+ * computes the logical or (union) between "n" bitmaps (referenced by a
+ * pointer).
+ * The answer gets written out in container. This might be faster than calling
+ * logicalor n-1 times.
+ */
+template <class uword>
+void fast_logicalor_tocontainer(size_t n, const BoolArray<uword> **inputs,
+                                BoolArray<uword> &container) {
+  if (n == 0) {
+    container.reset();
+    return;
+  }
+  container = *inputs[0];
+  for (size_t i = 0; i < n; i++) {
+    container.inplace_logicalor(*inputs[i]);
+  }
+}
+
+/**
+ * computes the logical or (union) between "n" bitmaps (referenced by a
+ * pointer).
+ * Returns the answer. This might be faster than calling
+ * logicalor n-1 times.
+ */
+template <class uword>
+BoolArray<uword> fast_logicalor(size_t n, const BoolArray<uword> **inputs) {
+  BoolArray<uword> answer;
+  fast_logicalor_tocontainer(n, inputs, answer);
+  return answer;
+}
+
+template <class uword> void BoolArray<uword>::append(const BoolArray &a) {
+  if (sizeinbits % wordinbits == 0) {
+    buffer.insert(buffer.end(), a.buffer.begin(), a.buffer.end());
+  } else {
+    throw std::invalid_argument(
+        "Cannot append if parent does not meet boundary");
+  }
+  sizeinbits += a.sizeinbits;
 }
 
 #endif
diff --git a/yt/utilities/lib/ewahboolarray/ewah.h b/yt/utilities/lib/ewahboolarray/ewah.h
index 96d780cf3cd..2f733cc0bf2 100644
--- a/yt/utilities/lib/ewahboolarray/ewah.h
+++ b/yt/utilities/lib/ewahboolarray/ewah.h
@@ -3,6 +3,7 @@
  * Apache License Version 2.0 http://www.apache.org/licenses/.
  *
  * (c) Daniel Lemire, http://lemire.me/en/
+ *     with contributions from Zarian Waheed and others.
  */
 
 #ifndef EWAH_H
@@ -10,24 +11,20 @@
 
 #include <algorithm>
 #include <vector>
+#include <queue>
+
 #include "ewahutil.h"
 #include "boolarray.h"
 
 #include "runninglengthword.h"
 
-using namespace std;
-
-template<class uword>
-class EWAHBoolArrayIterator;
+template <class uword> class EWAHBoolArrayIterator;
 
-template<class uword>
-class EWAHBoolArraySetBitForwardIterator;
+template <class uword> class EWAHBoolArraySetBitForwardIterator;
 
 class BitmapStatistics;
 
-template<class uword>
-class EWAHBoolArrayRawIterator;
-
+template <class uword> class EWAHBoolArrayRawIterator;
 
 /**
  * This class is a compressed bitmap.
@@ -35,739 +32,916 @@ class EWAHBoolArrayRawIterator;
  * happens.
  * The underlying data structure is an STL vector.
  */
-template<class uword = uint32_t>
-class EWAHBoolArray {
+template <class uword = uint32_t> class EWAHBoolArray {
 public:
-    EWAHBoolArray() :
-        buffer(1, 0), sizeinbits(0), lastRLW(0) {
-    }
-
-    static EWAHBoolArray bitmapOf(size_t n, ...) {
-    	EWAHBoolArray ans;
-		va_list vl;
-		va_start(vl, n);
-		for (size_t i = 0; i < n; i++) {
-            ans.set(static_cast<size_t>(va_arg(vl, int)));
-	    }
-	    va_end(vl);
-	    return ans;
-	}
-
-    /**
-     * Query the value of bit i. This runs in time proportional to
-     * the size of the bitmap. This is not meant to be use in
-     * a performance-sensitive context.
-     *
-     *  (This implementation is based on zhenjl's Go version of JavaEWAH.)
-     *
-     */
-    bool get(const size_t pos) const {
-        if ( pos >= static_cast<size_t>(sizeinbits) )
-                return false;
-        const size_t wordpos = pos / wordinbits;
-        size_t WordChecked = 0;
-        EWAHBoolArrayRawIterator<uword> j = raw_iterator();
-        while(j.hasNext()) {
-        	BufferedRunningLengthWord<uword> & rle = j.next();
-        	WordChecked += static_cast<size_t>( rle.getRunningLength());
-        	if(wordpos < WordChecked)
-        		return rle.getRunningBit();
-        	if(wordpos < WordChecked + rle.getNumberOfLiteralWords() ) {
-        		const uword w = j.dirtyWords()[wordpos - WordChecked];
-                return (w & (static_cast<uword>(1) << (pos % wordinbits))) != 0;
-        	}
-        	WordChecked += static_cast<size_t>(rle.getNumberOfLiteralWords());
-        }
-        return false;
-      }
-
-
-    /**
-     * Set the ith bit to true (starting at zero).
-     * Auto-expands the bitmap. It has constant running time complexity.
-     * Note that you must set the bits in increasing order:
-     * set(1), set(2) is ok; set(2), set(1) is not ok.
-     * set(100), set(100) is also not ok.
-     *
-     * Note: by design EWAH is not an updatable data structure in
-     * the sense that once bit 1000 is set, you cannot change the value
-     * of bits 0 to 1000.
-     *
-     * Returns true if the value of the bit was changed, and false otherwise.
-     * (In practice, if you set the bits in strictly increasing order, it
-     * should always return true.)
-     */
-    bool set(size_t i);
-
-    /**
-     * Transform into a string that presents a list of set bits.
-     * The running time is linear in the compressed size of the bitmap.
-     */
-    operator string() const {
-		stringstream ss;
-		ss << *this;
-		return ss.str();
-	}
-    friend ostream& operator<< (ostream &out, const EWAHBoolArray &a) {
-
-    	out<<"{";
-		for (EWAHBoolArray::const_iterator i = a.begin(); i != a.end(); ) {
-			out<<*i;
-			++i;
-			if( i != a.end())
-				out << ",";
-		}
-		out <<"}";
-
-    	return out;
-    }
-    /**
-     * Make sure the two bitmaps have the same size (padding with zeroes
-     * if necessary). It has constant running time complexity.
-     *
-     * This is useful when calling "logicalnot" functions.
-     *
-     * This can an adverse effect of performance, especially when computing
-     * intersections.
-     */
-    void makeSameSize(EWAHBoolArray & a) {
-        if (a.sizeinbits < sizeinbits)
-            a.padWithZeroes(sizeinbits);
-        else if (sizeinbits < a.sizeinbits)
-            padWithZeroes(a.sizeinbits);
-    }
-
-    enum {
-        RESERVEMEMORY = true
-    }; // for speed
-
-    typedef EWAHBoolArraySetBitForwardIterator<uword> const_iterator;
-
-    /**
-     * Returns an iterator that can be used to access the position of the
-     * set bits. The running time complexity of a full scan is proportional to the number
-     * of set bits: be aware that if you have long strings of 1s, this can be
-     * very inefficient.
-     *
-     * It can be much faster to use the toArray method if you want to
-     * retrieve the set bits.
-     */
-    const_iterator begin() const {
-        return EWAHBoolArraySetBitForwardIterator<uword> (buffer);
-    }
+  EWAHBoolArray() : buffer(1, 0), sizeinbits(0), lastRLW(0) {}
 
-    /**
-     * Basically a bogus iterator that can be used together with begin()
-     * for constructions such as for(EWAHBoolArray<uword>::iterator i = b.begin(); i!=b.end(); ++i) {}
-     */
-    const_iterator end() const {
-        return EWAHBoolArraySetBitForwardIterator<uword> (buffer, buffer.size());
+  static EWAHBoolArray bitmapOf(size_t n, ...) {
+    EWAHBoolArray ans;
+    va_list vl;
+    va_start(vl, n);
+    for (size_t i = 0; i < n; i++) {
+      ans.set(static_cast<size_t>(va_arg(vl, int)));
     }
-
-    /**
-     * Retrieve the set bits. Can be much faster than iterating through
-     * the set bits with an iterator.
-     */
-    vector<size_t> toArray() const;
-
-    /**
-     * computes the logical and with another compressed bitmap
-     * answer goes into container
-     * Running time complexity is proportional to the sum of the compressed
-     * bitmap sizes.
-     */
-    void logicaland(const EWAHBoolArray &a, EWAHBoolArray &container) const;
-
-    /**
-     * tests whether the bitmaps "intersect" (have at least one 1-bit at the same
-     * position). This function does not modify the existing bitmaps.
-     * It is faster than calling logicaland.
-     */
-    bool intersects(const EWAHBoolArray &a) const;
-
-    /**
-     * computes the logical or with another compressed bitmap
-     * answer goes into container
-     * Running time complexity is proportional to the sum of the compressed
-     * bitmap sizes.
-     */
-    void logicalor(const EWAHBoolArray &a, EWAHBoolArray &container) const;
-
-
-    /**
-     * computes the logical xor with another compressed bitmap
-     * answer goes into container
-     * Running time complexity is proportional to the sum of the compressed
-     * bitmap sizes.
-     */
-    void logicalxor(const EWAHBoolArray &a, EWAHBoolArray &container) const;
-
-    /**
-     * clear the content of the bitmap. It does not
-     * release the memory.
-     */
-    void reset() {
-        buffer.clear();
-        buffer.push_back(0);
-        sizeinbits = 0;
-        lastRLW = 0;
-    }
-
-    /**
-     * convenience method.
-     *
-     * returns the number of words added (storage cost increase)
-     */
-    inline size_t addWord(const uword newdata,
-            const uint32_t bitsthatmatter = 8 * sizeof(uword));
-
-    inline void printout(ostream &o = cout) {
-        toBoolArray().printout(o);
-    }
-
-    /**
-     * Prints a verbose description of the content of the compressed bitmap.
-     */
-    void debugprintout() const;
-
-    /**
-     * Return the size in bits of this bitmap (this refers
-     * to the uncompressed size in bits).
-     *
-     * You can increase it with padWithZeroes()
-     */
-    inline size_t sizeInBits() const {
-        return sizeinbits;
-    }
-
-
-    /**
-     * Return the size of the buffer in bytes. This
-     * is equivalent to the storage cost, minus some overhead.
-     */
-    inline size_t sizeInBytes() const {
-        return buffer.size() * sizeof(uword);
-    }
-
-    /**
-     * same as addEmptyWord, but you can do several in one shot!
-     * returns the number of words added (storage cost increase)
-     */
-    size_t addStreamOfEmptyWords(const bool v, size_t number);
-
-    /**
-     * add a stream of dirty words, returns the number of words added
-     * (storage cost increase)
-     */
-    size_t addStreamOfDirtyWords(const uword * v, const size_t number);
-
-    /**
-     * add a stream of dirty words, each one negated, returns the number of words added
-     * (storage cost increase)
-     */
-    size_t addStreamOfNegatedDirtyWords(const uword * v, const size_t number);
-
-
-    /**
-     * make sure the size of the array is totalbits bits by padding with zeroes.
-     * returns the number of words added (storage cost increase).
-     *
-     * This is useful when calling "logicalnot" functions.
-     *
-     * This can an adverse effect of performance, especially when computing
-     * intersections.
-     *
-     */
-    size_t padWithZeroes(const size_t totalbits);
-
-    /**
-     * Compute the size on disk assuming that it was saved using
-     * the method "save".
-     */
-    size_t sizeOnDisk() const;
-
-    /**
-     * Save this bitmap to a stream. The file format is
-     * | sizeinbits | buffer lenth | buffer content|
-     * the sizeinbits part can be omitted if "savesizeinbits=false".
-     * Both sizeinbits and buffer length are saved using the size_t data
-     * type which is typically a 32-bit unsigned integer for 32-bit CPUs
-     * and a 64-bit unsigned integer for 64-bit CPUs.
-     * Note that this format is machine-specific. Note also
-     * that the word size is not saved. For robust persistent
-     * storage, you need to save this extra information elsewhere.
-     */
-    void write(ostream & out, const bool savesizeinbits = true) const;
-
-    /**
-     * This only writes the content of the buffer (see write()) method.
-     * It is for advanced users.
-     */
-    void writeBuffer(ostream & out) const;
-
-    /**
-     * size (in words) of the underlying STL vector.
-     */
-    size_t bufferSize() const {
-        return buffer.size();
-    }
-
-    /**
-     * this is the counterpart to the write method.
-     * if you set savesizeinbits=false, then you are responsible
-     * for setting the value fo the attribute sizeinbits (see method setSizeInBits).
-     */
-    void read(istream & in, const bool savesizeinbits = true);
-
-    /**
-     * read the buffer from a stream, see method writeBuffer.
-     * this is for advanced users.
-     */
-    void readBuffer(istream & in, const size_t buffersize);
-
-
-    /**
-     * We define two EWAHBoolArray as being equal if they have the same set bits.
-     * Alternatively, B1==B2 if and only if cardinality(B1 XOR B2) ==0.
-     */
-    bool operator==(const EWAHBoolArray & x) const;
-
-    /**
-     * We define two EWAHBoolArray as being different if they do not have the same set bits.
-     * Alternatively, B1!=B2 if and only if cardinality(B1 XOR B2) >0.
-     */
-    bool operator!=(const EWAHBoolArray & x) const;
-
-    bool operator==(const BoolArray<uword> & x) const;
-
-    bool operator!=(const BoolArray<uword> & x) const;
-
-    /**
-     * Iterate over the uncompressed words.
-     * Can be considerably faster than begin()/end().
-     * Running time complexity of a full scan is proportional to the
-     * uncompressed size of the bitmap.
-     */
-    EWAHBoolArrayIterator<uword> uncompress() const ;
-
-    /**
-     * To iterate over the compressed data.
-     * Can be faster than any other iterator.
-     * Running time complexity of a full scan is proportional to the
-     * compressed size of the bitmap.
-     */
-    EWAHBoolArrayRawIterator<uword> raw_iterator() const ;
-
-    /**
-     * Appends the content of some other compressed bitmap
-     * at the end of the current bitmap.
-     */
-    void append(const EWAHBoolArray & x);
-
-    /**
-     * For research purposes. This computes the number of
-     * dirty words and the number of compressed words.
-     */
-    BitmapStatistics computeStatistics() const;
-
-    /**
-     * For convenience, this fully uncompresses the bitmap.
-     * Not fast!
-     */
-    BoolArray<uword> toBoolArray() const;
-
-    /**
-     * Convert to a list of positions of "set" bits.
-     * The recommended container is vector<size_t>.
-     *
-     * See also toVector().
-     */
-    template<class container>
-    void appendRowIDs(container & out, const size_t offset = 0) const;
-
-    /**
-     * Convert to a list of positions of "set" bits.
-     * The recommended container is vector<size_t>.
-     * (alias for appendRowIDs).
-     *
-     * See also toVector().
-     */
-    template<class container>
-    void appendSetBits(container & out, const size_t offset = 0) const {
-        return appendRowIDs(out, offset);
-    }
-
-    /**
-     * Returns a vector containing the position of the set
-     * bits in increasing order.
-     */
-    vector<size_t> toVector() {
-    	vector<size_t> answer;
-    	appendSetBits(answer);
-    	return answer;
-    }
-
-    /**
-     * Returns the number of bits set to the value 1.
-     * The running time complexity is proportional to the
-     * compressed size of the bitmap.
-     *
-     * This is sometimes called the cardinality.
-     */
-    size_t numberOfOnes() const;
-
-    /**
-     * Swap the content of this bitmap with another bitmap.
-     * No copying is done. (Running time complexity is constant.)
-     */
-    void swap(EWAHBoolArray & x);
-
-    const vector<uword> & getBuffer() const {
-        return buffer;
-    }
-    ;
-    enum {
-        wordinbits = sizeof(uword) * 8
-    };
-
-    /**
-     *Please don't copy your bitmaps! The running time
-     * complexity of a copy is the size of the compressed bitmap.
-     **/
-    EWAHBoolArray(const EWAHBoolArray& other) :
-        buffer(other.buffer), sizeinbits(other.sizeinbits),
-                lastRLW(other.lastRLW) {
-    }
-
-    /**
-     * Copies the content of one bitmap onto another. Running time complexity
-     * is proportional to the size of the compressed bitmap.
-     * please, never hard-copy this object. Use the swap method if you must.
-     */
-    EWAHBoolArray & operator=(const EWAHBoolArray & x) {
-        buffer = x.buffer;
-        sizeinbits = x.sizeinbits;
-        lastRLW = x.lastRLW;
-        return *this;
-    }
-
-    /**
-     * This is equivalent to the operator =. It is used
-     * to keep in mind that assignment can be expensive.
-     *
-     *if you don't care to copy the bitmap (performance-wise), use this!
-     */
-    void expensive_copy(const EWAHBoolArray & x) {
-        buffer = x.buffer;
-        sizeinbits = x.sizeinbits;
-        lastRLW = x.lastRLW;
+    va_end(vl);
+    return ans;
+  }
+
+  /**
+   * Recover wasted memory usage. Fit buffers to the actual data.
+   */
+  void trim() { buffer.shrink_to_fit(); }
+
+  /**
+   * Query the value of bit i. This runs in time proportional to
+   * the size of the bitmap. This is not meant to be use in
+   * a performance-sensitive context.
+   *
+   *  (This implementation is based on zhenjl's Go version of JavaEWAH.)
+   *
+   */
+  bool get(const size_t pos) const {
+    if (pos >= static_cast<size_t>(sizeinbits))
+      return false;
+    const size_t wordpos = pos / wordinbits;
+    size_t WordChecked = 0;
+    EWAHBoolArrayRawIterator<uword> j = raw_iterator();
+    while (j.hasNext()) {
+      BufferedRunningLengthWord<uword> &rle = j.next();
+      WordChecked += static_cast<size_t>(rle.getRunningLength());
+      if (wordpos < WordChecked)
+        return rle.getRunningBit();
+      if (wordpos < WordChecked + rle.getNumberOfLiteralWords()) {
+        const uword w = j.dirtyWords()[wordpos - WordChecked];
+        return (w & (static_cast<uword>(1) << (pos % wordinbits))) != 0;
+      }
+      WordChecked += static_cast<size_t>(rle.getNumberOfLiteralWords());
     }
+    return false;
+  }
+
+  /**
+  * Returns true if no bit is set.
+  */
+  bool empty() const {
+      size_t pointer(0);
+      while (pointer < buffer.size()) {
+        ConstRunningLengthWord<uword> rlw(buffer[pointer]);
+        if (rlw.getRunningBit()) {
+          if(rlw.getRunningLength() > 0) return false;
+        }
+        ++pointer;
+        for (size_t k = 0; k < rlw.getNumberOfLiteralWords(); ++k) {
+          if(buffer[pointer] != 0) return false;
+          ++pointer;
+        }
+      }
+      return true;
+  }
+
+
+  /**
+   * Set the ith bit to true (starting at zero).
+   * Auto-expands the bitmap. It has constant running time complexity.
+   * Note that you must set the bits in increasing order:
+   * set(1), set(2) is ok; set(2), set(1) is not ok.
+   * set(100), set(100) is also not ok.
+   *
+   * Note: by design EWAH is not an updatable data structure in
+   * the sense that once bit 1000 is set, you cannot change the value
+   * of bits 0 to 1000.
+   *
+   * Returns true if the value of the bit was changed, and false otherwise.
+   * (In practice, if you set the bits in strictly increasing order, it
+   * should always return true.)
+   */
+  bool set(size_t i);
+
+  /**
+   * Transform into a string that presents a list of set bits.
+   * The running time is linear in the compressed size of the bitmap.
+   */
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+  friend std::ostream &operator<<(std::ostream &out, const EWAHBoolArray &a) {
+
+    out << "{";
+    for (EWAHBoolArray::const_iterator i = a.begin(); i != a.end();) {
+      out << *i;
+      ++i;
+      if (i != a.end())
+        out << ",";
+    }
+    out << "}";
+
+    return out;
+  }
+  /**
+   * Make sure the two bitmaps have the same size (padding with zeroes
+   * if necessary). It has constant running time complexity.
+   *
+   * This is useful when calling "logicalnot" functions.
+   *
+   * This can an adverse effect of performance, especially when computing
+   * intersections.
+   */
+  void makeSameSize(EWAHBoolArray &a) {
+    if (a.sizeinbits < sizeinbits)
+      a.padWithZeroes(sizeinbits);
+    else if (sizeinbits < a.sizeinbits)
+      padWithZeroes(a.sizeinbits);
+  }
+
+  enum { RESERVEMEMORY = true }; // for speed
+
+  typedef EWAHBoolArraySetBitForwardIterator<uword> const_iterator;
+
+  /**
+   * Returns an iterator that can be used to access the position of the
+   * set bits. The running time complexity of a full scan is proportional to the
+   * number
+   * of set bits: be aware that if you have long strings of 1s, this can be
+   * very inefficient.
+   *
+   * It can be much faster to use the toArray method if you want to
+   * retrieve the set bits.
+   */
+  const_iterator begin() const {
+    return EWAHBoolArraySetBitForwardIterator<uword>(&buffer);
+  }
+
+  /**
+   * Basically a bogus iterator that can be used together with begin()
+   * for constructions such as for(EWAHBoolArray<uword>::iterator i = b.begin();
+   * i!=b.end(); ++i) {}
+   */
+  const_iterator &end() const {
+    return EWAHBoolArraySetBitForwardIterator<uword>::end();
+  }
+
+  /**
+   * Retrieve the set bits. Can be much faster than iterating through
+   * the set bits with an iterator.
+   */
+  std::vector<size_t> toArray() const;
+
+  /**
+   * computes the logical and with another compressed bitmap
+   * answer goes into container
+   * Running time complexity is proportional to the sum of the compressed
+   * bitmap sizes.
+   *
+   * The sizeInBits() of the result is equal to the maximum that of the current
+   * bitmap's sizeInBits() and that of a.sizeInBits().
+   */
+  void logicaland(const EWAHBoolArray &a, EWAHBoolArray &container) const;
+
+  /**
+   * computes the logical and with another compressed bitmap
+   * Return the answer
+   * Running time complexity is proportional to the sum of the compressed
+   * bitmap sizes.
+   *
+   * The sizeInBits() of the result is equal to the maximum that of the current
+   * bitmap's sizeInBits() and that of a.sizeInBits().
+   */
+  EWAHBoolArray logicaland(const EWAHBoolArray &a) const {
+    EWAHBoolArray answer;
+    logicaland(a, answer);
+    return answer;
+  }
+
+  /**
+  * calls logicaland
+  */
+  EWAHBoolArray operator&(const EWAHBoolArray &a) const {
+    return logicaland(a);
+  }
+
+  /**
+   * computes the logical and with another compressed bitmap
+   * answer goes into container
+   * Running time complexity is proportional to the sum of the compressed
+   * bitmap sizes.
+   *
+   * The sizeInBits() of the result should be equal to that of the current
+   * bitmap irrespective of a.sizeInBits().
+   *
+   */
+  void logicalandnot(const EWAHBoolArray &a, EWAHBoolArray &container) const;
+
+  /**
+  * calls logicalandnot
+  */
+  EWAHBoolArray operator-(const EWAHBoolArray &a) const {
+    return logicalandnot(a);
+  }
+
+  /**
+   * computes the logical and not with another compressed bitmap
+   * Return the answer
+   * Running time complexity is proportional to the sum of the compressed
+   * bitmap sizes.
+   *
+   * The sizeInBits() of the result should be equal to that of the current
+   * bitmap irrespective of a.sizeInBits().
+   *
+   */
+  EWAHBoolArray logicalandnot(const EWAHBoolArray &a) const {
+    EWAHBoolArray answer;
+    logicalandnot(a, answer);
+    return answer;
+  }
+
+  /**
+   * tests whether the bitmaps "intersect" (have at least one 1-bit at the same
+   * position). This function does not modify the existing bitmaps.
+   * It is faster than calling logicaland.
+   */
+  bool intersects(const EWAHBoolArray &a) const;
+
+  /**
+   * computes the logical or with another compressed bitmap
+   * answer goes into container
+   * Running time complexity is proportional to the sum of the compressed
+   * bitmap sizes.
+   *
+   * If you have many bitmaps, see fast_logicalor_tocontainer.
+   *
+   * The sizeInBits() of the result is equal to the maximum that of the current
+   * bitmap's sizeInBits() and that of a.sizeInBits().
+   */
+  void logicalor(const EWAHBoolArray &a, EWAHBoolArray &container) const;
+
+  /**
+   * computes the size (in number of set bits) of the logical or with another
+   * compressed bitmap
+   * Running time complexity is proportional to the sum of the compressed
+   * bitmap sizes.
+   */
+  size_t logicalorcount(const EWAHBoolArray &a) const;
+
+  /**
+   * computes the size (in number of set bits) of the logical and with another
+   * compressed bitmap
+   * Running time complexity is proportional to the sum of the compressed
+   * bitmap sizes.
+   */
+  size_t logicalandcount(const EWAHBoolArray &a) const;
+
+  /**
+   * computes the size (in number of set bits) of the logical and not with
+   * another compressed bitmap
+   * Running time complexity is proportional to the sum of the compressed
+   * bitmap sizes.
+   */
+  size_t logicalandnotcount(const EWAHBoolArray &a) const;
+
+  /**
+   * computes the size (in number of set bits) of the logical xor with another
+   * compressed bitmap
+   * Running time complexity is proportional to the sum of the compressed
+   * bitmap sizes.
+   */
+  size_t logicalxorcount(const EWAHBoolArray &a) const;
+
+  /**
+   * computes the logical or with another compressed bitmap
+   * Return the answer
+   * Running time complexity is proportional to the sum of the compressed
+   * bitmap sizes.
+   *
+   * If you have many bitmaps, see fast_logicalor.
+   *
+   * The sizeInBits() of the result is equal to the maximum that of the current
+   * bitmap's sizeInBits() and that of a.sizeInBits().
+   */
+  EWAHBoolArray logicalor(const EWAHBoolArray &a) const {
+    EWAHBoolArray answer;
+    logicalor(a, answer);
+    return answer;
+  }
+
+  /**
+  * calls logicalor
+  */
+  EWAHBoolArray operator|(const EWAHBoolArray &a) const { return logicalor(a); }
+
+  /**
+   * computes the logical xor with another compressed bitmap
+   * answer goes into container
+   * Running time complexity is proportional to the sum of the compressed
+   * bitmap sizes.
+   *
+   * The sizeInBits() of the result is equal to the maximum that of the current
+   * bitmap's sizeInBits() and that of a.sizeInBits().
+   */
+  void logicalxor(const EWAHBoolArray &a, EWAHBoolArray &container) const;
+
+  /**
+   * computes the logical xor with another compressed bitmap
+   * Return the answer
+   * Running time complexity is proportional to the sum of the compressed
+   * bitmap sizes.
+   *
+   * The sizeInBits() of the result is equal to the maximum that of the current
+   * bitmap's sizeInBits() and that of a.sizeInBits().
+   */
+  EWAHBoolArray logicalxor(const EWAHBoolArray &a) const {
+    EWAHBoolArray answer;
+    logicalxor(a, answer);
+    return answer;
+  }
+
+  /**
+  * calls logicalxor
+  */
+  EWAHBoolArray operator^(const EWAHBoolArray &a) const {
+    return logicalxor(a);
+  }
+  /**
+   * clear the content of the bitmap. It does not
+   * release the memory.
+   */
+  void reset() {
+    buffer.clear();
+    buffer.push_back(0);
+    sizeinbits = 0;
+    lastRLW = 0;
+  }
+
+  /**
+   * convenience method.
+   *
+   * returns the number of words added (storage cost increase)
+   */
+  inline size_t addWord(const uword newdata,
+                        const uint32_t bitsthatmatter = 8 * sizeof(uword));
+
+  inline void printout(std::ostream &o = std::cout) {
+    toBoolArray().printout(o);
+  }
+
+  /**
+   * Prints a verbose description of the content of the compressed bitmap.
+   */
+  void debugprintout() const;
+
+  /**
+   * Return the size in bits of this bitmap (this refers
+   * to the uncompressed size in bits).
+   *
+   * You can increase it with padWithZeroes()
+   */
+  inline size_t sizeInBits() const { return sizeinbits; }
+
+  /**
+   * Return the size of the buffer in bytes. This
+   * is equivalent to the storage cost, minus some overhead.
+   * See sizeOnDisk to get the actual storage cost with overhead.
+   */
+  inline size_t sizeInBytes() const { return buffer.size() * sizeof(uword); }
+
+  /**
+   * same as addEmptyWord, but you can do several in one shot!
+   * returns the number of words added (storage cost increase)
+   */
+  size_t addStreamOfEmptyWords(const bool v, size_t number);
+
+  /**
+   * add a stream of dirty words, returns the number of words added
+   * (storage cost increase)
+   */
+  size_t addStreamOfDirtyWords(const uword *v, const size_t number);
+
+  /**
+   * add a stream of dirty words, each one negated, returns the number of words
+   * added
+   * (storage cost increase)
+   */
+  size_t addStreamOfNegatedDirtyWords(const uword *v, const size_t number);
+
+  /**
+   * make sure the size of the array is totalbits bits by padding with zeroes.
+   * returns the number of words added (storage cost increase).
+   *
+   * This is useful when calling "logicalnot" functions.
+   *
+   * This can an adverse effect of performance, especially when computing
+   * intersections.
+   *
+   */
+  size_t padWithZeroes(const size_t totalbits);
+
+  /**
+   * Compute the size on disk assuming that it was saved using
+   * the method "write".
+   */
+  size_t sizeOnDisk(const bool savesizeinbits = true) const;
+
+  /**
+   * Save this bitmap to a stream. The file format is
+   * | sizeinbits | buffer lenth | buffer content|
+   * the sizeinbits part can be omitted if "savesizeinbits=false".
+   * Both sizeinbits and buffer length are saved using the size_t data
+   * type which is typically a 32-bit unsigned integer for 32-bit CPUs
+   * and a 64-bit unsigned integer for 64-bit CPUs.
+   * Note that this format is machine-specific. Note also
+   * that the word size is not saved. For robust persistent
+   * storage, you need to save this extra information elsewhere.
+   *
+   * Returns how many bytes were handed out to the stream.
+   */
+  size_t write(std::ostream &out, const bool savesizeinbits = true) const;
+
+  /**
+  * same as write(std::ostream...), except that you provide a char pointer
+  * and a "capacity" (in bytes). The function never writes at or beyond "out+capacity".
+  * If the storage needed exceeds the
+  * given capacity, the value zero is returned: it should be considered an error.
+  * Otherwise, the number of bytes copied is returned.
+  */
+  size_t write(char * out, size_t capacity, const bool savesizeinbits = true) const;
+
+  /**
+   * This only writes the content of the buffer (see write()) method.
+   * It is for advanced users.
+   */
+  void writeBuffer(std::ostream &out) const;
+
+  /**
+   * size (in words) of the underlying STL vector.
+   */
+  size_t bufferSize() const { return buffer.size(); }
+
+  /**
+   * this is the counterpart to the write method.
+   * if you set savesizeinbits=false, then you are responsible
+   * for setting the value fo the attribute sizeinbits (see method
+   * setSizeInBits).
+   *
+   * Returns how many bytes were queried from the stream.
+   */
+  size_t read(std::istream &in, const bool savesizeinbits = true);
+
+
+  /**
+  * same as read(std::istream...), except that you provide a char pointer
+  * and a "capacity" (in bytes). The function never reads at or beyond "in+capacity".
+  * If the detected storage exceeds the  given capacity, the value zero is returned:
+  * it should be considered an error.
+  * Otherwise, the number of bytes read is returned.
+  */
+  size_t read(const char * in, size_t capacity, const bool savesizeinbits = true);
+
+  /**
+   * read the buffer from a stream, see method writeBuffer.
+   * this is for advanced users.
+   */
+  void readBuffer(std::istream &in, const size_t buffersize);
+
+  /**
+   * We define two EWAHBoolArray as being equal if they have the same set bits.
+   * Alternatively, B1==B2 if and only if cardinality(B1 XOR B2) ==0.
+   */
+  bool operator==(const EWAHBoolArray &x) const;
+
+  /**
+   * We define two EWAHBoolArray as being different if they do not have the same
+   * set bits.
+   * Alternatively, B1!=B2 if and only if cardinality(B1 XOR B2) >0.
+   */
+  bool operator!=(const EWAHBoolArray &x) const;
+
+  bool operator==(const BoolArray<uword> &x) const;
+
+  bool operator!=(const BoolArray<uword> &x) const;
+
+  /**
+   * Iterate over the uncompressed words.
+   * Can be considerably faster than begin()/end().
+   * Running time complexity of a full scan is proportional to the
+   * uncompressed size of the bitmap.
+   */
+  EWAHBoolArrayIterator<uword> uncompress() const;
+
+  /**
+   * To iterate over the compressed data.
+   * Can be faster than any other iterator.
+   * Running time complexity of a full scan is proportional to the
+   * compressed size of the bitmap.
+   */
+  EWAHBoolArrayRawIterator<uword> raw_iterator() const;
+
+  /**
+   * Appends the content of some other compressed bitmap
+   * at the end of the current bitmap.
+   */
+  void append(const EWAHBoolArray &x);
+
+  /**
+   * For research purposes. This computes the number of
+   * dirty words and the number of compressed words.
+   */
+  BitmapStatistics computeStatistics() const;
+
+  /**
+   * For convenience, this fully uncompresses the bitmap.
+   * Not fast!
+   */
+  BoolArray<uword> toBoolArray() const;
+
+  /**
+   * Convert to a list of positions of "set" bits.
+   * The recommended container is vector<size_t>.
+   *
+   * See also toArray().
+   */
+  template <class container>
+  void appendRowIDs(container &out, const size_t offset = 0) const;
+
+  /**
+   * Convert to a list of positions of "set" bits.
+   * The recommended container is vector<size_t>.
+   * (alias for appendRowIDs).
+   *
+   * See also toArray().
+   */
+  template <class container>
+  void appendSetBits(container &out, const size_t offset = 0) const {
+    return appendRowIDs(out, offset);
+  }
+
+  /**
+   * Returns a vector containing the position of the set
+   * bits in increasing order. This just calls "toArray".
+   */
+  std::vector<size_t> toVector() const { return toArray(); }
+
+  /**
+   * Returns the number of bits set to the value 1.
+   * The running time complexity is proportional to the
+   * compressed size of the bitmap.
+   *
+   * This is sometimes called the cardinality.
+   */
+  size_t numberOfOnes() const;
+
+  /**
+   * Swap the content of this bitmap with another bitmap.
+   * No copying is done. (Running time complexity is constant.)
+   */
+  void swap(EWAHBoolArray &x);
+
+  const std::vector<uword> &getBuffer() const { return buffer; }
+
+  enum { wordinbits = sizeof(uword) * 8 };
+
+  /**
+   * Please don't copy your bitmaps! The running time
+   * complexity of a copy is the size of the compressed bitmap.
+   **/
+  EWAHBoolArray(const EWAHBoolArray &other)
+      : buffer(other.buffer), sizeinbits(other.sizeinbits),
+        lastRLW(other.lastRLW) {}
+
+  /**
+   * Copies the content of one bitmap onto another. Running time complexity
+   * is proportional to the size of the compressed bitmap.
+   * please, never hard-copy this object. Use the swap method if you must.
+   */
+  EWAHBoolArray &operator=(const EWAHBoolArray &x) {
+    buffer = x.buffer;
+    sizeinbits = x.sizeinbits;
+    lastRLW = x.lastRLW;
+    return *this;
+  }
+
+  /**
+  * Move constructor.
+  */
+  EWAHBoolArray(EWAHBoolArray &&other)
+    : buffer(std::move(other.buffer)), sizeinbits(other.sizeinbits),
+      lastRLW(other.lastRLW) {}
+
+  /**
+  * Move assignment operator.
+  */
+  EWAHBoolArray &operator=(EWAHBoolArray &&x) {
+    buffer = std::move(x.buffer);
+    sizeinbits = x.sizeinbits;
+    lastRLW = x.lastRLW;
+    return *this;
+  }
+
+  /**
+   * This is equivalent to the operator =. It is used
+   * to keep in mind that assignment can be expensive.
+   *
+   *if you don't care to copy the bitmap (performance-wise), use this!
+   */
+  void expensive_copy(const EWAHBoolArray &x) {
+    buffer = x.buffer;
+    sizeinbits = x.sizeinbits;
+    lastRLW = x.lastRLW;
+  }
+
+  /**
+   * Write the logical not of this bitmap in the provided container.
+   *
+   * This function takes into account the sizeInBits value.
+   * You may need to call "padWithZeroes" to adjust the sizeInBits.
+   */
+  void logicalnot(EWAHBoolArray &x) const;
+
+  /**
+   * Write the logical not of this bitmap in the provided container.
+   *
+   * This function takes into account the sizeInBits value.
+   * You may need to call "padWithZeroes" to adjust the sizeInBits.
+   */
+  EWAHBoolArray<uword> logicalnot() const {
+    EWAHBoolArray answer;
+    logicalnot(answer);
+    return answer;
+  }
+
+  /**
+   * Apply the logical not operation on this bitmap.
+   * Running time complexity is proportional to the compressed size of the
+   *bitmap.
+   * The current bitmap is not modified.
+   *
+   * This function takes into account the sizeInBits value.
+   * You may need to call "padWithZeroes" to adjust the sizeInBits.
+   **/
+  void inplace_logicalnot();
+
+  /**
+   * set size in bits. This does not affect the compressed size. It
+   * runs in constant time. This should not normally be used, except
+   * as part of a deserialization process.
+   */
+  inline void setSizeInBits(const size_t size) { sizeinbits = size; }
+
+  /**
+   * Like addStreamOfEmptyWords but
+   * addStreamOfEmptyWords but does not return the cost increase,
+   * does not update sizeinbits
+   */
+  inline void fastaddStreamOfEmptyWords(const bool v, size_t number);
+  /**
+   * LikeaddStreamOfDirtyWords but does not return the cost increse,
+   * does not update sizeinbits.
+   */
+  inline void fastaddStreamOfDirtyWords(const uword *v, const size_t number);
 
-    /**
-     * Write the logical not of this bitmap in the provided container.
-     *
-     * This function takes into account the sizeInBits value.
-     * You may need to call "padWithZeroes" to adjust the sizeInBits.
-     */
-    void logicalnot(EWAHBoolArray & x) const;
-
-    /**
-     * Apply the logical not operation on this bitmap.
-     * Running time complexity is proportional to the compressed size of the bitmap.
-     * The current bitmap is not modified.
-     *
-     * This function takes into account the sizeInBits value.
-     * You may need to call "padWithZeroes" to adjust the sizeInBits.
-     **/
-    void inplace_logicalnot();
-
-    /**
-     * set size in bits. This does not affect the compressed size. It
-     * runs in constant time. This should not normally be used, except
-     * as part of a deserialization process.
-     */
-    inline void setSizeInBits(const size_t size) {
-        sizeinbits = size;
-    }
 private:
-
-    // addStreamOfEmptyWords but does not return the cost increase,
-    // does not update sizeinbits and does not check that number>0
-    void fastaddStreamOfEmptyWords(const bool v, size_t number);
-
-    // private because does not increment the size in bits
-    // returns the number of words added (storage cost increase)
-    inline size_t addLiteralWord(const uword newdata);
-
-    // private because does not increment the size in bits
-    // returns the number of words added (storage cost increase)
-    size_t addEmptyWord(const bool v);
-    // this second version "might" be faster if you hate OOP.
-    // in my tests, it turned out to be slower!
-    // private because does not increment the size in bits
-    //inline void addEmptyWordStaticCalls(bool v);
-
-    vector<uword> buffer;
-    size_t sizeinbits;
-    size_t lastRLW;
+  // private because does not increment the size in bits
+  // returns the number of words added (storage cost increase)
+  inline size_t addLiteralWord(const uword newdata);
+
+  // private because does not increment the size in bits
+  // returns the number of words added (storage cost increase)
+  size_t addEmptyWord(const bool v);
+  // this second version "might" be faster if you hate OOP.
+  // in my tests, it turned out to be slower!
+  // private because does not increment the size in bits
+  // inline void addEmptyWordStaticCalls(bool v);
+
+  std::vector<uword> buffer;
+  size_t sizeinbits;
+  size_t lastRLW;
 };
 
+/**
+ * computes the logical or (union) between "n" bitmaps (referenced by a
+ * pointer).
+ * The answer gets written out in container. This might be faster than calling
+ * logicalor n-1 times.
+ */
+template <class uword>
+void fast_logicalor_tocontainer(size_t n, const EWAHBoolArray<uword> **inputs,
+                                EWAHBoolArray<uword> &container);
+
+/**
+ * computes the logical or (union) between "n" bitmaps (referenced by a
+ * pointer).
+ * Returns the answer. This might be faster than calling
+ * logicalor n-1 times.
+ */
+template <class uword>
+EWAHBoolArray<uword> fast_logicalor(size_t n,
+                                    const EWAHBoolArray<uword> **inputs) {
+  EWAHBoolArray<uword> answer;
+  fast_logicalor_tocontainer(n, inputs, answer);
+  return answer;
+}
+
 /**
  * Iterate over words of bits from a compressed bitmap.
  */
-template<class uword>
-class EWAHBoolArrayIterator {
+template <class uword> class EWAHBoolArrayIterator {
 public:
-    /**
-     * is there a new word?
-     */
-    bool hasNext() const {
-        return pointer < myparent.size();
+  /**
+   * is there a new word?
+   */
+  bool hasNext() const { return pointer < myparent.size(); }
+
+  /**
+   * return next word.
+   */
+  uword next() {
+    uword returnvalue;
+    if (compressedwords < rl) {
+      ++compressedwords;
+      if (b)
+        returnvalue = notzero;
+      else
+        returnvalue = zero;
+    } else {
+      ++literalwords;
+      ++pointer;
+      returnvalue = myparent[pointer];
     }
-
-    /**
-     * return next word.
-     */
-    uword next() {
-        uword returnvalue;
-        if (compressedwords < rl) {
-            ++compressedwords;
-            if (b)
-                returnvalue = notzero;
-            else
-                returnvalue = zero;
-        } else {
-#ifdef EWAHASSERT
-            assert(literalwords < lw);
-#endif
-            ++literalwords;
-            ++pointer;
-#ifdef EWAHASSERT
-            assert(pointer < myparent.size());
-#endif
-            returnvalue = myparent[pointer];
-        }
-        if ((compressedwords == rl) && (literalwords == lw)) {
-            ++pointer;
-            if (pointer < myparent.size())
-                readNewRunningLengthWord();
-        }
-        return returnvalue;
+    if ((compressedwords == rl) && (literalwords == lw)) {
+      ++pointer;
+      if (pointer < myparent.size())
+        readNewRunningLengthWord();
     }
+    return returnvalue;
+  }
 
-    EWAHBoolArrayIterator(const EWAHBoolArrayIterator<uword> & other) :
-        pointer(other.pointer), myparent(other.myparent),
-                compressedwords(other.compressedwords),
-                literalwords(other.literalwords), rl(other.rl), lw(other.lw),
-                b(other.b) {
-    }
+  EWAHBoolArrayIterator(const EWAHBoolArrayIterator<uword> &other)
+      : pointer(other.pointer), myparent(other.myparent),
+        compressedwords(other.compressedwords),
+        literalwords(other.literalwords), rl(other.rl), lw(other.lw),
+        b(other.b) {}
+
+  static const uword zero = 0;
+  static const uword notzero = static_cast<uword>(~zero);
 
-    static const uword zero = 0;
-    static const uword notzero = static_cast<uword> (~zero);
 private:
-    EWAHBoolArrayIterator(const vector<uword> & parent);
-    void readNewRunningLengthWord();
-    friend class EWAHBoolArray<uword> ;
-    size_t pointer;
-    const vector<uword> & myparent;
-    uword compressedwords;
-    uword literalwords;
-    uword rl, lw;
-    bool b;
+  EWAHBoolArrayIterator(const std::vector<uword> &parent);
+  void readNewRunningLengthWord();
+  friend class EWAHBoolArray<uword>;
+  size_t pointer;
+  const std::vector<uword> &myparent;
+  uword compressedwords;
+  uword literalwords;
+  uword rl, lw;
+  bool b;
 };
 
 /**
  * Used to go through the set bits. Not optimally fast, but convenient.
  */
-template<class uword>
-class EWAHBoolArraySetBitForwardIterator {
+template <class uword> class EWAHBoolArraySetBitForwardIterator {
 public:
-    enum {
-        wordinbits = sizeof(uword) * 8
-    };
-    typedef forward_iterator_tag iterator_category;
-    typedef size_t * pointer;
-    typedef size_t & reference_type;
-    typedef size_t value_type;
-    typedef ptrdiff_t difference_type;
-    typedef EWAHBoolArraySetBitForwardIterator<uword> type_of_iterator;
-
-    /**
-     * Provides the location of the set bit.
-     */
-    size_t operator*() const {
-        return currentrunoffset + offsetofpreviousrun;
-    }
-
-    // this can be expensive
-    difference_type operator-(const type_of_iterator& o) {
-        type_of_iterator& smaller = *this < o ? *this : o;
-        type_of_iterator& bigger = *this >= o ? *this : o;
-        if (smaller.mpointer == smaller.buffer.size())
-            return 0;
-        difference_type absdiff = static_cast<difference_type> (0);
-        EWAHBoolArraySetBitForwardIterator<uword> buf(smaller);
-        while (buf != bigger) {
-            ++absdiff;
-            ++buf;
-        }
-        if (*this < o)
-            return absdiff;
-        else
-            return -absdiff;
-    }
-
-    bool operator<(const type_of_iterator& o) {
-        if (buffer != o.buffer)
-            return false;
-        if (mpointer == buffer.size())
-            return false;
-        if (o.mpointer == o.buffer.size())
-            return true;
-        if (offsetofpreviousrun < o.offsetofpreviousrun)
-            return true;
-        if (offsetofpreviousrun > o.offsetofpreviousrun)
-            return false;
-        if (currentrunoffset < o.currentrunoffset)
-            return true;
-        return false;
-    }
-    bool operator<=(const type_of_iterator& o) {
-        return ((*this) < o) || ((*this) == o);
-    }
-
-    bool operator>(const type_of_iterator& o) {
-        return !((*this) <= o);
-    }
-
-    bool operator>=(const type_of_iterator& o) {
-        return !((*this) < o);
-    }
-
-    EWAHBoolArraySetBitForwardIterator & operator++() {
-        ++currentrunoffset;
-        advanceToNextSetBit();
-        return *this;
-    }
-    EWAHBoolArraySetBitForwardIterator operator++(int) {
-        EWAHBoolArraySetBitForwardIterator old(*this);
-        ++currentrunoffset;
-        advanceToNextSetBit();
-        return old;
-    }
-    bool operator==(const EWAHBoolArraySetBitForwardIterator<uword> & o) {
-        // if they are both over, return true
-        if ((mpointer == buffer.size()) && (o.mpointer == o.buffer.size()))
-            return true;
-        return (&buffer == &o.buffer) && (mpointer == o.mpointer)
-                && (offsetofpreviousrun == o.offsetofpreviousrun)
-                && (currentrunoffset == o.currentrunoffset);
-    }
-    bool operator!=(const EWAHBoolArraySetBitForwardIterator<uword> & o) {
-        // if they are both over, return false
-        if ((mpointer == buffer.size()) && (o.mpointer == o.buffer.size()))
-            return false;
-        return (&buffer != &o.buffer) || (mpointer != o.mpointer)
-                || (offsetofpreviousrun != o.offsetofpreviousrun)
-                || (currentrunoffset != o.currentrunoffset);
+  typedef std::forward_iterator_tag iterator_category;
+  typedef size_t *pointer;
+  typedef size_t &reference_type;
+  typedef size_t value_type;
+  typedef ptrdiff_t difference_type;
+  typedef EWAHBoolArraySetBitForwardIterator<uword> type_of_iterator;
+  /**
+   * Provides the location of the set bit.
+   */
+  inline size_t operator*() const { return answer; }
+
+  bool operator<(const type_of_iterator &o) const {
+    if (!o.hasValue)
+      return true;
+    if (!hasValue)
+      return false;
+    return answer < o.answer;
+  }
+
+  bool operator<=(const type_of_iterator &o) const {
+    if (!o.hasValue)
+      return true;
+    if (!hasValue)
+      return false;
+    return answer <= o.answer;
+  }
+
+  bool operator>(const type_of_iterator &o) const { return !((*this) <= o); }
+
+  bool operator>=(const type_of_iterator &o) const { return !((*this) < o); }
+
+  EWAHBoolArraySetBitForwardIterator &operator++() { //++i
+    if (hasNext)
+      next();
+    else
+      hasValue = false;
+    return *this;
+  }
+
+  EWAHBoolArraySetBitForwardIterator operator++(int) { // i++
+    EWAHBoolArraySetBitForwardIterator old(*this);
+    if (hasNext)
+      next();
+    else
+      hasValue = false;
+    return old;
+  }
+
+  bool operator==(const EWAHBoolArraySetBitForwardIterator<uword> &o) const {
+    if ((!hasValue) && (!o.hasValue))
+      return true;
+    return (hasValue == o.hasValue) && (answer == o.answer);
+  }
+
+  bool operator!=(const EWAHBoolArraySetBitForwardIterator<uword> &o) const {
+    return !(*this == o);
+  }
+
+  static EWAHBoolArraySetBitForwardIterator<uword> &end() {
+    static EWAHBoolArraySetBitForwardIterator<uword> e;
+    return e;
+  }
+
+  EWAHBoolArraySetBitForwardIterator(const std::vector<uword> *parent,
+                                     size_t startpointer = 0)
+      : word(0), position(0), runningLength(0), literalPosition(0),
+        wordPosition(startpointer), wordLength(0), buffer(parent),
+        hasNext(false), hasValue(false), answer(0) {
+    if (wordPosition < buffer->size()) {
+      setRunningLengthWord();
+      hasNext = moveToNext();
+      if (hasNext) {
+        next();
+        hasValue = true;
+      }
     }
+  }
 
-    EWAHBoolArraySetBitForwardIterator(
-            const EWAHBoolArraySetBitForwardIterator & o) :
-        buffer(o.buffer), mpointer(o.mpointer),
-                offsetofpreviousrun(o.offsetofpreviousrun),
-                currentrunoffset(o.currentrunoffset), rlw(o.rlw) {
-    }
+  EWAHBoolArraySetBitForwardIterator()
+      : word(0), position(0), runningLength(0), literalPosition(0),
+        wordPosition(0), wordLength(0), buffer(NULL), hasNext(false),
+        hasValue(false), answer(0) {}
 
-private:
+  inline bool runningHasNext() const { return position < runningLength; }
 
-    bool advanceToNextSetBit() {
-        if (mpointer == buffer.size())
-            return false;
-        if (currentrunoffset < static_cast<size_t> (rlw.getRunningLength()
-                * wordinbits)) {
-            if (rlw.getRunningBit())
-                return true;// nothing to do
-            currentrunoffset = static_cast<size_t> (rlw.getRunningLength()
-                    * wordinbits);//skipping
-        }
-        while (true) {
-            const size_t
-                    indexoflitword =
-                            static_cast<size_t> ((currentrunoffset
-                                    - rlw.getRunningLength() * wordinbits)
-                                    / wordinbits);
-            if (indexoflitword >= rlw.getNumberOfLiteralWords()) {
-                if (advanceToNextRun())
-                    return advanceToNextSetBit();
-                else {
-                    return false;
-                }
-            }
-
-            if (usetrailingzeros) {
-
-                const uint32_t tinwordpointer =
-                        static_cast<uint32_t> ((currentrunoffset
-                                - rlw.getRunningLength() * wordinbits)
-                                % wordinbits);
-                const uword modcurrentword =
-                        static_cast<uword> (buffer[mpointer + 1
-                                + indexoflitword] >> tinwordpointer);
-                if (modcurrentword != 0) {
-                    currentrunoffset
-                            += static_cast<size_t> (numberOfTrailingZeros(
-                                    modcurrentword));
-                    return true;
-                } else {
-                    currentrunoffset += wordinbits - tinwordpointer;
-                }
-            } else {
-                const uword currentword = buffer[mpointer + 1 + indexoflitword];
-                for (uint32_t inwordpointer =
-                        static_cast<uint32_t> ((currentrunoffset
-                                - rlw.getRunningLength() * wordinbits)
-                                % wordinbits); inwordpointer < wordinbits; ++inwordpointer, ++currentrunoffset) {
-                    if ((currentword
-                            & (static_cast<uword> (1) << inwordpointer)) != 0)
-                        return true;
-                }
-            }
-        }
+  inline bool literalHasNext() {
+    while (word == 0 && wordPosition < wordLength) {
+      word = (*buffer)[wordPosition++];
+      literalPosition = position;
+      position += WORD_IN_BITS;
     }
+    return word != 0;
+  }
 
-    enum {
-        usetrailingzeros = true
-    };// optimization option
-
-    bool advanceToNextRun() {
-        offsetofpreviousrun += currentrunoffset;
-        currentrunoffset = 0;
-        mpointer += static_cast<size_t> (1 + rlw.getNumberOfLiteralWords());
-        if (mpointer < buffer.size()) {
-            rlw.mydata = buffer[mpointer];
-        } else {
-            return false;
-        }
-        return true;
+  inline void setRunningLengthWord() {
+    uword rlw = (*buffer)[wordPosition];
+    runningLength =
+        (size_t)WORD_IN_BITS * RunningLengthWord<uword>::getRunningLength(rlw) +
+        position;
+    if (!RunningLengthWord<uword>::getRunningBit(rlw)) {
+      position = runningLength;
     }
+    wordPosition++; // point to first literal word
+    wordLength =
+        wordPosition + RunningLengthWord<uword>::getNumberOfLiteralWords(rlw);
+  }
 
-    EWAHBoolArraySetBitForwardIterator(const vector<uword> & parent,
-            size_t startpointer = 0) :
-        buffer(parent), mpointer(startpointer), offsetofpreviousrun(0),
-                currentrunoffset(0), rlw(0) {
-        if (mpointer < buffer.size()) {
-            rlw.mydata = buffer[mpointer];
-            advanceToNextSetBit();
-        }
+  inline bool moveToNext() {
+    while (!runningHasNext() && !literalHasNext()) {
+      if (wordPosition >= buffer->size()) {
+        return false;
+      }
+      setRunningLengthWord();
     }
+    return true;
+  }
 
-    const vector<uword> & buffer;
-    size_t mpointer;
-    size_t offsetofpreviousrun;
-    size_t currentrunoffset;
-    friend class EWAHBoolArray<uword> ;
-    ConstRunningLengthWord<uword> rlw;
+  void next() { // update answer
+    if (runningHasNext()) {
+      answer = position++;
+      if (runningHasNext())
+        return;
+    } else {
+      uword t = word & (~word + 1);
+      answer = literalPosition + countOnes((uword)(t - 1));
+      word ^= t;
+    }
+    hasNext = moveToNext();
+  }
+
+  enum { WORD_IN_BITS = sizeof(uword) * 8 };
+  uword word; // lit word
+  size_t position;
+  size_t runningLength;
+  size_t literalPosition;
+  size_t wordPosition; // points to word in buffer
+  uword wordLength;
+  const std::vector<uword> *buffer;
+  bool hasNext;
+  bool hasValue;
+  size_t answer;
 };
 
 /**
@@ -776,1137 +950,1357 @@ class EWAHBoolArraySetBitForwardIterator {
  */
 class BitmapStatistics {
 public:
-    BitmapStatistics() :
-        totalliteral(0), totalcompressed(0), runningwordmarker(0),
-                maximumofrunningcounterreached(0) {
-    }
-    size_t getCompressedSize() const {
-        return totalliteral + runningwordmarker;
-    }
-    size_t getUncompressedSize() const {
-        return totalliteral + totalcompressed;
-    }
-    size_t getNumberOfDirtyWords() const {
-        return totalliteral;
-    }
-    size_t getNumberOfCleanWords() const {
-        return totalcompressed;
-    }
-    size_t getNumberOfMarkers() const {
-        return runningwordmarker;
-    }
-    size_t getOverRuns() const {
-        return maximumofrunningcounterreached;
-    }
-    size_t totalliteral;
-    size_t totalcompressed;
-    size_t runningwordmarker;
-    size_t maximumofrunningcounterreached;
+  BitmapStatistics()
+      : totalliteral(0), totalcompressed(0), runningwordmarker(0),
+        maximumofrunningcounterreached(0) {}
+  size_t getCompressedSize() const { return totalliteral + runningwordmarker; }
+  size_t getUncompressedSize() const { return totalliteral + totalcompressed; }
+  size_t getNumberOfDirtyWords() const { return totalliteral; }
+  size_t getNumberOfCleanWords() const { return totalcompressed; }
+  size_t getNumberOfMarkers() const { return runningwordmarker; }
+  size_t getOverRuns() const { return maximumofrunningcounterreached; }
+  size_t totalliteral;
+  size_t totalcompressed;
+  size_t runningwordmarker;
+  size_t maximumofrunningcounterreached;
 };
 
-template<class uword>
-bool EWAHBoolArray<uword>::set(size_t i) {
-    if(i < sizeinbits) return false;
-    const size_t dist = (i + wordinbits) / wordinbits - (sizeinbits
-            + wordinbits - 1) / wordinbits;
-    sizeinbits = i + 1;
-    if (dist > 0) {// easy
-        if(dist>1) fastaddStreamOfEmptyWords(false, dist - 1);
-        addLiteralWord(
-                static_cast<uword> (static_cast<uword> (1) << (i % wordinbits)));
-        return true;
-    }
-    RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
-    if (lastRunningLengthWord.getNumberOfLiteralWords() == 0) {
-        lastRunningLengthWord.setRunningLength(
-                static_cast<uword> (lastRunningLengthWord.getRunningLength()
-                        - 1));
-        addLiteralWord(
-                static_cast<uword> (static_cast<uword> (1) << (i % wordinbits)));
-        return true;
-    }
-    buffer[buffer.size() - 1] |= static_cast<uword> (static_cast<uword> (1)
-            << (i % wordinbits));
-    // check if we just completed a stream of 1s
-    if (buffer[buffer.size() - 1] == static_cast<uword> (~0)) {
-        // we remove the last dirty word
-        buffer[buffer.size() - 1] = 0;
-        buffer.resize(buffer.size() - 1);
-        lastRunningLengthWord.setNumberOfLiteralWords(
-                static_cast<uword> (lastRunningLengthWord.getNumberOfLiteralWords()
-                        - 1));
-        // next we add one clean word
-        addEmptyWord(true);
-    }
+template <class uword> bool EWAHBoolArray<uword>::set(size_t i) {
+  if (i < sizeinbits)
+    return false;
+  const size_t dist = (i + wordinbits) / wordinbits -
+                      (sizeinbits + wordinbits - 1) / wordinbits;
+  sizeinbits = i + 1;
+  if (dist > 0) { // easy
+    if (dist > 1)
+      fastaddStreamOfEmptyWords(false, dist - 1);
+    addLiteralWord(
+        static_cast<uword>(static_cast<uword>(1) << (i % wordinbits)));
+    return true;
+  }
+  RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
+  if (lastRunningLengthWord.getNumberOfLiteralWords() == 0) {
+    lastRunningLengthWord.setRunningLength(
+        static_cast<uword>(lastRunningLengthWord.getRunningLength() - 1));
+    addLiteralWord(
+        static_cast<uword>(static_cast<uword>(1) << (i % wordinbits)));
     return true;
+  }
+  buffer[buffer.size() - 1] |=
+      static_cast<uword>(static_cast<uword>(1) << (i % wordinbits));
+  // check if we just completed a stream of 1s
+  if (buffer[buffer.size() - 1] == static_cast<uword>(~0)) {
+    // we remove the last dirty word
+    buffer[buffer.size() - 1] = 0;
+    buffer.resize(buffer.size() - 1);
+    lastRunningLengthWord.setNumberOfLiteralWords(static_cast<uword>(
+        lastRunningLengthWord.getNumberOfLiteralWords() - 1));
+    // next we add one clean word
+    addEmptyWord(true);
+  }
+  return true;
 }
 
-template<class uword>
-void EWAHBoolArray<uword>::inplace_logicalnot() {
-    size_t pointer(0), lastrlw(0);
-    while (pointer < buffer.size()) {
-        RunningLengthWord<uword> rlw(buffer[pointer]);
-        lastrlw = pointer;// we save this up
-        if (rlw.getRunningBit())
-            rlw.setRunningBit(false);
-        else
-            rlw.setRunningBit(true);
-        ++pointer;
-        for (size_t k = 0; k < rlw.getNumberOfLiteralWords(); ++k) {
-            buffer[pointer] = static_cast<uword>(~buffer[pointer]);
-            ++pointer;
-        }
-    }
-    if(sizeinbits % wordinbits != 0){
-        RunningLengthWord<uword> rlw(buffer[lastrlw]);
-#ifdef EWAHASSERT
-        assert(rlw.getNumberOfLiteralWords() + rlw.getRunningLength() > 0);
-#endif
-        const uword maskbogus = (static_cast<uword>(1) << (sizeinbits % wordinbits)) - 1;
-        if(rlw.getNumberOfLiteralWords()>0) {// easy case
-            buffer[lastrlw + 1 + rlw.getNumberOfLiteralWords() - 1 ] &= maskbogus;
-        } else if(rlw.getRunningBit()) {
-#ifdef EWAHASSERT
-        	assert(rlw.getNumberOfLiteralWords() > 0);
-#endif
-            rlw.setNumberOfLiteralWords(rlw.getNumberOfLiteralWords() - 1);
-            addLiteralWord(maskbogus);
-        }
+template <class uword> void EWAHBoolArray<uword>::inplace_logicalnot() {
+  size_t pointer(0), lastrlw(0);
+  while (pointer < buffer.size()) {
+    RunningLengthWord<uword> rlw(buffer[pointer]);
+    lastrlw = pointer; // we save this up
+    if (rlw.getRunningBit())
+      rlw.setRunningBit(false);
+    else
+      rlw.setRunningBit(true);
+    ++pointer;
+    for (size_t k = 0; k < rlw.getNumberOfLiteralWords(); ++k) {
+      buffer[pointer] = static_cast<uword>(~buffer[pointer]);
+      ++pointer;
+    }
+  }
+  if (sizeinbits % wordinbits != 0) {
+    RunningLengthWord<uword> rlw(buffer[lastrlw]);
+    const uword maskbogus =
+        (static_cast<uword>(1) << (sizeinbits % wordinbits)) - 1;
+    if (rlw.getNumberOfLiteralWords() > 0) { // easy case
+      buffer[lastrlw + 1 + rlw.getNumberOfLiteralWords() - 1] &= maskbogus;
+    } else {
+      rlw.setRunningLength(rlw.getRunningLength() - 1);
+      addLiteralWord(maskbogus);
     }
+  }
 }
 
-template<class uword>
-size_t EWAHBoolArray<uword>::numberOfOnes() const {
-    size_t tot(0);
-    size_t pointer(0);
-    while (pointer < buffer.size()) {
-        ConstRunningLengthWord<uword> rlw(buffer[pointer]);
-        if (rlw.getRunningBit()) {
-            tot += static_cast<size_t>(rlw.getRunningLength() * wordinbits);
-        }
-        ++pointer;
-        for (size_t k = 0; k < rlw.getNumberOfLiteralWords(); ++k) {
-#ifdef EWAHASSERT
-            assert(countOnes(buffer[pointer]) < 64);
-#endif
-            tot += countOnes(buffer[pointer]);
-            ++pointer;
-        }
-    }
-    return tot;
+template <class uword> size_t EWAHBoolArray<uword>::numberOfOnes() const {
+  size_t tot(0);
+  size_t pointer(0);
+  while (pointer < buffer.size()) {
+    ConstRunningLengthWord<uword> rlw(buffer[pointer]);
+    if (rlw.getRunningBit()) {
+      tot += static_cast<size_t>(rlw.getRunningLength() * wordinbits);
+    }
+    ++pointer;
+    for (size_t k = 0; k < rlw.getNumberOfLiteralWords(); ++k) {
+      tot += countOnes((uword)buffer[pointer]);
+      ++pointer;
+    }
+  }
+  return tot;
 }
 
-template<class uword>
-vector<size_t> EWAHBoolArray<uword>::toArray() const {
-    vector < size_t > ans;
-    size_t pos(0);
-    size_t pointer(0);
-    while (pointer < buffer.size()) {
-        ConstRunningLengthWord<uword> rlw(buffer[pointer]);
-        if (rlw.getRunningBit()) {
-            for (size_t k = 0; k < rlw.getRunningLength() * wordinbits; ++k, ++pos) {
-                ans.push_back(pos);
-            }
-        } else {
-            pos += static_cast<size_t>(rlw.getRunningLength() * wordinbits);
-        }
-        ++pointer;
-        const bool usetrailing = true; //optimization
-        for (size_t k = 0; k < rlw.getNumberOfLiteralWords(); ++k) {
-            if (usetrailing) {
-                uword myword = buffer[pointer];
-                while (myword != 0) {
-                  uint32_t ntz =  numberOfTrailingZeros (myword);
-                  ans.push_back(pos + ntz);
-                  myword ^= (static_cast<uword>(1) << ntz);
-                }
-                pos += wordinbits;
-            } else {
-                for (int c = 0; c < wordinbits; ++c, ++pos)
-                    if ((buffer[pointer] & (static_cast<uword> (1) << c)) != 0) {
-                        ans.push_back(pos);
-                    }
-            }
-            ++pointer;
-        }
+template <class uword>
+std::vector<size_t> EWAHBoolArray<uword>::toArray() const {
+  std::vector<size_t> ans;
+  size_t pos(0);
+  size_t pointer(0);
+  const size_t buffersize = buffer.size();
+  while (pointer < buffersize) {
+    ConstRunningLengthWord<uword> rlw(buffer[pointer]);
+    const size_t productofrl =
+        static_cast<size_t>(rlw.getRunningLength() * wordinbits);
+    if (rlw.getRunningBit()) {
+      size_t upper_limit = pos + productofrl;
+      for (; pos < upper_limit; ++pos) {
+        ans.push_back(pos);
+      }
+    } else {
+      pos += productofrl;
+    }
+    ++pointer;
+    const size_t rlwlw = rlw.getNumberOfLiteralWords();
+    for (size_t k = 0; k < rlwlw; ++k) {
+      uword myword = buffer[pointer];
+      while (myword != 0) {
+        uint64_t t = myword & (~myword + 1);
+        uint32_t r = numberOfTrailingZeros(t);
+        ans.push_back(pos + r);
+        myword ^= t;
+      }
+      pos += wordinbits;
+      ++pointer;
     }
-    return ans;
+  }
+  return ans;
 }
 
-template<class uword>
-void EWAHBoolArray<uword>::logicalnot(EWAHBoolArray & x) const {
-    x.reset();
-    x.buffer.reserve(buffer.size());
-    EWAHBoolArrayRawIterator<uword> i = this->raw_iterator();
-    if(!i.hasNext()) return;// nothing to do
-    while (true) {
-        BufferedRunningLengthWord<uword> & rlw = i.next();
-        if (i.hasNext()) {
-            if( rlw.getRunningLength()>0) x.fastaddStreamOfEmptyWords(!rlw.getRunningBit(),
-                    rlw.getRunningLength());
-            if (rlw.getNumberOfLiteralWords() > 0) {
-                const uword * dw = i.dirtyWords();
-                for (size_t k = 0; k < rlw.getNumberOfLiteralWords(); ++k) {
-                    x.addLiteralWord(~dw[k]);
-                }
-            }
+template <class uword>
+void EWAHBoolArray<uword>::logicalnot(EWAHBoolArray &x) const {
+  x.reset();
+  x.buffer.reserve(buffer.size());
+  EWAHBoolArrayRawIterator<uword> i = this->raw_iterator();
+  if (!i.hasNext())
+    return; // nothing to do
+  while (true) {
+    BufferedRunningLengthWord<uword> &rlw = i.next();
+    if (i.hasNext()) {
+      if (rlw.getRunningLength() > 0)
+        x.fastaddStreamOfEmptyWords(!rlw.getRunningBit(),
+                                    rlw.getRunningLength());
+      if (rlw.getNumberOfLiteralWords() > 0) {
+        const uword *dw = i.dirtyWords();
+        for (size_t k = 0; k < rlw.getNumberOfLiteralWords(); ++k) {
+          x.addLiteralWord(~dw[k]);
+        }
+      }
+    } else {
+      if (rlw.getNumberOfLiteralWords() == 0) {
+        if ((this->sizeinbits % wordinbits != 0) && !rlw.getRunningBit()) {
+          if (rlw.getRunningLength() > 1)
+            x.fastaddStreamOfEmptyWords(!rlw.getRunningBit(),
+                                        rlw.getRunningLength() - 1);
+          const uword maskbogus =
+              (static_cast<uword>(1) << (this->sizeinbits % wordinbits)) - 1;
+          x.addLiteralWord(maskbogus);
+          break;
         } else {
-#ifdef EWAHASSERT
-            assert(rlw.getNumberOfLiteralWords() + rlw.getRunningLength() > 0);
-#endif
-            if(rlw.getNumberOfLiteralWords() == 0) {
-                if((this->sizeinbits % wordinbits != 0) && !rlw.getRunningBit()) {
-                    if(rlw.getRunningLength()>1) x.fastaddStreamOfEmptyWords(!rlw.getRunningBit(),
-                            rlw.getRunningLength() - 1);
-                    const uword maskbogus = (static_cast<uword>(1) << (this->sizeinbits % wordinbits)) - 1;
-                    x.addLiteralWord(maskbogus);
-                    break;
-                } else {
-                    if(rlw.getRunningLength()>0) x.fastaddStreamOfEmptyWords(!rlw.getRunningBit(),
-                            rlw.getRunningLength());
-                    break;
-                }
-            }
-            if(rlw.getRunningLength()>0) x.fastaddStreamOfEmptyWords(!rlw.getRunningBit(),
-                                rlw.getRunningLength());
-            const uword * dw = i.dirtyWords();
-            for (size_t k = 0; k + 1 < rlw.getNumberOfLiteralWords() ; ++k) {
-                                x.addLiteralWord(~dw[k]);
-            }
-            const uword maskbogus = (this->sizeinbits % wordinbits != 0) ? (static_cast<uword>(1) << (this->sizeinbits % wordinbits)) - 1 : ~static_cast<uword>(0);
-            x.addLiteralWord((~dw[rlw.getNumberOfLiteralWords()  - 1]) & maskbogus);
-            break;
+          if (rlw.getRunningLength() > 0)
+            x.fastaddStreamOfEmptyWords(!rlw.getRunningBit(),
+                                        rlw.getRunningLength());
+          break;
         }
-    }
-    x.sizeinbits = this->sizeinbits;
+      }
+      if (rlw.getRunningLength() > 0)
+        x.fastaddStreamOfEmptyWords(!rlw.getRunningBit(),
+                                    rlw.getRunningLength());
+      const uword *dw = i.dirtyWords();
+      for (size_t k = 0; k + 1 < rlw.getNumberOfLiteralWords(); ++k) {
+        x.addLiteralWord(~dw[k]);
+      }
+      const uword maskbogus =
+          (this->sizeinbits % wordinbits != 0)
+              ? (static_cast<uword>(1) << (this->sizeinbits % wordinbits)) - 1
+              : ~static_cast<uword>(0);
+      x.addLiteralWord((~dw[rlw.getNumberOfLiteralWords() - 1]) & maskbogus);
+      break;
+    }
+  }
+  x.sizeinbits = this->sizeinbits;
 }
 
-template<class uword>
+template <class uword>
 size_t EWAHBoolArray<uword>::addWord(const uword newdata,
-        const uint32_t bitsthatmatter) {
-    sizeinbits += bitsthatmatter;
-    if (newdata == 0) {
-        return addEmptyWord(0);
-    } else if (newdata == static_cast<uword> (~0)) {
-        return addEmptyWord(1);
-    } else {
-        return addLiteralWord(newdata);
-    }
+                                     const uint32_t bitsthatmatter) {
+  sizeinbits += bitsthatmatter;
+  if (newdata == 0) {
+    return addEmptyWord(0);
+  } else if (newdata == static_cast<uword>(~0)) {
+    return addEmptyWord(1);
+  } else {
+    return addLiteralWord(newdata);
+  }
 }
 
-template<class uword>
-inline void EWAHBoolArray<uword>::writeBuffer(ostream & out) const {
-    if (!buffer.empty())
-        out.write(reinterpret_cast<const char *> (&buffer[0]),
-                sizeof(uword) * buffer.size());
+template <class uword>
+inline void EWAHBoolArray<uword>::writeBuffer(std::ostream &out) const {
+  if (!buffer.empty())
+    out.write(reinterpret_cast<const char *>(&buffer[0]),
+              sizeof(uword) * buffer.size());
 }
 
-template<class uword>
-inline void EWAHBoolArray<uword>::readBuffer(istream & in,
-        const size_t buffersize) {
-    buffer.resize(buffersize);
-    if (buffersize > 0)
-        in.read(reinterpret_cast<char *> (&buffer[0]),
-                sizeof(uword) * buffersize);
+template <class uword>
+inline void EWAHBoolArray<uword>::readBuffer(std::istream &in,
+                                             const size_t buffersize) {
+  buffer.resize(buffersize);
+  if (buffersize > 0)
+    in.read(reinterpret_cast<char *>(&buffer[0]), sizeof(uword) * buffersize);
 }
 
-template<class uword>
-void EWAHBoolArray<uword>::write(ostream & out, const bool savesizeinbits) const {
-    if (savesizeinbits)
-        out.write(reinterpret_cast<const char *> (&sizeinbits),
-                sizeof(sizeinbits));
-    const size_t buffersize = buffer.size();
-    out.write(reinterpret_cast<const char *> (&buffersize), sizeof(buffersize));
-    if (buffersize > 0)
-        out.write(reinterpret_cast<const char *> (&buffer[0]),
-                static_cast<streamsize> (sizeof(uword) * buffersize));
+template <class uword>
+size_t EWAHBoolArray<uword>::write(std::ostream &out,
+                                 const bool savesizeinbits) const {
+  size_t written = 0;
+  if (savesizeinbits) {
+    out.write(reinterpret_cast<const char *>(&sizeinbits), sizeof(sizeinbits));
+    written += sizeof(sizeinbits);
+  }
+  const size_t buffersize = buffer.size();
+  out.write(reinterpret_cast<const char *>(&buffersize), sizeof(buffersize));
+  written += sizeof(buffersize);
+
+  if (buffersize > 0) {
+    out.write(reinterpret_cast<const char *>(&buffer[0]),
+              static_cast<std::streamsize>(sizeof(uword) * buffersize));
+    written += sizeof(uword) * buffersize;
+  }
+  return written;
 }
 
-template<class uword>
-void EWAHBoolArray<uword>::read(istream & in, const bool savesizeinbits) {
-    if (savesizeinbits)
-        in.read(reinterpret_cast<char *> (&sizeinbits), sizeof(sizeinbits));
-    else
-        sizeinbits = 0;
-    size_t buffersize(0);
-    in.read(reinterpret_cast<char *> (&buffersize), sizeof(buffersize));
-    buffer.resize(buffersize);
-    if (buffersize > 0)
-        in.read(reinterpret_cast<char *> (&buffer[0]),
-                static_cast<streamsize> (sizeof(uword) * buffersize));
+template <class uword>
+size_t EWAHBoolArray<uword>::write(char * out, size_t capacity,
+                                 const bool savesizeinbits) const {
+  size_t written = 0;
+  if (savesizeinbits) {
+    if(capacity < sizeof(sizeinbits)) return 0;
+    capacity -= sizeof(sizeinbits);
+    memcpy(out, &sizeinbits, sizeof(sizeinbits));
+    out += sizeof(sizeinbits);
+    written += sizeof(sizeinbits);
+  }
+  const size_t buffersize = buffer.size();
+  if(capacity < sizeof(buffersize)) return 0;
+  capacity -= sizeof(buffersize);
+  memcpy(out, &buffersize, sizeof(buffersize));
+  out += sizeof(buffersize);
+  written += sizeof(buffersize);
+
+  if (buffersize > 0) {
+    if(capacity < sizeof(uword) * buffersize) return 0;
+    memcpy(out, &buffer[0], sizeof(uword) * buffersize);
+    written += sizeof(uword) * buffersize;
+  }
+  return written;
 }
 
-template<class uword>
+
+template <class uword>
+size_t EWAHBoolArray<uword>::read(std::istream &in, const bool savesizeinbits) {
+  size_t read = 0;
+  if (savesizeinbits) {
+    in.read(reinterpret_cast<char *>(&sizeinbits), sizeof(sizeinbits));
+    read += sizeof(sizeinbits);
+  } else {
+    sizeinbits = 0;
+  }
+  size_t buffersize(0);
+  in.read(reinterpret_cast<char *>(&buffersize), sizeof(buffersize));
+  read += sizeof(buffersize);
+  buffer.resize(buffersize);
+  if (buffersize > 0) {
+    in.read(reinterpret_cast<char *>(&buffer[0]),
+            static_cast<std::streamsize>(sizeof(uword) * buffersize));
+    read += sizeof(uword) * buffersize;
+  }
+  return read;
+}
+
+
+template <class uword>
+size_t EWAHBoolArray<uword>::read(const char * in, size_t capacity, const bool savesizeinbits) {
+  size_t read = 0;
+  if (savesizeinbits) {
+    if(capacity < sizeof(sizeinbits)) return 0;
+    capacity -= sizeof(sizeinbits);
+    memcpy(reinterpret_cast<char *>(&sizeinbits), in, sizeof(sizeinbits));
+    read += sizeof(sizeinbits);
+    in += sizeof(sizeinbits);
+  } else {
+    sizeinbits = 0;
+  }
+  size_t buffersize(0);
+  if(capacity < sizeof(buffersize)) return 0;
+  capacity -= sizeof(buffersize);
+  memcpy(reinterpret_cast<char *>(&buffersize), in, sizeof(buffersize));
+  in += sizeof(buffersize);
+  read += sizeof(buffersize);
+
+  buffer.resize(buffersize);
+  if (buffersize > 0) {
+    if(capacity < sizeof(uword) * buffersize) return 0;
+    memcpy(&buffer[0], in, sizeof(uword) * buffersize);
+    read += sizeof(uword) * buffersize;
+  }
+  return read;
+}
+
+template <class uword>
 size_t EWAHBoolArray<uword>::addLiteralWord(const uword newdata) {
-    RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
-    uword numbersofar = lastRunningLengthWord.getNumberOfLiteralWords();
-    if (numbersofar >= RunningLengthWord<uword>::largestliteralcount) {//0x7FFF) {
-        buffer.push_back(0);
-        lastRLW = buffer.size() - 1;
-        RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
-        lastRunningLengthWord2.setNumberOfLiteralWords(1);
-        buffer.push_back(newdata);
-        return 2;
-    }
-    lastRunningLengthWord.setNumberOfLiteralWords(
-            static_cast<uword> (numbersofar + 1));
-#ifdef EWAHASSERT
-    assert(lastRunningLengthWord.getNumberOfLiteralWords() == numbersofar + 1);
-#endif
+  RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
+  uword numbersofar = lastRunningLengthWord.getNumberOfLiteralWords();
+  if (numbersofar >=
+      RunningLengthWord<uword>::largestliteralcount) { // 0x7FFF) {
+    buffer.push_back(0);
+    lastRLW = buffer.size() - 1;
+    RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
+    lastRunningLengthWord2.setNumberOfLiteralWords(1);
     buffer.push_back(newdata);
-    return 1;
+    return 2;
+  }
+  lastRunningLengthWord.setNumberOfLiteralWords(
+      static_cast<uword>(numbersofar + 1));
+  buffer.push_back(newdata);
+  return 1;
 }
 
-template<class uword>
+template <class uword>
 size_t EWAHBoolArray<uword>::padWithZeroes(const size_t totalbits) {
-	size_t wordsadded = 0;
-	if ( totalbits <= sizeinbits )
-		return wordsadded;
-
-    size_t missingbits = totalbits - sizeinbits;
-
-
-	RunningLengthWord<uword> rlw( buffer[lastRLW] );
-	if ( rlw.getNumberOfLiteralWords() > 0 )
-	{
-		// Consume trailing zeroes of trailing literal word (past sizeinbits)
-		size_t remain = sizeinbits % wordinbits;
-		if ( remain > 0 )	// Is last word partial?
-		{
-			size_t avail = wordinbits - remain;
-			if ( avail > 0 )
-			{
-				if ( missingbits > avail ) {
-					missingbits -= avail;
-				} else {
-					missingbits = 0;
-				}
-				sizeinbits += avail;
-			}
-		}
-	}
-
-	if ( missingbits > 0 )
-	{
-		size_t wordstoadd = missingbits / wordinbits;
-		if ( (missingbits % wordinbits) != 0)
-			++wordstoadd;
-
-		wordsadded = addStreamOfEmptyWords( false, wordstoadd );
-	}
-#ifdef EWAHASSERT
-    assert(sizeinbits >= totalbits);
-    assert(sizeinbits <= totalbits + wordinbits);
-#endif
-    sizeinbits = totalbits;
+  size_t wordsadded = 0;
+  if (totalbits <= sizeinbits)
     return wordsadded;
+
+  size_t missingbits = totalbits - sizeinbits;
+
+  RunningLengthWord<uword> rlw(buffer[lastRLW]);
+  if (rlw.getNumberOfLiteralWords() > 0) {
+    // Consume trailing zeroes of trailing literal word (past sizeinbits)
+    size_t remain = sizeinbits % wordinbits;
+    if (remain > 0) // Is last word partial?
+    {
+      size_t avail = wordinbits - remain;
+      if (avail > 0) {
+        if (missingbits > avail) {
+          missingbits -= avail;
+        } else {
+          missingbits = 0;
+        }
+        sizeinbits += avail;
+      }
+    }
+  }
+
+  if (missingbits > 0) {
+    size_t wordstoadd = missingbits / wordinbits;
+    if ((missingbits % wordinbits) != 0)
+      ++wordstoadd;
+
+    wordsadded = addStreamOfEmptyWords(false, wordstoadd);
+  }
+  sizeinbits = totalbits;
+  return wordsadded;
 }
 
 /**
  * This is a low-level iterator.
  */
 
-template<class uword = uint32_t>
-class EWAHBoolArrayRawIterator {
+template <class uword = uint32_t> class EWAHBoolArrayRawIterator {
 public:
-
-    EWAHBoolArrayRawIterator(const EWAHBoolArray<uword> & p) :
-        pointer(0), myparent(&p.getBuffer()), rlw((*myparent)[pointer], this) {
-    }
-    EWAHBoolArrayRawIterator(const EWAHBoolArrayRawIterator & o) :
-        pointer(o.pointer), myparent(o.myparent), rlw(o.rlw) {
-    }
-
-    bool hasNext() const {
-        return pointer < myparent->size();
-    }
-
-    BufferedRunningLengthWord<uword> & next() {
-#ifdef EWAHASSERT
-        assert(pointer < myparent->size());
-#endif
-        rlw.read((*myparent)[pointer]);
-        pointer = static_cast<size_t> (pointer + rlw.getNumberOfLiteralWords()
-                + 1);
-        return rlw;
-    }
-
-    const uword * dirtyWords() const {
-#ifdef EWAHASSERT
-        assert(pointer > 0);
-        assert(pointer >= rlw.getNumberOfLiteralWords());
-#endif
-        return myparent->data() +
-                static_cast<size_t> (pointer - rlw.getNumberOfLiteralWords());
-    }
-
-    EWAHBoolArrayRawIterator & operator=(const EWAHBoolArrayRawIterator & other) {
-        pointer = other.pointer;
-        myparent = other.myparent;
-        rlw = other.rlw;
-        return *this;
-    }
-
-    size_t pointer;
-    const vector<uword> * myparent;
-    BufferedRunningLengthWord<uword> rlw;
-
-    EWAHBoolArrayRawIterator();
+  EWAHBoolArrayRawIterator(const EWAHBoolArray<uword> &p)
+      : pointer(0), myparent(&p.getBuffer()), rlw((*myparent)[pointer], this) {}
+  EWAHBoolArrayRawIterator(const EWAHBoolArrayRawIterator &o)
+      : pointer(o.pointer), myparent(o.myparent), rlw(o.rlw) {}
+
+  bool hasNext() const { return pointer < myparent->size(); }
+
+  BufferedRunningLengthWord<uword> &next() {
+    rlw.read((*myparent)[pointer]);
+    pointer = static_cast<size_t>(pointer + rlw.getNumberOfLiteralWords() + 1);
+    return rlw;
+  }
+
+  const uword *dirtyWords() const {
+    return myparent->data() +
+           static_cast<size_t>(pointer - rlw.getNumberOfLiteralWords());
+  }
+
+  EWAHBoolArrayRawIterator &operator=(const EWAHBoolArrayRawIterator &other) {
+    pointer = other.pointer;
+    myparent = other.myparent;
+    rlw = other.rlw;
+    return *this;
+  }
+
+  size_t pointer;
+  const std::vector<uword> *myparent;
+  BufferedRunningLengthWord<uword> rlw;
+
+  EWAHBoolArrayRawIterator();
 };
 
-template<class uword>
+template <class uword>
 EWAHBoolArrayIterator<uword> EWAHBoolArray<uword>::uncompress() const {
-    return EWAHBoolArrayIterator<uword> (buffer);
+  return EWAHBoolArrayIterator<uword>(buffer);
 }
 
-template<class uword>
+template <class uword>
 EWAHBoolArrayRawIterator<uword> EWAHBoolArray<uword>::raw_iterator() const {
-    return EWAHBoolArrayRawIterator<uword> (*this);
-}
-
-
-#ifndef ALTEQUAL
-
-template<class uword>
-bool EWAHBoolArray<uword>::operator==(const EWAHBoolArray & a) const {
-    EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
-    EWAHBoolArrayRawIterator<uword> j = raw_iterator();
-    if (!(i.hasNext() and j.hasNext())) {// hopefully this never happens...
-        return true;
-    }
-    // at this point, this should be safe:
-    BufferedRunningLengthWord<uword> & rlwi = i.next();
-    BufferedRunningLengthWord<uword> & rlwj = j.next();
-    //RunningLength;
-    while (true) {
-        bool i_is_prey(rlwi.size() < rlwj.size());
-		BufferedRunningLengthWord<uword> & prey(i_is_prey ? rlwi : rlwj);
-		BufferedRunningLengthWord<uword> & predator(i_is_prey ? rlwj : rlwi);
-		uword predatorrl(predator.getRunningLength());
-		const uword preyrl(prey.getRunningLength());
-        if (predatorrl >= preyrl) {
-			const uword tobediscarded = preyrl;
-			if(tobediscarded)
-				if(prey.getRunningBit() ^ predator.getRunningBit())
-					return false;
-		} else {
-			const uword tobediscarded = predatorrl;
-			if(predatorrl>0) {
-				if(prey.getRunningBit() ^ predator.getRunningBit())
-					return false;
-			}
-			if (preyrl - tobediscarded > 0) {
-				return false;
-			}
-		}
-		predator.discardFirstWords(preyrl);
-		prey.discardFirstWords(preyrl);
-
-		predatorrl = predator.getRunningLength();
-		if (predatorrl > 0) {
-
-			const uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
-			const uword tobediscarded =
-					(predatorrl >= nbre_dirty_prey) ? nbre_dirty_prey
-							: predatorrl;
-			if (tobediscarded > 0) {
-				return false;
-			}
-		}
-		// all that is left to do now is to AND the dirty words
-        uword nbre_dirty_prey(prey.getNumberOfLiteralWords());
-        if (nbre_dirty_prey > 0) {
-            const uword * idirty = i.dirtyWords();
-            const uword * jdirty = j.dirtyWords();
-
-            for (uword k = 0; k < nbre_dirty_prey; ++k) {
-                if((idirty[k] ^ jdirty[k])!=0) return false;
-            }
-            predator.discardFirstWords(nbre_dirty_prey);
-        }
-        if (i_is_prey) {
-            if (!i.hasNext())
-                break;
-            rlwi = i.next();
-        } else {
-            if (!j.hasNext())
-                break;
-            rlwj = j.next();
-        }
-    }
-    return true;
+  return EWAHBoolArrayRawIterator<uword>(*this);
 }
 
-#else
-
-template<class uword>
-bool EWAHBoolArray<uword>::operator==(const EWAHBoolArray & x) const {
-    EWAHBoolArrayRawIterator<uword> i = x.raw_iterator();
-    EWAHBoolArrayRawIterator<uword> j = raw_iterator();
-    if (!(i.hasNext() and j.hasNext())) {// hopefully this never happens...
-        return true;
-    }
-    // at this point, this should be safe:
-    BufferedRunningLengthWord<uword> & rlwi = i.next();
-    BufferedRunningLengthWord<uword> & rlwj = j.next();
-
-
-    while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
-        while ((rlwi.getRunningLength() > 0) || (rlwj.getRunningLength() > 0)) {
-            const bool i_is_prey = rlwi.getRunningLength() < rlwj .getRunningLength();
-            BufferedRunningLengthWord<uword> & prey = i_is_prey ? rlwi : rlwj;
-            BufferedRunningLengthWord<uword> & predator = i_is_prey ? rlwj : rlwi;
-            size_t index = 0;
-            const bool nonzero = ((!predator.getRunningBit()) ? prey.nonzero_discharge(
-                    predator.getRunningLength(),index) : prey.nonzero_dischargeNegated(
-                    predator.getRunningLength(),index));
-            if(nonzero) {
-            	return false;
-            }
-            if(predator.getRunningLength() - index > 0) {
-            	if(predator.getRunningBit()) {
-            		return false;
-            	}
-            }
-            predator.discardRunningWordsWithReload();
-
-        }
-        const size_t nbre_literal = min(rlwi.getNumberOfLiteralWords(),rlwj.getNumberOfLiteralWords());
-        if (nbre_literal > 0) {
-            for (size_t k = 0; k < nbre_literal; ++k)
-                if((rlwi.getLiteralWordAt(k) ^ rlwj.getLiteralWordAt(k))!=0) return false;
-            rlwi.discardFirstWordsWithReload(nbre_literal);
-            rlwj.discardFirstWordsWithReload(nbre_literal);
+template <class uword>
+bool EWAHBoolArray<uword>::operator==(const EWAHBoolArray &x) const {
+  EWAHBoolArrayRawIterator<uword> i = x.raw_iterator();
+  EWAHBoolArrayRawIterator<uword> j = raw_iterator();
+  if (!(i.hasNext() and j.hasNext())) { // hopefully this never happens...
+    return (i.hasNext() == false) && (j.hasNext() == false);
+  }
+  // at this point, this should be safe:
+  BufferedRunningLengthWord<uword> &rlwi = i.next();
+  BufferedRunningLengthWord<uword> &rlwj = j.next();
+
+  while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
+    while ((rlwi.getRunningLength() > 0) || (rlwj.getRunningLength() > 0)) {
+      const bool i_is_prey = rlwi.getRunningLength() < rlwj.getRunningLength();
+      BufferedRunningLengthWord<uword> &prey = i_is_prey ? rlwi : rlwj;
+      BufferedRunningLengthWord<uword> &predator = i_is_prey ? rlwj : rlwi;
+      size_t index = 0;
+      const bool nonzero =
+          ((!predator.getRunningBit())
+               ? prey.nonzero_discharge(predator.getRunningLength(), index)
+               : prey.nonzero_dischargeNegated(predator.getRunningLength(),
+                                               index));
+      if (nonzero) {
+        return false;
+      }
+      if (predator.getRunningLength() - index > 0) {
+        if (predator.getRunningBit()) {
+          return false;
         }
+      }
+      predator.discardRunningWordsWithReload();
     }
-    const bool i_remains = rlwi.size() > 0;
-    BufferedRunningLengthWord<uword> & remaining = i_remains ? rlwi : rlwj;
-    return !remaining.nonzero_discharge();
+    const size_t nbre_literal = std::min(rlwi.getNumberOfLiteralWords(),
+                                         rlwj.getNumberOfLiteralWords());
+    if (nbre_literal > 0) {
+      for (size_t k = 0; k < nbre_literal; ++k)
+        if ((rlwi.getLiteralWordAt(k) ^ rlwj.getLiteralWordAt(k)) != 0)
+          return false;
+      rlwi.discardLiteralWordsWithReload(nbre_literal);
+      rlwj.discardLiteralWordsWithReload(nbre_literal);
+    }
+  }
+  const bool i_remains = rlwi.size() > 0;
+  BufferedRunningLengthWord<uword> &remaining = i_remains ? rlwi : rlwj;
+  return !remaining.nonzero_discharge();
 }
 
-#endif
-
-template<class uword>
-void EWAHBoolArray<uword>::swap(EWAHBoolArray & x) {
-    buffer.swap(x.buffer);
-    size_t tmp = x.sizeinbits;
-    x.sizeinbits = sizeinbits;
-    sizeinbits = tmp;
-    tmp = x.lastRLW;
-    x.lastRLW = lastRLW;
-    lastRLW = tmp;
+template <class uword> void EWAHBoolArray<uword>::swap(EWAHBoolArray &x) {
+  buffer.swap(x.buffer);
+  size_t tmp = x.sizeinbits;
+  x.sizeinbits = sizeinbits;
+  sizeinbits = tmp;
+  tmp = x.lastRLW;
+  x.lastRLW = lastRLW;
+  lastRLW = tmp;
 }
 
-template<class uword>
-void EWAHBoolArray<uword>::append(const EWAHBoolArray & x) {
-    if (sizeinbits % wordinbits == 0) {
-        // hoping for the best?
-        sizeinbits += x.sizeinbits;
-        ConstRunningLengthWord<uword> lRLW(buffer[lastRLW]);
-        if ((lRLW.getRunningLength() == 0) && (lRLW.getNumberOfLiteralWords()
-                == 0)) {
-            // it could be that the running length word is empty, in such a case,
-            // we want to get rid of it!
-#ifdef EWAHASSERT
-            assert(lastRLW == buffer.size() - 1);
-#endif
-            lastRLW = x.lastRLW + buffer.size() - 1;
-            buffer.resize(buffer.size() - 1);
-            buffer.insert(buffer.end(), x.buffer.begin(), x.buffer.end());
-        } else {
-            lastRLW = x.lastRLW + buffer.size();
-            buffer.insert(buffer.end(), x.buffer.begin(), x.buffer.end());
-        }
+template <class uword>
+void EWAHBoolArray<uword>::append(const EWAHBoolArray &x) {
+  if (sizeinbits % wordinbits == 0) {
+    // hoping for the best?
+    sizeinbits += x.sizeinbits;
+    ConstRunningLengthWord<uword> lRLW(buffer[lastRLW]);
+    if ((lRLW.getRunningLength() == 0) &&
+        (lRLW.getNumberOfLiteralWords() == 0)) {
+      // it could be that the running length word is empty, in such a case,
+      // we want to get rid of it!
+      lastRLW = x.lastRLW + buffer.size() - 1;
+      buffer.resize(buffer.size() - 1);
+      buffer.insert(buffer.end(), x.buffer.begin(), x.buffer.end());
     } else {
-        stringstream ss;
-        ss
-                << "This should really not happen! You are trying to append to a bitmap having a fractional number of words, that is,  "
-                << static_cast<int> (sizeinbits)
-                << " bits with a word size in bits of "
-                << static_cast<int> (wordinbits) << ". ";
-        ss << "Size of the bitmap being appended: " << x.sizeinbits << " bits."
-                << endl;
-        throw invalid_argument(ss.str());
-    }
+      lastRLW = x.lastRLW + buffer.size();
+      buffer.insert(buffer.end(), x.buffer.begin(), x.buffer.end());
+    }
+  } else {
+    std::stringstream ss;
+    ss << "This should really not happen! You are trying to append to a bitmap "
+          "having a fractional number of words, that is,  "
+       << static_cast<int>(sizeinbits) << " bits with a word size in bits of "
+       << static_cast<int>(wordinbits) << ". ";
+    ss << "Size of the bitmap being appended: " << x.sizeinbits << " bits."
+       << std::endl;
+    throw std::invalid_argument(ss.str());
+  }
 }
 
-template<class uword>
+template <class uword>
 EWAHBoolArrayIterator<uword>::EWAHBoolArrayIterator(
-        const vector<uword> & parent) :
-    pointer(0), myparent(parent), compressedwords(0), literalwords(0), rl(0),
-            lw(0), b(0) {
-    if (pointer < myparent.size())
-        readNewRunningLengthWord();
+    const std::vector<uword> &parent)
+    : pointer(0), myparent(parent), compressedwords(0), literalwords(0), rl(0),
+      lw(0), b(0) {
+  if (pointer < myparent.size())
+    readNewRunningLengthWord();
 }
 
-template<class uword>
+template <class uword>
 void EWAHBoolArrayIterator<uword>::readNewRunningLengthWord() {
-    literalwords = 0;
-    compressedwords = 0;
-    ConstRunningLengthWord<uword> rlw(myparent[pointer]);
-    rl = rlw.getRunningLength();
-    lw = rlw.getNumberOfLiteralWords();
-    b = rlw.getRunningBit();
-    if ((rl == 0) && (lw == 0)) {
-        if (pointer < myparent.size() - 1) {
-            ++pointer;
-            readNewRunningLengthWord();
-        } else {
-#ifdef EWAHASSERT
-            assert(pointer >= myparent.size() - 1);
-#endif
-            pointer = myparent.size();
-#ifdef EWAHASSERT
-            assert(!hasNext());
-#endif
-        }
+  literalwords = 0;
+  compressedwords = 0;
+  ConstRunningLengthWord<uword> rlw(myparent[pointer]);
+  rl = rlw.getRunningLength();
+  lw = rlw.getNumberOfLiteralWords();
+  b = rlw.getRunningBit();
+  if ((rl == 0) && (lw == 0)) {
+    if (pointer < myparent.size() - 1) {
+      ++pointer;
+      readNewRunningLengthWord();
+    } else {
+      pointer = myparent.size();
     }
+  }
 }
 
-template<class uword>
+template <class uword>
 BoolArray<uword> EWAHBoolArray<uword>::toBoolArray() const {
-    BoolArray<uword> ans(sizeinbits);
-    EWAHBoolArrayIterator<uword> i = uncompress();
-    size_t counter = 0;
-    while (i.hasNext()) {
-        ans.setWord(counter++, i.next());
-    }
-    return ans;
+  BoolArray<uword> ans(sizeinbits);
+  EWAHBoolArrayIterator<uword> i = uncompress();
+  size_t counter = 0;
+  while (i.hasNext()) {
+    ans.setWord(counter++, i.next());
+  }
+  return ans;
 }
 
-template<class uword>
-template<class container>
-void EWAHBoolArray<uword>::appendRowIDs(container & out, const size_t offset) const {
-    size_t pointer(0);
-    size_t currentoffset(offset);
-    if (RESERVEMEMORY)
-        out.reserve(buffer.size() + 64);// trading memory for speed.
-    while (pointer < buffer.size()) {
-        ConstRunningLengthWord<uword> rlw(buffer[pointer]);
-        if (rlw.getRunningBit()) {
-            for (size_t x = 0; x < static_cast<size_t> (rlw.getRunningLength()
-                    * wordinbits); ++x) {
-                out.push_back(currentoffset + x);
-            }
-        }
-        currentoffset = static_cast<size_t> (currentoffset
-                + rlw.getRunningLength() * wordinbits);
-        ++pointer;
-        for (uword k = 0; k < rlw.getNumberOfLiteralWords(); ++k) {
-            const uword currentword = buffer[pointer];
-            for (uint32_t kk = 0; kk < wordinbits; ++kk) {
-                if ((currentword & static_cast<uword> (static_cast<uword> (1)
-                        << kk)) != 0)
-                    out.push_back(currentoffset + kk);
-            }
-            currentoffset += wordinbits;
-            ++pointer;
-        }
+template <class uword>
+template <class container>
+void EWAHBoolArray<uword>::appendRowIDs(container &out,
+                                        const size_t offset) const {
+  size_t pointer(0);
+  size_t currentoffset(offset);
+  if (RESERVEMEMORY)
+    out.reserve(buffer.size() + 64); // trading memory for speed.
+  const size_t buffersize = buffer.size();
+  while (pointer < buffersize) {
+    ConstRunningLengthWord<uword> rlw(buffer[pointer]);
+    const size_t productofrl =
+        static_cast<size_t>(rlw.getRunningLength() * wordinbits);
+    if (rlw.getRunningBit()) {
+      const size_t upper_limit = currentoffset + productofrl;
+      for (; currentoffset < upper_limit; ++currentoffset) {
+        out.push_back(currentoffset);
+      }
+    } else {
+      currentoffset += productofrl;
+    }
+    ++pointer;
+    const size_t rlwlw = rlw.getNumberOfLiteralWords();
+    for (uword k = 0; k < rlwlw; ++k) {
+      uword currentword = buffer[pointer];
+      while (currentword != 0) {
+        uint64_t t = currentword & -currentword;
+        uint32_t r = numberOfTrailingZeros(t);
+        out.push_back(currentoffset + r);
+        currentword ^= t;
+      }
+      currentoffset += wordinbits;
+      ++pointer;
     }
+  }
 }
 
-template<class uword>
-bool EWAHBoolArray<uword>::operator!=(const EWAHBoolArray<uword> & x) const {
-    return !(*this == x);
+template <class uword>
+bool EWAHBoolArray<uword>::operator!=(const EWAHBoolArray<uword> &x) const {
+  return !(*this == x);
 }
 
-template<class uword>
-bool EWAHBoolArray<uword>::operator==(const BoolArray<uword> & x) const {
-    // could be more efficient
-    return (this->toBoolArray() == x);
+template <class uword>
+bool EWAHBoolArray<uword>::operator==(const BoolArray<uword> &x) const {
+  // could be more efficient
+  return (this->toBoolArray() == x);
 }
 
-template<class uword>
-bool EWAHBoolArray<uword>::operator!=(const BoolArray<uword> & x) const {
-    // could be more efficient
-    return (this->toBoolArray() != x);
+template <class uword>
+bool EWAHBoolArray<uword>::operator!=(const BoolArray<uword> &x) const {
+  // could be more efficient
+  return (this->toBoolArray() != x);
 }
 
-template<class uword>
-size_t EWAHBoolArray<uword>::addStreamOfEmptyWords(const bool v, size_t number) {
-    if (number == 0)
-        return 0;
-    sizeinbits += number * wordinbits;
-    size_t wordsadded = 0;
-    if ((RunningLengthWord<uword>::getRunningBit(buffer[lastRLW]) != v)
-            && (RunningLengthWord<uword>::size(buffer[lastRLW]) == 0)) {
-        RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
-    } else if ((RunningLengthWord<uword>::getNumberOfLiteralWords(
-            buffer[lastRLW]) != 0) || (RunningLengthWord<uword>::getRunningBit(
-            buffer[lastRLW]) != v)) {
-        buffer.push_back(0);
-        ++wordsadded;
-        lastRLW = buffer.size() - 1;
-        if (v)
-            RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
-    }
-    const uword runlen = RunningLengthWord<uword>::getRunningLength(
-            buffer[lastRLW]);
-
-    const uword
-            whatwecanadd =
-                    number
-                            < static_cast<size_t> (RunningLengthWord<uword>::largestrunninglengthcount
-                                    - runlen) ? static_cast<uword> (number)
-                            : static_cast<uword> (RunningLengthWord<uword>::largestrunninglengthcount
-                                    - runlen);
+template <class uword>
+size_t EWAHBoolArray<uword>::addStreamOfEmptyWords(const bool v,
+                                                   size_t number) {
+  if (number == 0)
+    return 0;
+  sizeinbits += number * wordinbits;
+  size_t wordsadded = 0;
+  if ((RunningLengthWord<uword>::getRunningBit(buffer[lastRLW]) != v) &&
+      (RunningLengthWord<uword>::size(buffer[lastRLW]) == 0)) {
+    RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
+  } else if ((RunningLengthWord<uword>::getNumberOfLiteralWords(
+                  buffer[lastRLW]) != 0) ||
+             (RunningLengthWord<uword>::getRunningBit(buffer[lastRLW]) != v)) {
+    buffer.push_back(0);
+    ++wordsadded;
+    lastRLW = buffer.size() - 1;
+    if (v)
+      RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
+  }
+  const uword runlen =
+      RunningLengthWord<uword>::getRunningLength(buffer[lastRLW]);
+
+  const uword whatwecanadd =
+      number < static_cast<size_t>(
+                   RunningLengthWord<uword>::largestrunninglengthcount - runlen)
+          ? static_cast<uword>(number)
+          : static_cast<uword>(
+                RunningLengthWord<uword>::largestrunninglengthcount - runlen);
+  RunningLengthWord<uword>::setRunningLength(
+      buffer[lastRLW], static_cast<uword>(runlen + whatwecanadd));
+
+  number -= static_cast<size_t>(whatwecanadd);
+  while (number >= RunningLengthWord<uword>::largestrunninglengthcount) {
+    buffer.push_back(0);
+    ++wordsadded;
+    lastRLW = buffer.size() - 1;
+    if (v)
+      RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
+    RunningLengthWord<uword>::setRunningLength(
+        buffer[lastRLW], RunningLengthWord<uword>::largestrunninglengthcount);
+    number -= static_cast<size_t>(
+        RunningLengthWord<uword>::largestrunninglengthcount);
+  }
+  if (number > 0) {
+    buffer.push_back(0);
+    ++wordsadded;
+    lastRLW = buffer.size() - 1;
+    if (v)
+      RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
     RunningLengthWord<uword>::setRunningLength(buffer[lastRLW],
-            static_cast<uword> (runlen + whatwecanadd));
-
-    number -= static_cast<size_t> (whatwecanadd);
-    while (number >= RunningLengthWord<uword>::largestrunninglengthcount) {
-        buffer.push_back(0);
-        ++wordsadded;
-        lastRLW = buffer.size() - 1;
-        if (v)
-            RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
-        RunningLengthWord<uword>::setRunningLength(buffer[lastRLW],
-                RunningLengthWord<uword>::largestrunninglengthcount);
-        number
-                -= static_cast<size_t> (RunningLengthWord<uword>::largestrunninglengthcount);
-    }
-    if (number > 0) {
-        buffer.push_back(0);
-        ++wordsadded;
-        lastRLW = buffer.size() - 1;
-        if (v)
-            RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
-        RunningLengthWord<uword>::setRunningLength(buffer[lastRLW],
-                static_cast<uword> (number));
-    }
-    return wordsadded;
+                                               static_cast<uword>(number));
+  }
+  return wordsadded;
 }
 
-
-template<class uword>
-void EWAHBoolArray<uword>::fastaddStreamOfEmptyWords(const bool v, size_t number) {
-    if ((RunningLengthWord<uword>::getRunningBit(buffer[lastRLW]) != v)
-            && (RunningLengthWord<uword>::size(buffer[lastRLW]) == 0)) {
-        RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
-    } else if ((RunningLengthWord<uword>::getNumberOfLiteralWords(
-            buffer[lastRLW]) != 0) || (RunningLengthWord<uword>::getRunningBit(
-            buffer[lastRLW]) != v)) {
-        buffer.push_back(0);
-        lastRLW = buffer.size() - 1;
-        if (v)
-            RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
-    }
-    const uword runlen = RunningLengthWord<uword>::getRunningLength(
-            buffer[lastRLW]);
-
-    const uword
-            whatwecanadd =
-                    number
-                            < static_cast<size_t> (RunningLengthWord<uword>::largestrunninglengthcount
-                                    - runlen) ? static_cast<uword> (number)
-                            : static_cast<uword> (RunningLengthWord<uword>::largestrunninglengthcount
-                                    - runlen);
+template <class uword>
+void EWAHBoolArray<uword>::fastaddStreamOfEmptyWords(const bool v,
+                                                     size_t number) {
+  if (number == 0)
+    return;
+  if ((RunningLengthWord<uword>::getRunningBit(buffer[lastRLW]) != v) &&
+      (RunningLengthWord<uword>::size(buffer[lastRLW]) == 0)) {
+    RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
+  } else if ((RunningLengthWord<uword>::getNumberOfLiteralWords(
+                  buffer[lastRLW]) != 0) ||
+             (RunningLengthWord<uword>::getRunningBit(buffer[lastRLW]) != v)) {
+    buffer.push_back(0);
+    lastRLW = buffer.size() - 1;
+    if (v)
+      RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
+  }
+  const uword runlen =
+      RunningLengthWord<uword>::getRunningLength(buffer[lastRLW]);
+
+  const uword whatwecanadd =
+      number < static_cast<size_t>(
+                   RunningLengthWord<uword>::largestrunninglengthcount - runlen)
+          ? static_cast<uword>(number)
+          : static_cast<uword>(
+                RunningLengthWord<uword>::largestrunninglengthcount - runlen);
+  RunningLengthWord<uword>::setRunningLength(
+      buffer[lastRLW], static_cast<uword>(runlen + whatwecanadd));
+
+  number -= static_cast<size_t>(whatwecanadd);
+  while (number >= RunningLengthWord<uword>::largestrunninglengthcount) {
+    buffer.push_back(0);
+    lastRLW = buffer.size() - 1;
+    if (v)
+      RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
+    RunningLengthWord<uword>::setRunningLength(
+        buffer[lastRLW], RunningLengthWord<uword>::largestrunninglengthcount);
+    number -= static_cast<size_t>(
+        RunningLengthWord<uword>::largestrunninglengthcount);
+  }
+  if (number > 0) {
+    buffer.push_back(0);
+    lastRLW = buffer.size() - 1;
+    if (v)
+      RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
     RunningLengthWord<uword>::setRunningLength(buffer[lastRLW],
-            static_cast<uword> (runlen + whatwecanadd));
-
-    number -= static_cast<size_t> (whatwecanadd);
-    while (number >= RunningLengthWord<uword>::largestrunninglengthcount) {
-        buffer.push_back(0);
-        lastRLW = buffer.size() - 1;
-        if (v)
-            RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
-        RunningLengthWord<uword>::setRunningLength(buffer[lastRLW],
-                RunningLengthWord<uword>::largestrunninglengthcount);
-        number
-                -= static_cast<size_t> (RunningLengthWord<uword>::largestrunninglengthcount);
-    }
-    if (number > 0) {
-        buffer.push_back(0);
-        lastRLW = buffer.size() - 1;
-        if (v)
-            RunningLengthWord<uword>::setRunningBit(buffer[lastRLW], v);
-        RunningLengthWord<uword>::setRunningLength(buffer[lastRLW],
-                static_cast<uword> (number));
-    }
+                                               static_cast<uword>(number));
+  }
 }
 
+template <class uword>
+size_t EWAHBoolArray<uword>::addStreamOfDirtyWords(const uword *v,
+                                                   const size_t number) {
+  if (number == 0)
+    return 0;
+  uword rlw = buffer[lastRLW];
+  size_t NumberOfLiteralWords =
+      RunningLengthWord<uword>::getNumberOfLiteralWords(rlw);
+  if (NumberOfLiteralWords + number <=
+      RunningLengthWord<uword>::largestliteralcount) {
+    RunningLengthWord<uword>::setNumberOfLiteralWords(
+        rlw, NumberOfLiteralWords + number);
+    buffer[lastRLW] = rlw;
+    sizeinbits += number * wordinbits;
+    buffer.insert(buffer.end(), v, v + number);
+    return number;
+  }
+  // we proceed the long way
+  size_t howmanywecanadd =
+      RunningLengthWord<uword>::largestliteralcount - NumberOfLiteralWords;
+  RunningLengthWord<uword>::setNumberOfLiteralWords(
+      rlw, RunningLengthWord<uword>::largestliteralcount);
+  buffer[lastRLW] = rlw;
+  buffer.insert(buffer.end(), v, v + howmanywecanadd);
+  size_t wordadded = howmanywecanadd;
+  sizeinbits += howmanywecanadd * wordinbits;
+  buffer.push_back(0);
+  lastRLW = buffer.size() - 1;
+  ++wordadded;
+  wordadded +=
+      addStreamOfDirtyWords(v + howmanywecanadd, number - howmanywecanadd);
+  return wordadded;
+}
 
-template<class uword>
-size_t EWAHBoolArray<uword>::addStreamOfDirtyWords(const uword * v,
-        const size_t number) {
-    if (number == 0)
-        return 0;
-    RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
-    const uword NumberOfLiteralWords =
-            lastRunningLengthWord.getNumberOfLiteralWords();
-#ifdef EWAHASSERT
-    assert(
-            RunningLengthWord<uword>::largestliteralcount
-                    >= NumberOfLiteralWords);
-#endif
-    const size_t
-            whatwecanadd =
-                    number
-                            < static_cast<uword> (RunningLengthWord<uword>::largestliteralcount
-                                    - NumberOfLiteralWords) ? number
-                            : static_cast<size_t> (RunningLengthWord<uword>::largestliteralcount
-                                    - NumberOfLiteralWords);//0x7FFF-NumberOfLiteralWords);
-#ifdef EWAHASSERT
-    assert(NumberOfLiteralWords + whatwecanadd >= NumberOfLiteralWords);
-    assert(
-            NumberOfLiteralWords + whatwecanadd
-                    <= RunningLengthWord<uword>::largestliteralcount);
-#endif
-    lastRunningLengthWord.setNumberOfLiteralWords(
-            static_cast<uword> (NumberOfLiteralWords + whatwecanadd));
-#ifdef EWAHASSERT
-    assert(
-            lastRunningLengthWord.getNumberOfLiteralWords()
-                    == NumberOfLiteralWords + whatwecanadd);
-#endif
-    const size_t leftovernumber = number - whatwecanadd;
-    // add the dirty words...
-    const size_t oldsize(buffer.size());
-    buffer.resize(oldsize + whatwecanadd);
-    memcpy(&buffer[oldsize], v, whatwecanadd * sizeof(uword));
-	sizeinbits += whatwecanadd * wordinbits;
-    size_t wordsadded(whatwecanadd);
-    if (leftovernumber > 0) {
-        //add
-        buffer.push_back(0);
-        lastRLW = buffer.size() - 1;
-        ++wordsadded;
-        wordsadded += addStreamOfDirtyWords(v + whatwecanadd, leftovernumber);
-    }
-#ifdef EWAHASSERT
-    assert(wordsadded >= number);
-#endif
-    return wordsadded;
+template <class uword>
+void EWAHBoolArray<uword>::fastaddStreamOfDirtyWords(const uword *v,
+                                                     const size_t number) {
+  if (number == 0)
+    return;
+  uword rlw = buffer[lastRLW];
+  size_t NumberOfLiteralWords =
+      RunningLengthWord<uword>::getNumberOfLiteralWords(rlw);
+  if (NumberOfLiteralWords + number <=
+      RunningLengthWord<uword>::largestliteralcount) {
+    RunningLengthWord<uword>::setNumberOfLiteralWords(
+        rlw, NumberOfLiteralWords + number);
+    buffer[lastRLW] = rlw;
+    for (size_t i = 0; i < number; ++i)
+      buffer.push_back(v[i]);
+    // buffer.insert(buffer.end(), v, v+number); // seems slower than push_back?
+    return;
+  }
+  // we proceed the long way
+  size_t howmanywecanadd =
+      RunningLengthWord<uword>::largestliteralcount - NumberOfLiteralWords;
+  RunningLengthWord<uword>::setNumberOfLiteralWords(
+      rlw, RunningLengthWord<uword>::largestliteralcount);
+  buffer[lastRLW] = rlw;
+  for (size_t i = 0; i < howmanywecanadd; ++i)
+    buffer.push_back(v[i]);
+  // buffer.insert(buffer.end(), v, v+howmanywecanadd);// seems slower than
+  // push_back?
+  buffer.push_back(0);
+  lastRLW = buffer.size() - 1;
+  fastaddStreamOfDirtyWords(v + howmanywecanadd, number - howmanywecanadd);
 }
 
-template<class uword>
-size_t EWAHBoolArray<uword>::addStreamOfNegatedDirtyWords(const uword * v,
-        const size_t number) {
-    if (number == 0)
-        return 0;
-    RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
-    const uword NumberOfLiteralWords =
-            lastRunningLengthWord.getNumberOfLiteralWords();
-#ifdef EWAHASSERT
-    assert(
-            RunningLengthWord<uword>::largestliteralcount
-                    >= NumberOfLiteralWords);
-#endif
-    const size_t
-            whatwecanadd =
-                    number
-                            < static_cast<uword> (RunningLengthWord<uword>::largestliteralcount
-                                    - NumberOfLiteralWords) ? number
-                            : static_cast<size_t> (RunningLengthWord<uword>::largestliteralcount
-                                    - NumberOfLiteralWords);//0x7FFF-NumberOfLiteralWords);
-#ifdef EWAHASSERT
-    assert(NumberOfLiteralWords + whatwecanadd >= NumberOfLiteralWords);
-    assert(
-            NumberOfLiteralWords + whatwecanadd
-                    <= RunningLengthWord<uword>::largestliteralcount);
-#endif
-    lastRunningLengthWord.setNumberOfLiteralWords(
-            static_cast<uword> (NumberOfLiteralWords + whatwecanadd));
-#ifdef EWAHASSERT
-    assert(
-            lastRunningLengthWord.getNumberOfLiteralWords()
-                    == NumberOfLiteralWords + whatwecanadd);
-#endif
-    const size_t leftovernumber = number - whatwecanadd;
-    // add the dirty words...
-    const size_t oldsize(buffer.size());
-    buffer.resize(oldsize + whatwecanadd);
-    for(size_t k = 0; k <whatwecanadd; ++k) {
-    	buffer[oldsize + k] = ~v[k];
-    }
-    //memcpy(&buffer[oldsize], v, whatwecanadd * sizeof(uword));
-	sizeinbits += whatwecanadd * wordinbits;
-    size_t wordsadded(whatwecanadd);
-    if (leftovernumber > 0) {
-        //add
-        buffer.push_back(0);
-        lastRLW = buffer.size() - 1;
-        ++wordsadded;
-        wordsadded += addStreamOfDirtyWords(v + whatwecanadd, leftovernumber);
-    }
-#ifdef EWAHASSERT
-    assert(wordsadded >= number);
-#endif
-    return wordsadded;
+template <class uword>
+size_t EWAHBoolArray<uword>::addStreamOfNegatedDirtyWords(const uword *v,
+                                                          const size_t number) {
+  if (number == 0)
+    return 0;
+  uword rlw = buffer[lastRLW];
+  size_t NumberOfLiteralWords =
+      RunningLengthWord<uword>::getNumberOfLiteralWords(rlw);
+  if (NumberOfLiteralWords + number <=
+      RunningLengthWord<uword>::largestliteralcount) {
+    RunningLengthWord<uword>::setNumberOfLiteralWords(
+        rlw, NumberOfLiteralWords + number);
+    buffer[lastRLW] = rlw;
+    sizeinbits += number * wordinbits;
+    for (size_t k = 0; k < number; ++k)
+      buffer.push_back(~v[k]);
+    return number;
+  }
+  // we proceed the long way
+  size_t howmanywecanadd =
+      RunningLengthWord<uword>::largestliteralcount - NumberOfLiteralWords;
+  RunningLengthWord<uword>::setNumberOfLiteralWords(
+      rlw, RunningLengthWord<uword>::largestliteralcount);
+  buffer[lastRLW] = rlw;
+  for (size_t k = 0; k < howmanywecanadd; ++k)
+    buffer.push_back(~v[k]);
+  size_t wordadded = howmanywecanadd;
+  sizeinbits += howmanywecanadd * wordinbits;
+  buffer.push_back(0);
+  lastRLW = buffer.size() - 1;
+  ++wordadded;
+  wordadded +=
+      addStreamOfDirtyWords(v + howmanywecanadd, number - howmanywecanadd);
+  return wordadded;
 }
 
-template<class uword>
-size_t EWAHBoolArray<uword>::addEmptyWord(const bool v) {
-    RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
-    const bool noliteralword = (lastRunningLengthWord.getNumberOfLiteralWords()
-            == 0);
-    //first, if the last running length word is empty, we align it
-    // this
-    uword runlen = lastRunningLengthWord.getRunningLength();
-    if ((noliteralword) && (runlen == 0)) {
-        lastRunningLengthWord.setRunningBit(v);
-#ifdef EWAHASSERT
-        assert(lastRunningLengthWord.getRunningBit() == v);
-#endif
-    }
-    if ((noliteralword) && (lastRunningLengthWord.getRunningBit() == v)
-            && (runlen < RunningLengthWord<uword>::largestrunninglengthcount)) {
-        lastRunningLengthWord.setRunningLength(static_cast<uword> (runlen + 1));
-#ifdef EWAHASSERT
-        assert(lastRunningLengthWord.getRunningLength() == runlen + 1);
-#endif
-        return 0;
-    } else {
-        // we have to start anew
-        buffer.push_back(0);
-        lastRLW = buffer.size() - 1;
-        RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
-#ifdef EWAHASSERT
-        assert(lastRunningLengthWord2.getRunningLength() == 0);
-        assert(lastRunningLengthWord2.getRunningBit() == 0);
-        assert(lastRunningLengthWord2.getNumberOfLiteralWords() == 0);
-#endif
-        lastRunningLengthWord2.setRunningBit(v);
-#ifdef EWAHASSERT
-        assert(lastRunningLengthWord2.getRunningBit() == v);
-#endif
-        lastRunningLengthWord2.setRunningLength(1);
-#ifdef EWAHASSERT
-        assert(lastRunningLengthWord2.getRunningLength() == 1);
-        assert(lastRunningLengthWord2.getNumberOfLiteralWords() == 0);
-#endif
-        return 1;
-    }
+template <class uword> size_t EWAHBoolArray<uword>::addEmptyWord(const bool v) {
+  RunningLengthWord<uword> lastRunningLengthWord(buffer[lastRLW]);
+  const bool noliteralword =
+      (lastRunningLengthWord.getNumberOfLiteralWords() == 0);
+  // first, if the last running length word is empty, we align it
+  // this
+  uword runlen = lastRunningLengthWord.getRunningLength();
+  if ((noliteralword) && (runlen == 0)) {
+    lastRunningLengthWord.setRunningBit(v);
+  }
+  if ((noliteralword) && (lastRunningLengthWord.getRunningBit() == v) &&
+      (runlen < RunningLengthWord<uword>::largestrunninglengthcount)) {
+    lastRunningLengthWord.setRunningLength(static_cast<uword>(runlen + 1));
+    return 0;
+  } else {
+    // we have to start anew
+    buffer.push_back(0);
+    lastRLW = buffer.size() - 1;
+    RunningLengthWord<uword> lastRunningLengthWord2(buffer[lastRLW]);
+    lastRunningLengthWord2.setRunningBit(v);
+    lastRunningLengthWord2.setRunningLength(1);
+    return 1;
+  }
 }
 
+template <class uword>
+void fast_logicalor_tocontainer(size_t n, const EWAHBoolArray<uword> **inputs,
+                                EWAHBoolArray<uword> &container) {
+  class EWAHBoolArrayPtr {
 
-template<class uword>
-void EWAHBoolArray<uword>::logicalor(const EWAHBoolArray &a, EWAHBoolArray &container) const {
-    container.reset();
-    if (RESERVEMEMORY)
-        container.buffer.reserve(buffer.size() + a.buffer.size());
-    EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
-    EWAHBoolArrayRawIterator<uword> j = raw_iterator();
-    if (!(i.hasNext() and j.hasNext())) {// hopefully this never happens...
-        container.setSizeInBits(sizeInBits());
-        return;
-    }
-    // at this point, this should be safe:
-    BufferedRunningLengthWord<uword> & rlwi = i.next();
-    BufferedRunningLengthWord<uword> & rlwj = j.next();
-
-    while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
-        while ((rlwi.getRunningLength() > 0)
-                || (rlwj.getRunningLength() > 0)) {
-            const bool i_is_prey = rlwi
-                    .getRunningLength() < rlwj
-                    .getRunningLength();
-            BufferedRunningLengthWord<uword> & prey = i_is_prey ? rlwi
-                    : rlwj;
-            BufferedRunningLengthWord<uword> & predator = i_is_prey ? rlwj
-                    : rlwi;
-            if (predator.getRunningBit()) {
-                container.addStreamOfEmptyWords(true,
-                        predator.getRunningLength());
-                prey.discardFirstWordsWithReload(predator
-                        .getRunningLength());
-            } else {
-                const size_t index = prey.discharge(container,
-                        predator.getRunningLength());
-                container.addStreamOfEmptyWords(false,
-                        predator.getRunningLength()
-                                - index
-                );
-            }
-            predator.discardRunningWordsWithReload();
-        }
-        const size_t nbre_literal = min(
-                rlwi.getNumberOfLiteralWords(),
-                rlwj.getNumberOfLiteralWords());
-        if (nbre_literal > 0) {
-            for (size_t k = 0; k < nbre_literal; ++k) {
-                container.addWord(rlwi.getLiteralWordAt(k)
-                        | rlwj.getLiteralWordAt(k));
-            }
-            rlwi.discardFirstWordsWithReload(nbre_literal);
-            rlwj.discardFirstWordsWithReload(nbre_literal);
-        }
+  public:
+    EWAHBoolArrayPtr(const EWAHBoolArray<uword> *p, bool o) : ptr(p), own(o) {}
+    const EWAHBoolArray<uword> *ptr;
+    bool own; // whether to clean
+
+    bool operator<(const EWAHBoolArrayPtr &o) const {
+      return o.ptr->sizeInBytes() < ptr->sizeInBytes(); // backward on purpose
     }
-    const bool  i_remains = rlwi.size() > 0;
-    BufferedRunningLengthWord<uword> &  remaining = i_remains ? rlwi
-            : rlwj;
-    remaining.discharge(container);
+  };
 
+  if (n == 0) {
+    container.reset();
+    return;
+  }
+  if (n == 1) {
+    container = *inputs[0];
+    return;
+  }
+  std::priority_queue<EWAHBoolArrayPtr> pq;
+  for (size_t i = 0; i < n; i++) {
+    // could use emplace
+    pq.push(EWAHBoolArrayPtr(inputs[i], false));
+  }
+  while (pq.size() > 2) {
+
+    EWAHBoolArrayPtr x1 = pq.top();
+    pq.pop();
+
+    EWAHBoolArrayPtr x2 = pq.top();
+    pq.pop();
+
+    EWAHBoolArray<uword> *buffer = new EWAHBoolArray<uword>();
+    x1.ptr->logicalor(*x2.ptr, *buffer);
+
+    if (x1.own) {
+      delete x1.ptr;
+    }
+    if (x2.own) {
+      delete x2.ptr;
+    }
+    pq.push(EWAHBoolArrayPtr(buffer, true));
+  }
+  EWAHBoolArrayPtr x1 = pq.top();
+  pq.pop();
+
+  EWAHBoolArrayPtr x2 = pq.top();
+  pq.pop();
+
+  x1.ptr->logicalor(*x2.ptr, container);
+
+  if (x1.own) {
+    delete x1.ptr;
+  }
+  if (x2.own) {
+    delete x2.ptr;
+  }
 }
 
-template<class uword>
-void EWAHBoolArray<uword>::logicalxor(const EWAHBoolArray &a, EWAHBoolArray &container) const {
-    container.reset();
-    if (RESERVEMEMORY)
-        container.buffer.reserve(buffer.size() + a.buffer.size());
-    EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
-    EWAHBoolArrayRawIterator<uword> j = raw_iterator();
-    if (!(i.hasNext() and j.hasNext())) {// hopefully this never happens...
-        container.setSizeInBits(sizeInBits());
-        return;
+template <class uword>
+void EWAHBoolArray<uword>::logicalor(const EWAHBoolArray &a,
+                                     EWAHBoolArray &container) const {
+  container.reset();
+  if (RESERVEMEMORY)
+    container.buffer.reserve(buffer.size() + a.buffer.size());
+  EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
+  EWAHBoolArrayRawIterator<uword> j = raw_iterator();
+  if (!(i.hasNext() and j.hasNext())) { // hopefully this never happens...
+    container.setSizeInBits(sizeInBits());
+    return;
+  }
+  // at this point, this should be safe:
+  BufferedRunningLengthWord<uword> &rlwi = i.next();
+  BufferedRunningLengthWord<uword> &rlwj = j.next();
+
+  while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
+    while ((rlwi.getRunningLength() > 0) || (rlwj.getRunningLength() > 0)) {
+      const bool i_is_prey = rlwi.getRunningLength() < rlwj.getRunningLength();
+      BufferedRunningLengthWord<uword> &prey = i_is_prey ? rlwi : rlwj;
+      BufferedRunningLengthWord<uword> &predator = i_is_prey ? rlwj : rlwi;
+      if (predator.getRunningBit()) {
+        container.fastaddStreamOfEmptyWords(true, predator.getRunningLength());
+        prey.discardFirstWordsWithReload(predator.getRunningLength());
+      } else {
+        const size_t index =
+            prey.discharge(container, predator.getRunningLength());
+        container.fastaddStreamOfEmptyWords(false, predator.getRunningLength() -
+                                                       index);
+      }
+      predator.discardRunningWordsWithReload();
     }
-    // at this point, this should be safe:
-    BufferedRunningLengthWord<uword> & rlwi = i.next();
-    BufferedRunningLengthWord<uword> & rlwj = j.next();
-
-
-    while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
-        while ((rlwi.getRunningLength() > 0) || (rlwj.getRunningLength() > 0)) {
-            const bool i_is_prey = rlwi.getRunningLength() < rlwj .getRunningLength();
-            BufferedRunningLengthWord<uword> & prey = i_is_prey ? rlwi : rlwj;
-            BufferedRunningLengthWord<uword> & predator = i_is_prey ? rlwj : rlwi;
-            const size_t index = (!predator.getRunningBit()) ? prey.discharge(container,
-                    predator.getRunningLength()) : prey.dischargeNegated(container,
-                    predator.getRunningLength());
-            container.addStreamOfEmptyWords(predator.getRunningBit(), predator.getRunningLength() - index);
-            predator.discardRunningWordsWithReload();
-        }
-        const size_t nbre_literal = min(rlwi.getNumberOfLiteralWords(),rlwj.getNumberOfLiteralWords());
-        if (nbre_literal > 0) {
-            for (size_t k = 0; k < nbre_literal; ++k)
-                container.addWord(rlwi.getLiteralWordAt(k) ^ rlwj.getLiteralWordAt(k));
-            rlwi.discardFirstWordsWithReload(nbre_literal);
-            rlwj.discardFirstWordsWithReload(nbre_literal);
-        }
+
+    const size_t nbre_literal = std::min(rlwi.getNumberOfLiteralWords(),
+                                         rlwj.getNumberOfLiteralWords());
+    if (nbre_literal > 0) {
+      for (size_t k = 0; k < nbre_literal; ++k) {
+        container.addWord(rlwi.getLiteralWordAt(k) | rlwj.getLiteralWordAt(k));
+      }
+      rlwi.discardLiteralWordsWithReload(nbre_literal);
+      rlwj.discardLiteralWordsWithReload(nbre_literal);
+    }
+  }
+  const bool i_remains = rlwi.size() > 0;
+  BufferedRunningLengthWord<uword> &remaining = i_remains ? rlwi : rlwj;
+  remaining.discharge(container);
+  container.setSizeInBits(sizeInBits() > a.sizeInBits() ? sizeInBits() : a.sizeInBits());
+}
+
+template <class uword>
+size_t EWAHBoolArray<uword>::logicalorcount(const EWAHBoolArray &a) const {
+  size_t answer = 0;
+  EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
+  EWAHBoolArrayRawIterator<uword> j = raw_iterator();
+  if (!(i.hasNext() and j.hasNext())) { // hopefully this never happens...
+    return 0;
+  }
+  // at this point, this should be safe:
+  BufferedRunningLengthWord<uword> &rlwi = i.next();
+  BufferedRunningLengthWord<uword> &rlwj = j.next();
+
+  while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
+    while ((rlwi.getRunningLength() > 0) || (rlwj.getRunningLength() > 0)) {
+      const bool i_is_prey = rlwi.getRunningLength() < rlwj.getRunningLength();
+      BufferedRunningLengthWord<uword> &prey = i_is_prey ? rlwi : rlwj;
+      BufferedRunningLengthWord<uword> &predator = i_is_prey ? rlwj : rlwi;
+      if (predator.getRunningBit()) {
+        answer += predator.getRunningLength() * wordinbits;
+        prey.discardFirstWordsWithReload(predator.getRunningLength());
+
+      } else {
+        // const size_t index =
+        prey.dischargeCount(predator.getRunningLength(), &answer);
+      }
+      predator.discardRunningWordsWithReload();
     }
-    const bool i_remains = rlwi.size() > 0;
-    BufferedRunningLengthWord<uword> & remaining = i_remains ? rlwi : rlwj;
-    remaining.discharge(container);
-   /// container.setSizeInBitsWithinLastWord(Math.max(sizeInBits(), a.sizeInBits()));
+
+    const size_t nbre_literal = std::min(rlwi.getNumberOfLiteralWords(),
+                                         rlwj.getNumberOfLiteralWords());
+    if (nbre_literal > 0) {
+      for (size_t k = 0; k < nbre_literal; ++k) {
+        answer += countOnes(
+            (uword)(rlwi.getLiteralWordAt(k) | rlwj.getLiteralWordAt(k)));
+      }
+      rlwi.discardLiteralWordsWithReload(nbre_literal);
+      rlwj.discardLiteralWordsWithReload(nbre_literal);
+    }
+  }
+  const bool i_remains = rlwi.size() > 0;
+  BufferedRunningLengthWord<uword> &remaining = i_remains ? rlwi : rlwj;
+  answer += remaining.dischargeCount();
+  return answer;
 }
 
+template <class uword>
+void EWAHBoolArray<uword>::logicalxor(const EWAHBoolArray &a,
+                                      EWAHBoolArray &container) const {
+  container.reset();
+  if (RESERVEMEMORY)
+    container.buffer.reserve(buffer.size() + a.buffer.size());
+  EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
+  EWAHBoolArrayRawIterator<uword> j = raw_iterator();
+  if (!(i.hasNext() and j.hasNext())) { // hopefully this never happens...
+    container.setSizeInBits(sizeInBits());
+    return;
+  }
+  // at this point, this should be safe:
+  BufferedRunningLengthWord<uword> &rlwi = i.next();
+  BufferedRunningLengthWord<uword> &rlwj = j.next();
+  while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
+    while ((rlwi.getRunningLength() > 0) || (rlwj.getRunningLength() > 0)) {
+      const bool i_is_prey = rlwi.getRunningLength() < rlwj.getRunningLength();
+      BufferedRunningLengthWord<uword> &prey = i_is_prey ? rlwi : rlwj;
+      BufferedRunningLengthWord<uword> &predator = i_is_prey ? rlwj : rlwi;
+      const size_t index =
+          (!predator.getRunningBit())
+              ? prey.discharge(container, predator.getRunningLength())
+              : prey.dischargeNegated(container, predator.getRunningLength());
+      container.fastaddStreamOfEmptyWords(predator.getRunningBit(),
+                                          predator.getRunningLength() - index);
+      predator.discardRunningWordsWithReload();
+    }
+    const size_t nbre_literal = std::min(rlwi.getNumberOfLiteralWords(),
+                                         rlwj.getNumberOfLiteralWords());
+    if (nbre_literal > 0) {
+      for (size_t k = 0; k < nbre_literal; ++k)
+        container.addWord(rlwi.getLiteralWordAt(k) ^ rlwj.getLiteralWordAt(k));
+      rlwi.discardLiteralWordsWithReload(nbre_literal);
+      rlwj.discardLiteralWordsWithReload(nbre_literal);
+    }
+  }
+  const bool i_remains = rlwi.size() > 0;
+  BufferedRunningLengthWord<uword> &remaining = i_remains ? rlwi : rlwj;
+  remaining.discharge(container);
+  container.setSizeInBits(sizeInBits() > a.sizeInBits() ? sizeInBits() : a.sizeInBits());
+}
+
+template <class uword>
+size_t EWAHBoolArray<uword>::logicalxorcount(const EWAHBoolArray &a) const {
+  EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
+  EWAHBoolArrayRawIterator<uword> j = raw_iterator();
+  if (!i.hasNext())
+    return a.numberOfOnes();
+  if (!j.hasNext())
+    return this->numberOfOnes();
+
+  size_t answer = 0;
+
+  // at this point, this should be safe:
+  BufferedRunningLengthWord<uword> &rlwi = i.next();
+  BufferedRunningLengthWord<uword> &rlwj = j.next();
+  while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
+    while ((rlwi.getRunningLength() > 0) || (rlwj.getRunningLength() > 0)) {
+      const bool i_is_prey = rlwi.getRunningLength() < rlwj.getRunningLength();
+      BufferedRunningLengthWord<uword> &prey = i_is_prey ? rlwi : rlwj;
+      BufferedRunningLengthWord<uword> &predator = i_is_prey ? rlwj : rlwi;
+      size_t index;
+
+      if (predator.getRunningBit()) {
+        index =
+            prey.dischargeCountNegated(predator.getRunningLength(), &answer);
+      } else {
+        index = prey.dischargeCount(predator.getRunningLength(), &answer);
+      }
+      if (predator.getRunningBit())
+        answer += (predator.getRunningLength() - index) * wordinbits;
+
+      predator.discardRunningWordsWithReload();
+    }
+    const size_t nbre_literal = std::min(rlwi.getNumberOfLiteralWords(),
+                                         rlwj.getNumberOfLiteralWords());
+    if (nbre_literal > 0) {
+      for (size_t k = 0; k < nbre_literal; ++k) {
+        answer += countOnes(
+            (uword)(rlwi.getLiteralWordAt(k) ^ rlwj.getLiteralWordAt(k)));
+      }
+      rlwi.discardLiteralWordsWithReload(nbre_literal);
+      rlwj.discardLiteralWordsWithReload(nbre_literal);
+    }
+  }
+  const bool i_remains = rlwi.size() > 0;
+  BufferedRunningLengthWord<uword> &remaining = i_remains ? rlwi : rlwj;
+  answer += remaining.dischargeCount();
+  return answer;
+}
 
-template<class uword>
+template <class uword>
 void EWAHBoolArray<uword>::logicaland(const EWAHBoolArray &a,
-        EWAHBoolArray &container) const {
-    container.reset();
-    if (RESERVEMEMORY)
-        container.buffer.reserve(
-                buffer.size() > a.buffer.size() ? buffer.size()
-                        : a.buffer.size());
-    EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
-    EWAHBoolArrayRawIterator<uword> j = raw_iterator();
-    if (!(i.hasNext() and j.hasNext())) {// hopefully this never happens...
-        container.setSizeInBits(sizeInBits());
-        return;
+                                      EWAHBoolArray &container) const {
+  container.reset();
+  if (RESERVEMEMORY)
+    container.buffer.reserve(buffer.size() > a.buffer.size() ? buffer.size()
+                                                             : a.buffer.size());
+  EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
+  EWAHBoolArrayRawIterator<uword> j = raw_iterator();
+  if (!(i.hasNext() and j.hasNext())) { // hopefully this never happens...
+    container.setSizeInBits(sizeInBits());
+    return;
+  }
+  // at this point, this should be safe:
+  BufferedRunningLengthWord<uword> &rlwi = i.next();
+  BufferedRunningLengthWord<uword> &rlwj = j.next();
+
+  while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
+    while ((rlwi.getRunningLength() > 0) || (rlwj.getRunningLength() > 0)) {
+      const bool i_is_prey = rlwi.getRunningLength() < rlwj.getRunningLength();
+      BufferedRunningLengthWord<uword> &prey(i_is_prey ? rlwi : rlwj);
+      BufferedRunningLengthWord<uword> &predator(i_is_prey ? rlwj : rlwi);
+      if (!predator.getRunningBit()) {
+        container.fastaddStreamOfEmptyWords(false, predator.getRunningLength());
+        prey.discardFirstWordsWithReload(predator.getRunningLength());
+      } else {
+        const size_t index =
+            prey.discharge(container, predator.getRunningLength());
+        container.fastaddStreamOfEmptyWords(false, predator.getRunningLength() -
+                                                       index);
+      }
+      predator.discardRunningWordsWithReload();
     }
-    // at this point, this should be safe:
-    BufferedRunningLengthWord<uword> & rlwi = i.next();
-    BufferedRunningLengthWord<uword> & rlwj = j.next();
-
-    while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
-        while ((rlwi.getRunningLength() > 0)
-                || (rlwj.getRunningLength() > 0)) {
-        	const bool i_is_prey = rlwi.getRunningLength() < rlwj.getRunningLength();
-            BufferedRunningLengthWord<uword> & prey(i_is_prey ? rlwi : rlwj);
-            BufferedRunningLengthWord<uword> & predator(i_is_prey ? rlwj : rlwi);
-            if (!predator.getRunningBit()) {
-            	container.fastaddStreamOfEmptyWords(false, predator.getRunningLength());
-                prey.discardFirstWordsWithReload(predator.getRunningLength());
-            } else {
-            	const size_t index = prey.discharge(container, predator.getRunningLength());
-            	container.fastaddStreamOfEmptyWords(false, predator.getRunningLength() - index);
-            }
-            predator.discardRunningWordsWithReload();
-        }
-        const size_t nbre_literal = min(rlwi.getNumberOfLiteralWords(), rlwj.getNumberOfLiteralWords());
-        if (nbre_literal > 0) {
-            for (size_t k = 0; k < nbre_literal; ++k) {
-                container.addWord(rlwi.getLiteralWordAt(k) & rlwj.getLiteralWordAt(k));
-            }
-            rlwi.discardFirstWordsWithReload(nbre_literal);
-            rlwj.discardFirstWordsWithReload(nbre_literal);
-        }
+    const size_t nbre_literal = std::min(rlwi.getNumberOfLiteralWords(),
+                                         rlwj.getNumberOfLiteralWords());
+    if (nbre_literal > 0) {
+      for (size_t k = 0; k < nbre_literal; ++k) {
+        container.addWord(rlwi.getLiteralWordAt(k) & rlwj.getLiteralWordAt(k));
+      }
+      rlwi.discardLiteralWordsWithReload(nbre_literal);
+      rlwj.discardLiteralWordsWithReload(nbre_literal);
     }
+  }
+  container.setSizeInBits(sizeInBits());
+  container.setSizeInBits(sizeInBits() > a.sizeInBits() ? sizeInBits() : a.sizeInBits());
+}
+
+template <class uword>
+void EWAHBoolArray<uword>::logicalandnot(const EWAHBoolArray &a,
+                                         EWAHBoolArray &container) const {
+  container.reset();
+  if (RESERVEMEMORY)
+    container.buffer.reserve(buffer.size() > a.buffer.size() ? buffer.size()
+                                                             : a.buffer.size());
+  EWAHBoolArrayRawIterator<uword> i = raw_iterator();
+  EWAHBoolArrayRawIterator<uword> j = a.raw_iterator();
+  if (!j.hasNext()) { // the other fellow is empty
+    container = *this; // just copy, stupidly, the data
+    return;
+  }
+  if (!(i.hasNext())) { // hopefully this never happens...
     container.setSizeInBits(sizeInBits());
+    return;
+  }
+  // at this point, this should be safe:
+  BufferedRunningLengthWord<uword> &rlwi = i.next();
+  BufferedRunningLengthWord<uword> &rlwj = j.next();
+
+  while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
+    while ((rlwi.getRunningLength() > 0) || (rlwj.getRunningLength() > 0)) {
+      const bool i_is_prey = rlwi.getRunningLength() < rlwj.getRunningLength();
+      BufferedRunningLengthWord<uword> &prey(i_is_prey ? rlwi : rlwj);
+      BufferedRunningLengthWord<uword> &predator(i_is_prey ? rlwj : rlwi);
+      if (((predator.getRunningBit()) && (i_is_prey)) ||
+          ((!predator.getRunningBit()) && (!i_is_prey))) {
+        container.fastaddStreamOfEmptyWords(false, predator.getRunningLength());
+        prey.discardFirstWordsWithReload(predator.getRunningLength());
+      } else if (i_is_prey) {
+        const size_t index =
+            prey.discharge(container, predator.getRunningLength());
+        container.fastaddStreamOfEmptyWords(false, predator.getRunningLength() -
+                                                       index);
+      } else {
+        const size_t index =
+            prey.dischargeNegated(container, predator.getRunningLength());
+        container.fastaddStreamOfEmptyWords(true, predator.getRunningLength() -
+                                                      index);
+      }
+      predator.discardRunningWordsWithReload();
+    }
+    const size_t nbre_literal = std::min(rlwi.getNumberOfLiteralWords(),
+                                         rlwj.getNumberOfLiteralWords());
+    if (nbre_literal > 0) {
+      for (size_t k = 0; k < nbre_literal; ++k) {
+        container.addWord(rlwi.getLiteralWordAt(k) & ~rlwj.getLiteralWordAt(k));
+      }
+      rlwi.discardLiteralWordsWithReload(nbre_literal);
+      rlwj.discardLiteralWordsWithReload(nbre_literal);
+    }
+  }
+  const bool i_remains = rlwi.size() > 0;
+  if (i_remains) {
+    rlwi.discharge(container);
+  }
+  container.setSizeInBits(sizeInBits());
+}
+
+template <class uword>
+size_t EWAHBoolArray<uword>::logicalandnotcount(const EWAHBoolArray &a) const {
+  EWAHBoolArrayRawIterator<uword> i = raw_iterator();
+  EWAHBoolArrayRawIterator<uword> j = a.raw_iterator();
+  if (!j.hasNext()) { // the other fellow is empty
+    return this->numberOfOnes();
+  }
+  if (!(i.hasNext())) { // hopefully this never happens...
+    return 0;
+  }
+  size_t answer = 0;
+  // at this point, this should be safe:
+  BufferedRunningLengthWord<uword> &rlwi = i.next();
+  BufferedRunningLengthWord<uword> &rlwj = j.next();
+
+  while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
+    while ((rlwi.getRunningLength() > 0) || (rlwj.getRunningLength() > 0)) {
+      const bool i_is_prey = rlwi.getRunningLength() < rlwj.getRunningLength();
+      BufferedRunningLengthWord<uword> &prey(i_is_prey ? rlwi : rlwj);
+      BufferedRunningLengthWord<uword> &predator(i_is_prey ? rlwj : rlwi);
+      if (((predator.getRunningBit()) && (i_is_prey)) ||
+          ((!predator.getRunningBit()) && (!i_is_prey))) {
+        prey.discardFirstWordsWithReload(predator.getRunningLength());
+      } else if (i_is_prey) {
+        prey.dischargeCount(predator.getRunningLength(), &answer);
+      } else {
+        const size_t index =
+            prey.dischargeCountNegated(predator.getRunningLength(), &answer);
+        answer += (predator.getRunningLength() - index) * wordinbits;
+      }
+      predator.discardRunningWordsWithReload();
+    }
+    const size_t nbre_literal = std::min(rlwi.getNumberOfLiteralWords(),
+                                         rlwj.getNumberOfLiteralWords());
+    if (nbre_literal > 0) {
+      for (size_t k = 0; k < nbre_literal; ++k) {
+        answer += countOnes(
+            (uword)(rlwi.getLiteralWordAt(k) & (~rlwj.getLiteralWordAt(k))));
+      }
+      rlwi.discardLiteralWordsWithReload(nbre_literal);
+      rlwj.discardLiteralWordsWithReload(nbre_literal);
+    }
+  }
+  const bool i_remains = rlwi.size() > 0;
+  if (i_remains) {
+    answer += rlwi.dischargeCount();
+  }
+  return answer;
 }
 
+template <class uword>
+size_t EWAHBoolArray<uword>::logicalandcount(const EWAHBoolArray &a) const {
+  EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
+  EWAHBoolArrayRawIterator<uword> j = raw_iterator();
+  if (!(i.hasNext() and j.hasNext())) { // hopefully this never happens...
+    return 0;
+  }
+  size_t answer = 0;
+  // at this point, this should be safe:
+  BufferedRunningLengthWord<uword> &rlwi = i.next();
+  BufferedRunningLengthWord<uword> &rlwj = j.next();
+
+  while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
+    while ((rlwi.getRunningLength() > 0) || (rlwj.getRunningLength() > 0)) {
+      const bool i_is_prey = rlwi.getRunningLength() < rlwj.getRunningLength();
+      BufferedRunningLengthWord<uword> &prey(i_is_prey ? rlwi : rlwj);
+      BufferedRunningLengthWord<uword> &predator(i_is_prey ? rlwj : rlwi);
+      if (!predator.getRunningBit()) {
+        prey.discardFirstWordsWithReload(predator.getRunningLength());
+      } else {
+        // const size_t index =
+        prey.dischargeCount(predator.getRunningLength(), &answer);
+      }
+      predator.discardRunningWordsWithReload();
+    }
+    const size_t nbre_literal = std::min(rlwi.getNumberOfLiteralWords(),
+                                         rlwj.getNumberOfLiteralWords());
+    if (nbre_literal > 0) {
+      for (size_t k = 0; k < nbre_literal; ++k) {
+        answer += countOnes(
+            (uword)(rlwi.getLiteralWordAt(k) & rlwj.getLiteralWordAt(k)));
+      }
+      rlwi.discardLiteralWordsWithReload(nbre_literal);
+      rlwj.discardLiteralWordsWithReload(nbre_literal);
+    }
+  }
+  return answer;
+}
 
-template<class uword>
+template <class uword>
 bool EWAHBoolArray<uword>::intersects(const EWAHBoolArray &a) const {
-EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
-EWAHBoolArrayRawIterator<uword> j = raw_iterator();
-if (!(i.hasNext() and j.hasNext())) {// hopefully this never happens...
+  EWAHBoolArrayRawIterator<uword> i = a.raw_iterator();
+  EWAHBoolArrayRawIterator<uword> j = raw_iterator();
+  if (!(i.hasNext() and j.hasNext())) { // hopefully this never happens...
     return false;
-}
-// at this point, this should be safe:
-BufferedRunningLengthWord<uword> & rlwi = i.next();
-BufferedRunningLengthWord<uword> & rlwj = j.next();
+  }
+  // at this point, this should be safe:
+  BufferedRunningLengthWord<uword> &rlwi = i.next();
+  BufferedRunningLengthWord<uword> &rlwj = j.next();
 
-while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
-    while ((rlwi.getRunningLength() > 0)
-            || (rlwj.getRunningLength() > 0)) {
+  while ((rlwi.size() > 0) && (rlwj.size() > 0)) {
+    while ((rlwi.getRunningLength() > 0) || (rlwj.getRunningLength() > 0)) {
       const bool i_is_prey = rlwi.getRunningLength() < rlwj.getRunningLength();
-        BufferedRunningLengthWord<uword> & prey(i_is_prey ? rlwi : rlwj);
-        BufferedRunningLengthWord<uword> & predator(i_is_prey ? rlwj : rlwi);
-        if (!predator.getRunningBit()) {
-            prey.discardFirstWordsWithReload(predator.getRunningLength());
-        } else {
-          size_t index = 0;
-          bool isnonzero = prey.nonzero_discharge(predator.getRunningLength(),index);
-          if(isnonzero) return true;
-        }
-        predator.discardRunningWordsWithReload();
+      BufferedRunningLengthWord<uword> &prey(i_is_prey ? rlwi : rlwj);
+      BufferedRunningLengthWord<uword> &predator(i_is_prey ? rlwj : rlwi);
+      if (!predator.getRunningBit()) {
+        prey.discardFirstWordsWithReload(predator.getRunningLength());
+      } else {
+        size_t index = 0;
+        bool isnonzero =
+            prey.nonzero_discharge(predator.getRunningLength(), index);
+        if (isnonzero)
+          return true;
+      }
+      predator.discardRunningWordsWithReload();
     }
-    const size_t nbre_literal = min(rlwi.getNumberOfLiteralWords(), rlwj.getNumberOfLiteralWords());
+    const size_t nbre_literal = std::min(rlwi.getNumberOfLiteralWords(),
+                                         rlwj.getNumberOfLiteralWords());
     if (nbre_literal > 0) {
-        for (size_t k = 0; k < nbre_literal; ++k) {
-            if((rlwi.getLiteralWordAt(k) & rlwj.getLiteralWordAt(k))!=0) return true;
-        }
-        rlwi.discardFirstWordsWithReload(nbre_literal);
-        rlwj.discardFirstWordsWithReload(nbre_literal);
+      for (size_t k = 0; k < nbre_literal; ++k) {
+        if ((rlwi.getLiteralWordAt(k) & rlwj.getLiteralWordAt(k)) != 0)
+          return true;
+      }
+      rlwi.discardLiteralWordsWithReload(nbre_literal);
+      rlwj.discardLiteralWordsWithReload(nbre_literal);
     }
-}
-return false;
+  }
+  return false;
 }
 
-template<class uword>
+template <class uword>
 BitmapStatistics EWAHBoolArray<uword>::computeStatistics() const {
-    //uint totalcompressed(0), totalliteral(0);
-    BitmapStatistics bs;
-    EWAHBoolArrayRawIterator<uword> i = raw_iterator();
-    while (i.hasNext()) {
-        BufferedRunningLengthWord<uword> &brlw(i.next());
-        ++bs.runningwordmarker;
-        bs.totalliteral += brlw.getNumberOfLiteralWords();
-        bs.totalcompressed += brlw.getRunningLength();
-        if (brlw.getRunningLength()
-                == RunningLengthWord<uword>::largestrunninglengthcount) {
-            ++bs.maximumofrunningcounterreached;
-        }
-    }
-    return bs;
+  BitmapStatistics bs;
+  EWAHBoolArrayRawIterator<uword> i = raw_iterator();
+  while (i.hasNext()) {
+    BufferedRunningLengthWord<uword> &brlw(i.next());
+    ++bs.runningwordmarker;
+    bs.totalliteral += brlw.getNumberOfLiteralWords();
+    bs.totalcompressed += brlw.getRunningLength();
+    if (brlw.getRunningLength() ==
+        RunningLengthWord<uword>::largestrunninglengthcount) {
+      ++bs.maximumofrunningcounterreached;
+    }
+  }
+  return bs;
 }
 
-template<class uword>
-void EWAHBoolArray<uword>::debugprintout() const {
-    cout << "==printing out EWAHBoolArray==" << endl;
-    cout << "Number of compressed words: " << buffer.size() << endl;
-    size_t pointer = 0;
-    while (pointer < buffer.size()) {
-        ConstRunningLengthWord<uword> rlw(buffer[pointer]);
-        bool b = rlw.getRunningBit();
-        const uword rl = rlw.getRunningLength();
-        const uword lw = rlw.getNumberOfLiteralWords();
-        cout << "pointer = " << pointer << " running bit=" << b
-                << " running length=" << rl << " lit. words=" << lw << endl;
-        for (uword j = 0; j < lw; ++j) {
-            const uword & w = buffer[pointer + j + 1];
-            cout << toBinaryString(w) << endl;
-        }
-        pointer += lw + 1;
-    }
-    cout << "==END==" << endl;
+template <class uword> void EWAHBoolArray<uword>::debugprintout() const {
+  std::cout << "==printing out EWAHBoolArray==" << std::endl;
+  std::cout << "Number of compressed words: " << buffer.size() << std::endl;
+  size_t pointer = 0;
+  while (pointer < buffer.size()) {
+    ConstRunningLengthWord<uword> rlw(buffer[pointer]);
+    bool b = rlw.getRunningBit();
+    const uword rl = rlw.getRunningLength();
+    const uword lw = rlw.getNumberOfLiteralWords();
+    std::cout << "pointer = " << pointer << " running bit=" << b
+              << " running length=" << rl << " lit. words=" << lw << std::endl;
+    for (uword j = 0; j < lw; ++j) {
+      const uword &w = buffer[pointer + j + 1];
+      std::cout << toBinaryString(w) << std::endl;
+    }
+    pointer += lw + 1;
+  }
+  std::cout << "==END==" << std::endl;
 }
 
-template<class uword>
-size_t EWAHBoolArray<uword>::sizeOnDisk() const {
-    return sizeof(sizeinbits) + sizeof(size_t) + sizeof(uword) * buffer.size();
+template <class uword>
+size_t EWAHBoolArray<uword>::sizeOnDisk(const bool savesizeinbits) const {
+  return (savesizeinbits ? sizeof(sizeinbits) : 0) + sizeof(size_t) +
+         sizeof(uword) * buffer.size();
 }
 
 #endif
diff --git a/yt/utilities/lib/ewahboolarray/ewahutil.h b/yt/utilities/lib/ewahboolarray/ewahutil.h
index 0d5231b7d4e..1d5ee30a0f0 100644
--- a/yt/utilities/lib/ewahboolarray/ewahutil.h
+++ b/yt/utilities/lib/ewahboolarray/ewahutil.h
@@ -25,226 +25,216 @@
 #include <algorithm>
 #include <sstream>
 
-#if defined(_WIN64)
-#include <intrin.h>
-#endif
-
 // taken from stackoverflow
 #ifndef NDEBUG
-#   define ASSERT(condition, message) \
-    do { \
-        if (! (condition)) { \
-            std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \
-                      << " line " << __LINE__ << ": " << message << std::endl; \
-            std::exit(EXIT_FAILURE); \
-        } \
-    } while (false)
+#define ASSERT(condition, message)                                             \
+  do {                                                                         \
+    if (!(condition)) {                                                        \
+      std::cerr << "Assertion `" #condition "` failed in " << __FILE__         \
+                << " line " << __LINE__ << ": " << message << std::endl;       \
+      std::exit(EXIT_FAILURE);                                                 \
+    }                                                                          \
+  } while (false)
 #else
-#   define ASSERT(condition, message) do { } while (false)
+#define ASSERT(condition, message)                                             \
+  do {                                                                         \
+  } while (false)
 #endif
 
-
+#ifdef _MSC_VER
+#include <intrin.h> 
+#endif
 
 static inline uint32_t ctz64(uint64_t n) {
-#if defined(__GNUC__) && UINT_MAX >= UINT32_MAX
-	return static_cast<uint32_t>(__builtin_ctzl(n));
-#elif defined(_WIN64) && defined(_MSC_VER) && _MSC_VER >= 1400
-	uint32_t i;
-	_BitScanForward64((unsigned long *) &i, n);
-	return i;
+#if defined(__GNUC__) && UINT_MAX >= UINT32_MAX && ULLONG_MAX >= UINT64_MAX
+  return static_cast<uint32_t>(__builtin_ctzll(n));
+#elif defined(_WIN64) && defined(_MSC_VER) && _MSC_VER >= 1400 &&              \
+    ULONG_MAX >= UINT64_MAX
+  uint32_t i;
+  _BitScanForward64((unsigned long *)&i, n);
+  return i;
 #else
-	uint32_t i = 1;
-	if ((n & static_cast<uint64_t>(4294967295)) == 0) {
-		n >>= 32;
-		i += 32;
-	}
-	if ((n & static_cast<uint64_t>(0x0000FFFFUL)) == 0) {
-		n >>= 16;
-		i += 16;
-	}
-
-	if ((n & static_cast<uint64_t>(0x000000FFUL)) == 0) {
-		n >>= 8;
-		i += 8;
-	}
-
-	if ((n & static_cast<uint64_t>(0x0000000FUL)) == 0) {
-		n >>= 4;
-		i += 4;
-	}
-
-	if ((n & static_cast<uint64_t>(0x00000003UL)) == 0) {
-		n >>= 2;
-		i += 2;
-	}
-    i -= (n & 0x1);
-	return i;
+  uint32_t i = 1;
+  if ((n & static_cast<uint64_t>(4294967295)) == 0) {
+    n >>= 32;
+    i += 32;
+  }
+  if ((n & static_cast<uint64_t>(0x0000FFFFUL)) == 0) {
+    n >>= 16;
+    i += 16;
+  }
+
+  if ((n & static_cast<uint64_t>(0x000000FFUL)) == 0) {
+    n >>= 8;
+    i += 8;
+  }
+
+  if ((n & static_cast<uint64_t>(0x0000000FUL)) == 0) {
+    n >>= 4;
+    i += 4;
+  }
+
+  if ((n & static_cast<uint64_t>(0x00000003UL)) == 0) {
+    n >>= 2;
+    i += 2;
+  }
+  i -= (n & 0x1);
+  return i;
 #endif
 }
 
-
-
-
 static inline uint32_t ctz32(uint32_t n) {
 #if defined(__GNUC__) && UINT_MAX >= UINT32_MAX
-	return static_cast<uint32_t>(__builtin_ctz(n));
+  return static_cast<uint32_t>(__builtin_ctz(n));
 
 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-	uint32_t i;
-	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
-	return i;
+  uint32_t i;
+  __asm__("bsfl %1, %0" : "=r"(i) : "rm"(n));
+  return i;
 
 #elif defined(_MSC_VER) && _MSC_VER >= 1400
-	uint32_t i;
-	_BitScanForward((unsigned long *) &i, n);
-	return i;
+  uint32_t i;
+  _BitScanForward((unsigned long *)&i, n);
+  return i;
 
 #else
-	uint32_t i = 1;
+  uint32_t i = 1;
 
-	if ((n & static_cast<uint32_t>(0x0000FFFF)) == 0) {
-		n >>= 16;
-		i += 16;
-	}
+  if ((n & static_cast<uint32_t>(0x0000FFFF)) == 0) {
+    n >>= 16;
+    i += 16;
+  }
 
-	if ((n & static_cast<uint32_t>(0x000000FF)) == 0) {
-		n >>= 8;
-		i += 8;
-	}
+  if ((n & static_cast<uint32_t>(0x000000FF)) == 0) {
+    n >>= 8;
+    i += 8;
+  }
 
-	if ((n & static_cast<uint32_t>(0x0000000F)) == 0) {
-		n >>= 4;
-		i += 4;
-	}
+  if ((n & static_cast<uint32_t>(0x0000000F)) == 0) {
+    n >>= 4;
+    i += 4;
+  }
 
-	if ((n & static_cast<uint32_t>(0x00000003)) == 0) {
-		n >>= 2;
-		i += 2;
-	}
+  if ((n & static_cast<uint32_t>(0x00000003)) == 0) {
+    n >>= 2;
+    i += 2;
+  }
 
-    i -= (n & 1);
+  i -= (n & 1);
 
-	return i;
+  return i;
 #endif
 }
 
-
 static inline uint32_t ctz16(uint16_t n) {
 #if defined(__GNUC__) && UINT_MAX >= UINT32_MAX
-	return static_cast<uint32_t>(__builtin_ctz(n));
+  return static_cast<uint32_t>(__builtin_ctz(n));
 
 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-	uint32_t i;
-	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
-	return i;
+  uint32_t i;
+  __asm__("bsfl %1, %0" : "=r"(i) : "rm"(n));
+  return i;
 
 #elif defined(_MSC_VER) && _MSC_VER >= 1400
-	uint32_t i;
-	_BitScanForward((unsigned long *) &i, n);
-	return i;
+  uint32_t i;
+  _BitScanForward((unsigned long *)&i, n);
+  return i;
 
 #else
-	uint32_t i = 1;
+  uint32_t i = 1;
 
-	if ((n & static_cast<uint16_t>(0x000000FF)) == 0) {
-		n >>= 8;
-		i += 8;
-	}
+  if ((n & static_cast<uint16_t>(0x000000FF)) == 0) {
+    n >>= 8;
+    i += 8;
+  }
 
-	if ((n & static_cast<uint16_t>(0x0000000F)) == 0) {
-		n >>= 4;
-		i += 4;
-	}
+  if ((n & static_cast<uint16_t>(0x0000000F)) == 0) {
+    n >>= 4;
+    i += 4;
+  }
 
-	if ((n & static_cast<uint16_t>(0x00000003)) == 0) {
-		n >>= 2;
-		i += 2;
-	}
-    i -= (n & 1);
+  if ((n & static_cast<uint16_t>(0x00000003)) == 0) {
+    n >>= 2;
+    i += 2;
+  }
+  i -= (n & 1);
 
-	return i;
+  return i;
 #endif
 }
 
-
-
-
 #ifdef __GNUC__
 /**
  * count the number of bits set to one (32 bit version)
  */
 inline uint32_t countOnes(uint32_t x) {
-    return static_cast<uint32_t>(__builtin_popcount(x));
+  return static_cast<uint32_t>(__builtin_popcount(x));
 }
 #elif defined(_MSC_VER) && _MSC_VER >= 1400
-inline uint32_t countOnes(uint32_t x) {
-	return __popcnt(x);
-}
+inline uint32_t countOnes(uint32_t x) { return __popcnt(x); }
 #else
 inline uint32_t countOnes(uint32_t v) {
-    v = v - ((v >> 1) & 0x55555555);
-    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
-    return static_cast<uint32_t>((((v + (v >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24);
+  v = v - ((v >> 1) & 0x55555555);
+  v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+  return static_cast<uint32_t>((((v + (v >> 4)) & 0x0F0F0F0F) * 0x01010101) >>
+                               24);
 }
 #endif
 
-
 #ifdef __GNUC__
 /**
  * count the number of bits set to one (64 bit version)
  */
 inline uint32_t countOnes(uint64_t x) {
-    return static_cast<uint32_t>(__builtin_popcountl(x));
+  return static_cast<uint32_t>(__builtin_popcountll(x));
 }
 #elif defined(_WIN64) && defined(_MSC_VER) && _MSC_VER >= 1400
 inline uint32_t countOnes(uint64_t x) {
-	return static_cast<uint32_t>(__popcnt64(static_cast<__int64>(x)));
+  return static_cast<uint32_t>(__popcnt64(static_cast<__int64>(x)));
 }
 #else
 inline uint32_t countOnes(uint64_t v) {
-    v = v - ((v >> 1) & 0x5555555555555555);
-    v = (v & 0x3333333333333333) +
-        ((v >> 2) & 0x3333333333333333);
-    v = ((v + (v >> 4)) & 0x0F0F0F0F0F0F0F0F);
-    return static_cast<uint32_t>((v*(0x0101010101010101))>>56);
+  v = v - ((v >> 1) & 0x5555555555555555);
+  v = (v & 0x3333333333333333) + ((v >> 2) & 0x3333333333333333);
+  v = ((v + (v >> 4)) & 0x0F0F0F0F0F0F0F0F);
+  return static_cast<uint32_t>((v * (0x0101010101010101)) >> 56);
 }
 #endif
 
 inline uint32_t countOnes(uint16_t v) {
-    return countOnes(static_cast<uint32_t>(v));
+  return countOnes(static_cast<uint32_t>(v));
 }
 
-
 inline uint32_t numberOfTrailingZeros(uint32_t x) {
-    if (x == 0) return 32;
-    return ctz32(x);
+  if (x == 0)
+    return 32;
+  return ctz32(x);
 }
 
-
 inline uint32_t numberOfTrailingZeros(uint64_t x) {
-    if (x == 0) return 64;
-    return ctz64(x);
+  if (x == 0)
+    return 64;
+  return ctz64(x);
 }
 
 inline uint32_t numberOfTrailingZeros(uint16_t x) {
-    if (x == 0) return 16;
-    return ctz16(x);
+  if (x == 0)
+    return 16;
+  return ctz16(x);
 }
 
-
 /**
  * Returns the binary representation of a binary word.
  */
-template<class uword>
-std::string toBinaryString(const uword w) {
-    std::ostringstream convert;
-    for (uint32_t k = 0; k < sizeof(uword) * 8; ++k) {
-        if (w & (static_cast<uword> (1) << k))
-            convert << "1";
-        else
-            convert << "0";
-    }
-    return convert.str();
+template <class uword> std::string toBinaryString(const uword w) {
+  std::ostringstream convert;
+  for (uint32_t k = 0; k < sizeof(uword) * 8; ++k) {
+    if (w & (static_cast<uword>(1) << k))
+      convert << "1";
+    else
+      convert << "0";
+  }
+  return convert.str();
 }
 
 #endif
diff --git a/yt/utilities/lib/ewahboolarray/runninglengthword.h b/yt/utilities/lib/ewahboolarray/runninglengthword.h
index 5ff441a62f8..85ccdf5ddad 100644
--- a/yt/utilities/lib/ewahboolarray/runninglengthword.h
+++ b/yt/utilities/lib/ewahboolarray/runninglengthword.h
@@ -7,474 +7,542 @@
 #ifndef RUNNINGLENGTHWORD_H_
 #define RUNNINGLENGTHWORD_H_
 #include <iostream>
-using namespace std;
 /**
  * For expert users.
  * This class is used to represent a special type of word storing
  * a run length. It is defined by the Enhanced Word Aligned  Hybrid (EWAH)
  * format. You don't normally need to access this class.
  */
-template<class uword>
-class RunningLengthWord {
+template <class uword> class RunningLengthWord {
 public:
-    RunningLengthWord(uword & data) :
-        mydata(data) {
-    }
-
-    RunningLengthWord(const RunningLengthWord & rlw) :
-        mydata(rlw.mydata) {
-    }
-
-    RunningLengthWord& operator=(const RunningLengthWord & rlw) {
-        mydata = rlw.mydata;
-        return *this;
-    }
-
-    /**
-     * Which bit is being repeated?
-     */
-    bool getRunningBit() const {
-        return mydata & static_cast<uword> (1);
-    }
-
-    /**
-     * how many words should be filled by the running bit
-     */
-    static inline bool getRunningBit(uword data)  {
-        return data & static_cast<uword> (1);
-    }
-
-    /**
-     * how many words should be filled by the running bit
-     */
-    uword getRunningLength() const {
-        return static_cast<uword>((mydata >> 1) & largestrunninglengthcount);
-    }
-
-    /**
-     * followed by how many literal words?
-     */
-    static inline uword getRunningLength(uword data) {
-        return static_cast<uword>((data >> 1) & largestrunninglengthcount);
-    }
-
-    /**
-     * followed by how many literal words?
-     */
-    uword getNumberOfLiteralWords() const {
-        return static_cast<uword> (mydata >> (1 + runninglengthbits));
-    }
-
-    /**
-     * Total of getRunningLength() and getNumberOfLiteralWords()
-     */
-    uword size() const {
-        return static_cast<uword>(getRunningLength() + getNumberOfLiteralWords());
-    }
-
-
-
-    /**
-     * Total of getRunningLength() and getNumberOfLiteralWords()
-     */
-    static inline uword size(uword data) {
-        return static_cast<uword>(getRunningLength(data) + getNumberOfLiteralWords(data));
-    }
-
-    /**
-     * followed by how many literal words?
-     */
-    static inline uword getNumberOfLiteralWords(uword data) {
-        return static_cast<uword> (data >> (1 + runninglengthbits));
-    }
-
-    /**
-     * running length of which type of bits
-     */
-    void setRunningBit(bool b) {
-        if (b)
-            mydata |= static_cast<uword> (1);
-        else
-            mydata &= static_cast<uword> (~1);
-    }
-
-
-    void discardFirstWords(uword x) {
-#ifdef EWAHASSERT
-        assert(x <= size());
-#endif
-        const uword rl(getRunningLength());
-        if (rl >= x) {
-            setRunningLength(rl - x);
-            return;
-        }
-        x -= rl;
-        setRunningLength(0);
-        setNumberOfLiteralWords(getNumberOfLiteralWords() - x);
-    }
-
-    /**
-     * running length of which type of bits
-     */
-    static inline void setRunningBit(uword & data, bool b) {
-        if (b)
-            data |= static_cast<uword> (1);
-        else
-            data &= static_cast<uword> (~1);
-    }
-
-    void setRunningLength(uword l) {
-        mydata |= shiftedlargestrunninglengthcount;
-        mydata &= static_cast<uword> ((l << 1)
-                | notshiftedlargestrunninglengthcount);
-    }
-
-    // static call for people who hate objects
-    static inline void setRunningLength(uword & data, uword l) {
-        data |= shiftedlargestrunninglengthcount;
-        data &= static_cast<uword> ((l << 1)
-                | notshiftedlargestrunninglengthcount);
-    }
-
-    void setNumberOfLiteralWords(uword l) {
-        mydata |= notrunninglengthplusrunningbit;
-        mydata &= static_cast<uword> ((l << (runninglengthbits + 1))
-                | runninglengthplusrunningbit);
-    }
-    // static call for people who hate objects
-    static inline void setNumberOfLiteralWords(uword & data, uword l) {
-        data |= notrunninglengthplusrunningbit;
-        data &= static_cast<uword> (l << (runninglengthbits + 1))
-                | runninglengthplusrunningbit;
-    }
-
-
-    static const uint32_t runninglengthbits = sizeof(uword) * 4;
-    static const uint32_t literalbits = sizeof(uword) * 8 - 1 - runninglengthbits;
-    static const uword largestliteralcount = (static_cast<uword> (1)
-            << literalbits) - 1;
-    static const uword largestrunninglengthcount = (static_cast<uword> (1)
-            << runninglengthbits) - 1;
-    static const uword shiftedlargestrunninglengthcount =
-            largestrunninglengthcount << 1;
-    static const uword notshiftedlargestrunninglengthcount =
-            static_cast<uword> (~shiftedlargestrunninglengthcount);
-    static const uword runninglengthplusrunningbit = (static_cast<uword> (1)
-            << (runninglengthbits + 1)) - 1;
-    static const uword notrunninglengthplusrunningbit =
-            static_cast<uword> (~runninglengthplusrunningbit);
-    static const uword notlargestrunninglengthcount =
-            static_cast<uword> (~largestrunninglengthcount);
-
-    uword & mydata;
+  RunningLengthWord(uword &data) : mydata(data) {}
+
+  RunningLengthWord(const RunningLengthWord &rlw) : mydata(rlw.mydata) {}
+
+  RunningLengthWord &operator=(const RunningLengthWord &rlw) {
+    mydata = rlw.mydata;
+    return *this;
+  }
+
+  /**
+   * Which bit is being repeated?
+   */
+  bool getRunningBit() const { return mydata & static_cast<uword>(1); }
+
+  /**
+   * how many words should be filled by the running bit
+   */
+  static inline bool getRunningBit(uword data) {
+    return data & static_cast<uword>(1);
+  }
+
+  /**
+   * how many words should be filled by the running bit
+   */
+  uword getRunningLength() const {
+    return static_cast<uword>((mydata >> 1) & largestrunninglengthcount);
+  }
+
+  /**
+   * followed by how many literal words?
+   */
+  static inline uword getRunningLength(uword data) {
+    return static_cast<uword>((data >> 1) & largestrunninglengthcount);
+  }
+
+  /**
+   * followed by how many literal words?
+   */
+  uword getNumberOfLiteralWords() const {
+    return static_cast<uword>(mydata >> (1 + runninglengthbits));
+  }
+
+  /**
+   * Total of getRunningLength() and getNumberOfLiteralWords()
+   */
+  uword size() const {
+    return static_cast<uword>(getRunningLength() + getNumberOfLiteralWords());
+  }
+
+  /**
+   * Total of getRunningLength() and getNumberOfLiteralWords()
+   */
+  static inline uword size(uword data) {
+    return static_cast<uword>(getRunningLength(data) +
+                              getNumberOfLiteralWords(data));
+  }
+
+  /**
+   * followed by how many literal words?
+   */
+  static inline uword getNumberOfLiteralWords(uword data) {
+    return static_cast<uword>(data >> (1 + runninglengthbits));
+  }
+
+  /**
+   * running length of which type of bits
+   */
+  void setRunningBit(bool b) {
+    if (b)
+      mydata |= static_cast<uword>(1);
+    else
+      mydata &= static_cast<uword>(~1);
+  }
+
+  void discardFirstWords(uword x) {
+    const uword rl(getRunningLength());
+    if (rl >= x) {
+      setRunningLength(rl - x);
+      return;
+    }
+    x -= rl;
+    setRunningLength(0);
+    setNumberOfLiteralWords(getNumberOfLiteralWords() - x);
+  }
+
+  /**
+   * running length of which type of bits
+   */
+  static inline void setRunningBit(uword &data, bool b) {
+    if (b)
+      data |= static_cast<uword>(1);
+    else
+      data &= static_cast<uword>(~1);
+  }
+
+  void setRunningLength(uword l) {
+    mydata |= shiftedlargestrunninglengthcount;
+    mydata &=
+        static_cast<uword>((l << 1) | notshiftedlargestrunninglengthcount);
+  }
+
+  // static call for people who hate objects
+  static inline void setRunningLength(uword &data, uword l) {
+    data |= shiftedlargestrunninglengthcount;
+    data &= static_cast<uword>((l << 1) | notshiftedlargestrunninglengthcount);
+  }
+
+  void setNumberOfLiteralWords(uword l) {
+    mydata |= notrunninglengthplusrunningbit;
+    mydata &= static_cast<uword>((l << (runninglengthbits + 1)) |
+                                 runninglengthplusrunningbit);
+  }
+  // static call for people who hate objects
+  static inline void setNumberOfLiteralWords(uword &data, uword l) {
+    data |= notrunninglengthplusrunningbit;
+    data &= static_cast<uword>(l << (runninglengthbits + 1)) |
+            runninglengthplusrunningbit;
+  }
+
+  static const uint32_t runninglengthbits = sizeof(uword) * 4;
+  static const uint32_t literalbits = sizeof(uword) * 8 - 1 - runninglengthbits;
+  static const uword largestliteralcount =
+      (static_cast<uword>(1) << literalbits) - 1;
+  static const uword largestrunninglengthcount =
+      (static_cast<uword>(1) << runninglengthbits) - 1;
+  static const uword shiftedlargestrunninglengthcount =
+      largestrunninglengthcount << 1;
+  static const uword notshiftedlargestrunninglengthcount =
+      static_cast<uword>(~shiftedlargestrunninglengthcount);
+  static const uword runninglengthplusrunningbit =
+      (static_cast<uword>(1) << (runninglengthbits + 1)) - 1;
+  static const uword notrunninglengthplusrunningbit =
+      static_cast<uword>(~runninglengthplusrunningbit);
+  static const uword notlargestrunninglengthcount =
+      static_cast<uword>(~largestrunninglengthcount);
+
+  uword &mydata;
 };
 
 /**
  * Same as RunningLengthWord, except that the values cannot be modified.
  */
-template<class uword = uint32_t>
-class ConstRunningLengthWord {
+template <class uword = uint32_t> class ConstRunningLengthWord {
 public:
-
-    ConstRunningLengthWord() :
-        mydata(0) {
-    }
-
-    ConstRunningLengthWord(const uword data) :
-        mydata(data) {
-    }
-
-    ConstRunningLengthWord(const ConstRunningLengthWord & rlw) :
-        mydata(rlw.mydata) {
-    }
-
-    /**
-     * Which bit is being repeated?
-     */
-    bool getRunningBit() const {
-        return mydata & static_cast<uword> (1);
-    }
-
-    /**
-     * how many words should be filled by the running bit
-     */
-    uword getRunningLength() const {
-        return static_cast<uword>((mydata >> 1)
-                & RunningLengthWord<uword>::largestrunninglengthcount);
-    }
-
-    /**
-     * followed by how many literal words?
-     */
-    uword getNumberOfLiteralWords() const {
-        return static_cast<uword> (mydata >> (1
-                + RunningLengthWord<uword>::runninglengthbits));
-    }
-
-    /**
-     * Total of getRunningLength() and getNumberOfLiteralWords()
-     */
-    uword size() const {
-        return getRunningLength() + getNumberOfLiteralWords();
-    }
-
-    uword mydata;
+  ConstRunningLengthWord() : mydata(0) {}
+
+  ConstRunningLengthWord(const uword data) : mydata(data) {}
+
+  ConstRunningLengthWord(const ConstRunningLengthWord &rlw)
+      : mydata(rlw.mydata) {}
+
+  /**
+   * Which bit is being repeated?
+   */
+  bool getRunningBit() const { return mydata & static_cast<uword>(1); }
+
+  /**
+   * how many words should be filled by the running bit
+   */
+  uword getRunningLength() const {
+    return static_cast<uword>(
+        (mydata >> 1) & RunningLengthWord<uword>::largestrunninglengthcount);
+  }
+
+  /**
+   * followed by how many literal words?
+   */
+  uword getNumberOfLiteralWords() const {
+    return static_cast<uword>(
+        mydata >> (1 + RunningLengthWord<uword>::runninglengthbits));
+  }
+
+  /**
+   * Total of getRunningLength() and getNumberOfLiteralWords()
+   */
+  uword size() const { return getRunningLength() + getNumberOfLiteralWords(); }
+
+  uword mydata;
 };
 
-template<class uword>
-class EWAHBoolArray;
+template <class uword> class EWAHBoolArray;
 
-template<class uword>
-class EWAHBoolArrayRawIterator;
+template <class uword> class EWAHBoolArrayRawIterator;
 
 /**
  * Same as RunningLengthWord, except that the values are buffered for quick
  * access.
  */
-template<class uword = uint32_t>
-class BufferedRunningLengthWord {
+template <class uword = uint32_t> class BufferedRunningLengthWord {
 public:
-    BufferedRunningLengthWord(const uword & data, EWAHBoolArrayRawIterator<uword> * p) :
-                RunningBit(data & static_cast<uword> (1)),
-                RunningLength(
-                        static_cast<uword>((data >> 1)
-                                & RunningLengthWord<uword>::largestrunninglengthcount)),
-                NumberOfLiteralWords(
-                        static_cast<uword> (data >> (1 + RunningLengthWord<
-                                uword>::runninglengthbits))), parent(p) {
-    }
-    BufferedRunningLengthWord(const RunningLengthWord<uword> & p) :
-                RunningBit(p.mydata & static_cast<uword> (1)),
-                RunningLength(
-                        (p.mydata >> 1)
-                                & RunningLengthWord<uword>::largestrunninglengthcount),
-                NumberOfLiteralWords(
-                        p.mydata >> (1
-                                + RunningLengthWord<uword>::runninglengthbits)),
-                                parent(p.parent) {
-    }
-
-
-    void discharge(EWAHBoolArray<uword> &container) {
-    	while (size() > 0) {
-    		// first run
-
-    		size_t pl = getRunningLength();
-    		container.addStreamOfEmptyWords(getRunningBit(), pl);
-    		size_t pd = getNumberOfLiteralWords();
-    		writeLiteralWords(pd, container);
-    		discardFirstWordsWithReload(pl + pd);
-    	}
-    }
-
-    bool nonzero_discharge() {
-    	while (size() > 0) {
-    		// first run
-    		size_t pl = getRunningLength();
-    		if((pl>0) && (getRunningBit())) return true;
-    		size_t pd = getNumberOfLiteralWords();
-    		if(pd>0) return true;
-    		discardFirstWordsWithReload(pl + pd);
-    	}
-    	return false;
-    }
-
-    // Write out up to max words, returns how many were written
-    size_t discharge(EWAHBoolArray<uword> &container, size_t max) {
-    	size_t index = 0;
-    	while ((index < max) && (size() > 0)) {
-    		// first run
-    		size_t pl = getRunningLength();
-    		if (index + pl > max) {
-    			pl = max - index;
-    		}
-    		container.addStreamOfEmptyWords(getRunningBit(), pl);
-    		index += pl;
-    		size_t pd = getNumberOfLiteralWords();
-    		if (pd + index > max) {
-    			pd = max - index;
-    		}
-    		writeLiteralWords(pd, container);
-    		index += pd;
-    		discardFirstWordsWithReload(pl + pd);
-    	}
-    	return index;
-    }
-
-    bool nonzero_discharge(size_t max, size_t & index) {
-    	index = 0;
-    	while ((index < max) && (size() > 0)) {
-    		// first run
-    		size_t pl = getRunningLength();
-    		if (index + pl > max) {
-    			pl = max - index;
-    		}
-    		if((getRunningBit()) && (pl>0)) return true;
-    		index += pl;
-    		size_t pd = getNumberOfLiteralWords();
-    		if (pd + index > max) {
-    			pd = max - index;
-    		}
-    		if(pd>0) return true;
-    		discardFirstWordsWithReload(pl + pd);
-    	}
-    	return false;
-    }
-
-    // Write out up to max words, returns how many were written
-    size_t dischargeNegated(EWAHBoolArray<uword> &container, size_t max) {
-    	size_t index = 0;
-    	while ((index < max) && (size() > 0)) {
-    		// first run
-    		size_t pl = getRunningLength();
-    		if (index + pl > max) {
-    			pl = max - index;
-    		}
-    		container.addStreamOfEmptyWords(!getRunningBit(), pl);
-    		index += pl;
-    		size_t pd = getNumberOfLiteralWords();
-    		if (pd + index > max) {
-    			pd = max - index;
-    		}
-    		writeNegatedLiteralWords(pd, container);
-    		discardFirstWordsWithReload(pl + pd);
-    		index += pd;
-    	}
-    	return index;
-    }
-    bool nonzero_dischargeNegated(size_t max, size_t & index) {
-    	while ((index < max) && (size() > 0)) {
-    		// first run
-    		size_t pl = getRunningLength();
-    		if (index + pl > max) {
-    			pl = max - index;
-    		}
-    		if((!getRunningBit()) && (pl>0)) return true;
-    		index += pl;
-    		size_t pd = getNumberOfLiteralWords();
-    		if (pd + index > max) {
-    			pd = max - index;
-    		}
-    		if(pd>0) return true;
-    		discardFirstWordsWithReload(pl + pd);
-    		index += pd;
-    	}
-    	return false;
-    }
-
-    uword getLiteralWordAt(size_t index) {
-    	return parent->dirtyWords()[index];
-    }
-
-
-    void writeLiteralWords(size_t numWords, EWAHBoolArray<uword> &container) {
-        container.addStreamOfDirtyWords(parent->dirtyWords(), numWords);
-    }
-
-
-    void writeNegatedLiteralWords(size_t numWords, EWAHBoolArray<uword> &container) {
-        container.addStreamOfNegatedDirtyWords(parent->dirtyWords(), numWords);
-    }
-
-    void discardRunningWordsWithReload() {
+  enum { wordinbits = sizeof(uword) * 8 };
+
+  BufferedRunningLengthWord(const uword &data,
+                            EWAHBoolArrayRawIterator<uword> *p)
+      : RunningBit(data & static_cast<uword>(1)),
+        RunningLength(static_cast<uword>(
+            (data >> 1) & RunningLengthWord<uword>::largestrunninglengthcount)),
+        NumberOfLiteralWords(static_cast<uword>(
+            data >> (1 + RunningLengthWord<uword>::runninglengthbits))),
+        parent(p) {}
+  BufferedRunningLengthWord(const RunningLengthWord<uword> &p)
+      : RunningBit(p.mydata & static_cast<uword>(1)),
+        RunningLength((p.mydata >> 1) &
+                      RunningLengthWord<uword>::largestrunninglengthcount),
+        NumberOfLiteralWords(p.mydata >>
+                             (1 + RunningLengthWord<uword>::runninglengthbits)),
+        parent(p.parent) {}
+
+  void discharge(EWAHBoolArray<uword> &container) {
+    while (size() > 0) {
+      // first run
+      size_t pl = getRunningLength();
+      container.fastaddStreamOfEmptyWords(getRunningBit(), pl);
+      size_t pd = getNumberOfLiteralWords();
+      writeLiteralWords(pd, container);
+      if (!next())
+        break;
+    }
+  }
+
+  size_t dischargeCount() {
+    size_t answer = 0;
+    while (size() > 0) {
+      // first run
+      if (getRunningBit()) {
+        answer += wordinbits * getRunningLength();
+      }
+      size_t pd = getNumberOfLiteralWords();
+      for (size_t i = 0; i < pd; ++i)
+        answer += countOnes((uword)getLiteralWordAt(i));
+      if (!next())
+        break;
+    }
+    return answer;
+  }
+
+  size_t dischargeCountNegated() {
+    size_t answer = 0;
+    while (size() > 0) {
+      // first run
+      if (!getRunningBit()) {
+        answer += wordinbits * getRunningLength();
+      }
+      size_t pd = getNumberOfLiteralWords();
+      for (size_t i = 0; i < pd; ++i)
+        answer += countOnes((uword)(~getLiteralWordAt(i)));
+      if (!next())
+        break;
+    }
+    return answer;
+  }
+
+  // Symbolically write out up to max words, returns how many were written,
+  // write to count the number bits written (we assume that count was initially
+  // zero)
+  size_t dischargeCount(size_t max, size_t *count) {
+    size_t index = 0;
+    while (true) {
+      if (index + RunningLength > max) {
+        const size_t offset = max - index;
+        if (getRunningBit())
+          *count += offset * wordinbits;
+        RunningLength -= offset;
+        return max;
+      }
+      if (getRunningBit())
+        *count += RunningLength * wordinbits;
+      index += RunningLength;
+      if (NumberOfLiteralWords + index > max) {
+        const size_t offset = max - index;
+        for (size_t i = 0; i < offset; ++i)
+          *count += countOnes((uword)getLiteralWordAt(i));
         RunningLength = 0;
-        if(NumberOfLiteralWords == 0)
-        	next();
-    }
-
-    bool next() {
-        if (!parent->hasNext()) {
-        	NumberOfLiteralWords = 0;
-        	RunningLength = 0;
-            return false;
-        }
-        parent->next();
+        NumberOfLiteralWords -= offset;
+        return max;
+      }
+      for (size_t i = 0; i < NumberOfLiteralWords; ++i)
+        *count += countOnes((uword)getLiteralWordAt(i));
+      index += NumberOfLiteralWords;
+      if (!next())
+        break;
+    }
+    return index;
+  }
+
+  size_t dischargeCountNegated(size_t max, size_t *count) {
+    size_t index = 0;
+    while (true) {
+      if (index + RunningLength > max) {
+        const size_t offset = max - index;
+        if (!getRunningBit())
+          *count += offset * wordinbits;
+        RunningLength -= offset;
+        return max;
+      }
+      if (!getRunningBit())
+        *count += RunningLength * wordinbits;
+      index += RunningLength;
+      if (NumberOfLiteralWords + index > max) {
+        const size_t offset = max - index;
+        for (size_t i = 0; i < offset; ++i)
+          *count += countOnes((uword)(~getLiteralWordAt(i)));
+        RunningLength = 0;
+        NumberOfLiteralWords -= offset;
+        return max;
+      }
+      for (size_t i = 0; i < NumberOfLiteralWords; ++i)
+        *count += countOnes((uword)(~getLiteralWordAt(i)));
+      index += NumberOfLiteralWords;
+      if (!next())
+        break;
+    }
+    return index;
+  }
+  bool nonzero_discharge() {
+    while (size() > 0) {
+      // first run
+      size_t pl = getRunningLength();
+      if ((pl > 0) && (getRunningBit()))
         return true;
-    }
-
-    void read(const uword & data) {
-        RunningBit = data & static_cast<uword> (1);
-        RunningLength = static_cast<uword>((data >> 1)
-                & RunningLengthWord<uword>::largestrunninglengthcount);
-        NumberOfLiteralWords = static_cast<uword> (data >> (1
-                + RunningLengthWord<uword>::runninglengthbits));
-    }
-
-    /**
-     * Which bit is being repeated?
-     */
-    bool getRunningBit() const {
-        return RunningBit;
-    }
-
-    void discardFirstWords(uword x) {
-        if (RunningLength >= x) {
-            RunningLength = static_cast<uword> (RunningLength - x);
-            return;
-        }
-        x = static_cast<uword> (x - RunningLength);
+      size_t pd = getNumberOfLiteralWords();
+      if (pd > 0)
+        return true;
+      discardFirstWordsWithReload(pl + pd);
+    }
+    return false;
+  }
+
+  // Write out up to max words, returns how many were written
+  size_t discharge(EWAHBoolArray<uword> &container, size_t max) {
+    size_t index = 0;
+    while (true) {
+      if (index + RunningLength > max) {
+        const size_t offset = max - index;
+        container.fastaddStreamOfEmptyWords(getRunningBit(), offset);
+        RunningLength -= offset;
+        return max;
+      }
+      container.fastaddStreamOfEmptyWords(getRunningBit(), RunningLength);
+      index += RunningLength;
+      if (NumberOfLiteralWords + index > max) {
+        const size_t offset = max - index;
+        writeLiteralWords(offset, container);
         RunningLength = 0;
-        NumberOfLiteralWords = static_cast<uword> (NumberOfLiteralWords - x);
-    }
-
-    /**
-     * how many words should be filled by the running bit (see previous method)
-     */
-    uword getRunningLength() const {
-        return RunningLength;
-    }
-
-    /**
-     * followed by how many literal words?
-     */
-    uword getNumberOfLiteralWords() const {
-        return NumberOfLiteralWords;
-    }
-
-    /**
-     * Total of getRunningLength() and getNumberOfLiteralWords()
-     */
-    uword size() const {
-        return static_cast<uword> (RunningLength + NumberOfLiteralWords);
-    }
-
-    friend ostream& operator<< (ostream &out, const BufferedRunningLengthWord &a) {
-    	out<<"{RunningBit:"<<a.RunningBit<<",RunningLength:"<<a.RunningLength<<",NumberOfLiteralWords:"<<a.NumberOfLiteralWords<<"}";
-    	return out;
-    }
-
-
-
-	void discardFirstWordsWithReload(uword x) {
-		while (x > 0) {
-			if (RunningLength > x) {
-				RunningLength = static_cast<uword>(RunningLength - x);
-				return;
-			}
-			x = static_cast<uword>(x - RunningLength);
-			RunningLength = 0;
-			size_t toDiscard =
-					x > NumberOfLiteralWords ? NumberOfLiteralWords : x;
-			NumberOfLiteralWords = static_cast<uword>(NumberOfLiteralWords
-					- toDiscard);
-			x -= toDiscard;
-			if ((x > 0) || (size() == 0)) {
-				if (!next())
-					break;
-			}
-		}
-	}
-
+        NumberOfLiteralWords -= offset;
+        return max;
+      }
+      writeLiteralWords(NumberOfLiteralWords, container);
+      index += NumberOfLiteralWords;
+      if (!next())
+        break;
+    }
+    return index;
+  }
+
+  bool nonzero_discharge(size_t max, size_t &index) {
+    index = 0;
+    while ((index < max) && (size() > 0)) {
+      // first run
+      size_t pl = getRunningLength();
+      if (index + pl > max) {
+        pl = max - index;
+      }
+      if ((getRunningBit()) && (pl > 0))
+        return true;
+      index += pl;
+      size_t pd = getNumberOfLiteralWords();
+      if (pd + index > max) {
+        pd = max - index;
+      }
+      if (pd > 0)
+        return true;
+      discardFirstWordsWithReload(pl + pd);
+    }
+    return false;
+  }
+
+  // Write out up to max words, returns how many were written
+  size_t dischargeNegated(EWAHBoolArray<uword> &container, size_t max) {
+    // todo: could be optimized further
+    size_t index = 0;
+    while ((index < max) && (size() > 0)) {
+      // first run
+      size_t pl = getRunningLength();
+      if (index + pl > max) {
+        pl = max - index;
+      }
+      container.fastaddStreamOfEmptyWords(!getRunningBit(), pl);
+      index += pl;
+      size_t pd = getNumberOfLiteralWords();
+      if (pd + index > max) {
+        pd = max - index;
+      }
+      writeNegatedLiteralWords(pd, container);
+      discardFirstWordsWithReload(pl + pd);
+      index += pd;
+    }
+    return index;
+  }
+  bool nonzero_dischargeNegated(size_t max, size_t &index) {
+    while ((index < max) && (size() > 0)) {
+      // first run
+      size_t pl = getRunningLength();
+      if (index + pl > max) {
+        pl = max - index;
+      }
+      if ((!getRunningBit()) && (pl > 0))
+        return true;
+      index += pl;
+      size_t pd = getNumberOfLiteralWords();
+      if (pd + index > max) {
+        pd = max - index;
+      }
+      if (pd > 0)
+        return true;
+      discardFirstWordsWithReload(pl + pd);
+      index += pd;
+    }
+    return false;
+  }
+
+  uword getLiteralWordAt(size_t index) { return parent->dirtyWords()[index]; }
+
+  void writeLiteralWords(size_t numWords, EWAHBoolArray<uword> &container) {
+    container.fastaddStreamOfDirtyWords(parent->dirtyWords(), numWords);
+  }
+
+  void writeNegatedLiteralWords(size_t numWords,
+                                EWAHBoolArray<uword> &container) {
+    container.addStreamOfNegatedDirtyWords(parent->dirtyWords(), numWords);
+  }
+
+  void discardRunningWords() { RunningLength = 0; }
+
+  void discardRunningWordsWithReload() {
+    RunningLength = 0;
+    if (NumberOfLiteralWords == 0)
+      next();
+  }
+
+  bool next() {
+    if (!parent->hasNext()) {
+      NumberOfLiteralWords = 0;
+      RunningLength = 0;
+      return false;
+    }
+    parent->next();
+    return true;
+  }
+
+  void read(const uword &data) {
+    RunningBit = data & static_cast<uword>(1);
+    RunningLength = static_cast<uword>(
+        (data >> 1) & RunningLengthWord<uword>::largestrunninglengthcount);
+    NumberOfLiteralWords = static_cast<uword>(
+        data >> (1 + RunningLengthWord<uword>::runninglengthbits));
+  }
+
+  /**
+   * Which bit is being repeated?
+   */
+  bool getRunningBit() const { return RunningBit; }
+
+  void discardFirstWords(uword x) {
+    if (RunningLength >= x) {
+      RunningLength = static_cast<uword>(RunningLength - x);
+      return;
+    }
+    x = static_cast<uword>(x - RunningLength);
+    RunningLength = 0;
+    NumberOfLiteralWords = static_cast<uword>(NumberOfLiteralWords - x);
+  }
+
+  /**
+   * how many words should be filled by the running bit (see previous method)
+   */
+  uword getRunningLength() const { return RunningLength; }
+
+  /**
+   * followed by how many literal words?
+   */
+  uword getNumberOfLiteralWords() const { return NumberOfLiteralWords; }
+
+  /**
+   * Total of getRunningLength() and getNumberOfLiteralWords()
+   */
+  uword size() const {
+    return static_cast<uword>(RunningLength + NumberOfLiteralWords);
+  }
+
+  friend std::ostream &operator<<(std::ostream &out,
+                                  const BufferedRunningLengthWord &a) {
+    out << "{RunningBit:" << a.RunningBit
+        << ",RunningLength:" << a.RunningLength
+        << ",NumberOfLiteralWords:" << a.NumberOfLiteralWords << "}";
+    return out;
+  }
+  void discardLiteralWordsWithReload(uword x) {
+    assert(NumberOfLiteralWords >= x);
+    NumberOfLiteralWords -= x;
+    if (NumberOfLiteralWords == 0)
+      next();
+  }
+
+  void discardFirstWordsWithReload(uword x) {
+    while (x > 0) {
+      if (RunningLength > x) {
+        RunningLength = static_cast<uword>(RunningLength - x);
+        return;
+      }
+      x = static_cast<uword>(x - RunningLength);
+      RunningLength = 0;
+      size_t toDiscard = x > NumberOfLiteralWords ? NumberOfLiteralWords : x;
+      NumberOfLiteralWords =
+          static_cast<uword>(NumberOfLiteralWords - toDiscard);
+      x -= toDiscard;
+      if ((x > 0) || (size() == 0)) {
+        if (!next())
+          break;
+      }
+    }
+  }
 
 private:
-
-    bool RunningBit;
-    uword RunningLength;
-    uword NumberOfLiteralWords;
-    EWAHBoolArrayRawIterator<uword> * parent;
-
+  bool RunningBit;
+  uword RunningLength;
+  uword NumberOfLiteralWords;
+  EWAHBoolArrayRawIterator<uword> *parent;
 };
 
-
-
 #endif /* RUNNINGLENGTHWORD_H_ */

From 9e1f34e39219885c02785ce804886c2c6563bb03 Mon Sep 17 00:00:00 2001
From: John ZuHone <jzuhone@gmail.com>
Date: Thu, 28 May 2020 22:04:51 -0400
Subject: [PATCH 37/42] This fixes a compilation error

---
 yt/utilities/lib/ewah_bool_array.pxd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yt/utilities/lib/ewah_bool_array.pxd b/yt/utilities/lib/ewah_bool_array.pxd
index 9b55e626d9a..401a59a811d 100644
--- a/yt/utilities/lib/ewah_bool_array.pxd
+++ b/yt/utilities/lib/ewah_bool_array.pxd
@@ -12,7 +12,7 @@ from libcpp.vector cimport vector
 from libcpp.map cimport map
 from libcpp.string cimport string
 from libcpp cimport bool
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint64_t, uint32_t
 
 # Streams req for c++ IO
 cdef extern from "<ostream>" namespace "std":
@@ -89,7 +89,7 @@ cdef extern from "boolarray.h":
         uword getWord(size_t pos)
         size_t wordinbits
 
-ctypedef np.uint32_t ewah_word_type
+ctypedef uint32_t ewah_word_type
 ctypedef EWAHBoolArray[ewah_word_type] ewah_bool_array
 ctypedef EWAHBoolArraySetBitForwardIterator[ewah_word_type] ewah_bool_iterator
 ctypedef vector[size_t] bitset_array

From bd1fb35fe29555979aec6fff1701b8792df0f827 Mon Sep 17 00:00:00 2001
From: John ZuHone <jzuhone@gmail.com>
Date: Fri, 29 May 2020 11:33:38 -0400
Subject: [PATCH 38/42] Make this platform-dependent

---
 yt/utilities/lib/ewah_bool_array.pxd | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/yt/utilities/lib/ewah_bool_array.pxd b/yt/utilities/lib/ewah_bool_array.pxd
index 401a59a811d..4099e841b39 100644
--- a/yt/utilities/lib/ewah_bool_array.pxd
+++ b/yt/utilities/lib/ewah_bool_array.pxd
@@ -89,7 +89,10 @@ cdef extern from "boolarray.h":
         uword getWord(size_t pos)
         size_t wordinbits
 
-ctypedef uint32_t ewah_word_type
+IF UNAME_SYSNAME == "Windows":
+    ctypedef uint32_t ewah_word_type
+ELSE:
+    ctypedef np.uint32_t ewah_word_type
 ctypedef EWAHBoolArray[ewah_word_type] ewah_bool_array
 ctypedef EWAHBoolArraySetBitForwardIterator[ewah_word_type] ewah_bool_iterator
 ctypedef vector[size_t] bitset_array

From 25304b30ed01d1d3eb577528afacaab7a02019f1 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Fri, 29 May 2020 12:08:33 -0500
Subject: [PATCH 39/42] Update order-of-include and C++11 for
 particle_oct_container.pyx

---
 setup.py                               |  7 ++++---
 yt/geometry/particle_oct_container.pyx | 23 +++++++++++------------
 yt/utilities/lib/ewah_bool_array.pxd   |  5 +++--
 yt/utilities/lib/ewah_bool_wrap.pxd    |  3 +--
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/setup.py b/setup.py
index 4dfbf9e2639..9911b736093 100644
--- a/setup.py
+++ b/setup.py
@@ -106,7 +106,8 @@ def _compile(
               include_dirs=["yt/utilities/lib/",
                             "yt/utilities/lib/ewahboolarray"],
               language="c++",
-              libraries=std_libs),
+              libraries=std_libs,
+              extra_compile_args=["-std=c++11"]),
     Extension("yt.geometry.selection_routines",
               ["yt/geometry/selection_routines.pyx"],
               include_dirs=["yt/utilities/lib/"],
@@ -153,7 +154,7 @@ def _compile(
               ],
               libraries=std_libs,
               language="c++",
-              extra_compile_arg=["-std=c++03"]),
+              extra_compile_args=["-std=c++03"]),
     Extension("yt.utilities.lib.cykdtree.utils",
               [
                   "yt/utilities/lib/cykdtree/utils.pyx",
@@ -162,7 +163,7 @@ def _compile(
               depends=["yt/utilities/lib/cykdtree/c_utils.hpp"],
               libraries=std_libs,
               language="c++",
-              extra_compile_arg=["-std=c++03"]),    
+              extra_compile_args=["-std=c++03"]),    
     Extension("yt.utilities.lib.fnv_hash",
               ["yt/utilities/lib/fnv_hash.pyx"],
               include_dirs=["yt/utilities/lib/"],
diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index cfef5378d95..66e9d04558f 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -7,38 +7,37 @@ Oct container tuned for Particles
 """
 
 
+from libc.stdlib cimport malloc, free, qsort
+from libc.string cimport memset
+from libc.math cimport floor, ceil, fmod
+from libcpp.map cimport map
+from libcpp.vector cimport vector
+from yt.utilities.lib.ewah_bool_array cimport \
+    ewah_bool_array, ewah_bool_iterator, ewah_map, bool_array, ewah_word_type
+import numpy as np
+cimport numpy as np
+
 from oct_container cimport OctreeContainer, Oct, OctInfo, ORDER_MAX, \
     SparseOctreeContainer, OctKey, OctAllocationContainer
 cimport oct_visitors
 from oct_visitors cimport cind, OctVisitor
-from libc.stdlib cimport malloc, free, qsort
-from libc.string cimport memset
-from libc.math cimport floor, ceil, fmod
 from yt.utilities.lib.fp_utils cimport *
 from yt.utilities.lib.geometry_utils cimport bounded_morton, \
     bounded_morton_dds, bounded_morton_relative_dds, \
     bounded_morton_split_dds, bounded_morton_split_relative_dds, \
     encode_morton_64bit, decode_morton_64bit, \
     morton_neighbors_coarse, morton_neighbors_refined
-import numpy as np
-cimport numpy as np
 from selection_routines cimport SelectorObject, AlwaysSelector
 cimport cython
 from cython cimport floating
+from cython.operator cimport dereference, preincrement
 from cpython.exc cimport PyErr_CheckSignals
 from collections import defaultdict
 from yt.funcs import get_pbar
 
 from particle_deposit cimport gind
-from yt.utilities.lib.ewah_bool_array cimport \
-    ewah_bool_array, ewah_bool_iterator, ewah_map, bool_array, ewah_word_type
 #from yt.utilities.lib.ewah_bool_wrap cimport \
 from ..utilities.lib.ewah_bool_wrap cimport BoolArrayCollection
-from libcpp cimport bool
-from libcpp.map cimport map
-from libcpp.vector cimport vector
-from libcpp.pair cimport pair
-from cython.operator cimport dereference, preincrement
 import struct
 import os
 
diff --git a/yt/utilities/lib/ewah_bool_array.pxd b/yt/utilities/lib/ewah_bool_array.pxd
index 4099e841b39..b8507b71f1f 100644
--- a/yt/utilities/lib/ewah_bool_array.pxd
+++ b/yt/utilities/lib/ewah_bool_array.pxd
@@ -6,8 +6,6 @@ Wrapper for EWAH Bool Array: https://github.com/lemire/EWAHBoolArray
 """
 
 
-cimport numpy as np
-cimport cython
 from libcpp.vector cimport vector
 from libcpp.map cimport map
 from libcpp.string cimport string
@@ -89,6 +87,9 @@ cdef extern from "boolarray.h":
         uword getWord(size_t pos)
         size_t wordinbits
 
+cimport numpy as np
+cimport cython
+
 IF UNAME_SYSNAME == "Windows":
     ctypedef uint32_t ewah_word_type
 ELSE:
diff --git a/yt/utilities/lib/ewah_bool_wrap.pxd b/yt/utilities/lib/ewah_bool_wrap.pxd
index 4feeaf31e4f..229b6536b4c 100644
--- a/yt/utilities/lib/ewah_bool_wrap.pxd
+++ b/yt/utilities/lib/ewah_bool_wrap.pxd
@@ -1,11 +1,10 @@
-cimport numpy as np
 from libcpp.vector cimport vector
 from libcpp.set cimport set as cset
 from libcpp.pair cimport pair
-
 from yt.utilities.lib.ewah_bool_array cimport \
     sstream, ewah_map, ewah_bool_array, ewah_bool_iterator
 
+cimport numpy as np
 ctypedef bint bitarrtype
 ctypedef pair[np.uint64_t, np.uint64_t] ind_pair
 

From e5847d804acfc23e7c6ce8d519c626d81168670a Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Fri, 29 May 2020 12:15:31 -0500
Subject: [PATCH 40/42] Update to Bionic as per Kacper's suggestion

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 56834d22208..e7cfad98563 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,5 @@
 language: python
-dist: xenial
+dist: bionic
 cache:
   pip: true
   directories:

From c85827ea9b3b3a957dc38e0cd4ff778b26e79bae Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Tue, 2 Jun 2020 16:57:12 -0500
Subject: [PATCH 41/42] Fix a handful of lint and style issues

---
 yt/geometry/particle_oct_container.pyx | 62 +++++++++-----------------
 1 file changed, 20 insertions(+), 42 deletions(-)

diff --git a/yt/geometry/particle_oct_container.pyx b/yt/geometry/particle_oct_container.pyx
index 66e9d04558f..85e19a08419 100644
--- a/yt/geometry/particle_oct_container.pyx
+++ b/yt/geometry/particle_oct_container.pyx
@@ -491,7 +491,7 @@ cdef class ParticleBitmap:
                                        np.uint64_t file_id) except *:
         # Initialize
         cdef np.int64_t i, p
-        cdef np.uint64_t mi, miex, mi_max
+        cdef np.uint64_t mi, miex
         cdef np.uint64_t mi_split[3]
         cdef np.float64_t ppos[3]
         cdef np.float64_t s_ppos[3] # shifted ppos
@@ -511,7 +511,6 @@ cdef class ParticleBitmap:
         cdef np.uint64_t msize = (1 << (self.index_order1 * 3))
         cdef int axiter[3][2]
         cdef np.float64_t axiterv[3][2]
-        mi_max = (1 << self.index_order1) - 1
         # Copy over things for this file (type cast necessary?)
         for i in range(3):
             LE[i] = self.left_edge[i]
@@ -619,12 +618,9 @@ cdef class ParticleBitmap:
         if in_collection is None:
             in_collection = BoolArrayCollection()
         cdef BoolArrayCollection _in_coll = in_collection
-        cdef np.int64_t nsub
         out_collection = self.__refined_index_data_file(_in_coll, pos, hsml, mask,
-                                              sub_mi1, sub_mi2, 
-                                              file_id, &nsub,
                                               count_threshold, mask_threshold)
-        return nsub, out_collection
+        return 0, out_collection
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
@@ -636,9 +632,6 @@ cdef class ParticleBitmap:
         np.ndarray[floating, ndim=2] pos,
         np.ndarray[floating, ndim=1] hsml,
         np.ndarray[np.uint8_t, ndim=1] mask,
-        np.ndarray[np.uint64_t, ndim=1] sub_mi1,
-        np.ndarray[np.uint64_t, ndim=1] sub_mi2,
-        np.uint64_t file_id, np.int64_t *nsub_mi,
         np.uint64_t count_threshold, np.uint8_t mask_threshold
     ):
         # Initialize
@@ -647,7 +640,7 @@ cdef class ParticleBitmap:
         cdef np.uint64_t mi1, mi2
         cdef np.float64_t ppos[3]
         cdef np.float64_t s_ppos[3] # shifted ppos
-        cdef int skip, Nex
+        cdef int skip
         cdef BoolArrayCollection this_collection, out_collection
         cdef np.uint64_t bounds[2][3]
         cdef np.uint8_t fully_enclosed
@@ -660,29 +653,16 @@ cdef class ParticleBitmap:
         cdef np.float64_t radius
         cdef np.uint64_t mi_split1[3]
         cdef np.uint64_t mi_split2[3]
-        cdef np.uint64_t miex1, miex2, mi1_max, mi2_max
+        cdef np.uint64_t miex1
         cdef np.uint64_t[:] particle_counts = self.particle_counts
-        cdef int Nex_min[3]
-        cdef int Nex_max[3]
-        cdef np.float64_t rpos_min, rpos_max
-        cdef np.uint64_t xex2_min, xex2_max, yex2_min, yex2_max, zex2_min, zex2_max
         cdef np.uint64_t xex, yex, zex
-        cdef np.uint64_t xex1, yex1, zex1
-        cdef np.uint64_t xex2, yex2, zex2
-        cdef int ix, iy, iz, ixe, iye, ize
-        cdef np.ndarray[np.uint64_t, ndim=1] xex1_range = np.empty(7, 'uint64')
-        cdef np.ndarray[np.uint64_t, ndim=1] yex1_range = np.empty(7, 'uint64')
-        cdef np.ndarray[np.uint64_t, ndim=1] zex1_range = np.empty(7, 'uint64')
-        cdef np.ndarray[np.uint64_t, ndim=1] xex2_range = np.empty(7, 'uint64')
-        cdef np.ndarray[np.uint64_t, ndim=1] yex2_range = np.empty(7, 'uint64')
-        cdef np.ndarray[np.uint64_t, ndim=1] zex2_range = np.empty(7, 'uint64')
-        cdef np.float64_t clip_pos_l[3], clip_pos_r[3]
-        cdef np.int64_t msize = sub_mi1.shape[0]
+        cdef np.float64_t clip_pos_l[3]
+        cdef np.float64_t clip_pos_r[3]
         cdef int axiter[3][2]
         cdef np.float64_t axiterv[3][2]
         cdef CoarseRefinedSets coarse_refined_map
         cdef map[np.uint64_t, np.uint64_t] refined_count
-        cdef np.uint64_t nset = 0, nfully_enclosed = 0, n_calls = 0
+        cdef np.uint64_t nfully_enclosed = 0, n_calls = 0
         mi1_max = (1 << self.index_order1) - 1
         mi2_max = (1 << self.index_order2) - 1
         cdef np.uint64_t max_mi1_elements = 1 << (3*self.index_order1)
@@ -731,7 +711,7 @@ cdef class ParticleBitmap:
                     ppos[0], ppos[1], ppos[2], LE, dds1, dds2, mi_split2)
                 if refined_count[mi1] == 0:
                     coarse_refined_map[mi1].padWithZeroes(max_mi2_elements)
-                if coarse_refined_map[mi1].get(mi2) == False:
+                if not coarse_refined_map[mi1].get(mi2):
                     coarse_refined_map[mi1].set(mi2)
                     refined_count[mi1] += 1
             else: # only hit if we have smoothing lengths.
@@ -803,11 +783,9 @@ cdef class ParticleBitmap:
                                         n_calls += 1
                                         refined_count[miex1] += self.__fill_refined_ranges(s_ppos, radius, LE, RE,
                                                                    dds1, xex, yex, zex,
-                                                                   dds2, mi1_max, mi2_max, miex1,
-                                                                   coarse_refined_map[miex1], ppos, mask[miex1],
-                                                                   max_mi2_elements)
-        cdef np.uint64_t count, vec_i
-        cdef np.uint64_t total_count = 0
+                                                                   dds2, 
+                                                                   coarse_refined_map[miex1])
+        cdef np.uint64_t vec_i
         cdef bool_array *buf = NULL
         cdef ewah_word_type w
         this_collection = BoolArrayCollection()
@@ -832,20 +810,20 @@ cdef class ParticleBitmap:
     cdef np.int64_t __fill_refined_ranges(self, np.float64_t s_ppos[3], np.float64_t radius,
                                            np.float64_t LE[3], np.float64_t RE[3],
                                            np.float64_t dds1[3], np.uint64_t xex, np.uint64_t yex, np.uint64_t zex,
-                                           np.float64_t dds2[3],
-                                           np.uint64_t mi1_max, np.uint64_t mi2_max, np.uint64_t miex1,
-                                           bool_array &refined_set, np.float64_t ppos[3], np.uint64_t mcount,
-                                          np.uint64_t max_mi2_elements) except -1:
+                                           np.float64_t dds2[3], bool_array &refined_set) except -1:
         cdef int i
         cdef np.uint64_t new_nsub = 0
         cdef np.uint64_t bounds_l[3], bounds_r[3]
-        cdef np.uint64_t miex2, mi2, miex2_min, miex2_max
-        cdef np.float64_t clip_pos_l[3], clip_pos_r[3], cell_edge_l, cell_edge_r
-        cdef np.uint64_t ex1[3], ex2[3], ex3[3]
-        cdef np.uint64_t xex_max, yex_max, zex_max
+        cdef np.uint64_t miex2, miex2_min, miex2_max
+        cdef np.float64_t clip_pos_l[3]
+        cdef np.float64_t clip_pos_r[3]
+        cdef np.float64_t cell_edge_l, cell_edge_r
+        cdef np.uint64_t ex1[3]
         cdef np.uint64_t xiex_min, yiex_min, ziex_min
         cdef np.uint64_t xiex_max, yiex_max, ziex_max
-        ex1[0] = xex; ex1[1] = yex; ex1[2] = zex
+        ex1[0] = xex
+        ex1[1] = yex
+        ex1[2] = zex
         # Check a few special cases
         for i in range(3):
             # Figure out our bounds inside our coarse cell, in the space of the

From 59290f019f0c981d4850823d70236ac6e72c7b67 Mon Sep 17 00:00:00 2001
From: Matthew Turk <matthewturk@gmail.com>
Date: Wed, 3 Jun 2020 11:42:45 -0500
Subject: [PATCH 42/42] Updating to new answer-store rev

---
 answer-store | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/answer-store b/answer-store
index 49870bcc8f4..d607a2e1a47 160000
--- a/answer-store
+++ b/answer-store
@@ -1 +1 @@
-Subproject commit 49870bcc8f4d32fcd6980a65239574f2cdd3b159
+Subproject commit d607a2e1a47947971e7e004e9bfd92664714b14b