diff --git a/AUTHORS b/AUTHORS
index 25dda54..75e6363 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1 +1,2 @@
+Jingpeng Wu <jingpeng.wu@gmail.com>
 William Silversmith <william.silversmith@gmail.com>
diff --git a/ChangeLog b/ChangeLog
index b78e613..a924fb7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,26 @@
 CHANGES
 =======
 
+1.4.2
+-----
+
+* chore: loosen networkx requirement (#49)
+* Update README.md
+* docs: update memory usage diagram for version 1.4.0
+
+1.4.1
+-----
+
+* perf: switch source and target for dijkstra
+
+1.3.3
+-----
+
+* refactor: make type of 0L clear to std::max on Windows
+* Revert "fix: don't assume vertices are uint32"
+* fix: don't assume vertices are uint32
+* chore: update ChangeLog
+
 1.3.2
 -----
 
diff --git a/README.md b/README.md
index ef13e93..d4427a3 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ In the future, we may create a fully binary distribution.
 Fig. 2: Memory Usage on a 512x512x512 Densely Labeled Volume
 </p>
 
-Figure 2 shows the memory usage and processessing time (~390 seconds, about 6.5 minutes) required when Kimimaro 0.5.2 was applied to a 512x512x512 cutout, *labels*, from a connectomics dataset containing 2124 connected components. The different sections of the algorithm are depicted. Grossly, the preamble runs for about half a minute, skeletonization for about six minutes, and finalization within seconds. The peak memory usage was about 4.1 GB. The code below was used to process *labels*. The processing of the glia was truncated in due to a combination of *fix_borders* and max_paths.  
+Figure 2 shows the memory usage and processessing time (~390 seconds, about 6.5 minutes) required when Kimimaro 1.4.0 was applied to a 512x512x512 cutout, *labels*, from a connectomics dataset containing 2124 connected components. The different sections of the algorithm are depicted. Grossly, the preamble runs for about half a minute, skeletonization for about six minutes, and finalization within seconds. The peak memory usage was about 4.5 GB. The code below was used to process *labels*. The processing of the glia was truncated in due to a combination of *fix_borders* and max_paths.  
 
 Kimimaro has come a long way. Version 0.2.1 took over 15 minutes and had a  Preamble run time twice as long on the same dataset.    
 
@@ -61,6 +61,7 @@ skels = kimimaro.skeletonize(
   anisotropy=(16,16,40), # default True
   fix_branching=True, # default True
   fix_borders=True, # default True
+  fill_holes=False, # default False
   progress=True, # default False, show progress bar
   parallel=1, # <= 0 all cpu, 1 single process, 2+ multiprocess
   parallel_chunk_size=100, # how many skeletons to process before updating progress bar
@@ -152,6 +153,12 @@ This feature makes it easier to connect the skeletons of adjacent image volumes
 
 You'll probably never want to disable this, but base TEASAR is infamous for forking the skeleton at branch points way too early. This option makes it preferential to fork at a more reasonable place at a significant performance penalty. 
 
+#### `fill_holes`
+
+THIS WILL REMOVE INPUT LABELS THAT ARE DEEMED TO BE HOLES.
+
+If your segmentation contains artifacts that cause holes to appear in labels, you can preprocess the entire image to eliminate background holes and holes caused by entirely contained inclusions. This option adds a moderate amount of additional processing time at the beginning (perhaps ~30%). 
+
 #### `progress`
 
 Show a progress bar once the skeletonization phase begins.
@@ -193,7 +200,7 @@ Additionally, data quality issues can be challenging as well. If one is skeleton
 
 In our problem domain of skeletonizing neurons from anisotropic voxel labels, our chosen algorithm should produce tree structures, handle fine or coarse detail extraction depending on the circumstances, handle voxel anisotropy, and be reasonably efficient in CPU and memory usage. TEASAR fufills these criteria. Notably, TEASAR doesn't guarantee the centeredness of the skeleton within the shape, but it makes an effort. The basic TEASAR algorithm is known to cut corners around turns and branch too early. A 2001 paper by members of the original TEASAR team describes a method for reducing the early branching issue on page 204, section 4.2.2. [2]
 
-## TEASAR Derived Algorthm
+## TEASAR Derived Algorithm
 
 We implemented TEASAR but made several deviations from the published algorithm in order to improve path centeredness, increase performance, handle bulging cell somas, and enable efficient chunked evaluation of large images. We opted not to implement the gradient vector field step from [2] as our implementation is already quite fast. The paper claims a reduction of 70-85% in input voxels, so it might be worth investigating.  
 
diff --git a/automated_test.py b/automated_test.py
index 92dd53b..9586ac4 100644
--- a/automated_test.py
+++ b/automated_test.py
@@ -15,13 +15,13 @@ def test_binary_image():
 
   assert len(skels) == 1
 
-
-def test_square():
+@pytest.mark.parametrize('fill_holes', (True, False))
+def test_square(fill_holes):
   labels = np.ones( (1000, 1000), dtype=np.uint8)
   labels[-1,0] = 0
   labels[0,-1] = 0
   
-  skels = kimimaro.skeletonize(labels, fix_borders=False)
+  skels = kimimaro.skeletonize(labels, fix_borders=False, fill_holes=fill_holes)
 
   assert len(skels) == 1
 
@@ -34,7 +34,7 @@ def test_square():
   labels[0,0] = 0
   labels[-1,-1] = 0
 
-  skels = kimimaro.skeletonize(labels, fix_borders=False)
+  skels = kimimaro.skeletonize(labels, fix_borders=False, fill_holes=fill_holes)
 
   assert len(skels) == 1
 
@@ -236,7 +236,8 @@ def test_dimensions():
   except kimimaro.DimensionError:
     pass
 
-def test_joinability():
+@pytest.mark.parametrize('axis', ('x','y'))
+def test_joinability(axis):
   def skeletionize(labels, fix_borders):
     return kimimaro.skeletonize(
       labels,
@@ -257,34 +258,38 @@ def skeletionize(labels, fix_borders):
       parallel=1,
     )
 
-  def testlabels(labels):
-    skels1 = skeletionize(labels[:,:,:10], True)
-    skels1 = skels1[1]
+  labels = np.zeros((256, 256, 20), dtype=np.uint8)
+
+  if axis == 'x':
+    lslice = np.s_[ 32:160, :, : ]
+  elif axis == 'y':
+    lslice = np.s_[ :, 32:160, : ]
 
-    skels2 = skeletionize(labels[:,:,9:], True)
-    skels2 = skels2[1]
-    skels2.vertices[:,2] += 9
+  labels = np.zeros((256, 256, 20), dtype=np.uint8)
+  labels[lslice] = 1
 
-    skels = skels1.merge(skels2)
-    assert len(skels.components()) == 1
+  skels1 = skeletionize(labels[:,:,:10], True)
+  skels1 = skels1[1]
 
-    skels1 = skeletionize(labels[:,:,:10], False)
-    skels1 = skels1[1]
+  skels2 = skeletionize(labels[:,:,9:], True)
+  skels2 = skels2[1]
+  skels2.vertices[:,2] += 9
 
-    skels2 = skeletionize(labels[:,:,9:], False)
-    skels2 = skels2[1]
-    skels2.vertices[:,2] += 9
+  skels_fb = skels1.merge(skels2)
+  assert len(skels_fb.components()) == 1
 
-    skels = skels1.merge(skels2)
-    assert len(skels.components()) == 2
+  skels1 = skeletionize(labels[:,:,:10], False)
+  skels1 = skels1[1]
 
-  labels = np.zeros((256, 256, 20), dtype=np.uint8)
-  labels[ :, 32:160, : ] = 1
-  testlabels(labels)
+  skels2 = skeletionize(labels[:,:,9:], False)
+  skels2 = skels2[1]
+  skels2.vertices[:,2] += 9
 
-  labels = np.zeros((256, 256, 20), dtype=np.uint8)
-  labels[ 32:160, :, : ] = 1
-  testlabels(labels)
+  skels = skels1.merge(skels2)
+  # Ususally this results in 2 connected components,
+  # but random variation in how fp is handled can 
+  # result in a merge near the tails.
+  assert not Skeleton.equivalent(skels, skels_fb)
 
 def test_unique():
   for i in range(5):
@@ -384,3 +389,23 @@ def test_join_close_components_complex():
   assert len(res.components()) == 1
 
   assert np.all(res.edges == [[0,1], [0,3], [1,2], [3,4], [4,5], [5,6], [6,7]])
+
+def test_fill_all_holes():
+  labels = np.zeros((64, 32, 32), dtype=np.uint32)
+
+  labels[0:32,:,:] = 1
+  labels[32:64,:,:] = 8
+
+  noise = np.random.randint(low=1, high=8, size=(30, 30, 30))
+  labels[1:31,1:31,1:31] = noise
+
+  noise = np.random.randint(low=8, high=11, size=(30, 30, 30))
+  labels[33:63,1:31,1:31] = noise
+
+  noise_labels = np.unique(labels)
+  assert set(noise_labels) == set([1,2,3,4,5,6,7,8,9,10])
+
+  result = kimimaro.intake.fill_all_holes(labels)
+
+  filled_labels = np.unique(result)
+  assert set(filled_labels) == set([1,8])
diff --git a/ext/skeletontricks/skeletontricks.hpp b/ext/skeletontricks/skeletontricks.hpp
index 291ce4f..408050d 100644
--- a/ext/skeletontricks/skeletontricks.hpp
+++ b/ext/skeletontricks/skeletontricks.hpp
@@ -85,11 +85,13 @@ size_t _roll_invalidation_cube(
       x = loc - sx * (y + z * sy);
     }
 
-    minx = std::max(0L,    static_cast<int64_t>(x - (radius / wx)));
+    const int64_t ZERO = 0;
+
+    minx = std::max(ZERO,    static_cast<int64_t>(x - (radius / wx)));
     maxx = std::min(sx-1, static_cast<int64_t>(0.5 + (x + (radius / wx))));
-    miny = std::max(0L,    static_cast<int64_t>(y - (radius / wy)));
+    miny = std::max(ZERO,    static_cast<int64_t>(y - (radius / wy)));
     maxy = std::min(sy-1, static_cast<int64_t>(0.5 + (y + (radius / wy))));
-    minz = std::max(0L,    static_cast<int64_t>(z - (radius / wz)));
+    minz = std::max(ZERO,    static_cast<int64_t>(z - (radius / wz)));
     maxz = std::min(sz-1, static_cast<int64_t>(0.5 + (z + (radius / wz))));
 
     global_minx = std::min(global_minx, minx);
diff --git a/kimimaro/intake.py b/kimimaro/intake.py
index c0c8a48..7d3a517 100644
--- a/kimimaro/intake.py
+++ b/kimimaro/intake.py
@@ -54,7 +54,7 @@ def skeletonize(
     progress=False, fix_branching=True, in_place=False, 
     fix_borders=True, parallel=1, parallel_chunk_size=100,
     extra_targets_before=[], extra_targets_after=[],
-    fix_avocados=False
+    fill_holes=False, fix_avocados=False
   ):
   """
   Skeletonize all non-zero labels in a given 2D or 3D image.
@@ -83,6 +83,13 @@ def skeletonize(
     }
     dust_threshold: don't bother skeletonizing connected components smaller than
       this many voxels.
+    fill_holes: preemptively run a void filling algorithm on all connected
+      components and delete labels that get filled in. This can improve the
+      quality of the reconstruction if holes in the shapes are artifacts introduced
+      by the segmentation pipeline. This option incurs moderate overhead.
+
+      WARNING: THIS WILL REMOVE INPUT LABELS THAT ARE DEEMED TO BE HOLES.
+
     cc_safety_factor: Value between 0 and 1 that scales the size of the 
       disjoint set maps in connected_components. 1 is guaranteed to work,
       but is probably excessive and corresponds to every pixel being a different
@@ -141,6 +148,9 @@ def skeletonize(
   cc_labels, remapping = compute_cc_labels(all_labels, cc_safety_factor)
   del all_labels
 
+  if fill_holes:
+    cc_labels = fill_all_holes(cc_labels, progress)
+
   extra_targets_before = points_to_labels(extra_targets_before, cc_labels)
   extra_targets_after = points_to_labels(extra_targets_after, cc_labels)
 
@@ -550,6 +560,57 @@ def synapses_to_targets(labels, synapses, progress=False):
 
   return targets
 
+def fill_all_holes(cc_labels, progress=False, return_fill_count=False):
+  """
+  Fills the holes in each connected component and removes components that
+  get filled in. The idea is that holes (entirely contained labels or background) 
+  are artifacts in cell segmentations. A common example is a nucleus segmented 
+  separately from the rest of the cell or errors in a manual segmentation leaving
+  a void in a dendrite.
+
+  cc_labels: an image containing connected components with labels smaller than
+    the number of voxels in the image.
+  progress: Display a progress bar or not.
+  return_fill_count: if specified, return a tuple (filled_image, N) where N is
+    the number of voxels that were filled in.
+
+  Returns: filled_in_labels
+  """
+  import fill_voids
+
+  labels = fastremap.unique(cc_labels)
+  labels_set = set(labels)
+  labels_set.discard(0)
+
+  all_slices = find_objects(cc_labels)
+  pixels_filled = 0
+
+  for label in tqdm(labels, disable=(not progress), desc="Filling Holes"):
+    if label not in labels_set:
+      continue
+
+    slices = all_slices[label - 1]
+    if slices is None:
+      continue
+
+    binary_image = (cc_labels[slices] == label)
+    binary_image, N = fill_voids.fill(
+      binary_image, in_place=True, 
+      return_fill_count=True
+    )
+    pixels_filled += N
+    if N == 0:
+      continue 
+
+    sub_labels = set(fastremap.unique(cc_labels[slices] * binary_image))
+    sub_labels.remove(label)
+    labels_set -= sub_labels
+    cc_labels[slices] = cc_labels[slices] * ~binary_image + label * binary_image
+
+  if return_fill_count:
+    return cc_labels, pixels_filled
+  return cc_labels
+
 def print_quotes(parallel):
   if parallel == -1:
     print("Against the power of will I possess... The capability of my body is nothing.")
diff --git a/kimimaro/trace.py b/kimimaro/trace.py
index ce75a92..a11f38f 100644
--- a/kimimaro/trace.py
+++ b/kimimaro/trace.py
@@ -205,7 +205,11 @@ def compute_paths(
       target = kimimaro.skeletontricks.find_target(labels, DAF)
 
     if fix_branching:
-      path = dijkstra3d.dijkstra(parents, root, target, bidirectional=soma_mode)
+      # faster to trace from target to root than root to target
+      # because that way local exploration finds any zero weighted path
+      # and finishes vs exploring from the neighborhood of the entire zero
+      # weighted path
+      path = dijkstra3d.dijkstra(parents, target, root, bidirectional=soma_mode)
     else:
       path = dijkstra3d.path_from_parents(parents, target)
     
diff --git a/kimimaro_512x512x512_benchmark.png b/kimimaro_512x512x512_benchmark.png
index 8814b06..54bcc80 100644
Binary files a/kimimaro_512x512x512_benchmark.png and b/kimimaro_512x512x512_benchmark.png differ
diff --git a/manual_testing/manual_test.py b/manual_testing/manual_test.py
new file mode 100644
index 0000000..825812a
--- /dev/null
+++ b/manual_testing/manual_test.py
@@ -0,0 +1,7 @@
+import kimimaro
+import numpy as np
+
+from PIL import Image 
+
+img = Image.open('./crossstreet.png').asarray()
+print(img)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 84ab6cc..65af64c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ dijkstra3d>=1.4.1
 fill-voids>=1.1.0
 edt>=1.3.1
 fastremap>=1.5.1
-networkx==2.1
+networkx
 numpy>=1.16.1
 pathos
 pytest