diff --git a/tensornetwork/block_tensor/block_tensor.py b/tensornetwork/block_tensor/block_tensor.py
index c39fa38e7..cb2976a00 100644
--- a/tensornetwork/block_tensor/block_tensor.py
+++ b/tensornetwork/block_tensor/block_tensor.py
@@ -135,7 +135,7 @@ def compute_nonzero_block_shapes(charges: List[np.ndarray],
   return charge_shape_dict
 
 
-def retrieve_non_zero_diagonal_blocks(
+def retrieve_non_zero_diagonal_blocks_deprecated(
     data: np.ndarray,
     charges: List[np.ndarray],
     flows: List[Union[bool, int]],
@@ -143,6 +143,8 @@ def retrieve_non_zero_diagonal_blocks(
   """
   Given the meta data and underlying data of a symmetric matrix, compute 
   all diagonal blocks and return them in a dict.
+  This is a deprecated version which in general performs worse than the
+  current main implementation.
   Args: 
     data: An np.ndarray of the data. The number of elements in `data`
       has to match the number of non-zero elements defined by `charges` 
@@ -166,43 +168,13 @@ def retrieve_non_zero_diagonal_blocks(
     dict: Dictionary mapping quantum numbers (integers) to either an np.ndarray 
       or a python list of locations and shapes, depending on the value of `return_data`.
   """
-  #TODO: this is currently way too slow!!!!
-  #Run the following benchmark for testing (typical MPS use case)
-  #retrieving the blocks is ~ 10 times as slow as multiplying them
-
-  # D=4000
-  # B=10
-  # q1 = np.random.randint(0,B,D)
-  # q2 = np.asarray([0,1])
-  # q3 = np.random.randint(0,B,D)
-  # i1 = Index(charges=q1,flow=1)
-  # i2 = Index(charges=q2,flow=1)
-  # i3 = Index(charges=q3,flow=-1)
-  # indices=[i1,i2,i3]
-  # A = BlockSparseTensor.random(indices=indices, dtype=np.complex128)
-  # A.reshape((D*2, D))
-  # def multiply_blocks(blocks):
-  #     for b in blocks.values():
-  #         np.dot(b.T, b)
-  # t1s=[]
-  # t2s=[]
-  # for n in range(10):
-  #     print(n)
-  #     t1 = time.time()
-  #     b = A.get_diagonal_blocks()
-  #     t1s.append(time.time() - t1)
-  #     t1 = time.time()
-  #     multiply_blocks(b)
-  #     t2s.append(time.time() - t1)
-  # print('average retrieval time', np.average(t1s))
-  # print('average multiplication time',np.average(t2s))
-
   if len(charges) != 2:
     raise ValueError("input has to be a two-dimensional symmetric matrix")
   check_flows(flows)
   if len(flows) != len(charges):
     raise ValueError("`len(flows)` is different from `len(charges)`")
 
+  #we multiply the flows into the charges
   row_charges = flows[0] * charges[0]  # a list of charges on each row
   column_charges = flows[1] * charges[1]  # a list of charges on each column
 
@@ -210,25 +182,51 @@ def retrieve_non_zero_diagonal_blocks(
   unique_row_charges, row_dims = np.unique(row_charges, return_counts=True)
   unique_column_charges, column_dims = np.unique(
       column_charges, return_counts=True)
+  #get the charges common to rows and columns (only those matter)
   common_charges = np.intersect1d(
       unique_row_charges, -unique_column_charges, assume_unique=True)
 
+  #convenience container for storing the degeneracies of each
+  #row and column charge
   row_degeneracies = dict(zip(unique_row_charges, row_dims))
   column_degeneracies = dict(zip(unique_column_charges, column_dims))
 
+  # we only care about charges common to row and columns
   mask = np.isin(column_charges, -common_charges)
-  masked_charges = column_charges[mask]
-  degeneracy_vector = np.empty(len(masked_charges), dtype=np.int64)
+  relevant_column_charges = column_charges[mask]
+
+  #some numpy magic to get the index locations of the blocks
+  #we generate a vector of `len(relevant_column_charges) which,
+  #for each charge `c` in `relevant_column_charges` holds the
+  #row-degeneracy of charge `c`
+  degeneracy_vector = np.empty(len(relevant_column_charges), dtype=np.int64)
+  #for each charge `c` in `common_charges` we generate a boolean mask
+  #for indexing the positions where `relevant_column_charges` has a value of `c`.
   masks = {}
   for c in common_charges:
-    mask = masked_charges == -c
+    mask = relevant_column_charges == -c
     masks[c] = mask
     degeneracy_vector[mask] = row_degeneracies[c]
-  summed_degeneracies = np.cumsum(degeneracy_vector)
+
+  # the result of the cumulative sum is a vector containing
+  # the stop positions of the non-zero values of each column
+  # within the data vector.
+  # E.g. for `relevant_column_charges` = [0,1,0,0,3],  and
+  # row_degeneracies[0] = 10
+  # row_degeneracies[1] = 20
+  # row_degeneracies[3] = 30
+  # we have
+  # `stop_positions` = [10, 10+20, 10+20+10, 10+20+10+10, 10+20+10+10+30]
+  # The starting positions of consecutive elements (in column-major order) in
+  # each column with charge `c=0` within the data vector are then simply obtained using
+  # masks[0] = [True, False, True, True, False]
+  # and `stop_positions[masks[0]] - row_degeneracies[0]`
+  stop_positions = np.cumsum(degeneracy_vector)
   blocks = {}
 
   for c in common_charges:
-    a = np.expand_dims(summed_degeneracies[masks[c]] - row_degeneracies[c], 0)
+    #numpy broadcasting is substantially faster than kron!
+    a = np.expand_dims(stop_positions[masks[c]] - row_degeneracies[c], 0)
     b = np.expand_dims(np.arange(row_degeneracies[c]), 1)
     if not return_data:
       blocks[c] = [a + b, (row_degeneracies[c], column_degeneracies[-c])]
@@ -238,11 +236,11 @@ def retrieve_non_zero_diagonal_blocks(
   return blocks
 
 
-def retrieve_non_zero_diagonal_blocks_deprecated(
+def retrieve_non_zero_diagonal_blocks(
     data: np.ndarray,
     charges: List[np.ndarray],
     flows: List[Union[bool, int]],
-    return_data: Optional[bool] = True) -> Dict:
+    return_data: Optional[bool] = False) -> Dict:
   """
   Given the meta data and underlying data of a symmetric matrix, compute 
   all diagonal blocks and return them in a dict.
@@ -269,180 +267,51 @@ def retrieve_non_zero_diagonal_blocks_deprecated(
     dict: Dictionary mapping quantum numbers (integers) to either an np.ndarray 
       or a python list of locations and shapes, depending on the value of `return_data`.
   """
-  #TODO: this is currently way too slow!!!!
-  #Run the following benchmark for testing (typical MPS use case)
-  #retrieving the blocks is ~ 10 times as slow as multiplying them
-
-  # D=4000
-  # B=10
-  # q1 = np.random.randint(0,B,D)
-  # q2 = np.asarray([0,1])
-  # q3 = np.random.randint(0,B,D)
-  # i1 = Index(charges=q1,flow=1)
-  # i2 = Index(charges=q2,flow=1)
-  # i3 = Index(charges=q3,flow=-1)
-  # indices=[i1,i2,i3]
-  # A = BlockSparseTensor.random(indices=indices, dtype=np.complex128)
-  # A.reshape((D*2, D))
-  # def multiply_blocks(blocks):
-  #     for b in blocks.values():
-  #         np.dot(b.T, b)
-  # t1s=[]
-  # t2s=[]
-  # for n in range(10):
-  #     print(n)
-  #     t1 = time.time()
-  #     b = A.get_diagonal_blocks()
-  #     t1s.append(time.time() - t1)
-  #     t1 = time.time()
-  #     multiply_blocks(b)
-  #     t2s.append(time.time() - t1)
-  # print('average retrieval time', np.average(t1s))
-  # print('average multiplication time',np.average(t2s))
-
   if len(charges) != 2:
     raise ValueError("input has to be a two-dimensional symmetric matrix")
   check_flows(flows)
   if len(flows) != len(charges):
     raise ValueError("`len(flows)` is different from `len(charges)`")
 
+  #we multiply the flows into the charges
   row_charges = flows[0] * charges[0]  # a list of charges on each row
   column_charges = flows[1] * charges[1]  # a list of charges on each column
 
-  #get the unique charges
-  t1 = time.time()
-  unique_row_charges, row_dims = np.unique(row_charges, return_counts=True)
-  # print('finding unique row charges', time.time() - t1)
-  # t1 = time.time()
-  unique_column_charges, column_dims = np.unique(
-      column_charges, return_counts=True)
-  # print('finding unique column charges', time.time() - t1)
-  # t1 = time.time()
-  common_charges = np.intersect1d(
-      unique_row_charges, -unique_column_charges, assume_unique=True)
-  # print('finding unique intersections', time.time() - t1)
-  # t1 = time.time()
-  #common_charges = np.intersect1d(row_charges, -column_charges)
+  # we only care about charges common to rows and columns
+  common_charges = np.unique(np.intersect1d(row_charges, -column_charges))
+  row_charges = row_charges[np.isin(row_charges, common_charges)]
+  column_charges = column_charges[np.isin(column_charges, -common_charges)]
 
-  # for each matrix column find the number of non-zero elements in it
-  # Note: the matrix is assumed to be symmetric, i.e. only elements where
-  # ingoing and outgoing charge are identical are non-zero
+  #get the unique charges
+  unique_row_charges, row_locations, row_dims = np.unique(
+      row_charges, return_inverse=True, return_counts=True)
+  unique_column_charges, column_locations, column_dims = np.unique(
+      column_charges, return_inverse=True, return_counts=True)
 
-  # get the degeneracies of each row and column charge
+  #convenience container for storing the degeneracies of each
+  #row and column charge
   row_degeneracies = dict(zip(unique_row_charges, row_dims))
   column_degeneracies = dict(zip(unique_column_charges, column_dims))
 
-  number_of_seen_elements = 0
-  #idxs = {c: [] for c in common_charges}
-  idxs = {
-      c: np.empty(
-          row_degeneracies[c] * column_degeneracies[-c], dtype=np.int64)
-      for c in common_charges
-  }
-  idxs_stops = {c: 0 for c in common_charges}
-  t1 = time.time()
-  mask = np.isin(column_charges, -common_charges)
-  masked_charges = column_charges[mask]
-  print('finding mask', time.time() - t1)
-  # print(len(column_charges), len(masked_charges))
-  t1 = time.time()
-  elements = {c: np.arange(row_degeneracies[c]) for c in common_charges}
-  for charge in masked_charges:
-    # idxs[-charge].append((number_of_seen_elements,
-    #                       row_degeneracies[-charge] + number_of_seen_elements))
-
-    idxs[-charge][
-        idxs_stops[-charge]:idxs_stops[-charge] +
-        row_degeneracies[-charge]] = number_of_seen_elements + elements[-charge]
-
-    # np.arange(
-    #                   number_of_seen_elements,
-    #                   row_degeneracies[-charge] + number_of_seen_elements)
-
-    number_of_seen_elements += row_degeneracies[-charge]
-    idxs_stops[-charge] += row_degeneracies[-charge]
-  print('getting start and stop', time.time() - t1)
-  # t1 = time.time()
-  # for charge in masked_charges:
-  #   tmp = np.arange(number_of_seen_elements,
-  #                   row_degeneracies[-charge] + number_of_seen_elements)
-  #   number_of_seen_elements += row_degeneracies[-charge]
-  # print('running the partial loop', time.time() - t1)
-
-  #######################################################################################
-  #looks like this takes pretty long for rectangular matrices where shape[1] >> shape[0]
-  #it's mostly np.arange that causes the overhead.
-  # t1 = time.time()
-  # for charge in masked_charges:
-  #   idxs[-charge].append(
-  #       np.arange(number_of_seen_elements,
-  #                 row_degeneracies[-charge] + number_of_seen_elements))
-  #   number_of_seen_elements += row_degeneracies[-charge]
-  # print('running the full loop', time.time() - t1)
-  #######################################################################################
+  #some numpy magic to get the index locations of the blocks
+  #we generate a vector of `len(relevant_column_charges) which,
+  #for each charge `c` in `relevant_column_charges` holds the
+  #row-degeneracy of charge `c`
 
-  blocks = {}
-  if not return_data:
-    for c, idx in idxs.items():
-      #num_elements = np.sum([len(t) for t in idx])
-      #indexes = np.empty(num_elements, dtype=np.int64)
-      #np.concatenate(idx, out=indexes)
-      blocks[c] = [idx, (row_degeneracies[c], column_degeneracies[-c])]
-    return blocks
-
-  for c, idx in idxs.items():
-    num_elements = np.sum([len(t) for t in idx])
-    indexes = np.empty(num_elements, dtype=np.int64)
-    np.concatenate(idx, out=indexes)
-    blocks[c] = np.reshape(data[indexes],
-                           (row_degeneracies[c], column_degeneracies[-c]))
-  return blocks
+  degeneracy_vector = row_dims[column_locations]
+  stop_positions = np.cumsum(degeneracy_vector)
 
-
-def retrieve_non_zero_diagonal_blocks_test(
-    data: np.ndarray, charges: List[np.ndarray],
-    flows: List[Union[bool, int]]) -> Dict:
-  """
-  For testing purposes. Produces the same output as `retrieve_non_zero_diagonal_blocks`,
-  but computes it in a different way.
-  This is currently very slow for high rank tensors with many blocks, but can be faster than
-  `retrieve_non_zero_diagonal_blocks` in certain other cases.
-  It's pretty memory heavy too.
-  """
-  if len(charges) != 2:
-    raise ValueError("input has to be a two-dimensional symmetric matrix")
-  check_flows(flows)
-  if len(flows) != len(charges):
-    raise ValueError("`len(flows)` is different from `len(charges)`")
-
-  #get the unique charges
-  unique_row_charges, row_dims = np.unique(
-      flows[0] * charges[0], return_counts=True)
-  unique_column_charges, column_dims = np.unique(
-      flows[1] * charges[1], return_counts=True)
-
-  #a 1d array of the net charges.
-  #this can use a lot of memory
-  net_charges = fuse_charges(
-      q1=charges[0], flow1=flows[0], q2=charges[1], flow2=flows[1])
-  #a 1d array containing row charges added with zero column charges
-  #used to find the indices of in data corresponding to a given charge
-  #(see below)
-  #this can be very large
-  tmp = np.tile(charges[0] * flows[0], len(charges[1]))
-
-  symmetric_indices = net_charges == 0
-  charge_lookup = tmp[symmetric_indices]
-
-  row_degeneracies = dict(zip(unique_row_charges, row_dims))
-  column_degeneracies = dict(zip(unique_column_charges, column_dims))
   blocks = {}
-
-  common_charges = np.intersect1d(unique_row_charges, -unique_column_charges)
   for c in common_charges:
-    blocks[c] = np.reshape(data[charge_lookup == c],
-                           (row_degeneracies[c], column_degeneracies[-c]))
-
+    #numpy broadcasting is substantially faster than kron!
+    a = np.expand_dims(
+        stop_positions[column_locations == -c] - row_degeneracies[c], 0)
+    b = np.expand_dims(np.arange(row_degeneracies[c]), 1)
+    if not return_data:
+      blocks[c] = [a + b, (row_degeneracies[c], column_degeneracies[-c])]
+    else:
+      blocks[c] = np.reshape(data[a + b],
+                             (row_degeneracies[c], column_degeneracies[-c]))
   return blocks
 
 
@@ -614,6 +483,16 @@ def transpose(self, order):
 
     raise NotImplementedError('transpose is not implemented!!')
 
+  def reset_shape(self) -> None:
+    """
+    Bring the tensor back into its elementary shape.
+    """
+    elementary_indices = []
+    for i in self.indices:
+      elementary_indices.extend(i.get_elementary_indices())
+
+    self.indices = elementary_indices
+
   def reshape(self, shape: Union[Iterable[Index], Iterable[int]]) -> None:
     """
     Reshape `tensor` into `shape` in place.
@@ -668,19 +547,19 @@ def reshape(self, shape: Union[Iterable[Index], Iterable[int]]) -> None:
     index_copy = [i.copy() for i in self.indices]
 
     def raise_error():
-      #if this error is raised `shape` is incompatible
-      #with the elementary indices. We have to reset them
-      #to the original.
+      #if this error is raised then `shape` is incompatible
+      #with the elementary indices. We then reset the shape
+      #to what is was before the call to `reshape`.
       self.indices = index_copy
       elementary_indices = []
       for i in self.indices:
         elementary_indices.extend(i.get_elementary_indices())
-      print(elementary_indices)
       raise ValueError("The shape {} is incompatible with the "
                        "elementary shape {} of the tensor.".format(
                            dense_shape,
                            tuple([e.dimension for e in elementary_indices])))
 
+    self.reset_shape()  #bring tensor back into its elementary shape
     for n in range(len(dense_shape)):
       if dense_shape[n] > self.shape[n]:
         while dense_shape[n] > self.shape[n]:
@@ -730,6 +609,33 @@ def get_diagonal_blocks(self, return_data: Optional[bool] = True) -> Dict:
         flows=self.flows,
         return_data=return_data)
 
+  def get_diagonal_blocks_deprecated(
+      self, return_data: Optional[bool] = True) -> Dict:
+    """
+    Obtain the diagonal blocks of symmetric matrix.
+    BlockSparseTensor has to be a matrix.
+    Args:
+      return_data: If `True`, the return dictionary maps quantum numbers `q` to 
+        actual `np.ndarray` with the data. This involves a copy of data.
+        If `False`, the returned dict maps quantum numbers of a list 
+        [locations, shape], where `locations` is an np.ndarray of type np.int64
+        containing the locations of the tensor elements within A.data, i.e.
+        `A.data[locations]` contains the elements belonging to the tensor with 
+        quantum numbers `(q,q). `shape` is the shape of the corresponding array.
+    Returns:
+      dict: Dictionary mapping charge to np.ndarray of rank 2 (a matrix)
+    
+    """
+    if self.rank != 2:
+      raise ValueError(
+          "`get_diagonal_blocks` can only be called on a matrix, but found rank={}"
+          .format(self.rank))
+    return retrieve_non_zero_diagonal_blocks_deprecated(
+        data=self.data,
+        charges=self.charges,
+        flows=self.flows,
+        return_data=return_data)
+
 
 def reshape(tensor: BlockSparseTensor,
             shape: Union[Iterable[Index], Iterable[int]]) -> BlockSparseTensor: