Merge pull request #61 from baolsen/fix_delimiters_in_keys

Replace separators found within keys
amirziai · Feb 3, 2020 · 1236f75 · 1236f75
2 parents dcf0078 + 569cc84
commit 1236f75
Show file tree

Hide file tree

Showing 3 changed files with 122 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -27,13 +27,13 @@ flatten(dic)
 
 Results:
 ```python
-{'a': '1',
- 'b': '2',
- 'c_0_d_0': '2',
- 'c_0_d_1': '3',
- 'c_0_d_2': '4',
- 'c_0_e_0_f': '1',
- 'c_0_e_0_g': '2'}
+{'a': 1,
+ 'b': 2,
+ 'c_0_d_0': 2,
+ 'c_0_d_1': 3,
+ 'c_0_d_2': 4,
+ 'c_0_e_0_f': 1,
+ 'c_0_e_0_g': 2}
 ```
 
 ### Usage with Pandas
@@ -51,9 +51,9 @@ dic_flattened = (flatten(d) for d in dic)
 ```
 which creates an array of flattened objects:
 ```python
-[{'a': '1', 'b': '2', 'c_d': '3', 'c_e': '4'},
- {'a': '0.5', 'c_d': '3.2'},
- {'a': '0.8', 'b': '1.8'}]
+[{'a': 1, 'b': 2, 'c_d': 3, 'c_e': 4},
+ {'a': 0.5, 'c_d': 3.2},
+ {'a': 0.8, 'b': 1.8}]
 ```
 Finally you can use ```pd.DataFrame``` to capture the flattened array:
 ```python

diff --git a/flatten_json/__init__.py b/flatten_json/__init__.py
@@ -1,39 +1,47 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import sys
+import copy
 import json
+import re
+import sys
+from math import isnan
+
+import six
+
+from flatten_json.util import check_if_numbers_are_consecutive
 
 try:
     # 3.8 and up
     from collections.abc import Iterable
 except ImportError:
     from collections import Iterable
 
-from flatten_json.util import check_if_numbers_are_consecutive
-import six
-import copy
-import re
-from math import isnan
-
 
-def _construct_key(previous_key, separator, new_key):
+def _construct_key(previous_key, separator, new_key, replace_separators=None):
     """
     Returns the new_key if no previous key exists, otherwise concatenates
     previous key, separator, and new_key
     :param previous_key:
     :param separator:
     :param new_key:
+    :param str replace_separators: Replace separators within keys
     :return: a string if previous_key exists and simply passes through the
     new_key otherwise
     """
+    if replace_separators is not None:
+        new_key = str(new_key).replace(separator, replace_separators)
     if previous_key:
         return u"{}{}{}".format(previous_key, separator, new_key)
     else:
         return new_key
 
 
-def flatten(nested_dict, separator="_", root_keys_to_ignore=set()):
+def flatten(
+        nested_dict,
+        separator="_",
+        root_keys_to_ignore=set(),
+        replace_separators=None):
     """
     Flattens a dictionary with nested structure to a dictionary with no
     hierarchy
@@ -44,6 +52,7 @@ def flatten(nested_dict, separator="_", root_keys_to_ignore=set()):
     :param nested_dict: dictionary we want to flatten
     :param separator: string to separate dictionary keys by
     :param root_keys_to_ignore: set of root keys to ignore from flattening
+    :param str replace_separators: Replace separators within keys
     :return: flattened dictionary
     """
     assert isinstance(nested_dict, dict), "flatten requires a dictionary input"
@@ -69,12 +78,22 @@ def _flatten(object_, key):
         elif isinstance(object_, dict):
             for object_key in object_:
                 if not (not key and object_key in root_keys_to_ignore):
-                    _flatten(object_[object_key], _construct_key(key,
-                                                                 separator,
-                                                                 object_key))
+                    _flatten(
+                        object_[object_key],
+                        _construct_key(
+                            key,
+                            separator,
+                            object_key,
+                            replace_separators=replace_separators))
         elif isinstance(object_, (list, set, tuple)):
             for index, item in enumerate(object_):
-                _flatten(item, _construct_key(key, separator, index))
+                _flatten(
+                    item,
+                    _construct_key(
+                        key,
+                        separator,
+                        index,
+                        replace_separators=replace_separators))
         # Anything left take as is
         else:
             flattened_dict[key] = object_
@@ -88,7 +107,8 @@ def _flatten(object_, key):
 
 def flatten_preserve_lists(nested_dict, separator="_",
                            root_keys_to_ignore=set(),
-                           max_list_index=3, max_depth=3):
+                           max_list_index=3, max_depth=3,
+                           replace_separators=None):
     """
     Flattens a dictionary with nested structure to a dictionary with no
     hierarchy
@@ -106,6 +126,7 @@ def flatten_preserve_lists(nested_dict, separator="_",
     :param root_keys_to_ignore: set of root keys to ignore from flattening
     :param max_list_index: maximum list index to process
     :param max_depth: maximum nesting depth to process
+    :param str replace_separators: Replace separators within keys
     :return: flattened dictionary
     """
 
@@ -145,13 +166,23 @@ def _flatten(object_, key):
             else:
                 for object_key in object_:
                     if not (not key and object_key in root_keys_to_ignore):
-                        _flatten(object_[object_key],
-                                 _construct_key(key, separator, object_key)
-                                 )
+                        _flatten(
+                            object_[object_key],
+                            _construct_key(
+                                key,
+                                separator,
+                                object_key,
+                                replace_separators=replace_separators))
 
         elif isinstance(object_, list) or isinstance(object_, set):
             for index, item in enumerate(object_):
-                _flatten(item, _construct_key(key, separator, index))
+                _flatten(
+                    item,
+                    _construct_key(
+                        key,
+                        separator,
+                        index,
+                        replace_separators=replace_separators))
 
         else:
             flattened_dict[key] = object_
@@ -208,11 +239,15 @@ def _flatten_low_entropy(object_, key, cur_depth, max_depth_inner):
                                    (str(type(x[1])), len(str(x[1]))),
                                    reverse=False):
                         if not (not key and object_key in root_keys_to_ignore):
-                            _flatten_low_entropy(object_[object_key],
-                                                 _construct_key(key,
-                                                                separator,
-                                                                object_key),
-                                                 cur_depth, max_depth_inner)
+                            _flatten_low_entropy(
+                                object_[object_key],
+                                _construct_key(
+                                    key,
+                                    separator,
+                                    object_key,
+                                    replace_separators=replace_separators),
+                                cur_depth,
+                                max_depth_inner)
 
             # lists could go into rows, like in a relational database
             elif isinstance(object_, list) or isinstance(object_, set):

diff --git a/test_flatten.py b/test_flatten.py
@@ -1,8 +1,12 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import unittest
 import json
+import unittest
+
+from flatten_json import (cli, flatten, flatten_preserve_lists, unflatten,
+                          unflatten_list)
+from flatten_json.util import check_if_numbers_are_consecutive
 
 try:
     # python2
@@ -11,9 +15,6 @@
     # python3
     from io import StringIO
 
-from flatten_json import flatten, flatten_preserve_lists, unflatten, \
-    unflatten_list, cli
-from flatten_json.util import check_if_numbers_are_consecutive
 
 
 class UnitTests(unittest.TestCase):
@@ -2195,6 +2196,55 @@ def test_command_line(self):
         result = json.loads(output)
         self.assertEqual(result, dict(a_b=1))
 
+    def test_replace_separators_none(self):
+        dic = {
+            'a_with_separator': {'b': [1, 2, 3]},
+        }
+        expected = {
+            'a_with_separator_b_0': 1,
+            'a_with_separator_b_1': 2,
+            'a_with_separator_b_2': 3
+        }
+        actual = flatten(dic)
+        self.assertEqual(actual, expected)
+
+    def test_replace_separators_remove(self):
+        dic = {
+            'a_with_separator': {'b': [1, 2, 3]},
+        }
+        expected = {
+            'awithseparator_b_0': 1,
+            'awithseparator_b_1': 2,
+            'awithseparator_b_2': 3
+        }
+        actual = flatten(dic, replace_separators='')
+        self.assertEqual(actual, expected)
+
+    def test_replace_separators_something(self):
+        dic = {
+            'a_with_separator': {'b': [1, 2, 3]},
+        }
+        expected = {
+            'a.with.separator_b_0': 1,
+            'a.with.separator_b_1': 2,
+            'a.with.separator_b_2': 3
+        }
+        actual = flatten(dic, replace_separators='.')
+        self.assertEqual(actual, expected)
+
+    def test_replace_separators_nested(self):
+        dic = {
+            'a_with_separator': {'b_with_separator': [1, 2, 3]},
+        }
+        expected = {
+            'awithseparator_bwithseparator_0': 1,
+            'awithseparator_bwithseparator_1': 2,
+            'awithseparator_bwithseparator_2': 3
+        }
+        actual = flatten(dic, replace_separators='')
+        self.assertEqual(actual, expected)
+
+
 
 if __name__ == '__main__':
     unittest.main()