From 8d866c5a87fe0d3d53565e86b94744d0ff715488 Mon Sep 17 00:00:00 2001 From: Kay Date: Wed, 13 Nov 2024 23:34:41 +0800 Subject: [PATCH 1/3] feat: check for moved indicator via dataframe operations --- bigtree/tree/helper.py | 70 +++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/bigtree/tree/helper.py b/bigtree/tree/helper.py index c48b10ee..faf9da13 100644 --- a/bigtree/tree/helper.py +++ b/bigtree/tree/helper.py @@ -439,7 +439,7 @@ def get_tree_diff( indicator_col = "Exists" old_suffix = "_old" new_suffix = "_new" - tree_sep = tree.sep + moved_ind = "moved_ind" data, data_other = ( export.tree_to_dataframe( @@ -475,32 +475,38 @@ def get_tree_diff( data_path_diff = data_compare # Handle tree structure difference - paths_removed = list( - data_path_diff[data_path_diff[indicator_col] == "left_only"][path_col] - )[::-1] - paths_added = list( - data_path_diff[data_path_diff[indicator_col] == "right_only"][path_col] - )[::-1] - - moved_from_ind: List[bool] = [True for _ in range(len(paths_removed))] - moved_to_ind: List[bool] = [True for _ in range(len(paths_added))] - if detail: - names_removed = [path.split(tree_sep)[-1] for path in paths_removed] - names_added = [path.split(tree_sep)[-1] for path in paths_added] - moved_from_ind = [name in names_added for name in names_removed] - moved_to_ind = [name in names_removed for name in names_added] - - path_removed_to_suffix = { - path: "-" if not detail else ("moved from" if move_ind else "removed") - for path, move_ind in zip(paths_removed, moved_from_ind) + data_tree = data_path_diff[data_path_diff[indicator_col] == "left_only"] + data_tree_other = data_path_diff[data_path_diff[indicator_col] == "right_only"] + data_tree[moved_ind] = False + data_tree_other[moved_ind] = False + + if len(data_tree) and len(data_tree_other): + # Check for moved from and moved to + move_from_condition = data_tree[ + data_tree[name_col].isin(set(data_tree_other[name_col])) + ] + data_tree.loc[move_from_condition.index, moved_ind] = True + move_to_condition = data_tree_other[ + data_tree_other[name_col].isin(set(data_tree[name_col])) + ] + data_tree_other.loc[move_to_condition.index, moved_ind] = True + + path_move_from = data_tree.set_index(path_col)[[moved_ind]].to_dict(orient="index") + path_move_to = data_tree_other.set_index(path_col)[[moved_ind]].to_dict( + orient="index" + ) + + path_move_from_suffix = { + path: "-" if not detail else ("moved from" if v[moved_ind] else "removed") + for path, v in path_move_from.items() } - path_added_to_suffix = { - path: "+" if not detail else ("moved to" if move_ind else "added") - for path, move_ind in zip(paths_added, moved_to_ind) + path_move_to_suffix = { + path: "+" if not detail else ("moved to" if v[moved_ind] else "added") + for path, v in path_move_to.items() } # Check tree attribute difference - dict_attr_diff: Dict[str, Dict[str, Any]] = {} + path_attr_diff: Dict[str, Dict[str, Any]] = {} if attr_list: data_both = data_compare[data_compare[indicator_col] == "both"] condition_attr_diff = ( @@ -517,7 +523,7 @@ def get_tree_diff( data_attr_diff = data_both[eval(condition_attr_diff)] dict_attr_all = data_attr_diff.set_index(path_col).to_dict(orient="index") for path, node_attr in dict_attr_all.items(): - dict_attr_diff[path] = { + path_attr_diff[path] = { attr: ( node_attr[f"{attr}{old_suffix}"], node_attr[f"{attr}{new_suffix}"], @@ -531,24 +537,24 @@ def get_tree_diff( if only_diff: data_compare = data_compare[ (data_compare[indicator_col] != "both") - | (data_compare[path_col].isin(dict_attr_diff.keys())) + | (data_compare[path_col].isin(path_attr_diff.keys())) ] data_compare = data_compare[[path_col]].sort_values(path_col) if len(data_compare): tree_diff = construct.dataframe_to_tree( data_compare, node_type=tree.__class__, sep=tree.sep ) - for path in sorted(path_removed_to_suffix, reverse=True): + for path in sorted(path_move_from_suffix, reverse=True): _node = search.find_full_path(tree_diff, path) - _node.name += f""" ({path_removed_to_suffix[path]})""" - for path in sorted(path_added_to_suffix, reverse=True): + _node.name += f""" ({path_move_from_suffix[path]})""" + for path in sorted(path_move_to_suffix, reverse=True): _node = search.find_full_path(tree_diff, path) - _node.name += f""" ({path_added_to_suffix[path]})""" + _node.name += f""" ({path_move_to_suffix[path]})""" # Handle tree attribute difference - if dict_attr_diff: - tree_diff = construct.add_dict_to_tree_by_path(tree_diff, dict_attr_diff) - for path in sorted(dict_attr_diff, reverse=True): + if path_attr_diff: + tree_diff = construct.add_dict_to_tree_by_path(tree_diff, path_attr_diff) + for path in sorted(path_attr_diff, reverse=True): _node = search.find_full_path(tree_diff, path) _node.name += " (~)" return tree_diff From 28101f5b5cda980acd329dd768d87e191e8eb511 Mon Sep 17 00:00:00 2001 From: Kay Date: Wed, 13 Nov 2024 23:42:56 +0800 Subject: [PATCH 2/3] feat: check for moved indicator via dataframe operations --- bigtree/tree/helper.py | 60 ++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/bigtree/tree/helper.py b/bigtree/tree/helper.py index faf9da13..8bc1a58c 100644 --- a/bigtree/tree/helper.py +++ b/bigtree/tree/helper.py @@ -477,33 +477,41 @@ def get_tree_diff( # Handle tree structure difference data_tree = data_path_diff[data_path_diff[indicator_col] == "left_only"] data_tree_other = data_path_diff[data_path_diff[indicator_col] == "right_only"] - data_tree[moved_ind] = False - data_tree_other[moved_ind] = False - if len(data_tree) and len(data_tree_other): - # Check for moved from and moved to - move_from_condition = data_tree[ - data_tree[name_col].isin(set(data_tree_other[name_col])) - ] - data_tree.loc[move_from_condition.index, moved_ind] = True - move_to_condition = data_tree_other[ - data_tree_other[name_col].isin(set(data_tree[name_col])) - ] - data_tree_other.loc[move_to_condition.index, moved_ind] = True - - path_move_from = data_tree.set_index(path_col)[[moved_ind]].to_dict(orient="index") - path_move_to = data_tree_other.set_index(path_col)[[moved_ind]].to_dict( - orient="index" - ) - - path_move_from_suffix = { - path: "-" if not detail else ("moved from" if v[moved_ind] else "removed") - for path, v in path_move_from.items() - } - path_move_to_suffix = { - path: "+" if not detail else ("moved to" if v[moved_ind] else "added") - for path, v in path_move_to.items() - } + if detail: + data_tree[moved_ind] = False + data_tree_other[moved_ind] = False + + if len(data_tree) and len(data_tree_other): + # Check for moved from and moved to + move_from_condition = data_tree[ + data_tree[name_col].isin(set(data_tree_other[name_col])) + ] + data_tree.loc[move_from_condition.index, moved_ind] = True + move_to_condition = data_tree_other[ + data_tree_other[name_col].isin(set(data_tree[name_col])) + ] + data_tree_other.loc[move_to_condition.index, moved_ind] = True + + path_move_from = data_tree.set_index(path_col)[[moved_ind]].to_dict( + orient="index" + ) + path_move_to = data_tree_other.set_index(path_col)[[moved_ind]].to_dict( + orient="index" + ) + path_move_from_suffix = { + path: "moved from" if v[moved_ind] else "removed" + for path, v in path_move_from.items() + } + path_move_to_suffix = { + path: "moved to" if v[moved_ind] else "added" + for path, v in path_move_to.items() + } + else: + path_move_from_suffix = dict(zip(data_tree[path_col], "-" * len(data_tree))) + path_move_to_suffix = dict( + zip(data_tree_other[path_col], "+" * len(data_tree_other)) + ) # Check tree attribute difference path_attr_diff: Dict[str, Dict[str, Any]] = {} From 6d777ab7a779f3e86356fa09affad0f15c10da8d Mon Sep 17 00:00:00 2001 From: Kay Date: Wed, 13 Nov 2024 23:43:39 +0800 Subject: [PATCH 3/3] docs: update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc75754f..9d55b1bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed: - Tree Helper: Get tree diff logic to be faster to compare all attribute list and data at once (for attr diff). - Tree Helper: Get tree diff logic to be faster to add suffix at the end (for path diff). +- Tree Helper: Get tree diff logic to be faster to detect moved indicator using dataframe operations (for detail). ## [0.22.2] - 2024-11-11 ### Added: