-
-
Notifications
You must be signed in to change notification settings - Fork 239
/
Copy pathdiff.py
executable file
·1905 lines (1700 loc) · 88.3 KB
/
diff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# In order to run the docstrings:
# python3 -m deepdiff.diff
# You might need to run it many times since dictionaries come in different orders
# every time you run the docstrings.
# However the docstring expects it in a specific order in order to pass!
import difflib
import logging
import types
import datetime
from enum import Enum
from copy import deepcopy
from math import isclose as is_close
from typing import List, Dict, Callable, Union, Any, Pattern, Tuple, Optional, Set, FrozenSet, TYPE_CHECKING, Protocol
from collections.abc import Mapping, Iterable, Sequence
from collections import defaultdict
from inspect import getmembers
from itertools import zip_longest
from functools import lru_cache
from deepdiff.helper import (strings, bytes_type, numbers, uuids, ListItemRemovedOrAdded, notpresent,
IndexedHash, unprocessed, add_to_frozen_set, basic_types,
convert_item_or_items_into_set_else_none, get_type,
convert_item_or_items_into_compiled_regexes_else_none,
type_is_subclass_of_type_group, type_in_type_group, get_doc,
number_to_string, datetime_normalize, KEY_TO_VAL_STR, booleans,
np_ndarray, np_floating, get_numpy_ndarray_rows, RepeatedTimer,
TEXT_VIEW, TREE_VIEW, DELTA_VIEW, detailed__dict__, add_root_to_paths,
np, get_truncate_datetime, dict_, CannotCompare, ENUM_INCLUDE_KEYS,
PydanticBaseModel, Opcode, SetOrdered, ipranges)
from deepdiff.serialization import SerializationMixin
from deepdiff.distance import DistanceMixin, logarithmic_similarity
from deepdiff.model import (
RemapDict, ResultDict, TextResult, TreeResult, DiffLevel,
DictRelationship, AttributeRelationship, REPORT_KEYS,
SubscriptableIterableRelationship, NonSubscriptableIterableRelationship,
SetRelationship, NumpyArrayRelationship, CUSTOM_FIELD,
FORCE_DEFAULT,
)
from deepdiff.deephash import DeepHash, combine_hashes_lists
from deepdiff.base import Base
from deepdiff.lfucache import LFUCache, DummyLFU
if TYPE_CHECKING:
from pytz.tzinfo import BaseTzInfo
logger = logging.getLogger(__name__)
MAX_PASSES_REACHED_MSG = (
'DeepDiff has reached the max number of passes of {}. '
'You can possibly get more accurate results by increasing the max_passes parameter.')
MAX_DIFFS_REACHED_MSG = (
'DeepDiff has reached the max number of diffs of {}. '
'You can possibly get more accurate results by increasing the max_diffs parameter.')
notpresent_indexed = IndexedHash(indexes=[0], item=notpresent)
doc = get_doc('diff_doc.rst')
PROGRESS_MSG = "DeepDiff {} seconds in progress. Pass #{}, Diff #{}"
def _report_progress(_stats, progress_logger, duration):
"""
Report the progress every few seconds.
"""
progress_logger(PROGRESS_MSG.format(duration, _stats[PASSES_COUNT], _stats[DIFF_COUNT]))
DISTANCE_CACHE_HIT_COUNT = 'DISTANCE CACHE HIT COUNT'
DIFF_COUNT = 'DIFF COUNT'
PASSES_COUNT = 'PASSES COUNT'
MAX_PASS_LIMIT_REACHED = 'MAX PASS LIMIT REACHED'
MAX_DIFF_LIMIT_REACHED = 'MAX DIFF LIMIT REACHED'
DISTANCE_CACHE_ENABLED = 'DISTANCE CACHE ENABLED'
PREVIOUS_DIFF_COUNT = 'PREVIOUS DIFF COUNT'
PREVIOUS_DISTANCE_CACHE_HIT_COUNT = 'PREVIOUS DISTANCE CACHE HIT COUNT'
CANT_FIND_NUMPY_MSG = 'Unable to import numpy. This must be a bug in DeepDiff since a numpy array is detected.'
INVALID_VIEW_MSG = 'The only valid values for the view parameter are text and tree. But {} was passed.'
CUTOFF_RANGE_ERROR_MSG = 'cutoff_distance_for_pairs needs to be a positive float max 1.'
VERBOSE_LEVEL_RANGE_MSG = 'verbose_level should be 0, 1, or 2.'
PURGE_LEVEL_RANGE_MSG = 'cache_purge_level should be 0, 1, or 2.'
_ENABLE_CACHE_EVERY_X_DIFF = '_ENABLE_CACHE_EVERY_X_DIFF'
model_fields_set = frozenset(["model_fields_set"])
# What is the threshold to consider 2 items to be pairs. Only used when ignore_order = True.
CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT = 0.3
# What is the threshold to calculate pairs of items between 2 iterables.
# For example 2 iterables that have nothing in common, do not need their pairs to be calculated.
CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT = 0.7
DEEPHASH_PARAM_KEYS = (
'exclude_types',
'exclude_paths',
'include_paths',
'exclude_regex_paths',
'hasher',
'significant_digits',
'number_format_notation',
'ignore_string_type_changes',
'ignore_numeric_type_changes',
'use_enum_value',
'ignore_type_in_groups',
'ignore_type_subclasses',
'ignore_string_case',
'exclude_obj_callback',
'ignore_private_variables',
'encodings',
'ignore_encoding_errors',
'default_timezone',
'custom_operators',
)
class DeepDiffProtocol(Protocol):
t1: Any
t2: Any
cutoff_distance_for_pairs: float
use_log_scale: bool
log_scale_similarity_threshold: float
view: str
class DeepDiff(ResultDict, SerializationMixin, DistanceMixin, DeepDiffProtocol, Base):
__doc__ = doc
CACHE_AUTO_ADJUST_THRESHOLD = 0.25
def __init__(self,
t1: Any,
t2: Any,
_original_type=None,
cache_purge_level: int=1,
cache_size: int=0,
cache_tuning_sample_size: int=0,
custom_operators: Optional[List[Any]] =None,
cutoff_distance_for_pairs: float=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT,
cutoff_intersection_for_pairs: float=CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT,
default_timezone:Union[datetime.timezone, "BaseTzInfo"]=datetime.timezone.utc,
encodings: Optional[List[str]]=None,
exclude_obj_callback: Optional[Callable]=None,
exclude_obj_callback_strict: Optional[Callable]=None,
exclude_paths: Union[str, List[str], Set[str], FrozenSet[str], None]=None,
exclude_regex_paths: Union[str, List[str], Pattern[str], List[Pattern[str]], None]=None,
exclude_types: Optional[List[Any]]=None,
get_deep_distance: bool=False,
group_by: Union[str, Tuple[str, str], None]=None,
group_by_sort_key: Union[str, Callable, None]=None,
hasher: Optional[Callable]=None,
hashes: Optional[Dict]=None,
ignore_encoding_errors: bool=False,
ignore_nan_inequality: bool=False,
ignore_numeric_type_changes: bool=False,
ignore_order: bool=False,
ignore_order_func: Optional[Callable]=None,
ignore_private_variables: bool=True,
ignore_string_case: bool=False,
ignore_string_type_changes: bool=False,
ignore_type_in_groups: Optional[List[Tuple]]=None,
ignore_type_subclasses: bool=False,
include_obj_callback: Optional[Callable]=None,
include_obj_callback_strict: Optional[Callable]=None,
include_paths: Union[str, List[str], None]=None,
iterable_compare_func: Optional[Callable]=None,
log_frequency_in_sec: int=0,
log_scale_similarity_threshold: float=0.1,
log_stacktrace: bool=False,
math_epsilon: Optional[float]=None,
max_diffs: Optional[int]=None,
max_passes: int=10000000,
number_format_notation: str="f",
number_to_string_func: Optional[Callable]=None,
progress_logger: Callable=logger.info,
report_repetition: bool=False,
significant_digits: Optional[int]=None,
threshold_to_diff_deeper: float = 0.33,
truncate_datetime: Optional[str]=None,
use_enum_value: bool=False,
use_log_scale: bool=False,
verbose_level: int=1,
view: str=TEXT_VIEW,
zip_ordered_iterables: bool=False,
_parameters=None,
_shared_parameters=None,
**kwargs):
super().__init__()
if kwargs:
raise ValueError((
"The following parameter(s) are not valid: %s\n"
"The valid parameters are ignore_order, report_repetition, significant_digits, "
"number_format_notation, exclude_paths, include_paths, exclude_types, exclude_regex_paths, ignore_type_in_groups, "
"ignore_string_type_changes, ignore_numeric_type_changes, ignore_type_subclasses, truncate_datetime, "
"ignore_private_variables, ignore_nan_inequality, number_to_string_func, verbose_level, "
"view, hasher, hashes, max_passes, max_diffs, zip_ordered_iterables, "
"cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, "
"cache_tuning_sample_size, get_deep_distance, group_by, group_by_sort_key, cache_purge_level, log_stacktrace,"
"math_epsilon, iterable_compare_func, use_enum_value, _original_type, threshold_to_diff_deeper, default_timezone "
"ignore_order_func, custom_operators, encodings, ignore_encoding_errors, use_log_scale, log_scale_similarity_threshold "
"_parameters and _shared_parameters.") % ', '.join(kwargs.keys()))
if _parameters:
self.__dict__.update(_parameters)
else:
self.custom_operators = custom_operators or []
self.ignore_order = ignore_order
self.ignore_order_func = ignore_order_func
ignore_type_in_groups = ignore_type_in_groups or []
if numbers == ignore_type_in_groups or numbers in ignore_type_in_groups:
ignore_numeric_type_changes = True
self.ignore_numeric_type_changes = ignore_numeric_type_changes
if strings == ignore_type_in_groups or strings in ignore_type_in_groups:
ignore_string_type_changes = True
self.use_enum_value = use_enum_value
self.log_scale_similarity_threshold = log_scale_similarity_threshold
self.use_log_scale = use_log_scale
self.default_timezone = default_timezone
self.log_stacktrace = log_stacktrace
self.threshold_to_diff_deeper = threshold_to_diff_deeper
self.ignore_string_type_changes = ignore_string_type_changes
self.ignore_type_in_groups = self.get_ignore_types_in_groups(
ignore_type_in_groups=ignore_type_in_groups,
ignore_string_type_changes=ignore_string_type_changes,
ignore_numeric_type_changes=ignore_numeric_type_changes,
ignore_type_subclasses=ignore_type_subclasses)
self.report_repetition = report_repetition
self.exclude_paths = add_root_to_paths(convert_item_or_items_into_set_else_none(exclude_paths))
self.include_paths = add_root_to_paths(convert_item_or_items_into_set_else_none(include_paths))
self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths)
self.exclude_types = set(exclude_types) if exclude_types else None
self.exclude_types_tuple = tuple(exclude_types) if exclude_types else None # we need tuple for checking isinstance
self.ignore_type_subclasses = ignore_type_subclasses
self.type_check_func = type_in_type_group if ignore_type_subclasses else type_is_subclass_of_type_group
self.ignore_string_case = ignore_string_case
self.exclude_obj_callback = exclude_obj_callback
self.exclude_obj_callback_strict = exclude_obj_callback_strict
self.include_obj_callback = include_obj_callback
self.include_obj_callback_strict = include_obj_callback_strict
self.number_to_string = number_to_string_func or number_to_string
self.iterable_compare_func = iterable_compare_func
self.zip_ordered_iterables = zip_ordered_iterables
self.ignore_private_variables = ignore_private_variables
self.ignore_nan_inequality = ignore_nan_inequality
self.hasher = hasher
self.cache_tuning_sample_size = cache_tuning_sample_size
self.group_by = group_by
if callable(group_by_sort_key):
self.group_by_sort_key = group_by_sort_key
elif group_by_sort_key:
def _group_by_sort_key(x):
return x[group_by_sort_key]
self.group_by_sort_key = _group_by_sort_key
else:
self.group_by_sort_key = None
self.encodings = encodings
self.ignore_encoding_errors = ignore_encoding_errors
self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes)
self.math_epsilon = math_epsilon
if self.math_epsilon is not None and self.ignore_order:
logger.warning("math_epsilon in conjunction with ignore_order=True is only used for flat object comparisons. Custom math_epsilon will not have an effect when comparing nested objects.")
self.truncate_datetime = get_truncate_datetime(truncate_datetime)
self.number_format_notation = number_format_notation
if verbose_level in {0, 1, 2}:
self.verbose_level = verbose_level
else:
raise ValueError(VERBOSE_LEVEL_RANGE_MSG)
if cache_purge_level not in {0, 1, 2}:
raise ValueError(PURGE_LEVEL_RANGE_MSG)
self.view = view
# Setting up the cache for dynamic programming. One dictionary per instance of root of DeepDiff running.
self.max_passes = max_passes
self.max_diffs = max_diffs
self.cutoff_distance_for_pairs = float(cutoff_distance_for_pairs)
self.cutoff_intersection_for_pairs = float(cutoff_intersection_for_pairs)
if self.cutoff_distance_for_pairs < 0 or self.cutoff_distance_for_pairs > 1:
raise ValueError(CUTOFF_RANGE_ERROR_MSG)
# _Parameters are the clean _parameters to initialize DeepDiff with so we avoid all the above
# cleaning functionalities when running DeepDiff recursively.
# However DeepHash has its own set of _parameters that are slightly different than DeepDIff.
# DeepDiff _parameters are transformed to DeepHash _parameters via _get_deephash_params method.
self.progress_logger = progress_logger
self.cache_size = cache_size
_parameters = self.__dict__.copy()
_parameters['group_by'] = None # overwriting since these parameters will be passed on to other passes.
if log_stacktrace:
self.log_err = logger.exception
else:
self.log_err = logger.error
# Non-Root
if _shared_parameters:
self.is_root = False
self._shared_parameters = _shared_parameters
self.__dict__.update(_shared_parameters)
# We are in some pass other than root
progress_timer = None
# Root
else:
self.is_root = True
# Caching the DeepDiff results for dynamic programming
self._distance_cache = LFUCache(cache_size) if cache_size else DummyLFU()
self._stats = {
PASSES_COUNT: 0,
DIFF_COUNT: 0,
DISTANCE_CACHE_HIT_COUNT: 0,
PREVIOUS_DIFF_COUNT: 0,
PREVIOUS_DISTANCE_CACHE_HIT_COUNT: 0,
MAX_PASS_LIMIT_REACHED: False,
MAX_DIFF_LIMIT_REACHED: False,
DISTANCE_CACHE_ENABLED: bool(cache_size),
}
self.hashes = dict_() if hashes is None else hashes
self._numpy_paths = dict_() # if _numpy_paths is None else _numpy_paths
self._shared_parameters = {
'hashes': self.hashes,
'_stats': self._stats,
'_distance_cache': self._distance_cache,
'_numpy_paths': self._numpy_paths,
_ENABLE_CACHE_EVERY_X_DIFF: self.cache_tuning_sample_size * 10,
}
if log_frequency_in_sec:
# Creating a progress log reporter that runs in a separate thread every log_frequency_in_sec seconds.
progress_timer = RepeatedTimer(log_frequency_in_sec, _report_progress, self._stats, progress_logger)
else:
progress_timer = None
self._parameters = _parameters
self.deephash_parameters = self._get_deephash_params()
self.tree = TreeResult()
self._iterable_opcodes = {}
if group_by and self.is_root:
try:
original_t1 = t1
t1 = self._group_iterable_to_dict(t1, group_by, item_name='t1')
except (KeyError, ValueError):
pass
else:
try:
t2 = self._group_iterable_to_dict(t2, group_by, item_name='t2')
except (KeyError, ValueError):
t1 = original_t1
self.t1 = t1
self.t2 = t2
try:
root = DiffLevel(t1, t2, verbose_level=self.verbose_level)
# _original_type is only used to pass the original type of the data. Currently only used for numpy arrays.
# The reason is that we convert the numpy array to python list and then later for distance calculations
# we convert only the the last dimension of it into numpy arrays.
self._diff(root, parents_ids=frozenset({id(t1)}), _original_type=_original_type)
if get_deep_distance and view in {TEXT_VIEW, TREE_VIEW}:
self.tree['deep_distance'] = self._get_rough_distance()
self.tree.remove_empty_keys()
view_results = self._get_view_results(self.view)
self.update(view_results)
finally:
if self.is_root:
if cache_purge_level:
del self._distance_cache
del self.hashes
del self._shared_parameters
del self._parameters
for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT,
DISTANCE_CACHE_ENABLED):
del self._stats[key]
if progress_timer:
duration = progress_timer.stop()
self._stats['DURATION SEC'] = duration
logger.info('stats {}'.format(self.get_stats()))
if cache_purge_level == 2:
self.__dict__.clear()
def _get_deephash_params(self):
result = {key: self._parameters[key] for key in DEEPHASH_PARAM_KEYS}
result['ignore_repetition'] = not self.report_repetition
result['number_to_string_func'] = self.number_to_string
return result
def _report_result(self, report_type, change_level, local_tree=None):
"""
Add a detected change to the reference-style result dictionary.
report_type will be added to level.
(We'll create the text-style report from there later.)
:param report_type: A well defined string key describing the type of change.
Examples: "set_item_added", "values_changed"
:param change_level: A DiffLevel object describing the objects in question in their
before-change and after-change object structure.
:local_tree: None
"""
if not self._skip_this(change_level):
change_level.report_type = report_type
tree = self.tree if local_tree is None else local_tree
tree[report_type].add(change_level)
def custom_report_result(self, report_type, level, extra_info=None):
"""
Add a detected change to the reference-style result dictionary.
report_type will be added to level.
(We'll create the text-style report from there later.)
:param report_type: A well defined string key describing the type of change.
Examples: "set_item_added", "values_changed"
:param parent: A DiffLevel object describing the objects in question in their
before-change and after-change object structure.
:param extra_info: A dict that describe this result
:rtype: None
"""
if not self._skip_this(level):
level.report_type = report_type
level.additional[CUSTOM_FIELD] = extra_info
self.tree[report_type].add(level)
@staticmethod
def _dict_from_slots(object):
def unmangle(attribute):
if attribute.startswith('__') and attribute != '__weakref__':
return '_{type}{attribute}'.format(
type=type(object).__name__,
attribute=attribute
)
return attribute
all_slots = []
if isinstance(object, type):
mro = object.__mro__ # pragma: no cover. I have not been able to write a test for this case. But we still check for it.
else:
mro = object.__class__.__mro__
for type_in_mro in mro:
slots = getattr(type_in_mro, '__slots__', None)
if slots:
if isinstance(slots, strings):
all_slots.append(slots)
else:
all_slots.extend(slots)
return {i: getattr(object, key) for i in all_slots if hasattr(object, key := unmangle(i))}
def _diff_enum(self, level, parents_ids=frozenset(), local_tree=None):
t1 = detailed__dict__(level.t1, include_keys=ENUM_INCLUDE_KEYS)
t2 = detailed__dict__(level.t2, include_keys=ENUM_INCLUDE_KEYS)
self._diff_dict(
level,
parents_ids,
print_as_attribute=True,
override=True,
override_t1=t1,
override_t2=t2,
local_tree=local_tree,
)
def _diff_obj(self, level, parents_ids=frozenset(), is_namedtuple=False, local_tree=None, is_pydantic_object=False):
"""Difference of 2 objects"""
processing_error = False
try:
if is_namedtuple:
t1 = level.t1._asdict()
t2 = level.t2._asdict()
elif is_pydantic_object:
t1 = detailed__dict__(level.t1, ignore_private_variables=self.ignore_private_variables, ignore_keys=model_fields_set)
t2 = detailed__dict__(level.t2, ignore_private_variables=self.ignore_private_variables, ignore_keys=model_fields_set)
elif all('__dict__' in dir(t) for t in level):
t1 = detailed__dict__(level.t1, ignore_private_variables=self.ignore_private_variables)
t2 = detailed__dict__(level.t2, ignore_private_variables=self.ignore_private_variables)
elif all('__slots__' in dir(t) for t in level):
t1 = self._dict_from_slots(level.t1)
t2 = self._dict_from_slots(level.t2)
else:
t1 = {k: v for k, v in getmembers(level.t1) if not callable(v)}
t2 = {k: v for k, v in getmembers(level.t2) if not callable(v)}
except AttributeError:
processing_error = True
if processing_error is True:
self._report_result('unprocessed', level, local_tree=local_tree)
return
self._diff_dict(
level,
parents_ids,
print_as_attribute=True,
override=True,
override_t1=t1,
override_t2=t2,
local_tree=local_tree,
)
def _skip_this(self, level):
"""
Check whether this comparison should be skipped because one of the objects to compare meets exclusion criteria.
:rtype: bool
"""
level_path = level.path()
skip = False
if self.exclude_paths and level_path in self.exclude_paths:
skip = True
if self.include_paths and level_path != 'root':
if level_path not in self.include_paths:
skip = True
for prefix in self.include_paths:
if prefix in level_path or level_path in prefix:
skip = False
break
elif self.exclude_regex_paths and any(
[exclude_regex_path.search(level_path) for exclude_regex_path in self.exclude_regex_paths]):
skip = True
elif self.exclude_types_tuple and \
(isinstance(level.t1, self.exclude_types_tuple) or isinstance(level.t2, self.exclude_types_tuple)):
skip = True
elif self.exclude_obj_callback and \
(self.exclude_obj_callback(level.t1, level_path) or self.exclude_obj_callback(level.t2, level_path)):
skip = True
elif self.exclude_obj_callback_strict and \
(self.exclude_obj_callback_strict(level.t1, level_path) and
self.exclude_obj_callback_strict(level.t2, level_path)):
skip = True
elif self.include_obj_callback and level_path != 'root':
skip = True
if (self.include_obj_callback(level.t1, level_path) or self.include_obj_callback(level.t2, level_path)):
skip = False
elif self.include_obj_callback_strict and level_path != 'root':
skip = True
if (self.include_obj_callback_strict(level.t1, level_path) and
self.include_obj_callback_strict(level.t2, level_path)):
skip = False
return skip
def _skip_this_key(self, level, key):
# if include_paths is not set, than treet every path as included
if self.include_paths is None:
return False
if "{}['{}']".format(level.path(), key) in self.include_paths:
return False
if level.path() in self.include_paths:
# matches e.g. level+key root['foo']['bar']['veg'] include_paths ["root['foo']['bar']"]
return False
for prefix in self.include_paths:
if "{}['{}']".format(level.path(), key) in prefix:
# matches as long the prefix is longer than this object key
# eg.: level+key root['foo']['bar'] matches prefix root['foo']['bar'] from include paths
# level+key root['foo'] matches prefix root['foo']['bar'] from include_paths
# level+key root['foo']['bar'] DOES NOT match root['foo'] from include_paths This needs to be handled afterwards
return False
# check if a higher level is included as a whole (=without any sublevels specified)
# matches e.g. level+key root['foo']['bar']['veg'] include_paths ["root['foo']"]
# but does not match, if it is level+key root['foo']['bar']['veg'] include_paths ["root['foo']['bar']['fruits']"]
up = level.up
while up is not None:
if up.path() in self.include_paths:
return False
up = up.up
return True
def _get_clean_to_keys_mapping(self, keys, level):
"""
Get a dictionary of cleaned value of keys to the keys themselves.
This is mainly used to transform the keys when the type changes of keys should be ignored.
TODO: needs also some key conversion for groups of types other than the built-in strings and numbers.
"""
result = dict_()
for key in keys:
if self.ignore_string_type_changes and isinstance(key, bytes):
clean_key = key.decode('utf-8')
elif self.use_enum_value and isinstance(key, Enum):
clean_key = key.value
elif isinstance(key, numbers):
type_ = "number" if self.ignore_numeric_type_changes else key.__class__.__name__
clean_key = self.number_to_string(key, significant_digits=self.significant_digits,
number_format_notation=self.number_format_notation)
clean_key = KEY_TO_VAL_STR.format(type_, clean_key)
else:
clean_key = key
if self.ignore_string_case and isinstance(clean_key, str):
clean_key = clean_key.lower()
if clean_key in result:
logger.warning(('{} and {} in {} become the same key when ignore_numeric_type_changes'
'or ignore_numeric_type_changes are set to be true.').format(
key, result[clean_key], level.path()))
else:
result[clean_key] = key
return result
def _diff_dict(
self,
level,
parents_ids=frozenset([]),
print_as_attribute=False,
override=False,
override_t1=None,
override_t2=None,
local_tree=None,
):
"""Difference of 2 dictionaries"""
if override:
# for special stuff like custom objects and named tuples we receive preprocessed t1 and t2
# but must not spoil the chain (=level) with it
t1 = override_t1
t2 = override_t2
else:
t1 = level.t1
t2 = level.t2
if print_as_attribute:
item_added_key = "attribute_added"
item_removed_key = "attribute_removed"
rel_class = AttributeRelationship
else:
item_added_key = "dictionary_item_added"
item_removed_key = "dictionary_item_removed"
rel_class = DictRelationship
if self.ignore_private_variables:
t1_keys = SetOrdered([key for key in t1 if not(isinstance(key, str) and key.startswith('__')) and not self._skip_this_key(level, key)])
t2_keys = SetOrdered([key for key in t2 if not(isinstance(key, str) and key.startswith('__')) and not self._skip_this_key(level, key)])
else:
t1_keys = SetOrdered([key for key in t1 if not self._skip_this_key(level, key)])
t2_keys = SetOrdered([key for key in t2 if not self._skip_this_key(level, key)])
if self.ignore_string_type_changes or self.ignore_numeric_type_changes or self.ignore_string_case:
t1_clean_to_keys = self._get_clean_to_keys_mapping(keys=t1_keys, level=level)
t2_clean_to_keys = self._get_clean_to_keys_mapping(keys=t2_keys, level=level)
t1_keys = SetOrdered(t1_clean_to_keys.keys())
t2_keys = SetOrdered(t2_clean_to_keys.keys())
else:
t1_clean_to_keys = t2_clean_to_keys = None
t_keys_intersect = t2_keys & t1_keys
t_keys_added = t2_keys - t_keys_intersect
t_keys_removed = t1_keys - t_keys_intersect
if self.threshold_to_diff_deeper:
if self.exclude_paths:
t_keys_union = {f"{level.path()}[{repr(key)}]" for key in (t2_keys | t1_keys)}
t_keys_union -= self.exclude_paths
t_keys_union_len = len(t_keys_union)
else:
t_keys_union_len = len(t2_keys | t1_keys)
if t_keys_union_len > 1 and len(t_keys_intersect) / t_keys_union_len < self.threshold_to_diff_deeper:
self._report_result('values_changed', level, local_tree=local_tree)
return
for key in t_keys_added:
if self._count_diff() is StopIteration:
return
key = t2_clean_to_keys[key] if t2_clean_to_keys else key
change_level = level.branch_deeper(
notpresent,
t2[key],
child_relationship_class=rel_class,
child_relationship_param=key,
child_relationship_param2=key,
)
self._report_result(item_added_key, change_level, local_tree=local_tree)
for key in t_keys_removed:
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition.
key = t1_clean_to_keys[key] if t1_clean_to_keys else key
change_level = level.branch_deeper(
t1[key],
notpresent,
child_relationship_class=rel_class,
child_relationship_param=key,
child_relationship_param2=key,
)
self._report_result(item_removed_key, change_level, local_tree=local_tree)
for key in t_keys_intersect: # key present in both dicts - need to compare values
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition.
key1 = t1_clean_to_keys[key] if t1_clean_to_keys else key
key2 = t2_clean_to_keys[key] if t2_clean_to_keys else key
item_id = id(t1[key1])
if parents_ids and item_id in parents_ids:
continue
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
# Go one level deeper
next_level = level.branch_deeper(
t1[key1],
t2[key2],
child_relationship_class=rel_class,
child_relationship_param=key,
child_relationship_param2=key,
)
self._diff(next_level, parents_ids_added, local_tree=local_tree)
def _diff_set(self, level, local_tree=None):
"""Difference of sets"""
t1_hashtable = self._create_hashtable(level, 't1')
t2_hashtable = self._create_hashtable(level, 't2')
t1_hashes = set(t1_hashtable.keys())
t2_hashes = set(t2_hashtable.keys())
hashes_added = t2_hashes - t1_hashes
hashes_removed = t1_hashes - t2_hashes
items_added = [t2_hashtable[i].item for i in hashes_added]
items_removed = [t1_hashtable[i].item for i in hashes_removed]
for item in items_added:
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition.
change_level = level.branch_deeper(
notpresent, item, child_relationship_class=SetRelationship)
self._report_result('set_item_added', change_level, local_tree=local_tree)
for item in items_removed:
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition.
change_level = level.branch_deeper(
item, notpresent, child_relationship_class=SetRelationship)
self._report_result('set_item_removed', change_level, local_tree=local_tree)
@staticmethod
def _iterables_subscriptable(t1, t2):
try:
if getattr(t1, '__getitem__') and getattr(t2, '__getitem__'):
return True
else: # pragma: no cover
return False # should never happen
except AttributeError:
return False
def _diff_iterable(self, level, parents_ids=frozenset(), _original_type=None, local_tree=None):
"""Difference of iterables"""
if (self.ignore_order_func and self.ignore_order_func(level)) or self.ignore_order:
self._diff_iterable_with_deephash(level, parents_ids, _original_type=_original_type, local_tree=local_tree)
else:
self._diff_iterable_in_order(level, parents_ids, _original_type=_original_type, local_tree=local_tree)
def _compare_in_order(
self, level,
t1_from_index=None, t1_to_index=None,
t2_from_index=None, t2_to_index=None
) -> List[Tuple[Tuple[int, int], Tuple[Any, Any]]]:
"""
Default compare if `iterable_compare_func` is not provided.
This will compare in sequence order.
"""
if t1_from_index is None:
return [((i, i), (x, y)) for i, (x, y) in enumerate(
zip_longest(
level.t1, level.t2, fillvalue=ListItemRemovedOrAdded))]
else:
t1_chunk = level.t1[t1_from_index:t1_to_index]
t2_chunk = level.t2[t2_from_index:t2_to_index]
return [((i + t1_from_index, i + t2_from_index), (x, y)) for i, (x, y) in enumerate(
zip_longest(
t1_chunk, t2_chunk, fillvalue=ListItemRemovedOrAdded))]
def _get_matching_pairs(
self, level,
t1_from_index=None, t1_to_index=None,
t2_from_index=None, t2_to_index=None
) -> List[Tuple[Tuple[int, int], Tuple[Any, Any]]]:
"""
Given a level get matching pairs. This returns list of two tuples in the form:
[
(t1 index, t2 index), (t1 item, t2 item)
]
This will compare using the passed in `iterable_compare_func` if available.
Default it to compare in order
"""
if self.iterable_compare_func is None:
# Match in order if there is no compare function provided
return self._compare_in_order(
level,
t1_from_index=t1_from_index, t1_to_index=t1_to_index,
t2_from_index=t2_from_index, t2_to_index=t2_to_index,
)
try:
matches = []
y_matched = set()
y_index_matched = set()
for i, x in enumerate(level.t1):
x_found = False
for j, y in enumerate(level.t2):
if(j in y_index_matched):
# This ensures a one-to-one relationship of matches from t1 to t2.
# If y this index in t2 has already been matched to another x
# it cannot have another match, so just continue.
continue
if(self.iterable_compare_func(x, y, level)):
deep_hash = DeepHash(y,
hashes=self.hashes,
apply_hash=True,
**self.deephash_parameters,
)
y_index_matched.add(j)
y_matched.add(deep_hash[y])
matches.append(((i, j), (x, y)))
x_found = True
break
if(not x_found):
matches.append(((i, -1), (x, ListItemRemovedOrAdded)))
for j, y in enumerate(level.t2):
deep_hash = DeepHash(y,
hashes=self.hashes,
apply_hash=True,
**self.deephash_parameters,
)
if(deep_hash[y] not in y_matched):
matches.append(((-1, j), (ListItemRemovedOrAdded, y)))
return matches
except CannotCompare:
return self._compare_in_order(
level,
t1_from_index=t1_from_index, t1_to_index=t1_to_index,
t2_from_index=t2_from_index, t2_to_index=t2_to_index
)
def _diff_iterable_in_order(self, level, parents_ids=frozenset(), _original_type=None, local_tree=None):
# We're handling both subscriptable and non-subscriptable iterables. Which one is it?
subscriptable = self._iterables_subscriptable(level.t1, level.t2)
if subscriptable:
child_relationship_class = SubscriptableIterableRelationship
else:
child_relationship_class = NonSubscriptableIterableRelationship
if (
not self.zip_ordered_iterables
and isinstance(level.t1, Sequence)
and isinstance(level.t2, Sequence)
and self._all_values_basic_hashable(level.t1)
and self._all_values_basic_hashable(level.t2)
and self.iterable_compare_func is None
):
local_tree_pass = TreeResult()
opcodes_with_values = self._diff_ordered_iterable_by_difflib(
level,
parents_ids=parents_ids,
_original_type=_original_type,
child_relationship_class=child_relationship_class,
local_tree=local_tree_pass,
)
# Sometimes DeepDiff's old iterable diff does a better job than DeepDiff
if len(local_tree_pass) > 1:
local_tree_pass2 = TreeResult()
self._diff_by_forming_pairs_and_comparing_one_by_one(
level,
parents_ids=parents_ids,
_original_type=_original_type,
child_relationship_class=child_relationship_class,
local_tree=local_tree_pass2,
)
if len(local_tree_pass) >= len(local_tree_pass2):
local_tree_pass = local_tree_pass2
else:
self._iterable_opcodes[level.path(force=FORCE_DEFAULT)] = opcodes_with_values
for report_type, levels in local_tree_pass.items():
if levels:
self.tree[report_type] |= levels
else:
self._diff_by_forming_pairs_and_comparing_one_by_one(
level,
parents_ids=parents_ids,
_original_type=_original_type,
child_relationship_class=child_relationship_class,
local_tree=local_tree,
)
def _all_values_basic_hashable(self, iterable):
"""
Are all items basic hashable types?
Or there are custom types too?
"""
# We don't want to exhaust a generator
if isinstance(iterable, types.GeneratorType):
return False
for item in iterable:
if not isinstance(item, basic_types):
return False
return True
def _diff_by_forming_pairs_and_comparing_one_by_one(
self, level, local_tree, parents_ids=frozenset(),
_original_type=None, child_relationship_class=None,
t1_from_index=None, t1_to_index=None,
t2_from_index=None, t2_to_index=None,
):
for (i, j), (x, y) in self._get_matching_pairs(
level,
t1_from_index=t1_from_index, t1_to_index=t1_to_index,
t2_from_index=t2_from_index, t2_to_index=t2_to_index
):
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition.
reference_param1 = i
reference_param2 = j
if y is ListItemRemovedOrAdded: # item removed completely
change_level = level.branch_deeper(
x,
notpresent,
child_relationship_class=child_relationship_class,
child_relationship_param=reference_param1,
child_relationship_param2=reference_param2,
)
self._report_result('iterable_item_removed', change_level, local_tree=local_tree)
elif x is ListItemRemovedOrAdded: # new item added
change_level = level.branch_deeper(
notpresent,
y,
child_relationship_class=child_relationship_class,
child_relationship_param=reference_param1,
child_relationship_param2=reference_param2,
)
self._report_result('iterable_item_added', change_level, local_tree=local_tree)
else: # check if item value has changed
if (i != j and ((x == y) or self.iterable_compare_func)):
# Item moved
change_level = level.branch_deeper(
x,
y,
child_relationship_class=child_relationship_class,
child_relationship_param=reference_param1,
child_relationship_param2=reference_param2
)
self._report_result('iterable_item_moved', change_level, local_tree=local_tree)
if self.iterable_compare_func:
# Mark additional context denoting that we have moved an item.
# This will allow for correctly setting paths relative to t2 when using an iterable_compare_func
level.additional["moved"] = True
else:
continue
item_id = id(x)
if parents_ids and item_id in parents_ids:
continue
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
# Go one level deeper
next_level = level.branch_deeper(
x,
y,
child_relationship_class=child_relationship_class,
child_relationship_param=reference_param1,
child_relationship_param2=reference_param2
)
self._diff(next_level, parents_ids_added, local_tree=local_tree)
def _diff_ordered_iterable_by_difflib(
self, level, local_tree, parents_ids=frozenset(), _original_type=None, child_relationship_class=None,
):
seq = difflib.SequenceMatcher(isjunk=None, a=level.t1, b=level.t2, autojunk=False)
opcodes = seq.get_opcodes()
opcodes_with_values = []
# TODO: this logic should be revisted so we detect reverse operations
# like when a replacement happens at index X and a reverse replacement happens at index Y
# in those cases we have a "iterable_item_moved" operation.
for tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index in opcodes:
if tag == 'equal':
opcodes_with_values.append(Opcode(
tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index,
))
continue
# print('{:7} t1[{}:{}] --> t2[{}:{}] {!r:>8} --> {!r}'.format(
# tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index, level.t1[t1_from_index:t1_to_index], level.t2[t2_from_index:t2_to_index]))
opcodes_with_values.append(Opcode(
tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index,
old_values = level.t1[t1_from_index: t1_to_index],