diff --git a/pm4py/__init__.py b/pm4py/__init__.py index e9a2ddcb6..7bc4f9f71 100644 --- a/pm4py/__init__.py +++ b/pm4py/__init__.py @@ -26,7 +26,7 @@ filter_directly_follows_relation, filter_time_range, \ filter_eventually_follows_relation, filter_event_attribute_values, filter_trace_attribute_values, \ filter_between, filter_case_size, filter_case_performance, filter_activities_rework, filter_paths_performance, \ - filter_variants_by_coverage_percentage, filter_variants_top_k, filter_ocel_event_attribute, filter_ocel_object_attribute, \ + filter_variants_by_coverage_percentage, filter_variants_by_maximum_coverage_percentage, filter_variants_top_k, filter_ocel_event_attribute, filter_ocel_object_attribute, \ filter_ocel_object_types_allowed_activities, filter_ocel_object_per_type_count, filter_ocel_start_events_per_object_type, \ filter_ocel_end_events_per_object_type, filter_ocel_events_timestamp, filter_prefixes, filter_suffixes, \ filter_four_eyes_principle, filter_activity_done_different_resources, filter_ocel_events, filter_ocel_objects, \ diff --git a/pm4py/algo/filtering/log/variants/variants_filter.py b/pm4py/algo/filtering/log/variants/variants_filter.py index a56eb2b1c..23516760a 100644 --- a/pm4py/algo/filtering/log/variants/variants_filter.py +++ b/pm4py/algo/filtering/log/variants/variants_filter.py @@ -131,6 +131,39 @@ def filter_variants_by_coverage_percentage(log, min_coverage_percentage, paramet return apply(log, allowed_variants, parameters=parameters) +def filter_variants_by_maximum_coverage_percentage(log, max_coverage_percentage, parameters=None): + """ + Filters the variants of the log by a maximum coverage percentage + (e.g., if max_coverage_percentage=0.4, and we have a log with 1000 cases, + of which 500 of the variant 1, 400 of the variant 2, and 100 of the variant 3, + the filter keeps only the traces of variant 2 and variant 3). + + Parameters + --------------- + log + Event log + max_coverage_percentage + Maximum allowed percentage of coverage + parameters + Parameters + + Returns + --------------- + filtered_log + Filtered log + """ + if parameters is None: + parameters = {} + + log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters) + + variants = get_variants(log, parameters=parameters) + variants = {x: len(y) for x, y in variants.items()} + allowed_variants = [x for x, y in variants.items() if y <= max_coverage_percentage * len(log)] + + return apply(log, allowed_variants, parameters=parameters) + + def filter_log_variants_percentage(log, percentage=0.8, parameters=None): """ Filters a log by variants percentage diff --git a/pm4py/algo/filtering/pandas/variants/variants_filter.py b/pm4py/algo/filtering/pandas/variants/variants_filter.py index 79787a0c1..56d3cdc65 100644 --- a/pm4py/algo/filtering/pandas/variants/variants_filter.py +++ b/pm4py/algo/filtering/pandas/variants/variants_filter.py @@ -138,3 +138,35 @@ def filter_variants_by_coverage_percentage(log, min_coverage_percentage, paramet allowed_variants = [x for x, y in variants.items() if y >= min_coverage_percentage * log[case_id_glue].nunique()] return apply(log, allowed_variants, parameters=parameters) + + +def filter_variants_by_maximum_coverage_percentage(log, max_coverage_percentage, parameters=None): + """ + Filters the variants of the log by a maximum coverage percentage + (e.g., if max_coverage_percentage=0.4, and we have a log with 1000 cases, + of which 500 of the variant 1, 400 of the variant 2, and 100 of the variant 3, + the filter keeps only the traces of variant w and variant 3). + + Parameters + --------------- + log + Event log + max_coverage_percentage + Maximum allowed percentage of coverage + parameters + Parameters + + Returns + --------------- + filtered_log + Filtered log + """ + if parameters is None: + parameters = {} + + case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) + + variants = variants_get.get_variants_count(log, parameters=parameters) + allowed_variants = [x for x, y in variants.items() if y <= max_coverage_percentage * log[case_id_glue].nunique()] + + return apply(log, allowed_variants, parameters=parameters) \ No newline at end of file diff --git a/pm4py/filtering.py b/pm4py/filtering.py index 67a572181..59a944e03 100644 --- a/pm4py/filtering.py +++ b/pm4py/filtering.py @@ -651,6 +651,39 @@ def filter_variants_by_coverage_percentage(log: Union[EventLog, pd.DataFrame], m return variants_filter.filter_variants_by_coverage_percentage(log, min_coverage_percentage, parameters=parameters) +def filter_variants_by_maximum_coverage_percentage(log: Union[EventLog, pd.DataFrame], max_coverage_percentage: float, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: + """ + Filters the variants of the log by a maximum coverage percentage + (e.g., if max_coverage_percentage=0.4, and we have a log with 1000 cases, + of which 500 of the variant 1, 400 of the variant 2, and 100 of the variant 3, + the filter keeps only the traces of variant 2 and variant 3). + + :param log: event log / Pandas dataframe + :param max_coverage_percentage: maximum allowed percentage of coverage + :param activity_key: attribute to be used for the activity + :param timestamp_key: attribute to be used for the timestamp + :param case_id_key: attribute to be used as case identifier + :rtype: ``Union[EventLog, pd.DataFrame]`` + + .. code-block:: python3 + + import pm4py + + filtered_dataframe = pm4py.filter_variants_by_maximum_coverage_percentage(dataframe, 0.1, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') + """ + if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") + __event_log_deprecation_warning(log) + + parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + if check_is_pandas_dataframe(log): + check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + from pm4py.algo.filtering.pandas.variants import variants_filter + return variants_filter.filter_variants_by_maximum_coverage_percentage(log, max_coverage_percentage, parameters=parameters) + else: + from pm4py.algo.filtering.log.variants import variants_filter + return variants_filter.filter_variants_by_maximum_coverage_percentage(log, max_coverage_percentage, parameters=parameters) + + def filter_prefixes(log: Union[EventLog, pd.DataFrame], activity: str, strict=True, first_or_last="first", activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: """ Filters the log, keeping the prefixes to a given activity. E.g., for a log with traces: