Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added functionality to filter for a maximum coverage percentage #383

Merged
merged 1 commit into from
Feb 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pm4py/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
filter_directly_follows_relation, filter_time_range, \
filter_eventually_follows_relation, filter_event_attribute_values, filter_trace_attribute_values, \
filter_between, filter_case_size, filter_case_performance, filter_activities_rework, filter_paths_performance, \
filter_variants_by_coverage_percentage, filter_variants_top_k, filter_ocel_event_attribute, filter_ocel_object_attribute, \
filter_variants_by_coverage_percentage, filter_variants_by_maximum_coverage_percentage, filter_variants_top_k, filter_ocel_event_attribute, filter_ocel_object_attribute, \
filter_ocel_object_types_allowed_activities, filter_ocel_object_per_type_count, filter_ocel_start_events_per_object_type, \
filter_ocel_end_events_per_object_type, filter_ocel_events_timestamp, filter_prefixes, filter_suffixes, \
filter_four_eyes_principle, filter_activity_done_different_resources, filter_ocel_events, filter_ocel_objects, \
Expand Down
33 changes: 33 additions & 0 deletions pm4py/algo/filtering/log/variants/variants_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,39 @@ def filter_variants_by_coverage_percentage(log, min_coverage_percentage, paramet
return apply(log, allowed_variants, parameters=parameters)


def filter_variants_by_maximum_coverage_percentage(log, max_coverage_percentage, parameters=None):
"""
Filters the variants of the log by a maximum coverage percentage
(e.g., if max_coverage_percentage=0.4, and we have a log with 1000 cases,
of which 500 of the variant 1, 400 of the variant 2, and 100 of the variant 3,
the filter keeps only the traces of variant 2 and variant 3).

Parameters
---------------
log
Event log
max_coverage_percentage
Maximum allowed percentage of coverage
parameters
Parameters

Returns
---------------
filtered_log
Filtered log
"""
if parameters is None:
parameters = {}

log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters)

variants = get_variants(log, parameters=parameters)
variants = {x: len(y) for x, y in variants.items()}
allowed_variants = [x for x, y in variants.items() if y <= max_coverage_percentage * len(log)]

return apply(log, allowed_variants, parameters=parameters)


def filter_log_variants_percentage(log, percentage=0.8, parameters=None):
"""
Filters a log by variants percentage
Expand Down
32 changes: 32 additions & 0 deletions pm4py/algo/filtering/pandas/variants/variants_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,35 @@ def filter_variants_by_coverage_percentage(log, min_coverage_percentage, paramet
allowed_variants = [x for x, y in variants.items() if y >= min_coverage_percentage * log[case_id_glue].nunique()]

return apply(log, allowed_variants, parameters=parameters)


def filter_variants_by_maximum_coverage_percentage(log, max_coverage_percentage, parameters=None):
"""
Filters the variants of the log by a maximum coverage percentage
(e.g., if max_coverage_percentage=0.4, and we have a log with 1000 cases,
of which 500 of the variant 1, 400 of the variant 2, and 100 of the variant 3,
the filter keeps only the traces of variant w and variant 3).

Parameters
---------------
log
Event log
max_coverage_percentage
Maximum allowed percentage of coverage
parameters
Parameters

Returns
---------------
filtered_log
Filtered log
"""
if parameters is None:
parameters = {}

case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)

variants = variants_get.get_variants_count(log, parameters=parameters)
allowed_variants = [x for x, y in variants.items() if y <= max_coverage_percentage * log[case_id_glue].nunique()]

return apply(log, allowed_variants, parameters=parameters)
33 changes: 33 additions & 0 deletions pm4py/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,39 @@ def filter_variants_by_coverage_percentage(log: Union[EventLog, pd.DataFrame], m
return variants_filter.filter_variants_by_coverage_percentage(log, min_coverage_percentage, parameters=parameters)


def filter_variants_by_maximum_coverage_percentage(log: Union[EventLog, pd.DataFrame], max_coverage_percentage: float, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]:
"""
Filters the variants of the log by a maximum coverage percentage
(e.g., if max_coverage_percentage=0.4, and we have a log with 1000 cases,
of which 500 of the variant 1, 400 of the variant 2, and 100 of the variant 3,
the filter keeps only the traces of variant 2 and variant 3).

:param log: event log / Pandas dataframe
:param max_coverage_percentage: maximum allowed percentage of coverage
:param activity_key: attribute to be used for the activity
:param timestamp_key: attribute to be used for the timestamp
:param case_id_key: attribute to be used as case identifier
:rtype: ``Union[EventLog, pd.DataFrame]``

.. code-block:: python3

import pm4py

filtered_dataframe = pm4py.filter_variants_by_maximum_coverage_percentage(dataframe, 0.1, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name')
"""
if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!")
__event_log_deprecation_warning(log)

parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key)
if check_is_pandas_dataframe(log):
check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key)
from pm4py.algo.filtering.pandas.variants import variants_filter
return variants_filter.filter_variants_by_maximum_coverage_percentage(log, max_coverage_percentage, parameters=parameters)
else:
from pm4py.algo.filtering.log.variants import variants_filter
return variants_filter.filter_variants_by_maximum_coverage_percentage(log, max_coverage_percentage, parameters=parameters)


def filter_prefixes(log: Union[EventLog, pd.DataFrame], activity: str, strict=True, first_or_last="first", activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]:
"""
Filters the log, keeping the prefixes to a given activity. E.g., for a log with traces:
Expand Down