From fdc592caee3654bbfeb8a308e89d33dd18491baa Mon Sep 17 00:00:00 2001 From: Andrew Yin Date: Wed, 7 Jul 2021 14:17:33 -0500 Subject: [PATCH 1/4] Add entity percentages to gh-pages --- docs/source/index.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 6050300c1..23b4cecf3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -140,6 +140,11 @@ The format for an unstructured profile is below: "true_char_level": dict(int), "postprocess_char_level": dict(int) }, + "entity_percentages": { + "word_level": dict(float), + "true_char_level": dict(float), + "postprocess_char_level": dict(float) + } "times": dict(float) }, "statistics": { @@ -436,4 +441,3 @@ Versions .. _0.5.2: ../../0.5.2/html/index.html .. _0.5.3: ../../0.5.3/html/index.html - From 7234d727671bb7102adb3454e674c084da60023f Mon Sep 17 00:00:00 2001 From: Andrew Yin Date: Wed, 7 Jul 2021 14:25:08 -0500 Subject: [PATCH 2/4] Add missing comma --- docs/source/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 23b4cecf3..589405807 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -144,7 +144,7 @@ The format for an unstructured profile is below: "word_level": dict(float), "true_char_level": dict(float), "postprocess_char_level": dict(float) - } + }, "times": dict(float) }, "statistics": { From 55962035f8e93b461e0e8c18d5fd35e1e033eee6 Mon Sep 17 00:00:00 2001 From: Andrew Yin Date: Tue, 10 Aug 2021 15:27:20 -0500 Subject: [PATCH 3/4] Add mode and median to gh-pages --- docs/source/index.rst | 3 ++- docs/source/profiler.rst | 22 +++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index aaefa381b..1486b3932 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -86,6 +86,8 @@ The format for a structured profile is below: "min": [null, float], "max": [null, float], "sum": float, + "mode": list(float), + "median": float, "mean": float, "variance": float, "stddev": float, @@ -457,4 +459,3 @@ Versions .. _0.7.0: ../../0.7.0/html/index.html .. _0.7.1: ../../0.7.1/html/index.html - diff --git a/docs/source/profiler.rst b/docs/source/profiler.rst index 0fbfc5923..68b8a6362 100644 --- a/docs/source/profiler.rst +++ b/docs/source/profiler.rst @@ -465,6 +465,13 @@ Below is an breakdown of all the options. * max - Finds maximum value in a column * is_enabled - (Boolean) Enables or disables max + * mode - Finds mode(s) in a column + + * is_enabled - (Boolean) Enables or disables mode + * top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes. + * median - Finds median value in a column + + * is_enabled - (Boolean) Enables or disables median * sum - Finds sum of all values in a column * is_enabled - (Boolean) Enables or disables sum @@ -510,6 +517,13 @@ Below is an breakdown of all the options. * max - Finds maximum value in a column * is_enabled - (Boolean) Enables or disables max + * mode - Finds mode(s) in a column + + * is_enabled - (Boolean) Enables or disables mode + * top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes. + * median - Finds median value in a column + + * is_enabled - (Boolean) Enables or disables median * sum - Finds sum of all values in a column * is_enabled - (Boolean) Enables or disables sum @@ -552,6 +566,13 @@ Below is an breakdown of all the options. * max - Finds maximum value in a column * is_enabled - (Boolean) Enables or disables max + * mode - Finds mode(s) in a column + + * is_enabled - (Boolean) Enables or disables mode + * top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes. + * median - Finds median value in a column + + * is_enabled - (Boolean) Enables or disables median * sum - Finds sum of all values in a column * is_enabled - (Boolean) Enables or disables sum @@ -675,4 +696,3 @@ For every profile, we can provide a report and customize it with a couple option report = profile.report(report_options={"output_format": "compact"}) report = profile.report(report_options={"output_format": "serializable"}) report = profile.report(report_options={"output_format": "flat"}) - From bc963f2e29590fcf27d12eb207a40097374b763b Mon Sep 17 00:00:00 2001 From: Andrew Yin Date: Tue, 10 Aug 2021 15:46:09 -0500 Subject: [PATCH 4/4] Correct formatting --- docs/source/index.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 1486b3932..37a354865 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -66,8 +66,8 @@ The format for a structured profile is below: "duplicate_row_count": int, "file_type": string, "encoding": string, - "correlation_matrix": list(list(int)), (*) - "profile_schema": dict[string, list(int)] + "correlation_matrix": list[list[int]], (*) + "profile_schema": dict[string, list[int]] }, "data_stats": [ { @@ -76,17 +76,17 @@ The format for a structured profile is below: "data_label": string, "categorical": bool, "order": string, - "samples": list(str), + "samples": list[str], "statistics": { "sample_size": int, "null_count": int, - "null_types": list(string), - "null_types_index": dict[string, list(int)], - "data_type_representation": dict[string, list(string)], + "null_types": list[string], + "null_types_index": dict[string, list[int]], + "data_type_representation": dict[string, list[string]], "min": [null, float], "max": [null, float], "sum": float, - "mode": list(float), + "mode": list[float], "median": float, "mean": float, "variance": float, @@ -96,16 +96,16 @@ The format for a structured profile is below: "num_zeros": int, "num_negatives": int, "histogram": { - "bin_counts": list(int), - "bin_edges": list(float), + "bin_counts": list[int], + "bin_edges": list[float], }, "quantiles": { int: float }, - "vocab": list(char), + "vocab": list[char], "avg_predictions": dict[string, float], "data_label_representation": dict[string, float], - "categories": list(str), + "categories": list[str], "unique_count": int, "unique_ratio": float, "categorical_count": dict[string, int], @@ -155,9 +155,9 @@ The format for an unstructured profile is below: "times": dict[string, float] }, "statistics": { - "vocab": list(char), + "vocab": list[char], "vocab_count": dict[string, int], - "words": list(string), + "words": list[string], "word_count": dict[string, int], "times": dict[string, float] }