Merge pull request #116 from ing-bank/develop

v0.4.0
ing-bank · Apr 16, 2021 · 6bab922 · 6bab922
2 parents e661970 + c12783b
commit 6bab922
Show file tree

Hide file tree

Showing 54 changed files with 805 additions and 3,083 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -37,12 +37,15 @@ jobs:
         SPARK_VERSION: "2.4.7"
         HADOOP_VERSION: "2.7"
         SPARK_HOME: "/home/runner/work/spark/" #${{ github.workspace }}/spark/
+        SPARK_LOCAL_IP: "localhost"
       run: |
         sudo apt-get -y install openjdk-8-jdk
         curl https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz --output ${BUILD_DIR}/spark.tgz
         tar -xvzf ${BUILD_DIR}/spark.tgz && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} ${SPARK_HOME}
         pip install pytest-spark>=0.6.0 pyarrow>=0.8.0 pyspark==2.4.7
     - name: Test with pytest (spark-specific)
+      env:
+        SPARK_LOCAL_IP: "localhost"
       run: |
         pytest -m spark
 

diff --git a/.github/workflows/commit.yml b/.github/workflows/commit.yml
@@ -0,0 +1,11 @@
+name: Lint Commit Messages
+on: [pull_request]
+
+jobs:
+  commitlint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - uses: wagoid/commitlint-github-action@v3
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,18 +4,18 @@ repos:
     hooks:
     - id: black
 -   repo: https://github.com/pycqa/isort
-    rev: 5.7.0
+    rev: 5.8.0
     hooks:
       - id: isort
         files: '.*'
         args: [ --profile=black, --project=popmon, --thirdparty histogrammar, --thirdparty pybase64 ]
 -   repo: https://gitlab.com/pycqa/flake8
-    rev: "3.8.4"
+    rev: "3.9.0"
     hooks:
     -   id: flake8
         args: [ "--select=E9,F63,F7,F82"]
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.10.0
+    rev: v2.12.0
     hooks:
     -   id: pyupgrade
         args: ['--py36-plus','--exit-zero-even-if-changed']
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -2,6 +2,20 @@
 Release notes
 =============
 
+Version 0.4.0, (16-04-2021)
+---------------------------
+Documentation:
+
+* **docs**: include BDTWS presentation
+* **docs**: clarify that ``time_axis`` should be date or numeric
+* **docs**: initialize spark with both histogrammar jar files
+
+Build system
+
+* **build**: Migrate to version 1.0.25 of ``histogrammar``.
+* **build**: update ``pyupgrade`` to v2.12.0
+* **build**: update ``isort`` to 5.8.0
+* **build**: update ``flake8`` to 3.9.0
 
 Version 0.3.14, Feb 2021
 ------------------------

diff --git a/README.rst b/README.rst
@@ -29,24 +29,22 @@ With Spark 3.0, based on Scala 2.12, make sure to pick up the correct `histogram
 
 .. code-block:: python
 
-  spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.11,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.11").getOrCreate()
+  spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20").getOrCreate()
 
 For Spark 2.X compiled against scala 2.11, in the string above simply replace 2.12 with 2.11.
 
-`January 29, 2021`
+Examples
+========
+
+- `Flight Delays and Cancellations Kaggle data <https://crclz.com/popmon/reports/flight_delays_report.html>`_
+- `Synthetic data (code example below) <https://crclz.com/popmon/reports/test_data_report.html>`_
 
 Documentation
 =============
 
 The entire `popmon` documentation including tutorials can be found at `read-the-docs <https://popmon.readthedocs.io>`_.
 
 
-Examples
-========
-
-- `Flight Delays and Cancellations Kaggle data <https://crclz.com/popmon/reports/flight_delays_report.html>`_
-- `Synthetic data (code example below) <https://crclz.com/popmon/reports/test_data_report.html>`_
-
 Notebooks
 =========
 
@@ -151,19 +149,21 @@ Resources
 Presentations
 -------------
 
-+------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+------------------+-------------------------+
-| Title                                                                                          | Host                                                                                             | Date             | Speaker                 |
-+------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+------------------+-------------------------+
-| Popmon - population monitoring made easy                                                       | `Data Lunch @ Eneco <https://www.eneco.nl/>`_                                                    | October 29, 2020 | Max Baak, Simon Brugman |
-+------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+------------------+-------------------------+
-| Popmon - population monitoring made easy                                                       | `Data Science Summit 2020 <https://dssconf.pl/en/>`_                                             | October 16, 2020 | Max Baak                |
-+------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+------------------+-------------------------+
-| `Population Shift Monitoring Made Easy: the popmon package <https://youtu.be/PgaQpxzT_0g>`_    | `Online Data Science Meetup @ ING WBAA <https://www.meetup.com/nl-NL/Tech-Meetups-ING/events/>`_ | July 8 2020      | Tomas Sostak            |
-+------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+------------------+-------------------------+
-| `Popmon: Population Shift Monitoring Made Easy <https://www.youtube.com/watch?v=HE-3YeVYqPY>`_ | `PyData Fest Amsterdam 2020 <https://amsterdam.pydata.org/>`_                                    | June 16, 2020    | Tomas Sostak            |
-+------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+------------------+-------------------------+
-| Popmon: Population Shift Monitoring Made Easy                                                  | `Amundsen Community Meetup <https://github.com/amundsen-io/amundsen>`_                           | June 4, 2020     | Max Baak                |
-+------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+------------------+-------------------------+
++------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+-------------------+-------------------------+
+| Title                                                                                          | Host                                                                                             | Date              | Speaker                 |
++------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+-------------------+-------------------------+
+| Popmon - population monitoring made easy                                                       | `Big Data Technology Warsaw Summit 2021 <https://bigdatatechwarsaw.eu/>`_                        | February 25, 2021 | Simon Brugman           |
++------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+-------------------+-------------------------+
+| Popmon - population monitoring made easy                                                       | `Data Lunch @ Eneco <https://www.eneco.nl/>`_                                                    | October 29, 2020  | Max Baak, Simon Brugman |
++------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+-------------------+-------------------------+
+| Popmon - population monitoring made easy                                                       | `Data Science Summit 2020 <https://dssconf.pl/en/>`_                                             | October 16, 2020  | Max Baak                |
++------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+-------------------+-------------------------+
+| `Population Shift Monitoring Made Easy: the popmon package <https://youtu.be/PgaQpxzT_0g>`_    | `Online Data Science Meetup @ ING WBAA <https://www.meetup.com/nl-NL/Tech-Meetups-ING/events/>`_ | July 8 2020       | Tomas Sostak            |
++------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+-------------------+-------------------------+
+| `Popmon: Population Shift Monitoring Made Easy <https://www.youtube.com/watch?v=HE-3YeVYqPY>`_ | `PyData Fest Amsterdam 2020 <https://amsterdam.pydata.org/>`_                                    | June 16, 2020     | Tomas Sostak            |
++------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+-------------------+-------------------------+
+| Popmon: Population Shift Monitoring Made Easy                                                  | `Amundsen Community Meetup <https://github.com/amundsen-io/amundsen>`_                           | June 4, 2020      | Max Baak                |
++------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+-------------------+-------------------------+
 
 
 Articles

diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst
@@ -57,6 +57,7 @@ To specify the time-axis binning alone, do:
 
   report = df.pm_stability_report(time_axis='date', time_width='1w', time_offset='2020-1-6')
 
+The ``time_axis`` argument should be the name of a column that is of type **numeric (e.g. batch id, time in ns) or date(time)**.
 The default time width is 30 days ('30d'), with time offset 2010-1-4 (a Monday).
 All other features (except for 'date') are auto-binned in this example.
 
@@ -203,7 +204,7 @@ Spark usage
     from pyspark.sql import SparkSession
 
     # downloads histogrammar jar files if not already installed, used for histogramming of spark dataframe
-    spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.11,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.11").getOrCreate()
+    spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20").getOrCreate()
 
     # load a dataframe
     spark_df = spark.read.format('csv').options(header='true').load('file.csv')
@@ -221,8 +222,8 @@ This snippet contains the instructions for setting up a minimal environment for
     !apt-get install openjdk-8-jdk-headless -qq > /dev/null
     !wget -q https://www-us.apache.org/dist/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
     !tar xf spark-2.4.7-bin-hadoop2.7.tgz
-    !wget -P /content/spark-2.4.7-bin-hadoop2.7/jars/ -q https://repo1.maven.org/maven2/io/github/histogrammar/histogrammar-sparksql_2.12/1.0.11/histogrammar-sparksql_2.12-1.0.11.jar
-    !wget -P /content/spark-2.4.7-bin-hadoop2.7/jars/ -q https://repo1.maven.org/maven2/io/github/histogrammar/histogrammar_2.12/1.0.11/histogrammar_2.12-1.0.11.jar
+    !wget -P /content/spark-2.4.7-bin-hadoop2.7/jars/ -q https://repo1.maven.org/maven2/io/github/histogrammar/histogrammar-sparksql_2.12/1.0.20/histogrammar-sparksql_2.12-1.0.20.jar
+    !wget -P /content/spark-2.4.7-bin-hadoop2.7/jars/ -q https://repo1.maven.org/maven2/io/github/histogrammar/histogrammar_2.12/1.0.20/histogrammar_2.12-1.0.20.jar
     !pip install -q findspark popmon
 
 Now that spark is installed, restart the runtime.
@@ -239,7 +240,7 @@ Now that spark is installed, restart the runtime.
     from pyspark.sql import SparkSession
 
     spark = SparkSession.builder.master("local[*]") \
-      .config("spark.jars", "/content/jars/histogrammar_2.12-1.0.11.jar,/content/jars/histogrammar-sparksql_2.12-1.0.11.jar") \
+      .config("spark.jars", "/content/jars/histogrammar_2.12-1.0.20.jar,/content/jars/histogrammar-sparksql_2.12-1.0.20.jar") \
       .config("spark.sql.execution.arrow.enabled", "false") \
       .config("spark.sql.session.timeZone", "GMT") \
       .getOrCreate()
diff --git a/popmon/__init__.py b/popmon/__init__.py
@@ -18,12 +18,16 @@
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 
-# flake8: noqa
+# histogram and report functions
+from histogrammar.dfinterface.make_histograms import (
+    get_bin_specs,
+    get_time_axes,
+    make_histograms,
+)
+
 # pandas/spark dataframe decorators
 from popmon import decorators
 
-# histogram and report functions
-from .hist.filling import get_bin_specs, get_time_axes, make_histograms
 from .pipeline.metrics import df_stability_metrics, stability_metrics
 from .pipeline.report import df_stability_report, stability_report
 from .stitching import stitch_histograms

diff --git a/popmon/alerting/compute_tl_bounds.py b/popmon/alerting/compute_tl_bounds.py
@@ -329,7 +329,7 @@ def df_single_op_pull_bounds(
     :param list cols: list of cols to calculate bounds of (optional)
     """
     if len(df.index) == 0:
-        raise RuntimeError("input df has zero length")
+        raise ValueError("input df has zero length")
     row = df.iloc[0]
     return pull_bounds(
         row, red_high, yellow_high, yellow_low, red_low, suffix_mean, suffix_std, cols

diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py
@@ -39,7 +39,7 @@
     get_consistent_numpy_entries,
 )
 from ...base import Pipeline
-from ...hist.histogram import HistogramContainer
+from ...hist.hist_utils import COMMON_HIST_TYPES, is_numeric
 from ...stats.numpy import googl_test, ks_prob, ks_test, uu_chi2
 
 
@@ -78,21 +78,21 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
         hist_name1 = cols[0]
         hist_name2 = cols[1]
     if not all([name in cols for name in [hist_name1, hist_name2]]):
-        raise RuntimeError("Need to provide two histogram column names.")
+        raise ValueError("Need to provide two histogram column names.")
 
     # basic histogram checks
-    hc1 = row[hist_name1]
-    hc2 = row[hist_name2]
-    if not all([isinstance(hc, HistogramContainer) for hc in [hc1, hc2]]):
+    hist1 = row[hist_name1]
+    hist2 = row[hist_name2]
+    if not all([isinstance(hist, COMMON_HIST_TYPES) for hist in [hist1, hist2]]):
         return x
-    if not check_similar_hists([hc1, hc2]):
+    if not check_similar_hists([hist1, hist2]):
         return x
 
     # compare
-    is_num = hc1.is_num
-    if hc1.n_dim == 1:
+    is_num = is_numeric(hist1)
+    if hist1.n_dim == 1:
         if is_num:
-            numpy_1dhists = get_consistent_numpy_1dhists([hc1, hc2])
+            numpy_1dhists = get_consistent_numpy_1dhists([hist1, hist2])
             entries_list = [nphist[0] for nphist in numpy_1dhists]
             # KS-test only properly defined for (ordered) 1D interval variables
             ks_testscore = ks_test(*entries_list)
@@ -101,14 +101,14 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
             x["ks_pvalue"] = ks_pvalue
             x["ks_zscore"] = -norm.ppf(ks_pvalue)
         else:  # categorical
-            entries_list = get_consistent_numpy_entries([hc1, hc2])
+            entries_list = get_consistent_numpy_entries([hist1, hist2])
             # check consistency of bin_labels
-            labels1 = hc1.hist.bin_labels()
-            labels2 = hc2.hist.bin_labels()
+            labels1 = hist1.bin_labels()
+            labels2 = hist2.bin_labels()
             subset = set(labels1) <= set(labels2)
             unknown_labels = int(not subset)
-    elif hc1.n_dim == 2:
-        numpy_2dgrids = get_consistent_numpy_2dgrids([hc1, hc2])
+    elif hist1.n_dim == 2:
+        numpy_2dgrids = get_consistent_numpy_2dgrids([hist1, hist2])
         entries_list = [entry.flatten() for entry in numpy_2dgrids]
 
     # calculate pearson coefficient