microsoft · Willian-Zhang · Mar 1, 2021 · Mar 16, 2021 · Mar 21, 2021 · Mar 29, 2021
@@ -0,0 +1,41 @@
+# Regression Test
+
+## typical structure
+
+```
+.
+├── sample
+│   ├── ref_data
+│   │   ├── 1.ref
+│   │   └── 2.ref
+│   ├── sample_1.sh
+│   └── sample_test2.sh (generates `data/{1.ref, 2.ref}` to compare with in `ref_data`)
+└── verify_result.sh
+```
+
+## steps
+
+### create <task_dir> and add test job to ci
+in example `sample`
+
+
+### create referance truth data
+in `<task>/ref_data`
+in example `1.ref` and `2.ref`
+could be bin file model or any file
+
+
+### create test script
+Should generate `1.ref` and `2.ref` in `$1` dir
+in example `sample_1` and `sample_test2.sh
+
+
+### test run
+Run `bash ../verify_result.sh` in <task_dir> as working directory
+
+
+## Tips
+1. run `verify_result.sh` in test directory locally to test locally
+  1. `cd my-tests-on-higgs`
+  1. `bash ../verify_result.sh`
+
@@ -0,0 +1,22 @@
+import sys
+import lightgbm as lgb
+import pandas as pd
+
+data_path = sys.argv[1]
+
+print('Loading data...')
+# load or create your dataset
+df_train = pd.read_csv('../../../examples/regression/regression.train', header=None, sep='\t', float_precision='round_trip')
+df_test = pd.read_csv('../../../examples/regression/regression.test', header=None, sep='\t', float_precision='round_trip')
+
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
+
+# create dataset for lightgbm
+lgb_train = lgb.Dataset(X_train, y_train)
+lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
+
+lgb_train.save_binary(f"{data_path}/train.bin")
+# lgb_eval.save_binary(f"{data_path}/valid.bin")
@@ -0,0 +1,2 @@
+
+python gen_bin.py "$1"
@@ -0,0 +1,4 @@
+data_path="$1"
+
+touch "$data_path"/1.ref
+touch "$data_path"/2.ref
@@ -0,0 +1,4 @@
+data_path="$1"
+
+touch "$data_path"/1.ref
+touch "$data_path"/2.ref
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+p=$(pwd)
+# ls "$path/ref_data"
+
+POSITIONAL=()
+while [[ $# -gt 0 ]]
+do
+    key="$1"
+    case $key in
+        --no-rm)
+        NO_RM_DATA=YES
+        shift # past argument
+        ;;
+        *)    # unknown option
+        POSITIONAL+=("$1") # save it in an array for later
+        shift # past argument
+        ;;
+    esac
+done
+set -- "${POSITIONAL[@]}" # restore positional parameters
+
+
+
+
+if (( $(ls *.sh | wc -l) < 1 )) ; then
+    echo ERROR: No test scripts found in $p
+    exit 1
+fi
+
+if [ $(ls ref_data/* | wc -l) -lt 1 ] ; then
+    echo ERROR: No test data found in $p/ref_data
+    exit 1
+fi
+
+colorize() {
+    # prefer terminal safe colored and bold text when tput is supported
+    if tput setaf 0 &>/dev/null; then
+        _RESET="$(tput sgr0)"
+        _BOLD="$(tput bold)"
+        _BLUE="${BOLD}$(tput setaf 4)"
+        _GREEN="${BOLD}$(tput setaf 2)"
+        _RED="${BOLD}$(tput setaf 1)"
+        _YELLOW="${BOLD}$(tput setaf 3)"
+    else
+        _RESET="\e[0m"
+        _BOLD="\e[1m"
+        _BLUE="${BOLD}\e[34m"
+        _GREEN="${BOLD}\e[32m"
+        _RED="${BOLD}\e[31m"
+        _YELLOW="${BOLD}\e[33m"
+    fi
+    readonly _RESET _BOLD _BLUE _GREEN _RED _YELLOW
+}
+colorize;
+
+exit_code=0
+for script in *.sh; do
+    echo -e "$_BLUE RUN$_RESET" "$script";
+    script_data_dir="data"
+    # script_data_dir=${script%".sh"}
+    mkdir -p "$script_data_dir"
+    bash "$script" "$script_data_dir"
+
+    for ref_data in ref_data/*; do
+        d_path=${ref_data#"ref_data/"}
+        script_data="$script_data_dir/$d_path"
+
+        if cmp -s "$ref_data" "$script_data"; then
+            echo -e "$_GREEN SUCCESS$_RESET" "$script_data" is the same as "$ref_data"
+        else
+            echo -e "$_RED  FAILED$_RESET"   "$script_data" is differ from "$ref_data"
+            echo -e sha1 `sha1sum "$script_data"`
+            echo -e sha1 `sha1sum "$ref_data"`
+            exit_code=2
+        fi
+
+    done
+    if [ $NO_RM_DATA ]; then
+        echo "$script_data_dir" not removed
+    else
+        rm -r "$script_data_dir"
+    fi
+done
+
+exit $exit_code
+
@@ -57,9 +57,13 @@ if [[ $TASK == "lint" ]]; then
             "r-lintr>=2.0"
     pip install --user cpplint isort mypy
     echo "Linting Python code"
+    echo "..pycodestyle"
     pycodestyle --ignore=E501,W503 --exclude=./.nuget,./external_libs . || exit -1
-    pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1
+    echo "..pydocstyle"
+    pydocstyle --convention=numpy --add-ignore=D105 --match-dir='^(?!^external_libs|test|example).*' --match='(?!^test_|setup).*\.py' . || exit -1
+    echo "..isort"
     isort . --check-only || exit -1
+    echo "..mypy"
     mypy --ignore-missing-imports python-package/ || true
     echo "Linting R code"
     Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1
@@ -218,6 +222,7 @@ import matplotlib\
 matplotlib.use\(\"Agg\"\)\
 ' plot_example.py  # prevent interactive window mode
     sed -i'.bak' 's/graph.render(view=True)/graph.render(view=False)/' plot_example.py
+    conda install -q -y -n $CONDA_ENV h5py # requirements for example
     for f in *.py **/*.py; do python $f || exit -1; done  # run all examples
     cd $BUILD_DIRECTORY/examples/python-guide/notebooks
     conda install -q -y -n $CONDA_ENV ipywidgets notebook

@@ -41,7 +41,7 @@ if ($env:TASK -eq "swig") {
   Exit 0
 }
 
-conda install -q -y -n $env:CONDA_ENV joblib matplotlib numpy pandas psutil pytest python-graphviz scikit-learn scipy ; Check-Output $?
+conda install -q -y -n $env:CONDA_ENV joblib matplotlib numpy pandas psutil pytest python-graphviz scikit-learn scipy h5py ; Check-Output $?
 
 if ($env:TASK -eq "regular") {
   mkdir $env:BUILD_SOURCESDIRECTORY/build; cd $env:BUILD_SOURCESDIRECTORY/build

@@ -139,7 +139,7 @@ publish/
 # Publish Web Output
 *.[Pp]ublish.xml
 *.azurePubxml
-# TODO: Comment the next line if you want to checkin your web deploy settings 
+# TODO: Comment the next line if you want to checkin your web deploy settings
 # but database connection strings (with potential passwords) will be unencrypted
 *.pubxml
 *.publishproj
@@ -455,3 +455,6 @@ dask-worker-space/
 *.pub
 *.rdp
 *_rsa
+
+# Regression test datas
+!.ci/regression/*/ref_data/*.bin
@@ -0,0 +1,44 @@
+stages:
+  - build
+  - test
+
+default:
+  image: willianz/lightgbm-build-env:latest
+
+variables:
+  GIT_SUBMODULE_STRATEGY: recursive
+
+build:
+  stage: build
+  script:
+    - mkdir build
+    - cd build
+    - cmake ..
+    - make -j4
+    - make install
+    - cd ../
+    - cd python-package
+    - python setup.py install
+    - cd -
+  artifacts:
+    untracked: true
+    expire_in: 1 days
+
+.before_tests:
+  before_script:
+    - pip install -e python-package
+
+test-sample:
+  extends: .before_tests
+  stage: test
+  script:
+    - cd .ci/regression/sample
+    - bash ../verify_result.sh
+
+test-bin-example:
+  extends: .before_tests  
+  stage: test
+  script:
+    - cd .ci/regression/dataset_from_examples
+    - bash ../verify_result.sh
+
@@ -61,3 +61,6 @@ Examples include:
     - Plot split value histogram
     - Plot one specified tree
     - Plot one specified tree with Graphviz
+- [dataset_from_multi_hdf5.py](https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/dataset_from_multi_hdf5.py)
+  - Construct Dataset from multiple HDF5 file
+  - Avoids loading all data into memory
@@ -0,0 +1,102 @@
+import h5py
+import numpy as np
+import pandas as pd
+
+import lightgbm as lgb
+
+
+class HDFSequence(lgb.Sequence):
+    def __init__(self, hdf_dataset, batch_size):
+        """
+        Parameters
+        ----------
+        hdf_dataset: h5py.Dataset
+            dataset in HDF5 file
+        batch_size: int
+            when reading data to construct lightgbm Dataset, each read reads batch_size rows
+        """
+        # We can also open HDF5 file once and get access to
+        self.data = hdf_dataset
+        self.batch_size = batch_size
+
+    def __getitem__(self, idx):
+        return self.data[idx]
+
+    def __len__(self):
+        return len(self.data)
+
+
+def create_dataset_from_multiple_hdf(input_flist, batch_size):
+    data = []
+    ylist = []
+    for f in input_flist:
+        f = h5py.File(f, 'r')
+        data.append(HDFSequence(f['X'], batch_size))
+        ylist.append(f['Y'][:])
+
+    # params = {
+    #     'bin_construct_sample_cnt': 200000,
+    #     'max_bin': 255,
+    # }
+    params = None
+    y = np.concatenate(ylist)
+    dataset = lgb.Dataset(data, label=y, params=params)
+    # With binary dataset created, we can use either Python API or cmdline version to train.
+    #
+    # Note: in order to create exactly the same dataset with the one created in simple_example.py, we need
+    # to modify simple_example.py to pass numpy array instead of pandas DataFrame to Dataset constructor.
+    # The reason is that DataFrame column names will be used in Dataset. For a DataFrame with Int64Index
+    # as columns, Dataset will use column names like ["0", "1", "2", ...]. While for numpy array, column names
+    # are using the default one assigned in C++ code (dataset_loader.cpp), like ["Column_0", "Column_1", ...].
+    dataset.save_binary('regression.train.from_hdf.bin')
+
+
+def save2hdf(input_data, fname, batch_size):
+    """Store numpy array to HDF5 file.
+
+    Please note chunk size settings in the implementation for I/O performance optimization.
+    """
+    with h5py.File(fname, 'w') as f:
+        for name, data in input_data.items():
+            nrow, ncol = data.shape
+            if ncol == 1:
+                # Y has a single column and we read it in single shot. So store it as an 1-d array.
+                chunk = (nrow, )
+                data = data.values.flatten()
+            else:
+                # We use random access for data sampling when creating LightGBM Dataset from Sequence.
+                # When accessing any element in a HDF5 chunk, it's read entirely.
+                # To save I/O for sampling, we should keep number of total chunks much larger than sample count.
+                # Here we are just creating a chunk size that matches with batch_size.
+                #
+                # Also note that the data is stored in row major order to avoid extra copy when passing to
+                # lightgbm Dataset.
+                chunk = (batch_size, ncol)
+            f.create_dataset(name, data=data, chunks=chunk, compression='lzf')
+
+
+def generate_hdf(input_fname, output_basename, batch_size):
+    # Save to 2 HDF5 files for demonstration.
+    df = pd.read_csv(input_fname, header=None, sep='\t')
+
+    mid = len(df) // 2
+    df1 = df.iloc[:mid]
+    df2 = df.iloc[mid:]
+
+    # We can store multiple dataset inside a single HDF5 file.
+    # Separating X and Y for choosing best chunk size for data loading.
+    save2hdf({'Y': df1.iloc[:, :1], 'X': df1.iloc[:, 1:]}, '{}1.h5'.format(output_basename), batch_size)
+    save2hdf({'Y': df2.iloc[:, :1], 'X': df2.iloc[:, 1:]}, '{}2.h5'.format(output_basename), batch_size)
+
+
+def main():
+    batch_size = 64
+    generate_hdf('../regression/regression.train', 'regression', 64)
+    create_dataset_from_multiple_hdf(
+        ['regression1.h5', 'regression2.h5'],
+        batch_size=batch_size,
+    )
+
+
+if __name__ == '__main__':
+    main()
@@ -214,6 +214,17 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
                                                 const DatasetHandle reference,
                                                 DatasetHandle* out);
 
+/*!
+ * \brief Create sample indices for total nrow.
+ * \param total_nrow Number of all data rows
+ * \param parameters Additional parameters, specify sample count and random seed in parameter
+ * \param[out] out Created indices, type is int32_t, caller should insure out contains enough space to hold indices
+ * \return 0 when succeed, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_SampleIndices(int32_t total_nrow,
+                       const char* parameters,
+                       void* out);
+
 /*!
  * \brief Create dataset from dense matrix.
  * \param data Pointer to the data space

@@ -29,7 +29,7 @@ class DatasetLoader {
 
   LIGHTGBM_EXPORT Dataset* ConstructFromSampleData(double** sample_values,
     int** sample_indices, int num_col, const int* num_per_col,
-    size_t total_sample_size, data_size_t num_data);
+    size_t total_sample_size, data_size_t num_data, const std::string& dump_filename);
 
   /*! \brief Disable copy */
   DatasetLoader& operator=(const DatasetLoader&) = delete;