Skip to content

Commit

Permalink
LightGBM model locale fix (#53)
Browse files Browse the repository at this point in the history
This implements the fix for model locale with Feedzai's patch to LightGBM v3.0.0.
Such patch currently lives at feedzai/LightGBM as it wasn't yet merged to Microsoft's mainline code.

Also, it integrates the latest version of make-lightgbm with support for build caches to speed up repeated builds of LightGBM.
  • Loading branch information
AlbertoEAF authored and Joao Ramos Azevedo committed Dec 2, 2020
1 parent aba2523 commit 8dabaa6
Show file tree
Hide file tree
Showing 9 changed files with 7,879 additions and 33 deletions.
2 changes: 1 addition & 1 deletion openml-lightgbm/lightgbm-builder/make-lightgbm
2 changes: 1 addition & 1 deletion openml-lightgbm/lightgbm-builder/make.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ git submodule update --init

echo "Building LightGBM $LIGHTGBM_COMMIT_REF as lightgbmlib $LIGHTGBMLIB_VERSION"
cd make-lightgbm
bash make.sh "$LIGHTGBM_COMMIT_REF" "$LIGHTGBMLIB_VERSION"
bash make.sh "$LIGHTGBM_COMMIT_REF" "$LIGHTGBMLIB_VERSION" --cache

21 changes: 6 additions & 15 deletions openml-lightgbm/lightgbm-builder/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,21 @@

<groupId>com.feedzai.openml.lightgbm</groupId>
<artifactId>lightgbm-lib</artifactId>
<version>3.0.0</version>
<version>3.0.0-with_model_locale_fix_for_java</version>

<packaging>jar</packaging>
<name>Openml LightGBM lib</name>
<description>
LightGBM build for Java generated with make-lightgbm.

Build created with command: `bash make.sh v3.0.0 3.0.0`

---

Build info:

LIGHTGBM_REPO_URL=https://github.com/microsoft/LightGBM
LIGHTGBM_VERSION=v3.0.0
LIGHTGBM_COMMIT=7e11d4aeabd4a39ffa4afb382299c6d00ddf01e7
PACKAGE_TIMESTAMP=2020/09/01 18:20:49
</description>
<url>https://github.com/feedzai/make-lightgbm</url>

<properties>
<lightgbm.repo.url>https://github.com/microsoft/LightGBM</lightgbm.repo.url>
<lightgbm.version>v3.0.0</lightgbm.version>
<lightgbmlib.version>3.0.0</lightgbmlib.version>
<!-- Microsoft hasn't merged our model-locale-fix patch yet. -->
<!--<lightgbm.repo.url>https://github.com/microsoft/LightGBM</lightgbm.repo.url>-->
<lightgbm.repo.url>https://github.com/feedzai/LightGBM.git</lightgbm.repo.url>
<lightgbmlib.version>3.0.0-with_model_locale_fix_for_java</lightgbmlib.version>
<lightgbm.version>v3.0.0-with_model_locale_fix_for_java</lightgbm.version>
</properties>

<build>
Expand Down
18 changes: 6 additions & 12 deletions openml-lightgbm/lightgbm-provider/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
<description>OpenML Microsoft LightGBM Machine Learning Model and Classifier provider</description>

<properties>
<lightgbmlib.version>3.0.0</lightgbmlib.version>
<lightgbmlib.version>3.0.0-with_model_locale_fix_for_java</lightgbmlib.version>
</properties>

<dependencies>
Expand Down Expand Up @@ -91,6 +91,11 @@
<artifactId>commons-csv</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

<build>
Expand Down Expand Up @@ -119,17 +124,6 @@
</execution>
</executions>
</plugin>

<!-- Set locale for tests otherwise the LightGBM core implementation gives wrong results. -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<environmentVariables>
<LC_ALL>C</LC_ALL>
</environmentVariables>
</configuration>
</plugin>
</plugins>
</build>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,19 @@
import com.feedzai.openml.data.Instance;
import com.feedzai.openml.data.schema.DatasetSchema;
import com.feedzai.openml.provider.exception.ModelLoadingException;
import com.google.common.io.Files;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.io.FileUtils;
import org.junit.BeforeClass;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.net.URISyntaxException;
import java.nio.file.Path;

import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertTrue;
Expand Down Expand Up @@ -197,7 +201,7 @@ public void getClassDistributionScoresWithWrongFeaturesOrderTest()
* (which also contains the reference score)
* according to the schema.
*
* @param model Model that will score
* @param model Model that will score.
* @param schema Schema to use (controls CSVRecord extraction)
* @param record Shall contain at least the features and refererence score (at field "score").
* @return true if scores match, false if they don't.
Expand All @@ -217,10 +221,11 @@ private boolean modelScoreMatchesReferenceScore(final LightGBMBinaryClassificati

/**
* Compares two doubles, being equal if all but the last excludedDigits of the least precise double match.
* @param a First double.
* @param b Second double.
*
* @param a First double.
* @param b Second double.
* @param excludedDigits Exclude comparison of this many digits from the least precise number and compare.
* @return true if and only if a == b (at the chosen precision level)
* @return true if and only if a == b (at the chosen precision level).
*/
private boolean compareDoubles(final double a, final double b, final int excludedDigits) {

Expand All @@ -236,4 +241,81 @@ private boolean compareDoubles(final double a, final double b, final int exclude
);
}

/**
* Asserts that the two files have the same content.
*
* @param name Name of the file to compare for the assert message.
* @param filepath1 Path to the first file.
* @param filepath2 Path to the second file.
* @throws IOException Raised in case of failure reading the files.
* @since 1.0.19
*/
private void assertEqualFileContents(final String name,
final Path filepath1,
final Path filepath2) throws IOException {

final File file1 = new File(filepath1.toString());
final File file2 = new File(filepath2.toString());

assertThat(FileUtils.contentEquals(file1, file2))
.as(String.format("%s file comparison", name))
.isTrue();
}

/**
* This functional test ensures that LightGBM can read a model file and output one exactly alike the one read in.
* This is to ensure the new code to rewrite the model read/write layers is completely functional.
* The two reference models were generated with LightGBM's v3.0.0 code.
* The two generated ones will use the current code in the current locale. There should be no mismatches.
*
* @throws URISyntaxException For invalid resource paths.
* @throws ModelLoadingException Errors when loading the model resources.
* @throws IOException IO Errors opening/writing.
* @throws InterruptedException Thrown if the model report fails to await for the process.
* @implNote If you have mismatches in the models, run test/resources/diff_models.py by giving it
* the two model folders. It will compare files with the same name across the two folders.
* @since 1.0.19
*/
@Test
public void ensureModelReadWriteRoundTripMatchesStandardLightGBMOutput()
throws URISyntaxException, ModelLoadingException, IOException, InterruptedException {

final Path referenceModelsFolder = TestResources.getResourcePath("standard_code_models");
final String firstModelFilename = "4f.txt";
final String secondModelFilename = "42f.txt";

final File tempDir = Files.createTempDir();
final Path tempDirPath = tempDir.toPath();

try {
// Round-trip read+write models 4f.txt and 42f.txt with the current code

LightGBMSWIG swig = new LightGBMSWIG(
TestResources.getModelFilePath().toString(),
TestSchemas.NUMERICALS_SCHEMA_WITH_LABEL_AT_END,
"");
swig.saveModelToDisk(tempDirPath.resolve(firstModelFilename));

swig = new LightGBMSWIG(
TestResources.getResourcePath("lightgbm_model_42_numericals.txt").toString(),
TestSchemas.NUMERICALS_SCHEMA_WITH_LABEL_AT_END,
"");
swig.saveModelToDisk(tempDirPath.resolve(secondModelFilename));

// Compare generated model files with the round-trip read+write models generated with LightGBM v3.0.0

// Compare the rewritten models:
assertEqualFileContents(
firstModelFilename,
referenceModelsFolder.resolve(firstModelFilename),
tempDirPath.resolve(firstModelFilename));

assertEqualFileContents(
firstModelFilename,
referenceModelsFolder.resolve(secondModelFilename),
tempDirPath.resolve(secondModelFilename));
} finally {
FileUtils.deleteDirectory(tempDir); // Recursive delete
}
}
}
111 changes: 111 additions & 0 deletions openml-lightgbm/lightgbm-provider/src/test/resources/diff_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""
This script compares two folders with LGBM models.
It compares models with the same name across the two folders.
If a file with the same name is found in both folders it is compared.
Any differences in any of such compared files prints a report and triggers exit(1).
The report focuses on differences in the parameters of each line.
Requirements:
- Python 3
- (Optional) Python's termcolor for best results (`pip install termcolor`)
Usage:
script <reference_models_folder> <new_models_folder>
Exit code:
0 if no differences in matching files were found
1 if any file has differences
Author: Alberto Ferreira
Copyright: 2020, Feedzai
License: Same as the repo.
"""

import argparse
import filecmp
import sys
from pathlib import Path

try:
from termcolor import colored
def warn_missing_libs():
pass
except ModuleNotFoundError:
def warn_missing_libs():
print("Warning: Please install Python's 'termcolor' library to see detailed colored model diff!")
def colored(msg, color):
"Poor replacement for colored."
if color == "red":
return f"Original=<{msg}>"
elif color == "green":
return f"Current=<{msg}>"
else:
return msg


def file_diff_report(a, b, delim=" "):
"""
Reports the file diff by printing different lines and different elements in them.
"""
a_lines = open(a).readlines()
b_lines = open(b).readlines()

if len(a_lines) != len(b_lines):
print("Different file line sizes!")
return

for a_line, b_line in zip(a_lines, b_lines):
a_elems = a_line.split(delim)
b_elems = b_line.split(delim)

if a_elems != b_elems:
print(
colored(f"\n\n\n>>> Different line contents between lines\n\n", "yellow"),
colored(a_line, "red"),
colored(b_line, "green"),
sep=""
)
print(colored("\n>> Different elements in lines\n", "yellow"))
if len(a_elems) != len(b_elems):
print("Different line sizes!")
return
for a_elem, b_elem in zip(a_elems, b_elems):
if a_elem != b_elem:
print(
colored(a_elem, "red"),
"\n",
colored(b_elem, "green"),
sep=""
)

def compare_folders_with_model_files(folder_ref, folder_new):
"""
Compares two folders with lgbm .txt models inside.
If a file with the same name is found in the two folders is compared.
Any difference triggers an exit(1).
"""
dir_ref = Path(folder_ref)
dir_new = Path(folder_new)

diff_filenames = filecmp.dircmp(dir_ref, dir_new).diff_files

if diff_filenames:
warn_missing_libs()

for filename in diff_filenames:
print(colored(f"Found mismatches in file {filename}", "yellow"))
file_diff_report(dir_ref/filename, dir_new/filename)

warn_missing_libs()
sys.exit(1)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("reference_models_folder")
parser.add_argument("new_models_folder")
args = parser.parse_args()

compare_folders_with_model_files(
args.reference_models_folder,
args.new_models_folder
)
Loading

0 comments on commit 8dabaa6

Please sign in to comment.