Skip to content

Commit

Permalink
[SYSTEMDS-3696] New sliceLineDebug built-in function for usability
Browse files Browse the repository at this point in the history
This patch adds a new sliceLineDebug function to present the top-k
worst-slides returned from sliceLine (slicefinder) in a human
readable format. This is the output for the Salaries dataset:

sliceLineDebug:
-- Slice #1: score=0.4041683676825298, size=248.0
---- avg error=6.558681888351787E8, max error=8.524558818262574E9
---- predicate: "rank" = "Prof" AND "sex" = "Male"
-- Slice #2: score=0.3731763935666855, size=42.0
---- avg error=8.271958572009121E8, max error=4.553584116646141E9
---- predicate: "rank" = "Prof" AND "yrs.since.phd" = 31.25
-- Slice apache#3: score=0.3675193573989536, size=125.0
---- avg error=6.758211389786526E8, max error=8.524558818262574E9
---- predicate: "rank" = "Prof" AND "discipline" = "B" AND "sex" =
"Male"
-- Slice apache#4: score=0.35652331744984933, size=266.0
---- avg error=6.307265846260264E8, max error=8.524558818262574E9
---- predicate: "rank" = "Prof"
  • Loading branch information
mboehm7 committed May 20, 2024
1 parent 2972d6d commit 9e99f3c
Show file tree
Hide file tree
Showing 9 changed files with 582 additions and 387 deletions.
2 changes: 1 addition & 1 deletion scripts/builtin/lmPredictStats.dml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ m_lmPredictStats = function(Matrix[Double] yhat, Matrix[Double] ytest, Boolean l
R2 = 1 - ss_res / (sum_sq_y_test - n * (sum_y_test/n)^2);
else
R2 = sum((yhat - mean_y_test)^2) / sum((ytest - mean_y_test)^2)

avg_tot = sum_y_test / n;
ss_tot = sum_sq_y_test;
ss_avg_tot = ss_tot - n * avg_tot ^ 2;
Expand Down
68 changes: 68 additions & 0 deletions scripts/builtin/sliceLineDebug.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

# This builtin function takes the outputs of SliceLine and the
# original transformencode meta data in order to print a human-
# readable debug output of the resulting top-k slices.
#
# INPUT:
# ------------------------------------------------------------------------------
# TK top-k slices (k x ncol(X) if successful)
# TKC score, size, error of slices (k x 3)
# tfmeta transformencode meta data
# tfspec transform specification
# ------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------
# S debug output collected as a string
# ------------------------------------------------------------------------------

m_sliceLineDebug = function(Matrix[Double] TK,
Matrix[Double] TKC, Frame[Unknown] tfmeta, String tfspec)
return(Matrix[Double] S)
{
# FIXME: frame toString always pads to 100 rows
# print("sliceLineDebug: input\n"+toString(TK)+"\n"+toString(TKC)+"\n"+toString(tfmeta));
print("sliceLineDebug:");

# prepare essential decoding info
N = colnames(tfmeta);
TKsafe = TK + (TK==0); # for vectorized decoding
FTK = transformdecode(target=TKsafe, meta=tfmeta, spec=tfspec);

# actual debug output
for(i in 1:nrow(TK)) {
TKi = TK[i,]; FTKi = FTK[i,];
print("-- Slice #"+i+": score="+as.scalar(TKC[i,1])+", size="+as.scalar(TKC[i,4]));
print("---- avg error="+as.scalar(TKC[i,2]/TKC[i,4])+", max error="+as.scalar(TKC[i,3]));
pred = "";
for(j in 1:ncol(TKi)) {
if( as.scalar(TKi[1,j]) != 0 ) {
tmp = as.scalar(N[1,j]) + " = " + as.scalar(FTK[i,j]);
pred = ifelse(pred=="", tmp, pred+" AND "+tmp);
}
}
print("---- predicate: "+pred);
}
S = TK;
}

26 changes: 13 additions & 13 deletions scripts/builtin/slicefinder.dml
Original file line number Diff line number Diff line change
Expand Up @@ -44,19 +44,19 @@
# OUTPUT:
# -----------------------------------------------------------------------------------------
# TK top-k slices (k x ncol(X) if successful)
# TKC score, size, error of slices (k x 3)
# TKC score, total/max error, size of slices (k x 4)
# D debug matrix, populated with enumeration stats if verbose
# -----------------------------------------------------------------------------------------

m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, Int k = 4,
Int maxL = 0, Int minSup = 32, Double alpha = 0.5, Boolean tpEval = TRUE,
m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, Int k = 4,
Int maxL = 0, Int minSup = 32, Double alpha = 0.5, Boolean tpEval = TRUE,
Int tpBlksz = 16, Boolean selFeat = FALSE, Boolean verbose = FALSE)
return(Matrix[Double] TK, Matrix[Double] TKC, Matrix[Double] D)
{
t1 = time();

# init debug matrix: levelID, enumerated S, valid S, TKmax, TKmin
D = matrix(0, 0, 5);
D = matrix(0, 0, 5);

m = nrow(X);
n = ncol(X);
Expand Down Expand Up @@ -96,7 +96,7 @@ m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, Int k = 4,

# enumerate candidate join pairs, incl size/error pruning
nrS = nrow(S);
S = getPairedCandidates(S, R, TK, TKC, k, level, eAvg, minSup, alpha, n2, foffb, foffe);
S = getPairedCandidates(S, R, TK, TKC, k, level, eAvg, minSup, alpha, n2, foffb, foffe);
S2 = S;
if(selFeat)
S2 = removeEmpty(target=S, margin="cols", select=t(selCols));
Expand All @@ -109,10 +109,10 @@ m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, Int k = 4,
if( nrow(S) > 0 ) {
# extract and evaluate candidate slices
if( tpEval ) { # task-parallel
# hybrid task-parallel w/ 1 matrix-matrix for blocks of 16 matrix-vector
# hybrid task-parallel w/ 1 matrix-matrix for blocks of 16 matrix-vector
R = matrix(0, nrow(S), 4)
parfor( i in 1:ceil(nrow(S)/tpBlksz), check=0 ) {
beg = (i-1)*tpBlksz + 1;
beg = (i-1)*tpBlksz + 1;
end = min(i*tpBlksz, nrow(R));
R[beg:end,] = evalSlice(X2, e, eAvg, t(S2[beg:end,]), level, alpha);
}
Expand Down Expand Up @@ -143,7 +143,7 @@ m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, Int k = 4,
}
}

createAndScoreBasicSlices = function(Matrix[Double] X2, Matrix[Double] e,
createAndScoreBasicSlices = function(Matrix[Double] X2, Matrix[Double] e,
Double eAvg, Double minSup, Double alpha, Boolean verbose)
return(Matrix[Double] S, Matrix[Double] R, Matrix[Double] selCols)
{
Expand All @@ -165,7 +165,7 @@ createAndScoreBasicSlices = function(Matrix[Double] X2, Matrix[Double] e,
sm = removeEmpty(target=merr, margin="rows", select=selCols);
S = table(seq(1,nrow(attr)), attr, nrow(attr), n2);

# score 1-slices and create initial top-k
# score 1-slices and create initial top-k
sc = score(ss, se, eAvg, alpha, nrow(X2));
R = cbind(sc, se, sm, ss);
}
Expand Down Expand Up @@ -290,7 +290,7 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R,
# and to void creating huge sparse intermediates
[ID, M] = transformencode(target=as.frame(ID), spec="{ids:true,recode:[1]}")

# size pruning, with rowMin-rowMax transform
# size pruning, with rowMin-rowMax transform
# to avoid densification (ignored zeros)
map = table(ID, seq(1,nrow(P)), max(ID), nrow(P))
ubSizes = 1/rowMaxs(map * (1/t(ss)));
Expand All @@ -304,13 +304,13 @@ getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R,
ubMError = replace(target=ubMError, pattern=Inf, replacement=0);
ubScores = scoreUB(ubSizes, ubError, ubMError, eAvg, minSup, alpha, n2);
[maxsc, minsc] = analyzeTopK(TKC);
fScores = (ubScores > minsc & ubScores > 0)
fScores = (ubScores > minsc & ubScores > 0)

# missing parents pruning
numParents = rowSums((map %*% P12) != 0)
numParents = rowSums((map %*% P12) != 0)
fParents = (numParents == level);

# apply all pruning
# apply all pruning
fall = (fSizes & fScores & fParents);

# deduplication of join outputs
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/org/apache/sysds/common/Builtins.java
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,8 @@ public enum Builtins {
SIGN("sign", false),
SIN("sin", false),
SINH("sinh", false),
SLICEFINDER("slicefinder", true),
SLICEFINDER("slicefinder", true), //TODO rename
SLICELINE_DEBUG("sliceLineDebug", true),
SKEWNESS("skewness", true),
SMAPE("smape", true),
SMOTE("smote", true),
Expand Down
Loading

0 comments on commit 9e99f3c

Please sign in to comment.