Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MINOR] Adding decision tree to cleaning pipelines evaluation #1881

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/builtin/abstain.dml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ return (Matrix[Double] Xout, Matrix[Double] Yout)
Yout = Y
if(min(Y) != max(Y) & max(Y) <= 2)
{
betas = multiLogReg(X=X, Y=Y, icpt=1, reg=1e-4, maxi=100, maxii=0, verbose=verbose)
betas = multiLogReg(X=X, Y=Y, icpt=1, reg=1e-4, maxi=50, maxii=0, verbose=verbose)
[prob, yhat, accuracy] = multiLogRegPredict(X, betas, Y, FALSE)

inc = ((yhat != Y) & (rowMaxs(prob) > threshold))
Expand Down
4 changes: 2 additions & 2 deletions scripts/builtin/apply_pipeline.dml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ s_apply_pipeline = function(Frame[Unknown] testData, Frame[Unknown] metaData = a
[schema, mask, fdMask, maskY] = topk::prepareMeta(testData, metaData)
pip = removeEmpty(target=pip, margin="cols")
applyFunc = removeEmpty(target=applyFunc, margin="cols")
metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"))
metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"), tfspec=as.scalar(exState[3]))
ctx = list(prefix="----"); #TODO include seed
# separate the label
[Xtest, Ytest] = topk::getLabel(testData, isLastLabel)
Expand All @@ -75,7 +75,7 @@ s_apply_pipeline = function(Frame[Unknown] testData, Frame[Unknown] metaData = a
M = as.frame(exState[2])
if(sum(mask) > 0)
{
index = vectorToCsv(mask)
index = vectorToCsv(mask, ncol(mask))
jspecR = "{ids:true, recode:["+index+"]}"
eXtest = transformapply(target=Xtest, spec=jspecR, meta=M);
}
Expand Down
28 changes: 16 additions & 12 deletions scripts/builtin/bandit.dml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
totalPruneCount = 0
FLAG_VARIABLE = 5
pipelines_executed = 0
HYPERPARAM_LENGTH = ((ncol(lp) + 2) * FLAG_VARIABLE * 3) + 1 ## num of col in logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col
maxValueInParam = max(as.matrix(param[, 3]))
HYPERPARAM_LENGTH = ((ncol(lp) + 2) * FLAG_VARIABLE * maxValueInParam) + 1 ## num of col in logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col
bestPipeline = frame("", rows=1, cols=1)
bestHyperparams = as.matrix(0)
bestAccuracy = as.matrix(0)
Expand Down Expand Up @@ -111,7 +112,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
configurations = configurations[1:n_i, ]
pipelines_executed = pipelines_executed + (n_i * r_i)
[outPip,outHp, pruneCount] = run_with_hyperparam(ph_pip=configurations, r_i=r_i, X=X_train, Y=Y_train, Xtest=X_test, Ytest=Y_test, metaList=metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, cv=cv, cvk=cvk, ref=ref, seed = seed, enablePruning=enablePruning)
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, maxOpCount=maxValueInParam, cv=cv, cvk=cvk, ref=ref, seed = seed, enablePruning=enablePruning)
totalPruneCount = totalPruneCount + pruneCount
# sort the pipelines by order of accuracy decreasing
IX = order(target = outPip, by = 1, decreasing=TRUE, index.return=TRUE)
Expand Down Expand Up @@ -214,19 +215,21 @@ get_physical_configurations = function(Frame[String] logical, Scalar[int] numCon
# # this method will call the execute pipelines with their hyper-parameters
run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Double] X, Matrix[Double] Y,
Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp,
Frame[Unknown] param, Boolean cv = FALSE, Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean default = FALSE)
Frame[Unknown] param, Boolean cv = FALSE, Integer maxOpCount=3, Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean default = FALSE)
return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, Integer pruneCount, Matrix[Double] changesByPipMatrix)
{
# # # TODO there is a partial overlap but it is negligible so we will not rewrite the scripts but lineage based reuse will get rid of it
tfspec=as.scalar(metaList["tfspec"])
mask=as.matrix(metaList["mask"])
changesByPipMatrix = matrix(0, rows=nrow(ph_pip) * r_i, cols=1)
pruneCount = 0
output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * 3)
output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * maxOpCount)
output_accuracy = matrix(0, nrow(ph_pip)*r_i, 1)
output_pipelines = matrix(0, nrow(ph_pip)*r_i, 3)
# rows in validation set
ids = as.matrix(ph_pip[, 1:2])
ph_pip = ph_pip[, 3:ncol(ph_pip)]
inputHpMatrix = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * 3 + 1)
inputHpMatrix = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * maxOpCount + 1)
# prepare the pipelines and resources
allPipelines = frame(0, rows = nrow(ph_pip) * r_i, cols=ncol(ph_pip))
allApplyFunctions = frame(0, rows = nrow(ph_pip) * r_i, cols=ncol(ph_pip))
Expand Down Expand Up @@ -286,7 +289,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Do
hp = hp[, 2:totalVals]
applyFunctions = allApplyFunctions[i]
no_of_res = nrow(hp)
# print("PIPELINE EXECUTION START ... "+toString(op))
print("PIPELINE EXECUTION START ... "+toString(op))
hpForPruning = matrix(0, rows=1, cols=ncol(op))
changesByOp = matrix(0, rows=1, cols=ncol(op))
metaList2 = metaList; #ensure metaList is no result var
Expand Down Expand Up @@ -317,7 +320,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Do
else if(changesByPip < ref)
print("prunningAlert 2: not training the model due to minimum changes")
else
evalFunOutput = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
evalFunOutput = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=mask, evalFunHp=evalFunHp, tfspec=tfspec))
accuracy = as.scalar(evalFunOutput[1, 1])
}

Expand Down Expand Up @@ -506,12 +509,13 @@ crossV = function(Matrix[double] X, Matrix[double] y, Integer cvk, Matrix[Double
Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc, Double ref = 0)
return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double allChanges)
{

tfspec = as.scalar(metaList['tfspec'])
# # in the below condition we compute the hp using cv method on train dataset
if(is.na(as.scalar(evalFunHp[1,1]))) {
forEvalHp = eval(evalFunc, list(X=X, Y=y, Xtest=X, Ytest=y, Xorig=as.matrix(0), evalFunHp=evalFunHp))
if(is.na(as.scalar(evalFunHp[1,1])) & tfspec=="NA") {
forEvalHp = eval(evalFunc, list(X=X, Y=y, Xtest=X, Ytest=y, Xorig=as.matrix(0), evalFunHp=evalFunHp, tfspec=tfspec))
evalFunHp = forEvalHp[1, 2:ncol(forEvalHp)]
}
mask = as.matrix(metaList['mask'])
changesByPip = 0
cvChanges = matrix(0, rows=cvk, cols=ncol(changesByOp))
accuracyMatrix = matrix(0, cvk, 1)
Expand Down Expand Up @@ -547,9 +551,9 @@ return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning,
allChanges[i] = changesByPip
}
if(changesByPip < ref)
print("prunning alert 2: no training the model due to minimum changes")
print("pruning alert 2: no training the model due to minimum changes")
else {
res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp))
res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=mask , evalFunHp=evalFunHp, tfspec=tfspec))
accuracyMatrix[i] = res[1, 1]
}

Expand Down
1 change: 1 addition & 0 deletions scripts/builtin/correctTyposApply.dml
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,4 @@ replaceStrings1 = function(String replacement, String to_replace, Frame[String]
{
strings = map(strings, "s -> s.equals(\""+to_replace+"\") ? \""+replacement+"\" : s");
}

21 changes: 11 additions & 10 deletions scripts/builtin/executePipeline.dml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Mat
else {
print("not applying operation executeFlag = 0")
}

if(ncol(Xtest) == d & nrow(Xtest) == nrow(XtestClone) & ncol(hpForPruning) > 1) {
changesSingle = sum(abs(replace(target=Xtest, pattern=NaN, replacement=0) - replace(target=XtestClone, pattern=NaN, replacement=0)) > 0.001 )
changesAll = sum(abs(replace(target=Xtest, pattern=NaN, replacement=0) - replace(target=Xorig, pattern=NaN, replacement=0)) > 0.001 )
Expand Down Expand Up @@ -204,7 +203,7 @@ return (Matrix[Double] X)
# X without numerics
Xcat = removeEmpty(target=originalX, margin="cols", select=mask)
nanMask = is.na(Xcat)
Xcat = replace(target = Xcat, pattern = NaN, replacement = -1111)
Xcat = abs(round(replace(target = Xcat, pattern = NaN, replacement = 4444)))

# reconstruct the original matrix
p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows",
Expand All @@ -214,24 +213,24 @@ return (Matrix[Double] X)
X = (nX %*% p) + (Xcat %*% q)

X = replace(target = X, pattern = maxDummy, replacement = NaN)
X = replace(target = X, pattern = -1111, replacement = NaN)
X = replace(target = X, pattern = 4444, replacement = NaN)
}
else if(dataFlag == 1 & (sum(mask) > 0) & (sum(mask) != ncol(originalX)))
{
maxDummy = max(replace(target=nX, pattern=NaN, replacement=0)) + 1
nX = replace(target = nX, pattern = NaN, replacement = maxDummy)
maxDummy = abs(round(max(replace(target=nX, pattern=NaN, replacement=0)) + 1))
nX = abs(round(replace(target = nX, pattern = NaN, replacement = maxDummy)))
# X without categorical
Xnum = removeEmpty(target=originalX, margin="cols", select=(mask==0))
nanMask = is.na(Xnum)
Xnum = replace(target = Xnum, pattern = NaN, replacement = -1111)
Xnum = replace(target = Xnum, pattern = NaN, replacement = 4444)
# reconstruct the original matrix
p = table(seq(1, ncol(Xnum)), removeEmpty(target=seq(1, ncol(mask)), margin="rows",
select=t(mask==0)), ncol(Xnum), ncol(originalX))
q = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows",
select=t(mask)), ncol(nX), ncol(originalX))
X = (nX %*% q) + (Xnum %*% p)
X = replace(target = X, pattern = maxDummy, replacement = NaN)
X = replace(target = X, pattern = -1111, replacement = NaN)
X = replace(target = X, pattern = 4444, replacement = NaN)

}
else X = nX
Expand All @@ -247,14 +246,14 @@ return (Matrix[Double] X)
#######################################################################

dummycoding = function(Matrix[Double] X, Matrix[Double] mask)
return (Matrix[Double] X, String jspec, Frame[Unknown] meta) {

return (Matrix[Double] X, String jspec, Frame[Unknown] meta) {
meta = as.frame("NULL")
jspec = ""
if(sum(mask) > 0)
{
X = replace(target=X, pattern=NaN, replacement=0)
idx = vectorToCsv(mask)
idx = vectorToCsv(mask, ncol(X))
# specifications for one-hot encoding of categorical features
jspec = "{ids:true, dummycode:["+idx+"]}";
# OHE of categorical features
Expand All @@ -268,6 +267,7 @@ return (Matrix[Double] Y) {

if(jspec != "")
{
X = replace(target=X, pattern=NaN, replacement=0)
Y = transformapply(target=as.frame(X), spec=jspec, meta=meta);
}
else Y = X
Expand All @@ -286,6 +286,7 @@ return (Matrix[Double] X, Matrix[Double] fillMatrix)
if(sum(fdMask) > 0)
{
t = replace(target=X, pattern=NaN, replacement=1)
t = replace(target=t, pattern=0, replacement=1)
fdMask = removeEmpty(target=fdMask, margin="cols")
FD = discoverFD(X=t, Mask=fdMask, threshold=threshold)
FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD
Expand Down
19 changes: 10 additions & 9 deletions scripts/builtin/fit_pipeline.dml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ source("scripts/builtin/bandit.dml") as bandit;

s_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"),
Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, Integer cvk=3, String evaluationFunc, Matrix[Double] evalFunHp,
Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
Boolean isLastLabel = TRUE, String tfspec="NA", Boolean OHE=TRUE, Boolean correctTypos=FALSE)
return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTest, List[Unknown] externalState, List[Unknown] iState)
{
externalState = list()
Expand All @@ -57,14 +57,17 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe

pip = removeEmpty(target=pip, margin="cols")
applyFunc = removeEmpty(target=applyFunc, margin="cols")
metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"))
metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"), tfspec=tfspec)
ctx = list(prefix="----"); #TODO include seed
# separate the label
[Xtrain, Ytrain] = topk::getLabel(trainData, isLastLabel)
[Xtest, Ytest] = topk::getLabel(testData, isLastLabel)

# always recode the label
if(maskY == 1) {
sc = detectSchema(Ytrain)
Ytrain = applySchema(Ytrain, sc)
Ytest = applySchema(Ytest, sc)
[eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true, recode:[1]}");
eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M);
externalState = append(externalState, M)
Expand All @@ -77,12 +80,13 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe
# # # when the evaluation function is called first we also compute and keep hyperparams of target application
ctx = list(prefix="evaluate Pipeline")
dirtyScore = topk::getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, metaList=metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, ctx=ctx)
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, OHE=OHE, ctx=ctx)
[Xtrain, Xtest] = topk::runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos, ctx)

# # # if mask has 1s then there are categorical features
[eXtrain, eXtest, M1] = topk::recodeData(Xtrain, Xtest, mask, FALSE, "recode")
externalState = append(externalState, M1)
externalState = append(externalState, tfspec)
# # # do the early dropping
# [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList, FALSE)
metaList["applyFunc"] = applyFunc
Expand All @@ -94,25 +98,22 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe

[trainScore, evalFunHp] = bandit::crossV(X=eXtrain, y=eYtrain, cvk=cvk, evalFunHp=evalFunHp,
pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
print("train score cv: "+toString(trainScore))


# # # now test accuracy
[eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)

if(max(eYtrain) == min(eYtrain))
stop("Y contains only one class")

# score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
# trainAccuracy = as.scalar(score[1, 1])

score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=mask, evalFunHp=evalFunHp, tfspec=tfspec))
testAccuracy = as.scalar(score[1, 1])

scores = matrix(0, rows=1, cols=3)
scores[1, 1] = dirtyScore
# scores[1, 2] = trainAccuracy
scores[1, 2] = trainScore
scores[1, 3] = testAccuracy
cleanTrain = cbind(eXtrain, eYtrain)
cleanTest = cbind(eXtest, eYtest)
Expand Down
2 changes: 1 addition & 1 deletion scripts/builtin/frameSort.dml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
s_frameSort = function(Frame[String] F, Matrix[Double] mask, Boolean orderDesc = TRUE)
return (Frame[String] f_ordered)
{
index = vectorToCsv(mask)
index = vectorToCsv(mask, ncol(F))
# recode logical pipelines for easy handling
jspecR = "{ids:true, recode:["+index+"]}";
[X, M] = transformencode(target=F, spec=jspecR);
Expand Down
2 changes: 1 addition & 1 deletion scripts/builtin/mice.dml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
d = ncol(X1)
n = nrow(X1)
# compute index of categorical features
index = vectorToCsv(cMask)
index = vectorToCsv(cMask, ncol(cMask))
# specifications for one-hot encoding of categorical features
jspecDC = "{ids:true, dummycode:["+index+"]}";
[dX, dM] = transformencode(target=as.frame(X1), spec=jspecDC);
Expand Down
2 changes: 1 addition & 1 deletion scripts/builtin/miceApply.dml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ m_miceApply = function(Matrix[Double] X, Matrix[Double] meta, Double threshold,
n = nrow(X1)

# compute index of categorical features
index = vectorToCsv(mask)
index = vectorToCsv(mask, ncol(mask))
# specifications for one-hot encoding of categorical features
jspecDC = "{ids:true, dummycode:["+index+"]}";

Expand Down
Loading