-
Notifications
You must be signed in to change notification settings - Fork 106
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
使用 SPU 实现决策树模型基础功能 #212
Comments
shazi4399 Give it to me |
经沟通,该任务已经回收,目前为待认领状态哈,欢迎小伙伴们继续认领~ |
LR当时问过“只能通过明文转化实现condition判断”,XGB里如果使用simulation应该是没法实现的吧? |
tree-based算法确实比较特殊,如果纯密态下实现可以预见的效率会非常的低,但是我理解总归是能实现且需要实现的(通过一些01的indicator去标识)。 为了提升性能则需要reveal一部分信息,但此时安全性则无法保证,原则上这类实现只要没有明显的安全隐患,我们也可以接受,但一定需要开发者写清楚整个训练/预测过程中所有泄漏的信息。 |
若是中途reveal了信息,确实无法直接用simulation的方式去测试,需要用类似sf的分布式api。spu在 |
Stale issue message. Please comment to remove stale tag. Otherwise this issue will be closed soon. |
ElleryQu Give it to me |
Would you mind describing your implementation in several sentences? Thanks |
|
基于GTree进行了实现,对算法做出了部分调整,扩充了对多分类标签的支持。 # sml/tree/tree.py
import jax.numpy as jnp
'''
The protocols of GTree.
'''
def oblivious_array_access(array, index):
# (n_array)
count_array = jnp.arange(0, array.shape[-1])
# (n_array, n_index)
E = jnp.equal(index, count_array[:, jnp.newaxis])
# OAA basic case
if len(array.shape) == 1:
# (n_array, n_index)
O = array[:, jnp.newaxis] * E # select shares
zu = jnp.sum(O, axis=0)
# OAA vectorization variant
elif len(array.shape) == 2:
# (n_arrays, n_array, n_index)
O = array[:, :, jnp.newaxis] * E[jnp.newaxis, :, :] # select shares
zu = jnp.sum(O, axis=1)
else:
raise "Not implemented."
return zu
def oaa_elementwise(array, index_array):
assert index_array.shape[0] == array.shape[0], "n_arrays must be equal to n_index."
count_array = jnp.arange(0, array.shape[-1])
# (n_array, n_index)
E = jnp.equal(index_array[:, jnp.newaxis], count_array)
if len(array.shape) == 2:
O = array * E
zu = jnp.sum(O, axis=1)
else:
raise "Not implemented."
return zu
# def oblivious_learning(X, y, T, F, M, Cn, h):
def oblivious_learning(X, y, T, F, M, h, Cn, n_labels):
''' partition the data and count the number of data samples.
params:
D: data samples, which is splitted into X, y. X: (n_samples, n_features), y: (n_samples, 1).
T: tree structure reprensenting split features. (total_nodes)
F: tree structure reprensenting node types. (total_nodes)
0 for internal, 1 for leaf, 2 for dummy.
M: which leave node does D[i] belongs to (for level h-1). (n_samples)
Cn: statical information of the data samples. (n_leaves, n_labels+1, 2*n_features)
h: int, current depth of the tree.
n_labels: the number of label classes.
'''
# line 1-5, partition the datas into new leaves.
n_d, n_f = X.shape
n_h = 2 ** h
if h != 0:
Tval = oaa(T, M)
Dval = oaae(X, Tval)
M = 2 * M + Dval + 1
# (n_leaves)
LCidx = jnp.arange(0, n_h)
isLeaf = jnp.equal(F[n_h-1:2*n_h-1], jnp.ones(n_h))
# (n_samples, n_leaves)
LCF = jnp.equal(M[:, jnp.newaxis] - n_h + 1, LCidx)
LCF = LCF * isLeaf
# (n_samples, n_leaves, n_labels, 2 * n_features)
Cd = jnp.zeros((n_d, n_h, n_labels + 1, 2 * n_f))
Cd = Cd.at[:,:,0,0::2].set(jnp.tile((1 - X) [:,jnp.newaxis,:], (1, n_h, 1)))
Cd = Cd.at[:,:,0,1::2].set(jnp.tile((X) [:,jnp.newaxis,:], (1, n_h, 1)))
for i in range(n_labels):
Cd = Cd.at[:,:,i+1,0::2].set(jnp.tile(((1 - X) * (i == y)[:,jnp.newaxis])[:,jnp.newaxis,:], (1, n_h, 1)))
Cd = Cd.at[:,:,i+1,1::2].set(jnp.tile(((X) * (i == y)[:,jnp.newaxis])[:,jnp.newaxis,:], (1, n_h, 1)))
Cd = Cd * LCF[:,:,jnp.newaxis,jnp.newaxis]
# (n_leaves, n_labels+1, 2*n_features)
new_Cn = jnp.sum(Cd, axis=0)
print(isLeaf.shape, Cn.shape, new_Cn.shape)
if h != 0:
Cn = Cn.repeat(2, axis=0)
new_Cn = new_Cn[:,:,:] + Cn[:,:,:] * (1 - isLeaf[:,jnp.newaxis,jnp.newaxis])
return new_Cn, M
def oblivious_heuristic_computation(Cn, gamma, F, h, n_labels):
''' Compute gini index, find the best feature, and update F.
params:
Cn: statical information of the data samples. (n_leaves, n_labels+1, 2*n_features)
gamma: gamma[n][i] indicates if feature si has been assigned at node n. (n_leaves, n_features)
F: tree structure reprensenting node types. (total_nodes)
0 for internal, 1 for leaf, 2 for dummy.
h: int, current depth of the tree.
n_labels: int, number of label classes.
'''
n_leaves = Cn.shape[0]
n_features = gamma.shape[1]
Ds0 = Cn[:,0,0::2]
Ds1 = Cn[:,0,1::2]
D = Ds0 + Ds1
Q = D * Ds0 * Ds1
P = jnp.zeros(gamma.shape)
print(P.shape, Ds1.shape)
for i in range(n_labels):
P = P - Ds1 * (Cn[:,i+1,0::2] ** 2) - Ds0 * (Cn[:,i+1,1::2] ** 2)
gini = Q / (Q + P + 1)
gini = gini * gamma
# (n_leaves)
SD = jnp.argmax(gini, axis=1)
index = jnp.arange(0, n_features)
gamma = gamma * jnp.not_equal(index[jnp.newaxis, :], SD[:, jnp.newaxis])
new_gamma = jnp.zeros((n_leaves * 2, n_features))
new_gamma = new_gamma.at[0::2, :].set(gamma)
new_gamma = new_gamma.at[1::2, :].set(gamma)
psi = jnp.zeros((n_leaves, n_labels))
for i in range(n_labels):
psi = psi.at[:,i].set(Cn[:,i+1,0] + Cn[:,i+1,1])
total = jnp.sum(psi, axis=1)
psi = total[:, jnp.newaxis] - psi
psi = jnp.prod(psi, axis=1)
F = F.at[2**h-1:2**(h+1)-1].set(jnp.equal(psi * F[2**h-1:2**(h+1)-1], 0))
F = F.at[2**(h+1)-1:2**(h+2)-1:2].set(2-jnp.equal(F[2**h-1:2**(h+1)-1], 0))
F = F.at[2**(h+1):2**(h+2)-1:2].set(F[2**(h+1)-1:2**(h+2)-1:2])
return SD, new_gamma, F
def oblivious_node_split(SD, T, F, Cn, h, max_depth):
''' Convert each node into its internal node and generates new leaves at the next level.
params:
SD: the best feature at current level. (n_leaves)
T: tree structure reprensenting split features. (total_nodes)
F: tree structure reprensenting node types. (total_nodes)
0 for internal, 1 for leaf, 2 for dummy.
Cn: statical information of the data samples. (n_leaves, n_labels+1, 2*n_features)
h: current level.
max_depth: max depth of tree.
'''
T = T.at[2**h-1:2**(h+1)-1].set(SD)
# return T
return T, Cn
def oblivious_DT_training(X, y, max_depth, n_labels):
n_samples, n_features = X.shape
T = jnp.zeros((2 ** (max_depth + 1) - 1))
F = jnp.ones((2 ** max_depth - 1))
M = jnp.zeros(n_samples)
gamma = jnp.ones((1, n_features))
Cn = jnp.zeros((1, n_labels + 1, 2 * n_features))
h = 0
while h < max_depth:
Cn, M = ol(X, y, T, F, M, h, Cn, n_labels)
SD, gamma, F = ohc(Cn, gamma, F, h, n_labels)
T, Cn = ons(SD, T, F, Cn, h, max_depth)
h += 1
n_leaves = 2 ** h
psi = jnp.zeros((n_leaves, n_labels))
for i in range(2 ** (h-1)):
t1 = oaa(Cn[i, 1:], 2*SD[i:i+1]).squeeze()
t2 = oaa(Cn[i, 1:], 2*SD[i:i+1]+1).squeeze()
psi = psi.at[2*i, :].set(t1)
psi = psi.at[2*i+1, :].set(t2)
# psi = psi.at[0::2, i].set(Cn[0::2, i+1, 0])
# psi = psi.at[1::2, i].set(Cn[1::2, i+1, 1])
T = T.at[n_leaves - 1:].set(jnp.argmax(psi, axis=1))
return T, F
def oblivious_DT_inference(X, T, max_height):
n_samples, n_features = X.shape
Tidx = jnp.zeros((n_samples))
i = 0
while i < max_height:
Tval = oaa(T, Tidx)
Dval = oaae(X, Tval)
Tidx = Tidx * 2 + Dval + 1
i += 1
Tval = oaa(T, Tidx)
return Tval
oaa = oblivious_array_access
oaae = oaa_elementwise
ol = oblivious_learning
ohc = oblivious_heuristic_computation
ons = oblivious_node_split
odtt = oblivious_DT_training
odti = oblivious_DT_inference
# sml/tree/emulations/tree_emul.py
import time
import jax.numpy as jnp
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
import sml.utils.emulation as emulation
from sml.tree.tree import odtt, odti
MAX_DEPTH = 2
CONFIG_FILE = "3pc.json"
def emul_tree(mode = emulation.Mode.MULTIPROCESS):
def proc_wrapper(max_depth=2, n_labels=3):
def proc(X, y):
T, F = odtt(X, y, max_depth=max_depth, n_labels=n_labels)
result = odti(X, T, max_depth)
return result
return proc
def load_data():
iris = load_iris()
iris_data, iris_label = jnp.array(iris.data), jnp.array(iris.target)
# sorted_features: n_samples * n_features_in
n_samples, n_features_in = iris_data.shape
n_labels = len(jnp.unique(iris_label))
sorted_features = jnp.sort(iris_data, axis=0)
new_threshold = (sorted_features[:-1, :] + sorted_features[1:, :]) / 2
new_features = jnp.greater_equal(iris_data[:,:], new_threshold[:,jnp.newaxis,:]) + 1 - 1
new_features = new_features.transpose([1,0,2]).reshape(n_samples, -1)
X, y = new_features, iris_label
return X, y
try:
# bandwidth and latency only work for docker mode
emulator = emulation.Emulator(
CONFIG_FILE, mode, bandwidth=300, latency=20
)
emulator.up()
# load mock data
X, y = load_data()
n_samples = y.shape[0]
n_labels = jnp.unique(y).shape[0]
# compare with sklearn
clf = DecisionTreeClassifier(max_depth=MAX_DEPTH, criterion='gini', splitter='best', random_state=None)
start = time.time()
clf = clf.fit(X, y)
score_plain = clf.score(X, y)
end = time.time()
print(f"Running time in SKlearn: {end - start:.2f}s")
# mark these data to be protected in SPU
X_spu, y_spu = emulator.seal(X, y)
# run
proc = proc_wrapper(MAX_DEPTH, n_labels)
start = time.time()
result = emulator.run(proc)(X_spu, y_spu)
end = time.time()
score_encrpted = jnp.sum((result == y)) / n_samples
print(f"Running time in SPU: {end - start:.2f}s")
# print acc
print(f"Accuracy in SKlearn: {score_plain:.2f}")
print(f"Accuracy in SPU: {score_encrpted:.2f}")
finally:
emulator.down()
if __name__ == "__main__":
emul_tree(emulation.Mode.MULTIPROCESS) 现在的问题是:
|
感谢算法实现,
|
您好,我已经发起了PR,目前还没有审核 |
sorry,最近有点忙,我争取这周完成review哈~ |
# Pull Request ## What problem does this PR solve? Issue Number: Fixed #212 ## Possible side effects? - Performance: **test**: ``` No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) [2023-10-13 18:32:00.502] [info] [thread_pool.cc:30] Create a fixed thread pool with size 19 Accuracy in SKlearn: 0.96 Accuracy in SPU: 0.96 . ---------------------------------------------------------------------- Ran 1 test in 78.026s OK ``` **emulation**: Running time in SPU: 77.37s Accuracy in SKlearn: 0.96 Accuracy in SPU: 0.96 [2023-10-13 18:35:38,091] Shutdown multiprocess cluster... - Backward compatibility:
任务介绍
详细要求:
能力要求:
操作说明:
The text was updated successfully, but these errors were encountered: