-
Notifications
You must be signed in to change notification settings - Fork 528
/
Copy pathtransfer_to_ascend.py
143 lines (129 loc) · 5.45 KB
/
transfer_to_ascend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
import json
import logging
from deepmd.env import tf
from typing import Optional
from deepmd.common import j_loader
from deepmd.env import tf, GLOBAL_ENER_FLOAT_PRECISION
from deepmd.utils.argcheck import normalize
from deepmd.utils.compat import update_deepmd_input
from deepmd.utils.errors import GraphTooLargeError, GraphWithoutTensorError
from deepmd.utils.graph import get_tensor_by_name
from deepmd.entrypoints.freeze import freeze
from deepmd.entrypoints.train import train, get_rcut, get_min_nbor_dist
from deepmd.entrypoints.transfer import transfer
log = logging.getLogger(__name__)
def mix_precision(
input: str,
output: str,
checkpoint_folder: str,
training_script: str,
mpi_log: str,
log_path: Optional[str],
log_level: int,
**kwargs
):
"""convert trained model to Ascend mix-precision model.
Generate a Ascend_transfer mix-precision model.
Parameters
----------
input : str
frozen model file to compress
output : str
compressed model filename
checkpoint_folder : str
trining checkpoint folder for freezing
training_script : str
training script of the input frozen model
mpi_log : str
mpi logging mode for training
log_path : Optional[str]
if speccified log will be written to this file
log_level : int
logging level
"""
try:
t_jdata = get_tensor_by_name(input, 'train_attr/training_script')
t_min_nbor_dist = get_tensor_by_name(input, 'train_attr/min_nbor_dist')
jdata = json.loads(t_jdata)
except GraphWithoutTensorError as e:
if training_script == None:
raise RuntimeError(
"The input transfering model: %s has no training script or min_nbor_dist information, "
"which is not supported by the transfer_to_ascend interface. "
"Please consider using the --training-script command within the transfer_to_ascend interface to provide the training script of the input transfering model. "
"Note that the input transfering script must contain the correct path to the training data." % input
) from e
elif not os.path.exists(training_script):
raise RuntimeError(
"The input training script %s (%s) does not exist! Please check the path of the training script. " % (input, os.path.abspath(input))
) from e
else:
log.info("stage 0: compute the min_nbor_dist")
jdata = j_loader(training_script)
jdata = update_deepmd_input(jdata)
t_min_nbor_dist = get_min_nbor_dist(jdata, get_rcut(jdata))
_check_transfer_model_type(input)
tf.constant(t_min_nbor_dist,
name = 'train_attr/min_nbor_dist',
dtype = GLOBAL_ENER_FLOAT_PRECISION)
jdata["model"]["descriptor"]["precision"] = "float16"
jdata["model"]["fitting_net"]["precision"] = "float16"
jdata["model"]["transfered_from_model"] = True
jdata["training"]["save_ckpt"] = os.path.join("model-transfer", "model.ckpt")
jdata = update_deepmd_input(jdata)
jdata = normalize(jdata)
# check the descriptor info of the input file
# move to the specific Descriptor class
# stage 1: training or refining the model with tabulation
log.info("\n\n")
log.info("stage 1: generate the mix-precision model")
control_file = "ascend-transfer.json"
with open(control_file, "w") as fp:
json.dump(jdata, fp, indent=4)
try:
train(
INPUT=control_file,
init_model=None,
restart=None,
init_frz_model=input,
output=control_file,
mpi_log=mpi_log,
log_level=log_level,
log_path=log_path,
is_compress=False,
skip_neighbor_stat=True,
)
except GraphTooLargeError as e:
raise RuntimeError(
"The uniform step size of the tabulation's first table is %f, "
"which is too small. This leads to a very large graph size, "
"exceeding protobuf's limitation (2 GB). You should try to "
"increase the step size." % step
) from e
# reset the graph, otherwise the size limitation will be only 2 GB / 2 = 1 GB
tf.reset_default_graph()
# stage 2: freeze the mix-precision model
log.info("\n\n")
log.info("stage 2: freeze the mix-precision model")
try:
freeze(checkpoint_folder=checkpoint_folder, output=output, node_names=None)
except GraphTooLargeError as e:
raise RuntimeError(
"The uniform step size of the tabulation's first table is %f, "
"which is too small. This leads to a very large graph size, "
"exceeding protobuf's limitation (2 GB). You should try to "
"increase the step size." % step
) from e
# stage 3: transfer the mix-precision model
log.info("\n\n")
log.info("stage 3: transfer the mix-precision model")
transfer(old_model=input, raw_model=output, output=output)
def _check_transfer_model_type(model_file):
try:
t_model_type = bytes.decode(get_tensor_by_name(model_file, 'model_type'))
except GraphWithoutTensorError as e:
# Compatible with the upgraded model, which has no 'model_type' info
t_model_type = None
if t_model_type == "ascend_transfer_model":
raise RuntimeError("The input model %s has already been transfered to ascend mix-precision model! Please do not transfer the model repeatedly. " % model_file)