-
Notifications
You must be signed in to change notification settings - Fork 0
/
egbib.bib
executable file
·440 lines (398 loc) · 18.5 KB
/
egbib.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
@misc{Authors14,
author = {Authors},
title = {The frobnicatable foo filter},
note = {Face and Gesture submission ID 324. Supplied as additional material {\tt fg324.pdf}},
year = 2014
}
@misc{Authors14b,
author = {Authors},
title = {Frobnication tutorial},
note = {Supplied as additional material {\tt tr.pdf}},
year = 2014
}
@article{Alpher02,
author = {FirstName Alpher},
title = {Frobnication},
journal = {Journal of Foo},
volume = 12,
number = 1,
pages = {234--778},
year = 2002
}
@article{Alpher03,
author = {FirstName Alpher and FirstName Fotheringham-Smythe},
title = {Frobnication revisited},
journal = {Journal of Foo},
volume = 13,
number = 1,
pages = {234--778},
year = 2003
}
@article{Alpher04,
author = {FirstName Alpher and FirstName Fotheringham-Smythe and FirstName Gamow},
title = {Can a machine frobnicate?},
journal = {Journal of Foo},
volume = 14,
number = 1,
pages = {234--778},
year = 2004
}
@inproceedings{firstkdpaper,
author = {Buciluǎ, Cristian and Caruana, Rich and Niculescu-Mizil, Alexandru},
title = {Model Compression},
year = {2006},
isbn = {1595933395},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/1150402.1150464},
doi = {10.1145/1150402.1150464},
abstract = {Often the best performing supervised learning models are ensembles of hundreds or thousands of base-level classifiers. Unfortunately, the space required to store this many classifiers, and the time required to execute them at run-time, prohibits their use in applications where test sets are large (e.g. Google), where storage space is at a premium (e.g. PDAs), and where computational power is limited (e.g. hea-ring aids). We present a method for "compressing" large, complex ensembles into smaller, faster models, usually without significant loss in performance.},
booktitle = {Proceedings of the 12th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {535–541},
numpages = {7},
keywords = {supervised learning, model compression},
location = {Philadelphia, PA, USA},
series = {KDD '06}
}
@inproceedings{hintonfirstkd,
title = {Distilling the Knowledge in a Neural Network},
author = {Geoffrey Hinton and Oriol Vinyals and Jeffrey Dean},
year = {2015},
URL = {http://arxiv.org/abs/1503.02531},
booktitle = {NIPS Deep Learning and Representation Learning Workshop}
}
@misc{visualtask04,
title = {Resolution-Based Distillation for Efficient Histology Image Classification},
author = {Joseph DiPalma and Arief A. Suriawinata and Laura J. Tafe and Lorenzo Torresani and Saeed Hassanpour},
year = {2021},
eprint = {2101.04170},
archiveprefix = {arXiv},
primaryclass = {eess.IV}
}
@misc{visualtask02,
title = {Knowledge Distillation with Feature Maps for Image Classification},
author = {Wei-Chun Chen and Chia-Che Chang and Chien-Yu Lu and Che-Rung Lee},
year = {2018},
eprint = {1812.00660},
archiveprefix = {arXiv},
primaryclass = {cs.LG}
}
@misc{visualtask01,
title = {Learning without Forgetting},
author = {Zhizhong Li and Derek Hoiem},
year = {2017},
eprint = {1606.09282},
archiveprefix = {arXiv},
primaryclass = {cs.CV}
}
@misc{visualtask03,
title = {Label Refinery: Improving ImageNet Classification through Label Progression},
author = {Hessam Bagherinezhad and Maxwell Horton and Mohammad Rastegari and Ali Farhadi},
year = {2018},
eprint = {1805.02641},
archiveprefix = {arXiv},
primaryclass = {cs.CV}
}
@misc{segment01,
title = {Knowledge Adaptation for Efficient Semantic Segmentation},
author = {Tong He and Chunhua Shen and Zhi Tian and Dong Gong and Changming Sun and Youliang Yan},
year = {2019},
eprint = {1903.04688},
archiveprefix = {arXiv},
primaryclass = {cs.CV}
}
@article{facial01,
title = {MobileFAN: Transferring deep hidden representation for face alignment},
journal = {Pattern Recognition},
volume = {100},
pages = {107114},
year = {2020},
issn = {0031-3203},
doi = {https://doi.org/10.1016/j.patcog.2019.107114},
url = {https://www.sciencedirect.com/science/article/pii/S0031320319304157},
author = {Yang Zhao and Yifan Liu and Chunhua Shen and Yongsheng Gao and Shengwu Xiong},
keywords = {Face alignment, Knowledge distillation, Lightweight model},
abstract = {Facial landmark detection is a crucial prerequisite for many face analysis applications. Deep learning-based methods currently dominate the approach of addressing the facial landmark detection. However, such works generally introduce a large number of parameters, resulting in high memory cost. In this paper, we aim for a lightweight as well as effective solution to facial landmark detection. To this end, we propose an effective lightweight model, namely Mobile Face Alignment Network (MobileFAN), using a simple backbone MobileNetV2 as the encoder and three deconvolutional layers as the decoder. The proposed MobileFAN, with only 8% of the model size and lower computational cost, achieves superior or equivalent performance compared with state-of-the-art models. Moreover, by transferring the geometric structural information of a face graph from a large complex model to our proposed MobileFAN through feature-aligned distillation and feature-similarity distillation, the performance of MobileFAN is further improved in effectiveness and efficiency for face alignment. Extensive experiment results on three challenging facial landmark estimation benchmarks including COFW, 300W and WFLW show the superiority of our proposed MobileFAN against state-of-the-art methods.}
}
@article{facial02,
title={Low-Resolution Face Recognition in the Wild via Selective Knowledge Distillation},
volume={28},
ISSN={1941-0042},
url={http://dx.doi.org/10.1109/TIP.2018.2883743},
DOI={10.1109/tip.2018.2883743},
number={4},
journal={IEEE Transactions on Image Processing},
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
author={Ge, Shiming and Zhao, Shengwei and Li, Chenyu and Li, Jia},
year={2019},
month={Apr},
pages={2051–2062}
}
@misc{lanedetect01,
title = {Learning Lightweight Lane Detection CNNs by Self Attention Distillation},
author = {Yuenan Hou and Zheng Ma and Chunxiao Liu and Chen Change Loy},
year = {2019},
eprint = {1908.00821},
archiveprefix = {arXiv},
primaryclass = {cs.CV}
}
@inproceedings{nlp01,
title = {Patient Knowledge Distillation for {BERT} Model Compression},
author = {Sun, Siqi and
Cheng, Yu and
Gan, Zhe and
Liu, Jingjing},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
month = nov,
year = {2019},
address = {Hong Kong, China},
publisher = {Association for Computational Linguistics},
url = {https://www.aclweb.org/anthology/D19-1441},
doi = {10.18653/v1/D19-1441},
pages = {4323--4332},
abstract = {Pre-trained language models such as BERT have proven to be highly effective for natural language processing (NLP) tasks. However, the high demand for computing resources in training such models hinders their application in practice. In order to alleviate this resource hunger in large-scale model training, we propose a Patient Knowledge Distillation approach to compress an original large model (teacher) into an equally-effective lightweight shallow network (student). Different from previous knowledge distillation methods, which only use the output from the last layer of the teacher network for distillation, our student model patiently learns from multiple intermediate layers of the teacher model for incremental knowledge extraction, following two strategies: (i) PKD-Last: learning from the last k layers; and (ii) PKD-Skip: learning from every k layers. These two patient distillation schemes enable the exploitation of rich information in the teacher{'}s hidden layers, and encourage the student model to patiently learn from and imitate the teacher through a multi-layer distillation process. Empirically, this translates into improved results on multiple NLP tasks with a significant gain in training efficiency, without sacrificing model accuracy.}
}
@misc{nlp02,
title = {TinyBERT: Distilling BERT for Natural Language Understanding},
author = {Xiaoqi Jiao and Yichun Yin and Lifeng Shang and Xin Jiang and Xiao Chen and Linlin Li and Fang Wang and Qun Liu},
year = {2020},
eprint = {1909.10351},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
@misc{featurebased03_relu,
title = {A Comprehensive Overhaul of Feature Distillation},
author = {Byeongho Heo and Jeesoo Kim and Sangdoo Yun and Hyojin Park and Nojun Kwak and Jin Young Choi},
year = {2019},
eprint = {1904.01866},
archiveprefix = {arXiv},
primaryclass = {cs.CV}
}
@misc{featurebased01,
title = {FitNets: Hints for Thin Deep Nets},
author = {Adriana Romero and Nicolas Ballas and Samira Ebrahimi Kahou and Antoine Chassang and Carlo Gatta and Yoshua Bengio},
year = {2015},
eprint = {1412.6550},
archiveprefix = {arXiv},
primaryclass = {cs.LG}
}
@misc{featurebased02_AT,
title = {Paying More Attention to Attention: Improving the Performance of Convolutional Neural Networks via Attention Transfer},
author = {Sergey Zagoruyko and Nikos Komodakis},
year = {2017},
eprint = {1612.03928},
archiveprefix = {arXiv},
primaryclass = {cs.CV}
}
@misc{featurebased04_mmd,
title = {Like What You Like: Knowledge Distill via Neuron Selectivity Transfer},
author = {Zehao Huang and Naiyan Wang},
year = {2017},
eprint = {1707.01219},
archiveprefix = {arXiv},
primaryclass = {cs.CV}
}
@misc{featurebased05_kl,
title = {Knowledge Distillation from Internal Representations},
author = {Gustavo Aguilar and Yuan Ling and Yu Zhang and Benjamin Yao and Xing Fan and Chenlei Guo},
year = {2020},
eprint = {1910.03723},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
@misc{featurebased06_meal,
title = {MEAL: Multi-Model Ensemble via Adversarial Learning},
author = {Zhiqiang Shen and Zhankui He and Xiangyang Xue},
year = {2019},
eprint = {1812.02425},
archiveprefix = {arXiv},
primaryclass = {cs.CV}
}
@inproceedings{relbase01,
author = {J. {Yim} and D. {Joo} and J. {Bae} and J. {Kim}},
booktitle = {2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
title = {A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning},
year = {2017},
volume = {},
number = {},
pages = {7130-7138},
doi = {10.1109/CVPR.2017.754}
}
@misc{relbase02,
title = {Better and Faster: Knowledge Transfer from Multiple Self-supervised Learning Tasks via Graph Distillation for Video Classification},
author = {Chenrui Zhang and Yuxin Peng},
year = {2018},
eprint = {1804.10069},
archiveprefix = {arXiv},
primaryclass = {cs.CV}
}
@inproceedings{explainkd01_phuong,
title = {Towards Understanding Knowledge Distillation},
author = {Phuong, Mary and Lampert, Christoph},
booktitle = {Proceedings of the 36th International Conference on Machine Learning},
pages = {5142--5151},
year = {2019},
editor = {Kamalika Chaudhuri and Ruslan Salakhutdinov},
volume = {97},
series = {Proceedings of Machine Learning Research},
month = {09--15 Jun},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v97/phuong19a/phuong19a.pdf},
url = { http://proceedings.mlr.press/v97/phuong19a.html },
abstract = {Knowledge distillation, i.e., one classifier being trained on the outputs of another classifier, is an empirically very successful technique for knowledge transfer between classifiers. It has even been observed that classifiers learn much faster and more reliably if trained with the outputs of another classifier as soft labels, instead of from ground truth data. So far, however, there is no satisfactory theoretical explanation of this phenomenon. In this work, we provide the first insights into the working mechanisms of distillation by studying the special case of linear and deep linear classifiers. Specifically, we prove a generalization bound that establishes fast convergence of the expected risk of a distillation-trained linear classifier. From the bound and its proof we extract three key factors that determine the success of distillation: * data geometry – geometric properties of the data distribution, in particular class separation, has a direct influence on the convergence speed of the risk; * optimization bias – gradient descent optimization finds a very favorable minimum of the distillation objective; and * strong monotonicity – the expected risk of the student classifier always decreases when the size of the training set grows.}
}
@misc{explainkd02_vconcepts,
title = {Explaining Knowledge Distillation by Quantifying the Knowledge},
author = {Xu Cheng and Zhefan Rao and Yilan Chen and Quanshi Zhang},
year = {2020},
eprint = {2003.03622},
archiveprefix = {arXiv},
primaryclass = {cs.LG}
}
@misc{quantifying,
title = {Quantifying Layerwise Information Discarding of Neural Networks},
author = {Haotian Ma and Yinqing Zhang and Fan Zhou and Quanshi Zhang},
year = {2019},
eprint = {1906.04109},
archiveprefix = {arXiv},
primaryclass = {cs.LG}
}
@misc{complexitygap,
title = {On the Efficacy of Knowledge Distillation},
author = {Jang Hyun Cho and Bharath Hariharan},
year = {2019},
eprint = {1910.01348},
archiveprefix = {arXiv},
primaryclass = {cs.LG}
}
@misc{teacherfree,
title = {Revisiting Knowledge Distillation via Label Smoothing Regularization},
author = {Li Yuan and Francis E. H. Tay and Guilin Li and Tao Wang and Jiashi Feng},
year = {2021},
eprint = {1909.11723},
archiveprefix = {arXiv},
primaryclass = {cs.CV}
}
@misc{labelsmoothingnoise,
title = {Does label smoothing mitigate label noise?},
author = {Michal Lukasik and Srinadh Bhojanapalli and Aditya Krishna Menon and Sanjiv Kumar},
year = {2020},
eprint = {2003.02819},
archiveprefix = {arXiv},
primaryclass = {cs.LG}
}
@misc{logitable01,
title={Knowledge Distillation in Generations: More Tolerant Teachers Educate Better Students},
author={Chenglin Yang and Lingxi Xie and Siyuan Qiao and Alan Yuille},
year={2018},
eprint={1805.05551},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{logitable02,
title={Learning Metrics from Teachers: Compact Networks for Image Embedding},
author={Lu Yu and Vacit Oguz Yazici and Xialei Liu and Joost van de Weijer and Yongmei Cheng and Arnau Ramisa},
year={2019},
eprint={1904.03624},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{logitable03,
title={Relational Knowledge Distillation},
author={Wonpyo Park and Dongju Kim and Yan Lu and Minsu Cho},
year={2019},
eprint={1904.05068},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{logitable04,
title={Improved Knowledge Distillation via Teacher Assistant},
author={Seyed-Iman Mirzadeh and Mehrdad Farajtabar and Ang Li and Nir Levine and Akihiro Matsukawa and Hassan Ghasemzadeh},
year={2019},
eprint={1902.03393},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{logitable05,
title={Ensemble Distribution Distillation},
author={Andrey Malinin and Bruno Mlodozeniec and Mark Gales},
year={2019},
eprint={1905.00076},
archivePrefix={arXiv},
primaryClass={stat.ML}
}
@misc{logitable06,
title={Noise as a Resource for Learning in Knowledge Distillation},
author={Elahe Arani and Fahad Sarfraz and Bahram Zonooz},
year={2020},
eprint={1910.05057},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{logitable07,
title={Self-training with Noisy Student improves ImageNet classification},
author={Qizhe Xie and Minh-Thang Luong and Eduard Hovy and Quoc V. Le},
year={2020},
eprint={1911.04252},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{logitable08,
title={Preparing Lessons: Improve Knowledge Distillation with Better Supervision},
author={Tiancheng Wen and Shenqi Lai and Xueming Qian},
year={2020},
eprint={1911.07471},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{logitable09,
title={MarginDistillation: distillation for margin-based softmax},
author={David Svitov and Sergey Alyamkin},
year={2020},
eprint={2003.02586},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{logitable10,
title={An Embarrassingly Simple Approach for Knowledge Distillation},
author={Mengya Gao and Yujun Shen and Quanquan Li and Junjie Yan and Liang Wan and Dahua Lin and Chen Change Loy and Xiaoou Tang},
year={2019},
eprint={1812.01819},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{logitable11,
title={Conditional Teacher-student Learning},
ISBN={9781479981311},
url={http://dx.doi.org/10.1109/ICASSP.2019.8683438},
DOI={10.1109/icassp.2019.8683438},
journal={ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
publisher={IEEE},
author={Meng, Zhong and Li, Jinyu and Zhao, Yong and Gong, Yifan},
year={2019},
month={May}
}
@misc{logitable12,
title={Snapshot Distillation: Teacher-Student Optimization in One Generation},
author={Chenglin Yang and Lingxi Xie and Chi Su and Alan L. Yuille},
year={2018},
eprint={1812.00123},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{featurebased07_ab,
title={Knowledge Transfer via Distillation of Activation Boundaries Formed by Hidden Neurons},
author={Byeongho Heo and Minsik Lee and Sangdoo Yun and Jin Young Choi},
year={2018},
eprint={1811.03233},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{featurebased08_resKD,
title={Residual Knowledge Distillation},
author={Mengya Gao and Yujun Shen and Quanquan Li and Chen Change Loy},
year={2020},
eprint={2002.09168},
archivePrefix={arXiv},
primaryClass={cs.LG}
}