-
Notifications
You must be signed in to change notification settings - Fork 0
/
cite.bib
14 lines (14 loc) · 1.26 KB
/
cite.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
@inproceedings{pietruszka-etal-2022-sparsifying,
title = "Sparsifying Transformer Models with Trainable Representation Pooling",
author = "Pietruszka, Micha{\l} and
Borchmann, {\L}ukasz and
Garncarek, {\L}ukasz",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.acl-long.590",
pages = "8616--8633",
abstract = "We propose a novel method to sparsify attention in the Transformer model by learning to select the most-informative token representations during the training process, thus focusing on the task-specific parts of an input. A reduction of quadratic time and memory complexity to sublinear was achieved due to a robust trainable top-$k$ operator.Our experiments on a challenging long document summarization task show that even our simple baseline performs comparably to the current SOTA, and with trainable pooling we can retain its top quality, while being $1.8\times$ faster during training, $4.5\times$ faster during inference, and up to $13\times$ more computationally efficient in the decoder.",
}