cite.bib

@article{lamport,
author = {Lamport, Leslie},
title = {Time, clocks, and the ordering of events in a distributed system},
year = {1978},
issue_date = {July 1978},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {21},
number = {7},
issn = {0001-0782},
url = {https://doi.org/10.1145/359545.359563},
doi = {10.1145/359545.359563},
abstract = {The concept of one event happening before another in a distributed system is examined, and is shown to define a partial ordering of the events. A distributed algorithm is given for synchronizing a system of logical clocks which can be used to totally order the events. The use of the total ordering is illustrated with a method for solving synchronization problems. The algorithm is then specialized for synchronizing physical clocks, and a bound is derived on how far out of synchrony the clocks can become.},
journal = {Commun. ACM},
month = {jul},
pages = {558–565},
numpages = {8},
keywords = {multiprocess systems, distributed systems, computer networks, clock synchronization}
}

@ARTICLE{lateness,
  author={Isaacs, Katherine E. and Gamblin, Todd and Bhatele, Abhinav and Schulz, Martin and Hamann, Bernd and Bremer, Peer-Timo},
  journal={IEEE Transactions on Parallel and Distributed Systems}, 
  title={Ordering Traces Logically to Identify Lateness in Message Passing Programs}, 
  year={2016},
  volume={27},
  number={3},
  pages={829-840},
  keywords={Visualization;Partitioning algorithms;Merging;Message passing;Delays;Trace analysis;performance;Trace analysis;performance},
  doi={10.1109/TPDS.2015.2417531}}


@inproceedings{trace-vis-task-dependencies,
author = {Haugen, Blake and Richmond, Stephen and Kurzak, Jakub and Steed, Chad A. and Dongarra, Jack},
title = {Visualizing execution traces with task dependencies},
year = {2015},
isbn = {9781450340137},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2835238.2835240},
doi = {10.1145/2835238.2835240},
abstract = {Task-based scheduling has emerged as one method to reduce the complexity of parallel computing. When using task-based schedulers, developers must frame their computation as a series of tasks with various data dependencies. The scheduler can take these tasks, along with their input and output dependencies, and schedule the task in parallel across a node or cluster. While these schedulers simplify the process of parallel software development, they can obfuscate the performance characteristics of the execution of an algorithm.The execution trace has been used for many years to give developers a visual representation of how their computations are performed. These methods can be employed to visualize when and where each of the tasks in a task-based algorithm is scheduled. In addition, the task dependencies can be used to create a directed acyclic graph (DAG) that can also be visualized to demonstrate the dependencies of the various tasks that make up a workload. The work presented here aims to combine these two data sets and extend execution trace visualization to better suit task-based workloads.This paper presents a brief description of task-based schedulers and the performance data they produce. It will then describe an interactive extension to the current trace visualization methods that combines the trace and DAG data sets. This new tool allows users to gain a greater understanding of how their tasks are scheduled. It also provides a simplified way for developers to evaluate and debug the performance of their scheduler.},
booktitle = {Proceedings of the 2nd Workshop on Visual Performance Analysis},
articleno = {2},
numpages = {8},
keywords = {task-based scheduling, execution trace, data movement, DAG},
location = {Austin, Texas},
series = {VPA '15}
}

@misc{litgpt-2023,
  author       = {Lightning AI},
  title        = {LitGPT},
  howpublished = {\url{https://github.com/Lightning-AI/litgpt}},
  year         = {2023},
}

@InProceedings{rabenseifner-2004,
author="Rabenseifner, Rolf",
editor="Bubak, Marian
and van Albada, Geert Dick
and Sloot, Peter M. A.
and Dongarra, Jack",
title="Optimization of Collective Reduction Operations",
booktitle="Computational Science - ICCS 2004",
year="2004",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="1--9",
abstract="A 5-year-profiling in production mode at the University of Stuttgart has shown that more than 40{\%} of the execution time of Message Passing Interface (MPI) routines is spent in the collective communication routines MPI{\_}Allreduce and MPI{\_}Reduce. Although MPI implementations are now available for about 10 years and all vendors are committed to this Message Passing Interface standard, the vendors' and publicly available reduction algorithms could be accelerated with new algorithms by a factor between 3 (IBM, sum) and 100 (Cray T3E, maxloc) for long vectors. This paper presents five algorithms optimized for different choices of vector size and number of processes. The focus is on bandwidth dominated protocols for power-of-two and non-power-of-two number of processes, optimizing the load balance in communication and computation.",
isbn="978-3-540-24685-5"
}

@Inproceedings{Zhang2023,
 author = {Zhen Zhang and Shuai Zheng and Yida Wang and Justin Chiu and George Karypis and Trishul Chilimbi and Mu Li and Xin Jin},
 title = {MiCS: Near linear scaling for training gigantic model on public cloud},
 year = {2023},
 url = {https://www.amazon.science/publications/mics-near-linear-scaling-for-training-gigantic-model-on-public-cloud},
 booktitle = {VLDB 2023},
}

@inproceedings{wang2024zero,
title={Ze{RO}++: Extremely Efficient Collective Communication for Large Model Training},
author={Guanhua Wang and Heyang Qin and Sam Ade Jacobs and Xiaoxia Wu and Connor Holmes and Zhewei Yao and Samyam Rajbhandari and Olatunji Ruwase and Feng Yan and Lei Yang and Yuxiong He},
booktitle={The Twelfth International Conference on Learning Representations},
year={2024},
url={https://openreview.net/forum?id=gx2BT0a9MQ}
}

@inproceedings{black-etal-2022-gpt,
    title = "{GPT}-{N}eo{X}-20{B}: An Open-Source Autoregressive Language Model",
    author = "Black, Sidney  and
      Biderman, Stella  and
      Hallahan, Eric  and
      Anthony, Quentin  and
      Gao, Leo  and
      Golding, Laurence  and
      He, Horace  and
      Leahy, Connor  and
      McDonell, Kyle  and
      Phang, Jason  and
      Pieler, Michael  and
      Prashanth, Usvsn Sai  and
      Purohit, Shivanshu  and
      Reynolds, Laria  and
      Tow, Jonathan  and
      Wang, Ben  and
      Weinbach, Samuel",
    editor = "Fan, Angela  and
      Ilic, Suzana  and
      Wolf, Thomas  and
      Gall{\'e}, Matthias",
    booktitle = "Proceedings of BigScience Episode {\#}5 -- Workshop on Challenges {\&} Perspectives in Creating Large Language Models",
    month = may,
    year = "2022",
    address = "virtual+Dublin",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.bigscience-1.9",
    doi = "10.18653/v1/2022.bigscience-1.9",
    pages = "95--136",
}

@misc{luo2019adaptive,
      title={Adaptive Gradient Methods with Dynamic Bound of Learning Rate},
      author={Liangchen Luo and Yuanhao Xiong and Yan Liu and Xu Sun},
      year={2019},
      eprint={1902.09843},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{keskar2017improving,
      title={Improving Generalization Performance by Switching from Adam to SGD},
      author={Nitish Shirish Keskar and Richard Socher},
      year={2017},
      eprint={1712.07628},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@article{zhuang2020adabelief,
  title={AdaBelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients},
  author={Zhuang, Juntang and Tang, Tommy and Ding, Yifan and Tatikonda, Sekhar C and Dvornek, Nicha and Papademetris, Xenophon and Duncan, James},
  journal={Advances in Neural Information Processing Systems},
  volume={33},
  year={2020}
}

@misc{he2016identity,
      title={Identity Mappings in Deep Residual Networks},
      author={Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},
      year={2016},
      eprint={1603.05027},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}
@InProceedings{tang2021skfac,
    author    = {Tang, Zedong and Jiang, Fenlong and Gong, Maoguo and Li, Hao and Wu, Yue and Yu, Fan and Wang, Zidong and Wang, Min},
    title     = {SKFAC: Training Neural Networks With Faster Kronecker-Factored Approximate Curvature},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
    month     = {June},
    year      = {2021},
    pages     = {13479-13487}
}

@inproceedings{
zhang2023eva,
title={Eva: Practical Second-order Optimization with Kronecker-vectorized Approximation},
author={Lin Zhang and Shaohuai Shi and Bo Li},
booktitle={The Eleventh International Conference on Learning Representations },
year={2023},
url={https://openreview.net/forum?id=_Mic8V96Voy}
}

@misc{wang2017stochastic,
      title={Stochastic Quasi-Newton Methods for Nonconvex Stochastic Optimization},
      author={Xiao Wang and Shiqian Ma and Donald Goldfarb and Wei Liu},
      year={2017},
      eprint={1607.01231},
      archivePrefix={arXiv},
      primaryClass={math.OC}
}

@misc{bollapragada2018progressive,
      title={A Progressive Batching L-BFGS Method for Machine Learning},
      author={Raghu Bollapragada and Dheevatsa Mudigere and Jorge Nocedal and Hao-Jun Michael Shi and Ping Tak Peter Tang},
      year={2018},
      eprint={1802.05374},
      archivePrefix={arXiv},
      primaryClass={math.OC}
}

@misc{berahas2016multibatch,
      title={A Multi-Batch L-BFGS Method for Machine Learning},
      author={Albert S. Berahas and Jorge Nocedal and Martin Takáč},
      year={2016},
      eprint={1605.06049},
      archivePrefix={arXiv},
      primaryClass={math.OC}
}

@misc{erdogdu2015convergence,
      title={Convergence rates of sub-sampled Newton methods},
      author={Murat A. Erdogdu and Andrea Montanari},
      year={2015},
      eprint={1508.02810},
      archivePrefix={arXiv},
      primaryClass={stat.ML}
}

@misc{perlmutter,
  author = {NERSC},
  title = {Perlmutter System Architecture},
  year = {},
  publisher = {},
  journal = {},
  howpublished = {\url{https://docs.nersc.gov/systems/perlmutter/architecture/}},
  commit = {}
}

@inproceedings{heo2021adamp,
    title={AdamP: Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights},
    author={Heo, Byeongho and Chun, Sanghyuk and Oh, Seong Joon and Han, Dongyoon and Yun, Sangdoo and Kim, Gyuwan and Uh, Youngjung and Ha, Jung-Woo},
    year={2021},
    booktitle={International Conference on Learning Representations (ICLR)},
}

@software{torchvision2016,
    title        = {TorchVision: PyTorch's Computer Vision library},
    author       = {TorchVision maintainers and contributors},
    year         = 2016,
    journal      = {GitHub repository},
    publisher    = {GitHub},
    howpublished = {\url{https://github.com/pytorch/vision}}
}

@misc{mozaffari2023mkor,
      title={MKOR: Momentum-Enabled Kronecker-Factor-Based Optimizer Using Rank-1 Updates},
      author={Mohammad Mozaffari and Sikan Li and Zhao Zhang and Maryam Mehri Dehnavi},
      year={2023},
      eprint={2306.01685},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{shi2023distributed,
      title={A Distributed Data-Parallel PyTorch Implementation of the Distributed Shampoo Optimizer for Training Neural Networks At-Scale},
      author={Hao-Jun Michael Shi and Tsung-Hsien Lee and Shintaro Iwasaki and Jose Gallego-Posada and Zhijing Li and Kaushik Rangadurai and Dheevatsa Mudigere and Michael Rabbat},
      year={2023},
      eprint={2309.06497},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{shi2021accelerating,
      title={Accelerating Distributed K-FAC with Smart Parallelism of Computing and Communication Tasks},
      author={Shaohuai Shi and Lin Zhang and Bo Li},
      year={2021},
      eprint={2107.06533},
      archivePrefix={arXiv},
      primaryClass={cs.DC}
}

@inproceedings{ueno2020rich,
author = {Ueno, Yuichiro and Osawa, Kazuki and Tsuji, Yohei and Naruse, Akira and Yokota, Rio},
title = {Rich Information is Affordable: A Systematic Performance Analysis of Second-Order Optimization Using K-FAC},
year = {2020},
isbn = {9781450379984},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3394486.3403265},
doi = {10.1145/3394486.3403265},
abstract = {Rich information matrices from first and second-order derivatives have many potential applications in both theoretical and practical problems in deep learning. However, computing these information matrices is extremely expensive and this enormous cost is currently limiting its application to important problems regarding generalization, hyperparameter tuning, and optimization of deep neural networks. One of the most challenging use cases of information matrices is their use as a preconditioner for the optimizers, since the information matrices need to be updated every step. In this work, we conduct a step-by-step performance analysis when computing the Fisher information matrix during training of ResNet-50 on ImageNet, and show that the overhead can be reduced to the same amount as the cost of performing a single SGD step. We also show that the resulting Fisher preconditioned optimizer can converge in 1/3 the number of epochs compared to SGD, while achieving the same Top-1 validation accuracy. This is the first work to achieve such accuracy with K-FAC while reducing the training time to match that of SGD.},
booktitle = {Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining},
pages = {2145–2153},
numpages = {9},
keywords = {distributed training, information matrix, performance optimization},
location = {Virtual Event, CA, USA},
series = {KDD '20}
}


@misc{osawa2020scalable,
      title={Scalable and Practical Natural Gradient for Large-Scale Deep Learning},
      author={Kazuki Osawa and Yohei Tsuji and Yuichiro Ueno and Akira Naruse and Chuan-Sheng Foo and Rio Yokota},
      year={2020},
      eprint={2002.06015},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{osawa2019largescale,
      title={Large-Scale Distributed Second-Order Optimization Using Kronecker-Factored Approximate Curvature for Deep Convolutional Neural Networks},
      author={Kazuki Osawa and Yohei Tsuji and Yuichiro Ueno and Akira Naruse and Rio Yokota and Satoshi Matsuoka},
      year={2019},
      eprint={1811.12019},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@inproceedings{pauloski-2020-kfac,
    author = {Pauloski, J. Gregory and Zhang, Zhao and Huang, Lei and Xu, Weijia and Foster, Ian T.},
    title = {Convolutional {N}eural {N}etwork {T}raining with {D}istributed {K}-{FAC}},
    year = {2020},
    isbn = {9781728199986},
    publisher = {IEEE Press},
    booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
    articleno = {94},
    numpages = {14},
    location = {Atlanta, Georgia},
    series = {SC '20},
    doi = {10.5555/3433701.3433826}
}

@inproceedings{pauloski-kaisa-2021,
	doi = {10.1145/3458817.3476152},

	url = {https://doi.org/10.1145%2F3458817.3476152},

	year = 2021,
	month = {nov},

	publisher = {{ACM}
},

	author = {J. Gregory Pauloski and Qi Huang and Lei Huang and Shivaram Venkataraman and Kyle Chard and Ian Foster and Zhao Zhang},

	title = {{KAISA}},

	booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}
}

@InProceedings{agarwal-ggt-2019,
  title = 	 {Efficient Full-Matrix Adaptive Regularization},
  author =       {Agarwal, Naman and Bullins, Brian and Chen, Xinyi and Hazan, Elad and Singh, Karan and Zhang, Cyril and Zhang, Yi},
  booktitle = 	 {Proceedings of the 36th International Conference on Machine Learning},
  pages = 	 {102--110},
  year = 	 {2019},
  editor = 	 {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
  volume = 	 {97},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {09--15 Jun},
  publisher =    {PMLR},
  pdf = 	 {http://proceedings.mlr.press/v97/agarwal19b/agarwal19b.pdf},
  url = 	 {https://proceedings.mlr.press/v97/agarwal19b.html},
  abstract = 	 {Adaptive regularization methods pre-multiply a descent direction by a preconditioning matrix. Due to the large number of parameters of machine learning problems, full-matrix preconditioning methods are prohibitively expensive. We show how to modify full-matrix adaptive regularization in order to make it practical and effective. We also provide a novel theoretical analysis for adaptive regularization in <em>non-convex</em> optimization settings. The core of our algorithm, termed GGT, consists of the efficient computation of the inverse square root of a low-rank matrix. Our preliminary experiments show improved iteration-wise convergence rates across synthetic tasks and standard deep learning benchmarks, and that the more carefully-preconditioned steps sometimes lead to a better solution.}
}


@inproceedings{hessian-free-rnn,
author = {Martens, James and Sutskever, Ilya},
title = {Learning Recurrent Neural Networks with Hessian-Free Optimization},
year = {2011},
isbn = {9781450306195},
publisher = {Omnipress},
address = {Madison, WI, USA},
abstract = {In this work we resolve the long-outstanding problem of how to effectively train recurrent neural networks (RNNs) on complex and difficult sequence modeling problems which may contain long-term data dependencies. Utilizing recent advances in the Hessian-free optimization approach (Martens, 2010), together with a novel damping scheme, we successfully train RNNs on two sets of challenging problems. First, a collection of pathological synthetic datasets which are known to be impossible for standard optimization approaches (due to their extremely long-term dependencies), and second, on three natural and highly complex real-world sequence datasets where we find that our method significantly outperforms the previous state-of-the-art method for training neural sequence models: the Long Short-term Memory approach of Hochreiter and Schmidhuber (1997). Additionally, we offer a new interpretation of the generalized Gauss-Newton matrix of Schraudolph (2002) which is used within the HF approach of Martens.},
booktitle = {Proceedings of the 28th International Conference on International Conference on Machine Learning},
pages = {1033–1040},
numpages = {8},
location = {Bellevue, Washington, USA},
series = {ICML'11}
}


@misc{grosse2016kfacconvolution,
      title={A Kronecker-factored approximate Fisher matrix for convolution layers},
      author={Roger Grosse and James Martens},
      year={2016},
      eprint={1602.01407},
      archivePrefix={arXiv},
      primaryClass={stat.ML}
}

@misc{zhu2019anisotropic,
      title={The Anisotropic Noise in Stochastic Gradient Descent: Its Behavior of Escaping from Sharp Minima and Regularization Effects},
      author={Zhanxing Zhu and Jingfeng Wu and Bing Yu and Lei Wu and Jinwen Ma},
      year={2019},
      eprint={1803.00195},
      archivePrefix={arXiv},
      primaryClass={stat.ML}
}

@misc{sagun2018empirical,
      title={Empirical Analysis of the Hessian of Over-Parametrized Neural Networks},
      author={Levent Sagun and Utku Evci and V. Ugur Guney and Yann Dauphin and Leon Bottou},
      year={2018},
      eprint={1706.04454},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{jastrzebski2018factors,
      title={Three Factors Influencing Minima in SGD},
      author={Stanislaw Jastrzebski and Zachary Kenton and Devansh Arpit and Nicolas Ballas and Asja Fischer and Yoshua Bengio and Amos Storkey},
      year={2018},
      eprint={1711.04623},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@article{duchi:jmlr2011,
  author  = {John Duchi and Elad Hazan and Yoram Singer},
  title   = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization},
  journal = {Journal of Machine Learning Research},
  year    = {2011},
  volume  = {12},
  number  = {61},
  pages   = {2121--2159},
  url     = {http://jmlr.org/papers/v12/duchi11a.html}
}

@article{martens:jmlr2020,
author = {Martens, James},
title = {New Insights and Perspectives on the Natural Gradient Method},
year = {2020},
issue_date = {January 2020},
publisher = {JMLR.org},
volume = {21},
number = {1},
issn = {1532-4435},
abstract = {Natural gradient descent is an optimization method traditionally motivated from the perspective of information geometry, and works well for many applications as an alternative to stochastic gradient descent. In this paper we critically analyze this method and its properties, and show how it can be viewed as a type of 2nd-order optimization method, with the Fisher information matrix acting as a substitute for the Hessian. In many important cases, the Fisher information matrix is shown to be equivalent to the Generalized Gauss-Newton matrix, which both approximates the Hessian, but also has certain properties that favor its use over the Hessian. This perspective turns out to have significant implications for the design of a practical and robust natural gradient optimizer, as it motivates the use of techniques like trust regions and Tikhonov regularization. Additionally, we make a series of contributions to the understanding of natural gradient and 2nd-order methods, including: a thorough analysis of the convergence speed of stochastic natural gradient descent (and more general stochastic 2nd-order methods) as applied to convex quadratics, a critical examination of the oft-used "empirical" approximation of the Fisher matrix, and an analysis of the (approximate) parameterization invariance property possessed by natural gradient methods (which we show also holds for certain other curvature matrices, but notably not the Hessian).},
journal = {J. Mach. Learn. Res.},
month = {jan},
articleno = {146},
numpages = {76},
keywords = {neural networks, convergence rate, parameterization invariance, natural gradient methods, 2nd-order optimization}
}


@inproceedings{kunstner:neurips2019,
 author = {Kunstner, Frederik and Hennig, Philipp and Balles, Lukas},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {Limitations of the empirical Fisher approximation for natural gradient descent},
 url = {https://proceedings.neurips.cc/paper_files/paper/2019/file/46a558d97954d0692411c861cf78ef79-Paper.pdf},
 volume = {32},
 year = {2019}
}


@inproceedings{tonga,
 author = {Roux, Nicolas and Manzagol, Pierre-antoine and Bengio, Yoshua},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {J. Platt and D. Koller and Y. Singer and S. Roweis},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {Topmoumoute Online Natural Gradient Algorithm},
 url = {https://proceedings.neurips.cc/paper_files/paper/2007/file/9f61408e3afb633e50cdf1b20de6f466-Paper.pdf},
 volume = {20},
 year = {2007}
}

@inproceedings{Desjardins:nips2015,
 author = {Desjardins, Guillaume and Simonyan, Karen and Pascanu, Razvan and kavukcuoglu, koray},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {Natural Neural Networks},
 url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/2de5d16682c3c35007e4e92982f1a2ba-Paper.pdf},
 volume = {28},
 year = {2015}
}


@article{Park2000AdaptiveNG,
  title={Adaptive natural gradient learning algorithms for various stochastic models},
  author={Hyeyoung Park and Shun‐ichi Amari and Kenji Fukumizu},
  journal={Neural networks : the official journal of the International Neural Network Society},
  year={2000},
  volume={13 7},
  pages={
          755-64
        },
  url={https://api.semanticscholar.org/CorpusID:6471036}
}

@article{ngd-og,
    author = {Amari, Shun-ichi},
    title = "{Natural Gradient Works Efficiently in Learning}",
    journal = {Neural Computation},
    volume = {10},
    number = {2},
    pages = {251-276},
    year = {1998},
    month = {02},
    abstract = "{When a parameter space has a certain underlying structure, the ordinary gradient of a function does not represent its steepest direction, but the natural gradient does. Information geometry is used for calculating the natural gradients in the parameter space of perceptrons, the space of matrices (for blind source separation), and the space of linear dynamical systems (for blind source deconvolution). The dynamical behavior of natural gradient online learning is analyzed and is proved to be Fisher efficient, implying that it has asymptotically the same performance as the optimal batch estimation of parameters. This suggests that the plateau phenomenon, which appears in the backpropagation learning algorithm of multilayer perceptrons, might disappear or might not be so serious when the natural gradient is used. An adaptive method of updating the learning rate is proposed and analyzed.}",
    issn = {0899-7667},
    doi = {10.1162/089976698300017746},
    url = {https://doi.org/10.1162/089976698300017746},
    eprint = {https://direct.mit.edu/neco/article-pdf/10/2/251/813415/089976698300017746.pdf},
}


@InProceedings{botev-practical-17,
  title = 	 {Practical {G}auss-{N}ewton Optimisation for Deep Learning},
  author =       {Aleksandar Botev and Hippolyt Ritter and David Barber},
  booktitle = 	 {Proceedings of the 34th International Conference on Machine Learning},
  pages = 	 {557--565},
  year = 	 {2017},
  editor = 	 {Precup, Doina and Teh, Yee Whye},
  volume = 	 {70},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {06--11 Aug},
  publisher =    {PMLR},
  pdf = 	 {http://proceedings.mlr.press/v70/botev17a/botev17a.pdf},
  url = 	 {https://proceedings.mlr.press/v70/botev17a.html},
  abstract = 	 {We present an efficient block-diagonal approximation to the Gauss-Newton matrix for feedforward neural networks. Our resulting algorithm is competitive against state-of-the-art first-order optimisation methods, with sometimes significant improvement in optimisation performance. Unlike first-order methods, for which hyperparameter tuning of the optimisation parameters is often a laborious process, our approach can provide good performance even when used with default settings. A side result of our work is that for piecewise linear transfer functions, the network objective function can have no differentiable local maxima, which may partially explain why such transfer functions facilitate effective optimisation.}
}


@InProceedings{krylov-subspace-descent,
  title = 	 {Krylov Subspace Descent for Deep Learning},
  author = 	 {Vinyals, Oriol and Povey, Daniel},
  booktitle = 	 {Proceedings of the Fifteenth International Conference on Artificial Intelligence and Statistics},
  pages = 	 {1261--1268},
  year = 	 {2012},
  editor = 	 {Lawrence, Neil D. and Girolami, Mark},
  volume = 	 {22},
  series = 	 {Proceedings of Machine Learning Research},
  address = 	 {La Palma, Canary Islands},
  month = 	 {21--23 Apr},
  publisher =    {PMLR},
  pdf = 	 {http://proceedings.mlr.press/v22/vinyals12/vinyals12.pdf},
  url = 	 {https://proceedings.mlr.press/v22/vinyals12.html},
  abstract = 	 {In this paper, we propose a second order optimization method to learn models where both the dimensionality of the parameter space and the number of training samples is high.  In our method, we construct on each iteration a Krylov subspace formed by the gradient and an approximation to the Hessian matrix, and then use a subset of the training data samples to optimize over this subspace.  As with the Hessian Free (HF) method of Martens (2010), the Hessian matrix is never explicitly constructed, and is computed using a subset of data.  In practice, as in HF, we typically use a positive definite substitute for the Hessian matrix such as the Gauss-Newton matrix.  We investigate the effectiveness of our proposed method on deep neural networks, and compare its performance to widely used methods such as stochastic gradient descent, conjugate gradient descent and L-BFGS, and also to HF. Our method leads to faster convergence than either L-BFGS or HF, and generally performs better than either of them in cross-validation accuracy.  It is also simpler and more general than HF, as it does not require a positive semidefinite approximation of the Hessian matrix to work well nor the setting of a damping parameter.  The chief drawback versus HF is the need for memory to store a basis for the Krylov subspace.}
}


@inproceedings{hessian-free-optimization,
  added-at = {2011-07-08T14:11:15.000+0200},
  author = {Martens, James},
  biburl = {https://www.bibsonomy.org/bibtex/2af0029f21446a26c04f2e4650ec1fbf1/gromgull},
  booktitle = {ICML},
  editor = {Fürnkranz, Johannes and Joachims, Thorsten},
  ee = {http://www.icml2010.org/papers/458.pdf},
  interhash = {1d6577ca73270732c2cc1e3c2cce6cdb},
  intrahash = {af0029f21446a26c04f2e4650ec1fbf1},
  keywords = {machinelearning neural-networks optimisation recurrent-neural-networks},
  pages = {735-742},
  publisher = {Omnipress},
  timestamp = {2011-07-08T14:11:15.000+0200},
  title = {Deep learning via Hessian-free optimization.},
  url = {http://dblp.uni-trier.de/db/conf/icml/icml2010.html#Martens10},
  year = 2010
}


@article{schraudolphGGN,
author = {Schraudolph, Nicol N.},
title = {Fast Curvature Matrix-Vector Products for Second-Order Gradient Descent},
year = {2002},
issue_date = {July 2002},
publisher = {MIT Press},
address = {Cambridge, MA, USA},
volume = {14},
number = {7},
issn = {0899-7667},
url = {https://doi.org/10.1162/08997660260028683},
doi = {10.1162/08997660260028683},
abstract = {We propose a generic method for iteratively approximating various second-order gradient steps--Newton, Gauss-Newton, Levenberg-Marquardt, and natural gradient--in linear time per iteration, using special curvature matrix-vector products that can be computed in O(n). Two recent acceleration techniques for on-line learning, matrix momentum and stochastic meta-descent (SMD), implement this approach. Since both were originally derived by very different routes, this offers fresh insight into their operation, resulting in further improvements to SMD.},
journal = {Neural Comput.},
month = {jul},
pages = {1723–1738},
numpages = {16}
}


@inproceedings{loshchilov2017sgdr,
title={{SGDR}: Stochastic Gradient Descent with Warm Restarts},
author={Ilya Loshchilov and Frank Hutter},
booktitle={International Conference on Learning Representations},
year={2017},
url={https://openreview.net/forum?id=Skq89Scxx}
}

@misc{grafting,
      title={Disentangling Adaptive Gradient Methods from Learning Rates},
      author={Naman Agarwal and Rohan Anil and Elad Hazan and Tomer Koren and Cyril Zhang},
      year={2020},
      eprint={2002.11803},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@inproceedings{adapative-optimizer-bad-sgd-good,
 author = {Wilson, Ashia C and Roelofs, Rebecca and Stern, Mitchell and Srebro, Nati and Recht, Benjamin},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {I. Guyon and U. Von Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {The Marginal Value of Adaptive Gradient Methods in Machine Learning},
 url = {https://proceedings.neurips.cc/paper_files/paper/2017/file/81b3833e2504647f9d794f7d7b9bf341-Paper.pdf},
 volume = {30},
 year = {2017}
}

@inproceedings{dense-net,
  added-at = {2018-09-04T11:33:01.000+0200},
  author = {Huang, Gao and Liu, Zhuang and van der Maaten, Laurens and Weinberger, Kilian Q.},
  biburl = {https://www.bibsonomy.org/bibtex/24ea2e82bd87f8102b9f1f14a98b4dc53/nosebrain},
  booktitle = {CVPR},
  ee = {http://doi.ieeecomputersociety.org/10.1109/CVPR.2017.243},
  interhash = {39c8ce8d8104d4c557d508eb421fb90c},
  intrahash = {4ea2e82bd87f8102b9f1f14a98b4dc53},
  isbn = {978-1-5386-0457-1},
  keywords = {classification densenet image},
  pages = {2261-2269},
  publisher = {IEEE Computer Society},
  timestamp = {2018-09-04T11:41:32.000+0200},
  title = {Densely Connected Convolutional Networks},
  url = {http://dblp.uni-trier.de/db/conf/cvpr/cvpr2017.html#HuangLMW17},
  year = 2017
}


@incollection{ssd,
	doi = {10.1007/978-3-319-46448-0_2},

	url = {https://doi.org/10.1007%2F978-3-319-46448-0_2},

	year = 2016,
	publisher = {Springer International Publishing},

	pages = {21--37},

	author = {Wei Liu and Dragomir Anguelov and Dumitru Erhan and Christian Szegedy and Scott Reed and Cheng-Yang Fu and Alexander C. Berg},

	title = {{SSD}: Single Shot {MultiBox} Detector},

	booktitle = {Computer Vision {\textendash} {ECCV} 2016}
}


@incollection{alexnet,
  added-at = {2016-11-14T12:05:24.000+0100},
  author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E.},
  biburl = {https://www.bibsonomy.org/bibtex/2886c491fe45049fee3c9660df30bb5c4/albinzehe},
  booktitle = {Advances in Neural Information Processing Systems 25},
  editor = {Pereira, F. and Burges, C. J. C. and Bottou, L. and Weinberger, K. Q.},
  interhash = {74bbb5dea5afb1b088bd10e317f1f0d2},
  intrahash = {886c491fe45049fee3c9660df30bb5c4},
  keywords = {cnn deeplearning ma-zehe neuralnet},
  pages = {1097--1105},
  publisher = {Curran Associates, Inc.},
  timestamp = {2016-11-14T12:05:24.000+0100},
  title = {ImageNet Classification with Deep Convolutional Neural Networks},
  url = {http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf},
  year = 2012
}

@article{sgd-nesterov,
  title={A method for solving the convex programming problem with convergence rate $\mathcal{O}(1/k^2)$},
  author={Yurii Nesterov},
  journal={Proceedings of the USSR Academy of Sciences},
  year={1983},
  volume={269},
  pages={543-547},
  url={https://api.semanticscholar.org/CorpusID:145918791}
}

@article{sgd-momentum,
title = {Some methods of speeding up the convergence of iteration methods},
journal = {USSR Computational Mathematics and Mathematical Physics},
volume = {4},
number = {5},
pages = {1-17},
year = {1964},
issn = {0041-5553},
doi = {https://doi.org/10.1016/0041-5553(64)90137-5},
url = {https://www.sciencedirect.com/science/article/pii/0041555364901375},
author = {B.T. Polyak},
abstract = {For the solution of the functional equation P (x) = 0 (1) (where P is an operator, usually linear, from B into B, and B is a Banach space) iteration methods are generally used. These consist of the construction of a series x0, …, xn, …, which converges to the solution (see, for example [1]). Continuous analogues of these methods are also known, in which a trajectory x(t), 0 ⩽ t ⩽ ∞ is constructed, which satisfies the ordinary differential equation in B and is such that x(t) approaches the solution of (1) as t → ∞ (see [2]). We shall call the method a k-step method if for the construction of each successive iteration xn+1 we use k previous iterations xn, …, xn−k+1. The same term will also be used for continuous methods if x(t) satisfies a differential equation of the k-th order or k-th degree. Iteration methods which are more widely used are one-step (e.g. methods of successive approximations). They are generally simple from the calculation point of view but often converge very slowly. This is confirmed both by the evaluation of the speed of convergence and by calculation in practice (for more details see below). Therefore the question of the rate of convergence is most important. Some multistep methods, which we shall consider further, which are only slightly more complicated than the corresponding one-step methods, make it possible to speed up the convergence substantially. Note that all the methods mentioned below are applicable also to the problem of minimizing the differentiable functional (x) in Hilbert space, so long as this problem reduces to the solution of the equation grad (x) = 0.}
}

@article{sgd,
author = {Herbert Robbins and Sutton Monro},
title = {{A Stochastic Approximation Method}},
volume = {22},
journal = {The Annals of Mathematical Statistics},
number = {3},
publisher = {Institute of Mathematical Statistics},
pages = {400 -- 407},
year = {1951},
doi = {10.1214/aoms/1177729586},
URL = {https://doi.org/10.1214/aoms/1177729586}
}


@inproceedings{mask-rcnn,
  added-at = {2021-07-07T12:16:11.000+0200},
  author = {He, Kaiming and Gkioxari, Georgia and Doll{\'{a}}r, Piotr and Girshick, Ross B.},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl = {https://www.bibsonomy.org/bibtex/2d2deec4bb1449a5f55dcc9086b669e37/pkoch},
  booktitle = {{IEEE} International Conference on Computer Vision, {ICCV} 2017, Venice,               Italy, October 22-29, 2017},
  doi = {10.1109/ICCV.2017.322},
  interhash = {3743d2a88223517f9adc496b9ad099bc},
  intrahash = {d2deec4bb1449a5f55dcc9086b669e37},
  keywords = {instance mask mask-rcnn segmentation},
  pages = {2980--2988},
  publisher = {{IEEE} Computer Society},
  timestamp = {2021-07-07T12:16:11.000+0200},
  title = {Mask {R-CNN}},
  url = {https://doi.org/10.1109/ICCV.2017.322},
  year = 2017
}


@article{iyer2020wideminima,
  title={Wide-minima Density Hypothesis and the Explore-Exploit Learning Rate Schedule},
  author={Iyer, Nikhil and Thejas, V and Kwatra, Nipun and Ramjee, Ramachandran and Sivathanu, Muthian},
  journal={arXiv preprint arXiv:2003.03977},
  year={2020}
}

@misc{mscoco,
      title={Microsoft COCO: Common Objects in Context},
      author={Tsung-Yi Lin and Michael Maire and Serge Belongie and Lubomir Bourdev and Ross Girshick and James Hays and Pietro Perona and Deva Ramanan and C. Lawrence Zitnick and Piotr Dollár},
      year={2015},
      eprint={1405.0312},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{deeplabv3,
      title={Rethinking Atrous Convolution for Semantic Image Segmentation},
      author={Liang-Chieh Chen and George Papandreou and Florian Schroff and Hartwig Adam},
      year={2017},
      eprint={1706.05587},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{faster_rcnn,
 author = {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks},
 url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/14bfa6bb14875e45bba028a21ed38046-Paper.pdf},
 volume = {28},
 year = {2015}
}


@misc{shampoo-scalable,
      title={Scalable Second Order Optimization for Deep Learning},
      author={Rohan Anil and Vineet Gupta and Tomer Koren and Kevin Regan and Yoram Singer},
      year={2021},
      eprint={2002.09018},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}


@InProceedings{shampoo-icml,
  title = 	 {Shampoo: Preconditioned Stochastic Tensor Optimization},
  author =       {Gupta, Vineet and Koren, Tomer and Singer, Yoram},
  booktitle = 	 {Proceedings of the 35th International Conference on Machine Learning},
  pages = 	 {1842--1850},
  year = 	 {2018},
  editor = 	 {Dy, Jennifer and Krause, Andreas},
  volume = 	 {80},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {10--15 Jul},
  publisher =    {PMLR},
  pdf = 	 {http://proceedings.mlr.press/v80/gupta18a/gupta18a.pdf},
  url = 	 {https://proceedings.mlr.press/v80/gupta18a.html},
  abstract = 	 {Preconditioned gradient methods are among the most general and powerful tools in optimization. However, preconditioning requires storing and manipulating prohibitively large matrices. We describe and analyze a new structure-aware preconditioning algorithm, called Shampoo, for stochastic optimization over tensor spaces. Shampoo maintains a set of preconditioning matrices, each of which operates on a single dimension, contracting over the remaining dimensions. We establish convergence guarantees in the stochastic convex setting, the proof of which builds upon matrix trace inequalities. Our experiments with state-of-the-art deep learning models show that Shampoo is capable of converging considerably faster than commonly used optimizers. Surprisingly, although it involves a more complex update rule, Shampoo’s runtime per step is comparable in practice to that of simple gradient methods such as SGD, AdaGrad, and Adam.}
}


@article{alpa,
  author    = {Lianmin Zheng and
               Zhuohan Li and
               Hao Zhang and
               Yonghao Zhuang and
               Zhifeng Chen and
               Yanping Huang and
               Yida Wang and
               Yuanzhong Xu and
               Danyang Zhuo and
               Joseph E. Gonzalez and
               Ion Stoica},
  title     = {Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed
               Deep Learning},
  journal   = {CoRR},
  volume    = {abs/2201.12023},
  year      = {2022},
  url       = {https://arxiv.org/abs/2201.12023},
  eprinttype = {arXiv},
  eprint    = {2201.12023},
  timestamp = {Wed, 02 Feb 2022 15:00:01 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2201-12023.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@misc{jangda2022breaking,
      title={Breaking the Computation and Communication Abstraction Barrier in Distributed Machine Learning Workloads},
      author={Abhinav Jangda and Jun Huang and Guodong Liu and Amir Hossein Nodehi Sabet and Saeed Maleki and Youshan Miao and Madanlal Musuvathi and Todd Mytkowicz and Olli Sarikivi},
      year={2022},
      eprint={2105.05720},
      archivePrefix={arXiv},
      primaryClass={cs.DC}
}

@misc{afhq-dataset,
      title={StarGAN v2: Diverse Image Synthesis for Multiple Domains},
      author={Yunjey Choi and Youngjung Uh and Jaejun Yoo and Jung-Woo Ha},
      year={2020},
      eprint={1912.01865},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{object-detection-survey,
	doi = {10.1109/access.2019.2939201},

	url = {https://doi.org/10.1109%2Faccess.2019.2939201},

	year = 2019,
	publisher = {Institute of Electrical and Electronics Engineers ({IEEE})},

	volume = {7},

	pages = {128837--128868},

	author = {Licheng Jiao and Fan Zhang and Fang Liu and Shuyuan Yang and Lingling Li and Zhixi Feng and Rong Qu},

	title = {A Survey of Deep Learning-Based Object Detection},

	journal = {{IEEE} Access}
}

@ARTICLE{image-segmentation-survey,
  author={Minaee, Shervin and Boykov, Yuri and Porikli, Fatih and Plaza, Antonio and Kehtarnavaz, Nasser and Terzopoulos, Demetri},
  journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
  title={Image Segmentation Using Deep Learning: A Survey},
  year={2022},
  volume={44},
  number={7},
  pages={3523-3542},
  doi={10.1109/TPAMI.2021.3059968}}

@misc{group-norm,
      title={Group Normalization},
      author={Yuxin Wu and Kaiming He},
      year={2018},
      eprint={1803.08494},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{improved-diffusion,
      title={Improved Denoising Diffusion Probabilistic Models},
      author={Alex Nichol and Prafulla Dhariwal},
      year={2021},
      eprint={2102.09672},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@article{ring-all-reduce,
author = {Patarasuk, Pitch and Yuan, Xin},
title = {Bandwidth Optimal All-Reduce Algorithms for Clusters of Workstations},
year = {2009},
issue_date = {February, 2009},
publisher = {Academic Press, Inc.},
address = {USA},
volume = {69},
number = {2},
issn = {0743-7315},
url = {https://doi.org/10.1016/j.jpdc.2008.09.002},
doi = {10.1016/j.jpdc.2008.09.002},
abstract = {We consider an efficient realization of the all-reduce operation with large data sizes in cluster environments, under the assumption that the reduce operator is associative and commutative. We derive a tight lower bound of the amount of data that must be communicated in order to complete this operation and propose a ring-based algorithm that only requires tree connectivity to achieve bandwidth optimality. Unlike the widely used butterfly-like all-reduce algorithm that incurs network contention in SMP/multi-core clusters, the proposed algorithm can achieve contention-free communication in almost all contemporary clusters, including SMP/multi-core clusters and Ethernet switched clusters with multiple switches. We demonstrate that the proposed algorithm is more efficient than other algorithms on clusters with different nodal architectures and networking technologies when the data size is sufficiently large.},
journal = {J. Parallel Distrib. Comput.},
month = {feb},
pages = {117–124},
numpages = {8},
keywords = {Collective communication, All-reduce, Tree topology, Cluster of workstations}
}


@misc{ddpm,
  doi = {10.48550/ARXIV.2006.11239},

  url = {https://arxiv.org/abs/2006.11239},

  author = {Ho, Jonathan and Jain, Ajay and Abbeel, Pieter},

  keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Denoising Diffusion Probabilistic Models},

  publisher = {arXiv},

  year = {2020},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{stable-diffusion,
  doi = {10.48550/ARXIV.2112.10752},

  url = {https://arxiv.org/abs/2112.10752},

  author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Björn},

  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {High-Resolution Image Synthesis with Latent Diffusion Models},

  publisher = {arXiv},

  year = {2021},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{dall-e-2,
  doi = {10.48550/ARXIV.2204.06125},

  url = {https://arxiv.org/abs/2204.06125},

  author = {Ramesh, Aditya and Dhariwal, Prafulla and Nichol, Alex and Chu, Casey and Chen, Mark},

  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Hierarchical Text-Conditional Image Generation with CLIP Latents},

  publisher = {arXiv},

  year = {2022},

  copyright = {Creative Commons Attribution 4.0 International}
}

@misc{unet-arch,
  doi = {10.48550/ARXIV.1505.04597},

  url = {https://arxiv.org/abs/1505.04597},

  author = {Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},

  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {U-Net: Convolutional Networks for Biomedical Image Segmentation},

  publisher = {arXiv},

  year = {2015},

  copyright = {arXiv.org perpetual, non-exclusive license}
}

@techreport{summa,
author = {van de Geijn, Robert A. and Watts, Jerrell},
title = {SUMMA: Scalable Universal Matrix Multiplication Algorithm},
year = {1995},
publisher = {University of Texas at Austin},
address = {USA},
abstract = {In this paper, we give a straight forward, highly efficient, scalable implementation of common matrix multiplication operations. The algorithms are much simpler than previously published methods, yield better performance, and require less work space. MPI implementations are given, as are performance results on the Intel Paragon system.}
}

@ARTICLE{agarwal-3d,
  author={Agarwal, R. C. and Balle, S. M. and Gustavson, F. G. and Joshi, M. and Palkar, P.},
  journal={IBM Journal of Research and Development},
  title={A three-dimensional approach to parallel matrix multiplication},
  year={1995},
  volume={39},
  number={5},
  pages={575-582},
  doi={10.1147/rd.395.0575}}

@InProceedings{oxford_flowers_102,
  author       = "Maria-Elena Nilsback and Andrew Zisserman",
  title        = "Automated Flower Classification over a Large Number of Classes",
  booktitle    = "Indian Conference on Computer Vision, Graphics and Image Processing",
  month        = "Dec",
  year         = "2008",
}

@misc{yellick-graph,
  doi = {10.48550/ARXIV.2005.03300},

  url = {https://arxiv.org/abs/2005.03300},

  author = {Tripathy, Alok and Yelick, Katherine and Buluc, Aydin},

  keywords = {Machine Learning (cs.LG), Distributed, Parallel, and Cluster Computing (cs.DC), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Reducing Communication in Graph Neural Network Training},

  publisher = {arXiv},

  year = {2020},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{you-3d,
  doi = {10.48550/ARXIV.2105.14450},

  url = {https://arxiv.org/abs/2105.14450},

  author = {Bian, Zhengda and Xu, Qifan and Wang, Boxiang and You, Yang},

  keywords = {Distributed, Parallel, and Cluster Computing (cs.DC), Machine Learning (cs.LG), Performance (cs.PF), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Maximizing Parallelism in Distributed Training for Huge Neural Networks},

  publisher = {arXiv},

  year = {2021},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@inproceedings{you-2.5d,
	doi = {10.1145/3545008.3545087},

	url = {https://doi.org/10.1145%2F3545008.3545087},

	year = 2022,
	month = {aug},

	publisher = {{ACM}},

	author = {Boxiang Wang and Qifan Xu and Zhengda Bian and Yang You},

	title = {Tesseract: Parallelize the Tensor Parallelism Efficiently},

	booktitle = {Proceedings of the 51st International Conference on Parallel Processing}
}

@misc{you-2d,
  doi = {10.48550/ARXIV.2104.05343},

  url = {https://arxiv.org/abs/2104.05343},

  author = {Xu, Qifan and Li, Shenggui and Gong, Chaoyu and You, Yang},

  keywords = {Machine Learning (cs.LG), Distributed, Parallel, and Cluster Computing (cs.DC), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {An Efficient 2D Method for Training Super-Large Deep Learning Models},

  publisher = {arXiv},

  year = {2021},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{spatial-parallel-cnn-dryden,
  doi = {10.48550/ARXIV.1903.06681},

  url = {https://arxiv.org/abs/1903.06681},

  author = {Dryden, Nikoli and Maruyama, Naoya and Benson, Tom and Moon, Tim and Snir, Marc and Van Essen, Brian},

  keywords = {Distributed, Parallel, and Cluster Computing (cs.DC), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Improving Strong-Scaling of CNN Training by Exploiting Finer-Grained Parallelism},

  publisher = {arXiv},

  year = {2019},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@inproceedings{channel-filter-parallel-cnn-dryden,
author = {Dryden, Nikoli and Maruyama, Naoya and Moon, Tim and Benson, Tom and Snir, Marc and Van Essen, Brian},
title = {Channel and Filter Parallelism for Large-Scale CNN Training},
year = {2019},
isbn = {9781450362290},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3295500.3356207},
doi = {10.1145/3295500.3356207},
abstract = {Accelerating large-scale CNN training is needed to keep training times reasonable as datasets grow larger and models become more complex. Existing frameworks primarily scale using data-parallelism, but this is limited by the mini-batch size, which cannot grow arbitrarily. We introduce three algorithms that partition channel or filter data to exploit parallelism beyond the sample dimension. Further, they partition the parameters of convolutional layers, replacing global all reduces with segmented allreduces---smaller, concurrent allreduces among disjoint processor sets. These algorithms enable strong scaling, reduced communication overhead, and reduced memory pressure, enabling training of very wide CNNs.We demonstrate improved strong and weak scaling, including up to 4.1x reductions in training time for residual networks and 4x reductions in allreduce overhead. We also show that wider models provide improved accuracy on ImageNet. We study the current limitations of our algorithms and provide a direction for future optimizations of large-scale deep learning frameworks.},
booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
articleno = {10},
numpages = {20},
keywords = {algorithms, CNN, deep learning, convolution, scaling},
location = {Denver, Colorado},
series = {SC '19}
}


@misc{fb-moe,
  doi = {10.48550/ARXIV.2112.10684},

  url = {https://arxiv.org/abs/2112.10684},

  author = {Artetxe, Mikel and Bhosale, Shruti and Goyal, Naman and Mihaylov, Todor and Ott, Myle and Shleifer, Sam and Lin, Xi Victoria and Du, Jingfei and Iyer, Srinivasan and Pasunuru, Ramakanth and Anantharaman, Giri and Li, Xian and Chen, Shuohui and Akin, Halil and Baines, Mandeep and Martin, Louis and Zhou, Xing and Koura, Punit Singh and O'Horo, Brian and Wang, Jeff and Zettlemoyer, Luke and Diab, Mona and Kozareva, Zornitsa and Stoyanov, Ves},

  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Efficient Large Scale Language Modeling with Mixtures of Experts},

  publisher = {arXiv},

  year = {2021},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{hetu-moe,
  doi = {10.48550/ARXIV.2203.14685},

  url = {https://arxiv.org/abs/2203.14685},

  author = {Nie, Xiaonan and Zhao, Pinxue and Miao, Xupeng and Zhao, Tong and Cui, Bin},

  keywords = {Distributed, Parallel, and Cluster Computing (cs.DC), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {HetuMoE: An Efficient Trillion-scale Mixture-of-Expert Distributed Training System},

  publisher = {arXiv},

  year = {2022},

  copyright = {Creative Commons Attribution 4.0 International}
}


@misc{fairscale,
  author =       {FairScale authors},
  title =        {FairScale:  A general purpose modular PyTorch library for high performance and large scale training},
  howpublished = {\url{https://github.com/facebookresearch/fairscale}},
  year =         {2021}
}

@misc{se-moe,
  doi = {10.48550/ARXIV.2205.10034},

  url = {https://arxiv.org/abs/2205.10034},

  author = {Shen, Liang and Wu, Zhihua and Gong, WeiBao and Hao, Hongxiang and Bai, Yangfan and Wu, HuaChao and Wu, Xinxuan and Xiong, Haoyi and Yu, Dianhai and Ma, Yanjun},

  keywords = {Distributed, Parallel, and Cluster Computing (cs.DC), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {SE-MoE: A Scalable and Efficient Mixture-of-Experts Distributed Training and Inference System},

  publisher = {arXiv},

  year = {2022},

  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{ds-moe-systems,
  doi = {10.48550/ARXIV.2201.05596},

  url = {https://arxiv.org/abs/2201.05596},

  author = {Rajbhandari, Samyam and Li, Conglong and Yao, Zhewei and Zhang, Minjia and Aminabadi, Reza Yazdani and Awan, Ammar Ahmad and Rasley, Jeff and He, Yuxiong},

  keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Distributed, Parallel, and Cluster Computing (cs.DC), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale},

  publisher = {arXiv},

  year = {2022},

  copyright = {Creative Commons Attribution 4.0 International}
}

@misc{ds-moe-ml,
  doi = {10.48550/ARXIV.2109.10465},

  url = {https://arxiv.org/abs/2109.10465},

  author = {Kim, Young Jin and Awan, Ammar Ahmad and Muzio, Alexandre and Salinas, Andres Felipe Cruz and Lu, Liyang and Hendy, Amr and Rajbhandari, Samyam and He, Yuxiong and Awadalla, Hany Hassan},

  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Scalable and Efficient MoE Training for Multitask Multilingual Models},

  publisher = {arXiv},

  year = {2021},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{scaling-laws,
  doi = {10.48550/ARXIV.2001.08361},

  url = {https://arxiv.org/abs/2001.08361},

  author = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B. and Chess, Benjamin and Child, Rewon and Gray, Scott and Radford, Alec and Wu, Jeffrey and Amodei, Dario},

  keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Scaling Laws for Neural Language Models},

  publisher = {arXiv},

  year = {2020},

  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{t5-transformer,
  doi = {10.48550/ARXIV.1910.10683},

  url = {https://arxiv.org/abs/1910.10683},

  author = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J.},

  keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},

  publisher = {arXiv},

  year = {2019},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{g-shard,
  doi = {10.48550/ARXIV.2006.16668},

  url = {https://arxiv.org/abs/2006.16668},

  author = {Lepikhin, Dmitry and Lee, HyoukJoong and Xu, Yuanzhong and Chen, Dehao and Firat, Orhan and Huang, Yanping and Krikun, Maxim and Shazeer, Noam and Chen, Zhifeng},

  keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding},

  publisher = {arXiv},

  year = {2020},

  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{og-moe,
  doi = {10.48550/ARXIV.1701.06538},

  url = {https://arxiv.org/abs/1701.06538},

  author = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},

  keywords = {Machine Learning (cs.LG), Computation and Language (cs.CL), Neural and Evolutionary Computing (cs.NE), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer},

  publisher = {arXiv},

  year = {2017},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{switch-transformer,
  doi = {10.48550/ARXIV.2101.03961},

  url = {https://arxiv.org/abs/2101.03961},

  author = {Fedus, William and Zoph, Barret and Shazeer, Noam},

  keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity},

  publisher = {arXiv},

  year = {2021},

  copyright = {arXiv.org perpetual, non-exclusive license}
}

@inproceedings{
keskar2017on,
title={On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima},
author={Nitish Shirish Keskar and Dheevatsa Mudigere and Jorge Nocedal and Mikhail Smelyanskiy and Ping Tak Peter Tang},
booktitle={International Conference on Learning Representations},
year={2017},
url={https://openreview.net/forum?id=H1oyRlYgg}
}
@INPROCEEDINGS{guo2008hpcIOarch,
  author={Guo, Yu-Feng and Li, Qiong and Liu, Guang-Ming and Cao, Yue-Sheng and Zhang, Lei},
  booktitle={Fifth International Conference on Information Technology: New Generations (itng 2008)},
  title={A Distributed Shared Parallel IO System for HPC},
  year={2008},
  volume={},
  number={},
  pages={229-234},
  doi={10.1109/ITNG.2008.7}}

@inproceedings{cao2017bbarch,
author = {Cao, Lei and Settlemyer, Bradley W. and Bent, John},
title = {To Share or Not to Share: Comparing Burst Buffer Architectures},
year = {2017},
isbn = {9781510838222},
publisher = {Society for Computer Simulation International},
address = {San Diego, CA, USA},
abstract = {Modern high performance computing platforms employ burst buffers to overcome the I/O bottleneck that limits the scale and efficiency of large-scale parallel computations. Currently there are two competing burst buffer architectures. One is to treat burst buffers as a dedicated shared resource, The other is to integrate burst buffer hardware into each compute node. In this paper we examine the design tradeoffs associated with local and shared, dedicated burst buffer architectures through modeling. By seeding our simulation with realistic workloads, we are able to systematically evaluate the resulting performance of both designs. Our studies validate previous results indicating that storage systems without parity protection can reduce overall time to solution, and further determine that shared burst buffer organizations can result in a 3.5\texttimes{} greater average application I/O throughput compared to local burst buffer configurations.},
booktitle = {Proceedings of the 25th High Performance Computing Symposium},
articleno = {4},
numpages = {10},
keywords = {storage systems, file systems, I/O, checkpoint-restart, burst buffers},
location = {Virginia Beach, Virginia},
series = {HPC '17}
}


@online{lcsierrasystem,
  title={Using LC's Sierra Systems},
  url={hpc.llnl.gov/documentation/tutorials/using-lc-s-sierra-systems}
}

@inproceedings{Landsteiner2016ArchitectureAD,
  title={Architecture and Design of Cray DataWarp},
  author={Benjamin R. Landsteiner and Dave Henseler and Douglas Petesch and Nicholas J. Wright},
  year={2016}
}

@INPROCEEDINGS{teng2016burstbuffer,
  author={Wang, Teng and Mohror, Kathryn and Moody, Adam and Sato, Kento and Yu, Weikuan},
  booktitle={SC '16: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  title={An Ephemeral Burst-Buffer File System for Scientific Applications},
  year={2016},
  volume={},
  number={},
  pages={807-818},
  doi={10.1109/SC.2016.68}}

@inproceedings{
chen2021gans,
title={{\{}GAN{\}}s Can Play Lottery Tickets Too},
author={Xuxi Chen and Zhenyu Zhang and Yongduo Sui and Tianlong Chen},
booktitle={International Conference on Learning Representations},
year={2021},
url={https://openreview.net/forum?id=1AoMhc_9jER}
}

@misc{lth-nlp-rl,
  doi = {10.48550/ARXIV.1906.02768},

  url = {https://arxiv.org/abs/1906.02768},

  author = {Yu, Haonan and Edunov, Sergey and Tian, Yuandong and Morcos, Ari S.},

  keywords = {Machine Learning (stat.ML), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Playing the lottery with rewards and multiple languages: lottery tickets in RL and NLP},

  publisher = {arXiv},

  year = {2019},

  copyright = {arXiv.org perpetual, non-exclusive license}
}

@article{DBLP:journals/corr/abs-2007-12223,
  author    = {Tianlong Chen and
               Jonathan Frankle and
               Shiyu Chang and
               Sijia Liu and
               Yang Zhang and
               Zhangyang Wang and
               Michael Carbin},
  title     = {The Lottery Ticket Hypothesis for Pre-trained {BERT} Networks},
  journal   = {CoRR},
  volume    = {abs/2007.12223},
  year      = {2020},
  url       = {https://arxiv.org/abs/2007.12223},
  eprinttype = {arXiv},
  eprint    = {2007.12223},
  timestamp = {Wed, 29 Jul 2020 15:36:39 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2007-12223.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@misc{butterfly-matrices,
  doi = {10.48550/ARXIV.1903.05895},

  url = {https://arxiv.org/abs/1903.05895},

  author = {Dao, Tri and Gu, Albert and Eichhorn, Matthew and Rudra, Atri and Ré, Christopher},

  keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Learning Fast Algorithms for Linear Transforms Using Butterfly Factorizations},

  publisher = {arXiv},

  year = {2019},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@inproceedings{Dao2020Kaleidoscope,
title={Kaleidoscope: An Efficient, Learnable Representation For All Structured Linear Maps},
author={Tri Dao and Nimit Sohoni and Albert Gu and Matthew Eichhorn and Amit Blonder and Megan Leszczynski and Atri Rudra and Christopher Ré},
booktitle={International Conference on Learning Representations},
year={2020},
url={https://openreview.net/forum?id=BkgrBgSYDS}
}

@inproceedings{column-vector-sparse-encoding,
author = {Chen, Zhaodong and Qu, Zheng and Liu, Liu and Ding, Yufei and Xie, Yuan},
title = {Efficient Tensor Core-Based GPU Kernels for Structured Sparsity under Reduced Precision},
year = {2021},
isbn = {9781450384421},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3458817.3476182},
doi = {10.1145/3458817.3476182},
abstract = {The success of DNN comes at the expense of excessive memory/computation cost, which can be addressed by exploiting reduced precision and sparsity jointly. Existing sparse GPU kernels, however, fail to achieve practical speedup over cuBLASHgemm under half-precision. Those for fine-grained sparsity suffer from low data reuse, and others for coarse-grained sparsity are limited by the wrestling between kernel performance and model quality under different grain sizes. We propose column-vector-sparse-encoding that has a smaller grain size under the same reuse rate compared with block sparsity. Column-vector-sparse-encoding can be applied to both SpMM & SDDMM, two major sparse DNN operations. We also introduce the Tensor-Core-based 1D Octet Tiling that has efficient memory access and computation patterns under small grain size. Based on these, we design SpMM and SDDMM kernels and achieve 1.71-7.19x speedup over cuSPARSE. Practical speedup is achieved over cuBLASHgemm under >70% and >90% sparsity with 4x1 grain size and half-precision.},
booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
articleno = {78},
numpages = {14},
keywords = {tensor core, sparse matrices, GPGPU, neural networks},
location = {St. Louis, Missouri},
series = {SC '21}
}


@misc{open-ai-block-sparse,
  url = {https://cdn.openai.com/blocksparse/blocksparsepaper.pdf},

  author = {Gray, Scott and Radford, Alex and Kingma, Diedrick P.},

  title = {GPU Kernels for Block-Sparse Weights},

  publisher = {OpenAI},

  year = {2017},

}

@misc{state-of-sparsity,
  doi = {10.48550/ARXIV.1902.09574},

  url = {https://arxiv.org/abs/1902.09574},

  author = {Gale, Trevor and Elsen, Erich and Hooker, Sara},

  keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {The State of Sparsity in Deep Neural Networks},

  publisher = {arXiv},

  year = {2019},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{sparse-design-principles,
  doi = {10.48550/ARXIV.1803.08601},

  url = {https://arxiv.org/abs/1803.08601},

  author = {Yang, Carl and Buluc, Aydin and Owens, John D.},

  keywords = {Distributed, Parallel, and Cluster Computing (cs.DC), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Design Principles for Sparse Matrix Multiplication on the GPU},

  publisher = {arXiv},

  year = {2018},

  copyright = {arXiv.org perpetual, non-exclusive license}
}

@inproceedings{adaptive-sparse-tiling,
author = {Hong, Changwan and Sukumaran-Rajam, Aravind and Nisa, Israt and Singh, Kunal and Sadayappan, P.},
title = {Adaptive Sparse Tiling for Sparse Matrix Multiplication},
year = {2019},
isbn = {9781450362252},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3293883.3295712},
doi = {10.1145/3293883.3295712},
abstract = {Tiling is a key technique for data locality optimization and is widely used in high-performance implementations of dense matrix-matrix multiplication for multicore/manycore CPUs and GPUs. However, the irregular and matrix-dependent data access pattern of sparse matrix multiplication makes it challenging to use tiling to enhance data reuse. In this paper, we devise an adaptive tiling strategy and apply it to enhance the performance of two primitives: SpMM (product of sparse matrix and dense matrix) and SDDMM (sampled dense-dense matrix multiplication). In contrast to studies that have resorted to non-standard sparse-matrix representations to enhance performance, we use the standard Compressed Sparse Row (CSR) representation, within which intra-row reordering is performed to enable adaptive tiling. Experimental evaluation using an extensive set of matrices from the Sparse Suite collection demonstrates significant performance improvement over currently available state-of-the-art alternatives.},
booktitle = {Proceedings of the 24th Symposium on Principles and Practice of Parallel Programming},
pages = {300–314},
numpages = {15},
keywords = {tiling, multicore/manycore, GPU, sampled dense-dense matrix multiplication, SDDMM, SpMM, sparse matrix-matrix multiplication},
location = {Washington, District of Columbia},
series = {PPoPP '19}
}


@inproceedings{sputnik,
  author    = {Trevor Gale and Matei Zaharia and Cliff Young and Erich Elsen},
  title     = {Sparse {GPU} Kernels for Deep Learning},
  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, {SC} 2020},
  year      = {2020},
}

@inproceedings{GPFS,
  title={IBM Spectrum Scale
(formerly GPFS)},
  author={Dino Quintero and JLuis Bolinches and JPuneet Chaudhary and Willard Davis and Steve Duersch and Carlos Henrique Fachim and Andrei Socoliuc and Olaf Weiser},
  year={2020}
}


@article{ior,
title={IOR},
url="https://ior.readthedocs.io/en/latest/index.html"
}

@inproceedings{io500,
  title={Establishing the IO-500 Benchmark},
  author={Julian M. Kunkel and John Bent and Jay Lofstead and George S. Markomanolis},
  year={2017}
}

@INPROCEEDINGS{ior-emulation,
  author={Shan, Hongzhang and Antypas, Katie and Shalf, John},
  booktitle={SC '08: Proceedings of the 2008 ACM/IEEE Conference on Supercomputing},
  title={Characterizing and predicting the I/O performance of HPC applications using a parameterized synthetic benchmark},
  year={2008},
  volume={},
  number={},
  pages={1-12},
  doi={10.1109/SC.2008.5222721}}

@article{penntreebank,
    title = "Building a Large Annotated Corpus of {E}nglish: The {P}enn {T}reebank",
    author = "Marcus, Mitchell P.  and
      Santorini, Beatrice  and
      Marcinkiewicz, Mary Ann",
    journal = "Computational Linguistics",
    volume = "19",
    number = "2",
    year = "1993",
    address = "Cambridge, MA",
    publisher = "MIT Press",
    url = "https://aclanthology.org/J93-2004",
    pages = "313--330",
}

@misc{layer-norm,
  doi = {10.48550/ARXIV.1607.06450},

  url = {https://arxiv.org/abs/1607.06450},

  author = {Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E.},

  keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Layer Normalization},

  publisher = {arXiv},

  year = {2016},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@article{ntk,
  doi = {10.48550/ARXIV.1806.07572},

  url = {https://arxiv.org/abs/1806.07572},

  author = {Jacot, Arthur and Gabriel, Franck and Hongler, Clément},

  keywords = {Machine Learning (cs.LG), Neural and Evolutionary Computing (cs.NE), Probability (math.PR), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Mathematics, FOS: Mathematics},

  title = {Neural Tangent Kernel: Convergence and Generalization in Neural Networks},

  publisher = {arXiv},

  year = {2018},

  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{xu2015empirical,
      title={Empirical Evaluation of Rectified Activations in Convolutional Network},
      author={Bing Xu and Naiyan Wang and Tianqi Chen and Mu Li},
      year={2015},
      eprint={1505.00853},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{gelu,
      title={Gaussian Error Linear Units (GELUs)},
      author={Dan Hendrycks and Kevin Gimpel},
      year={2020},
      eprint={1606.08415},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@article{rethink,
  author    = {Chiyuan Zhang and
               Samy Bengio and
               Moritz Hardt and
               Benjamin Recht and
               Oriol Vinyals},
  title     = {Understanding deep learning requires rethinking generalization},
  journal   = {CoRR},
  volume    = {abs/1611.03530},
  year      = {2016},
  url       = {http://arxiv.org/abs/1611.03530},
  eprinttype = {arXiv},
  eprint    = {1611.03530},
  timestamp = {Mon, 13 Aug 2018 16:47:02 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/ZhangBHRV16.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@misc{implicit-bias,
  doi = {10.48550/ARXIV.1710.10345},

  url = {https://arxiv.org/abs/1710.10345},

  author = {Soudry, Daniel and Hoffer, Elad and Nacson, Mor Shpigel and Gunasekar, Suriya and Srebro, Nathan},

  keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {The Implicit Bias of Gradient Descent on Separable Data},

  publisher = {arXiv},

  year = {2017},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@article{pl-condition,
  author    = {Chaoyue Liu and
               Libin Zhu and
               Mikhail Belkin},
  title     = {Toward a theory of optimization for over-parameterized systems of
               non-linear equations: the lessons of deep learning},
  journal   = {CoRR},
  volume    = {abs/2003.00307},
  year      = {2020},
  url       = {https://arxiv.org/abs/2003.00307},
  eprinttype = {arXiv},
  eprint    = {2003.00307},
  timestamp = {Mon, 07 Sep 2020 12:51:34 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2003-00307.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@misc{provably,
  doi = {10.48550/ARXIV.1810.02054},

  url = {https://arxiv.org/abs/1810.02054},

  author = {Du, Simon S. and Zhai, Xiyu and Poczos, Barnabas and Singh, Aarti},

  keywords = {Machine Learning (cs.LG), Optimization and Control (math.OC), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Mathematics, FOS: Mathematics},

  title = {Gradient Descent Provably Optimizes Over-parameterized Neural Networks},

  publisher = {arXiv},

  year = {2018},

  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{openwebtext,
  title={OpenWebText Corpus},
  author={Aaron Gokaslan and Vanya Cohen},
  howpublished={\url{http://Skylion007.github.io/OpenWebTextCorpus}},
  year={2019}
}

@article{cifar-10,
title= {CIFAR-10 (Canadian Institute for Advanced Research)},
journal= {},
author= {Alex Krizhevsky and Vinod Nair and Geoffrey Hinton},
year= {},
url= {http://www.cs.toronto.edu/~kriz/cifar.html},
abstract= {The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images.},
keywords= {Dataset},
terms= {}
}

@inproceedings{LeCun1995LearningAF,
  title={Learning algorithms for classification: A comparison on handwritten digit recognition},
  author={Yann LeCun and Lawrence D. Jackel and L{\'e}on Bottou and Corinna Cortes and John S. Denker and Harris Drucker and Isabelle Guyon and Urs Muller and E. Sackinger and Patrice Y. Simard and Vladimir Naumovich Vapnik},
  year={1995}
}

@article{belkin:double-descent,
author = {Mikhail Belkin  and Daniel Hsu  and Siyuan Ma  and Soumik Mandal },
title = {Reconciling modern machine-learning practice and the classical bias-variance trade-off},
journal = {Proceedings of the National Academy of Sciences},
volume = {116},
number = {32},
pages = {15849-15854},
year = {2019},
doi = {10.1073/pnas.1903070116},

URL = {https://www.pnas.org/doi/abs/10.1073/pnas.1903070116},
eprint = {https://www.pnas.org/doi/pdf/10.1073/pnas.1903070116}

}


@article{sparsity-in-dl,
  author={Torsten Hoefler and Dan Alistarh and Tan Ben-Nun and Nikoli Dryden and Alexandra Peste},
  title={{Sparsity in Deep Learning: Pruning and growth for efficient inference and training in neural networks}},
  journal={Journal of Machine Learning Research},
  year={2021},
  month={Sep.},
  pages={1-124},
  volume={22},
  number={241},
  source={http://www.unixer.de/~htor/publications/},
}

@inproceedings{NEURIPS2020_b6af2c97,
 author = {Chen, Tianlong and Frankle, Jonathan and Chang, Shiyu and Liu, Sijia and Zhang, Yang and Wang, Zhangyang and Carbin, Michael},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
 pages = {15834--15846},
 publisher = {Curran Associates, Inc.},
 title = {The Lottery Ticket Hypothesis for Pre-trained BERT Networks},
 url = {https://proceedings.neurips.cc/paper/2020/file/b6af2c9703f203a2794be03d443af2e3-Paper.pdf},
 volume = {33},
 year = {2020}
}


@inproceedings{prasanna-etal-2020-bert,
    title = "{W}hen {BERT} {P}lays the {L}ottery, {A}ll {T}ickets {A}re {W}inning",
    author = "Prasanna, Sai  and
      Rogers, Anna  and
      Rumshisky, Anna",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
    month = nov,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.emnlp-main.259",
    doi = "10.18653/v1/2020.emnlp-main.259",
    pages = "3208--3229",
    abstract = "Large Transformer-based models were shown to be reducible to a smaller number of self-attention heads and layers. We consider this phenomenon from the perspective of the lottery ticket hypothesis, using both structured and magnitude pruning. For fine-tuned BERT, we show that (a) it is possible to find subnetworks achieving performance that is comparable with that of the full model, and (b) similarly-sized subnetworks sampled from the rest of the model perform worse. Strikingly, with structured pruning even the worst possible subnetworks remain highly trainable, indicating that most pre-trained BERT weights are potentially useful. We also study the {``}good{''} subnetworks to see if their success can be attributed to superior linguistic knowledge, but find them unstable, and not explained by meaningful self-attention patterns.",
}

@article{gohil2020one,
title={One ticket to win them all: generalizing lottery ticket initializations across datasets and optimizers },
author={Varun Gohil and S. Deepak Narayanan and Atishay Jain},
journal={ReScience C},
year={2020},
volume={6},
number={2},
note={Accepted at NeurIPS 2019 Reproducibility Challenge},
url={https://openreview.net/forum?id=SklFHaqG6S}
}

@inproceedings{brix-etal-2020-successfully,
    title = "Successfully Applying the Stabilized Lottery Ticket Hypothesis to the Transformer Architecture",
    author = "Brix, Christopher  and
      Bahar, Parnia  and
      Ney, Hermann",
    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.acl-main.360",
    doi = "10.18653/v1/2020.acl-main.360",
    pages = "3909--3915",
    abstract = "Sparse models require less memory for storage and enable a faster inference by reducing the necessary number of FLOPs. This is relevant both for time-critical and on-device computations using neural networks. The stabilized lottery ticket hypothesis states that networks can be pruned after none or few training iterations, using a mask computed based on the unpruned converged model. On the transformer architecture and the WMT 2014 English-to-German and English-to-French tasks, we show that stabilized lottery ticket pruning performs similar to magnitude pruning for sparsity levels of up to 85{\%}, and propose a new combination of pruning techniques that outperforms all other techniques for even higher levels of sparsity. Furthermore, we confirm that the parameter{'}s initial sign and not its specific value is the primary factor for successful training, and show that magnitude pruning cannot be used to find winning lottery tickets.",
}

@inproceedings{NEURIPS2019-1113d7a7,
 author = {Zhou, Hattie and Lan, Janice and Liu, Rosanne and Yosinski, Jason},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d Alch{e}-Buc and E. Fox and R. Garnett},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {Deconstructing Lottery Tickets: Zeros, Signs, and the Supermask},
 url = {https://proceedings.neurips.cc/paper/2019/file/1113d7a76ffceca1bb350bfe145467c6-Paper.pdf},
 volume = "32",
 year = "2019"
}

@inproceedings{Frankle2020The,
title={The Early Phase of Neural Network Training},
author={Jonathan Frankle and David J. Schwab and Ari S. Morcos},
booktitle={International Conference on Learning Representations},
year={2020},
url={https://openreview.net/forum?id=Hkl1iRNFwS}
}

@inproceedings{FrankleD0C20,
  title = {Linear Mode Connectivity and the Lottery Ticket Hypothesis},
  author = {Jonathan Frankle and Gintare Karolina Dziugaite and Daniel Roy 0001 and Michael Carbin},
  year = {2020},
  url = {http://proceedings.mlr.press/v119/frankle20a.html},
  researchr = {https://researchr.org/publication/FrankleD0C20},
  cites = {0},
  citedby = {0},
  pages = {3259-3269},
  booktitle = {Proceedings of the 37th International Conference on Machine Learning, ICML 2020, 13-18 July 2020, Virtual Event},
  volume = {119},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
}

@article{Maene2021TowardsUI,
  title={Towards Understanding Iterative Magnitude Pruning: Why Lottery Tickets Win},
  author={Jaron Maene and Mingxiao Li and Marie-Francine Moens},
  journal={ArXiv},
  year={2021},
  volume={abs/2106.06955}
}

@inproceedings{You2020Drawing,
title={Drawing Early-Bird Tickets: Toward More Efficient Training of Deep Networks},
author={Haoran You and Chaojian Li and Pengfei Xu and Yonggan Fu and Yue Wang and Xiaohan Chen and Richard G. Baraniuk and Zhangyang Wang and Yingyan Lin},
booktitle={International Conference on Learning Representations},
year={2020},
url={https://openreview.net/forum?id=BJxsrgStvr}
}

@INPROCEEDINGS{5961514,
  author={Satopaa, Ville and Albrecht, Jeannie and Irwin, David and Raghavan, Barath},
  booktitle={2011 31st International Conference on Distributed Computing Systems Workshops},
  title={Finding a "Kneedle" in a Haystack: Detecting Knee Points in System Behavior},
  year={2011},
  volume={},
  number={},
  pages={166-171},
  doi={10.1109/ICDCSW.2011.20}}

@misc{chen2021earlybert,
title={Early{\{}BERT{\}}: Efficient {\{}BERT{\}} Training via Early-bird Lottery Tickets},
author={Xiaohan Chen and Yu Cheng and Shuohang Wang and Zhe Gan and Zhangyang Wang and Jingjing Liu},
year={2021},
url={https://openreview.net/forum?id=I-VfjSBzi36}
}

@inproceedings{Renda2020Comparing,
title={Comparing Rewinding and Fine-tuning in Neural Network Pruning},
author={Alex Renda and Jonathan Frankle and Michael Carbin},
booktitle={International Conference on Learning Representations},
year={2020},
url={https://openreview.net/forum?id=S1gSj0NKvB}
}

@inproceedings{NEURIPS2020-46a4378f,
 author = {Tanaka, Hidenori and Kunin, Daniel and Yamins, Daniel L and Ganguli, Surya},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
 pages = {6377--6389},
 publisher = {Curran Associates, Inc.},
 title = {Pruning neural networks without any data by iteratively conserving synaptic flow},
 url = {https://proceedings.neurips.cc/paper/2020/file/46a4378f835dc8040c8057beb6a2da52-Paper.pdf},
 volume = {33},
 year = {2020}
}

@inproceedings{liu2018rethinking,
title={Rethinking the Value of Network Pruning},
author={Zhuang Liu and Mingjie Sun and Tinghui Zhou and Gao Huang and Trevor Darrell},
booktitle={International Conference on Learning Representations},
year={2019},
url={https://openreview.net/forum?id=rJlnB3C5Ym},
}

@INPROCEEDINGS{gainaru2015schdulingIO,
  author={Gainaru, Ana and Aupy, Guillaume and Benoit, Anne and Cappello, Franck and Robert, Yves and Snir, Marc},
  booktitle={2015 IEEE International Parallel and Distributed Processing Symposium},
  title={Scheduling the I/O of HPC Applications Under Congestion},
  year={2015},
  volume={},
  number={},
  pages={1013-1022},
  doi={10.1109/IPDPS.2015.116}}

@INPROCEEDINGS{tokio1,
  author={Lockwood, Glenn K. and Snyder, Shane and Byna, Suren and Carns, Philip and Wright, Nicholas J.},
  booktitle={2019 IEEE/ACM Fourth International Parallel Data Systems Workshop (PDSW)},
  title={Understanding Data Motion in the Modern HPC Data Center},
  year={2019},
  volume={},
  number={},
  pages={74-83},
  doi={10.1109/PDSW49588.2019.00012}}

@INPROCEEDINGS{tokio2,
  author={Wang, Teng and Byna, Suren and Lockwood, Glenn K. and Snyder, Shane and Carns, Philip and Kim, Sunggon and Wright, Nicholas J.},
  booktitle={2019 19th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGRID)},
  title={A Zoom-in Analysis of I/O Logs to Detect Root Causes of I/O Performance Bottlenecks},
  year={2019},
  volume={},
  number={},
  pages={102-111},
  doi={10.1109/CCGRID.2019.00021}}

@online{pydarshan,
title={PyDarshan Documentation — PyDarshan 3.3.1.0 documentation},
url = {https://www.mcs.anl.gov/research/projects/darshan/docs/pydarshan/index.html}
}

@INPROCEEDINGS{9150354,
  author={Wang, Chen and Sun, Jinghan and Snir, Marc and Mohror, Kathryn and Gonsiorowski, Elsa},
  booktitle={2020 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)},
  title={Recorder 2.0: Efficient Parallel I/O Tracing and Analysis},
  year={2020},
  volume={},
  number={},
  pages={1-8},
  doi={10.1109/IPDPSW50202.2020.00176}}

@article{10.1145/2027066.2027068,
author = {Carns, Philip and Harms, Kevin and Allcock, William and Bacon, Charles and Lang, Samuel and Latham, Robert and Ross, Robert},
title = {Understanding and Improving Computational Science Storage Access through Continuous Characterization},
year = {2011},
issue_date = {October 2011},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {7},
number = {3},
issn = {1553-3077},
url = {https://doi.org/10.1145/2027066.2027068},
doi = {10.1145/2027066.2027068},
abstract = {Computational science applications are driving a demand for increasingly powerful storage systems. While many techniques are available for capturing the I/O behavior of individual application trial runs and specific components of the storage system, continuous characterization of a production system remains a daunting challenge for systems with hundreds of thousands of compute cores and multiple petabytes of storage. As a result, these storage systems are often designed without a clear understanding of the diverse computational science workloads they will support.In this study, we outline a methodology for scalable, continuous, systemwide I/O characterization that combines storage device instrumentation, static file system analysis, and a new mechanism for capturing detailed application-level behavior. This methodology allows us to identify both system-wide trends and application-specific I/O strategies. We demonstrate the effectiveness of our methodology by performing a multilevel, two-month study of Intrepid, a 557-teraflop IBM Blue Gene/P system. During that time, we captured application-level I/O characterizations from 6,481 unique jobs spanning 38 science and engineering projects. We used the results of our study to tune example applications, highlight trends that impact the design of future storage systems, and identify opportunities for improvement in I/O characterization methodology.},
journal = {ACM Trans. Storage},
month = {oct},
articleno = {8},
numpages = {26},
keywords = {parallel file systems, I/O characterization}
}


@INPROCEEDINGS{sato2014checkpointing2,
  author={Sato, Kento and Mohror, Kathryn and Moody, Adam and Gamblin, Todd and Supinski, Bronis R. de and Maruyama, Naoya and Matsuoka, Satoshi},
  booktitle={2014 14th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing},
  title={A User-Level InfiniBand-Based File System and Checkpoint Strategy for Burst Buffers},
  year={2014},
  volume={},
  number={},
  pages={21-30},
  doi={10.1109/CCGrid.2014.24}}

@INPROCEEDINGS{nicolae2019veloc,
  author={Nicolae, Bogdan and Moody, Adam and Gonsiorowski, Elsa and Mohror, Kathryn and Cappello, Franck},
  booktitle={2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
  title={VeloC: Towards High Performance Adaptive Asynchronous Checkpointing at Large Scale},
  year={2019},
  volume={},
  number={},
  pages={911-920},
  doi={10.1109/IPDPS.2019.00099}}

@INPROCEEDINGS{sato2012checkpointing,
  author={Sato, Kento and Maruyama, Naoya and Mohror, Kathryn and Moody, Adam and Gamblin, Todd and de Supinski, Bronis R. and Matsuoka, Satoshi},
  booktitle={SC '12: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis},
  title={Design and modeling of a non-blocking checkpointing system},
  year={2012},
  volume={},
  number={},
  pages={1-10},
  doi={10.1109/SC.2012.46}}

@INPROCEEDINGS{ovsyannikov2016nerscBB,
  author={Ovsyannikov, Andrey and Romanus, Melissa and Van Straalen, Brian and Weber, Gunther H. and Trebotich, David},
  booktitle={2016 1st Joint International Workshop on Parallel Data Storage and data Intensive Scalable Computing Systems (PDSW-DISCS)},
  title={Scientific Workflows at DataWarp-Speed: Accelerated Data-Intensive Science Using NERSC's Burst Buffer},
  year={2016},
  volume={},
  number={},
  pages={1-6},
  doi={10.1109/PDSW-DISCS.2016.005}}

@INPROCEEDINGS{pottier2020modelBB,
  author={Pottier, Loïc and da Silva, Rafael Ferreira and Casanova, Henri and Deelman, Ewa},
  booktitle={2020 IEEE International Conference on Cluster Computing (CLUSTER)},
  title={Modeling the Performance of Scientific Workflow Executions on HPC Platforms with Burst Buffers},
  year={2020},
  volume={},
  number={},
  pages={92-103},
  doi={10.1109/CLUSTER49012.2020.00019}}

@article{bhimji2016nerscBB,
title = {Accelerating Science with the NERSC Burst Buffer Early User Program},
author = {Bhimji, Wahid and Bard, Debbie and Romanus, Melissa and Paul, David and Ovsyannikov, Andrey and Friesen, Brian and Bryson, Matt and Correa, Joaquin and Lockwood, Glenn K. and Tsulaia, Vakho and Byna, Suren and Farrell, Steve and Gursoy, Doga and Daley, Chris and Beckner, Vince and Van Straalen, Brian and Trebotich, David and Tull, Craig and Weber, Gunther H. and Wright, Nicholas J. and Antypas, Katie and Prabhat, none},
abstractNote = {NVRAM-based Burst Buffers are an important part of the emerging HPC storage landscape. The National Energy Research Scientific Computing Center (NERSC) at Lawrence Berkeley National Laboratory recently installed one of the first Burst Buffer systems as part of its new Cori supercomputer, collaborating with Cray on the development of the DataWarp software. NERSC has a diverse user base comprised of over 6500 users in 700 different projects spanning a wide variety of scientific computing applications. The use-cases of the Burst Buffer at NERSC are therefore also considerable and diverse. We describe here performance measurements and lessons learned from the Burst Buffer Early User Program at NERSC, which selected a number of research projects to gain early access to the Burst Buffer and exercise its capability to enable new scientific advancements. To the best of our knowledge this is the first time a Burst Buffer has been stressed at scale by diverse, real user workloads and therefore these lessons will be of considerable benefit to shaping the developing use of Burst Buffers at HPC centers.},
doi = {},
url = {https://www.osti.gov/biblio/1393591}, journal = {},
place = {United States},
year = {2016},
month = {1}
}


@inproceedings{Hjelm2017libhioOI,
  title={libhio: Optimizing IO on Cray XC Systems With DataWarp},
  author={Nathan T. Hjelm},
  year={2017}
}

@misc{schuhmann2021laion400m,
      title={LAION-400M: Open Dataset of CLIP-Filtered 400 Million Image-Text Pairs},
      author={Christoph Schuhmann and Richard Vencu and Romain Beaumont and Robert Kaczmarczyk and Clayton Mullis and Aarush Katta and Theo Coombes and Jenia Jitsev and Aran Komatsuzaki},
      year={2021},
      eprint={2111.02114},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{shafahi2019adversarial,
      title={Adversarial Training for Free!},
      author={Ali Shafahi and Mahyar Najibi and Amin Ghiasi and Zheng Xu and John Dickerson and Christoph Studer and Larry S. Davis and Gavin Taylor and Tom Goldstein},
      year={2019},
      eprint={1904.12843},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{rice2020overfitting,
      title={Overfitting in adversarially robust deep learning},
      author={Leslie Rice and Eric Wong and J. Zico Kolter},
      year={2020},
      eprint={2002.11569},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{wong2020fast,
      title={Fast is better than free: Revisiting adversarial training},
      author={Eric Wong and Leslie Rice and J. Zico Kolter},
      year={2020},
      eprint={2001.03994},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@article{gradcam2019,
   title={Grad-CAM: Visual Explanations from Deep Networks via Gradient-Based Localization},
   volume={128},
   ISSN={1573-1405},
   url={http://dx.doi.org/10.1007/s11263-019-01228-7},
   DOI={10.1007/s11263-019-01228-7},
   number={2},
   journal={International Journal of Computer Vision},
   publisher={Springer Science and Business Media LLC},
   author={Selvaraju, Ramprasaath R. and Cogswell, Michael and Das, Abhishek and Vedantam, Ramakrishna and Parikh, Devi and Batra, Dhruv},
   year={2019},
   month={Oct},
   pages={336–359}
}

@misc{singla2021salient,
      title={Salient ImageNet: How to discover spurious features in Deep Learning?},
      author={Sahil Singla and Soheil Feizi},
      year={2021},
      eprint={2110.04301},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@article{singla2020understanding,
  author    = {Sahil Singla and
               Besmira Nushi and
               Shital Shah and
               Ece Kamar and
               Eric Horvitz},
  title     = {Understanding Failures of Deep Networks via Robust Feature Extraction},
  journal   = {CoRR},
  volume    = {abs/2012.01750},
  year      = {2020},
  url       = {https://arxiv.org/abs/2012.01750},
  eprinttype = {arXiv},
  eprint    = {2012.01750},
  timestamp = {Fri, 04 Dec 2020 12:07:23 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2012-01750.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{structured-pruning-bad,
  author    = {Zhuang Liu and
               Mingjie Sun and
               Tinghui Zhou and
               Gao Huang and
               Trevor Darrell},
  title     = {Rethinking the Value of Network Pruning},
  journal   = {CoRR},
  volume    = {abs/1810.05270},
  year      = {2018},
  url       = {http://arxiv.org/abs/1810.05270},
  eprinttype = {arXiv},
  eprint    = {1810.05270},
  timestamp = {Tue, 30 Oct 2018 20:39:56 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1810-05270.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{frankle2018the,
title={The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks},
author={Jonathan Frankle and Michael Carbin},
booktitle={International Conference on Learning Representations},
year={2019},
url={https://openreview.net/forum?id=rJl-b3RcF7},
}


@article{Martin:JCP2006,
title = {A bandwidth-optimized WENO scheme for the effective direct numerical simulation of compressible turbulence},
journal = {Journal of Computational Physics},
volume = {220},
number = {1},
pages = {270-289},
year = {2006},
issn = {0021-9991},
doi = {https://doi.org/10.1016/j.jcp.2006.05.009},
url = {https://www.sciencedirect.com/science/article/pii/S0021999106002312},
author = {M.P. Martín and E.M. Taylor and M. Wu and V.G. Weirs},
keywords = {Direct numerical simulation, Shock capturing, Bandwidth optimization, Turbulence, Compressible flow},
}

@inproceedings{elmo,
    title = "Deep Contextualized Word Representations",
    author = "Peters, Matthew E.  and
      Neumann, Mark  and
      Iyyer, Mohit  and
      Gardner, Matt  and
      Clark, Christopher  and
      Lee, Kenton  and
      Zettlemoyer, Luke",
    booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
    month = jun,
    year = "2018",
    address = "New Orleans, Louisiana",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/N18-1202",
    doi = "10.18653/v1/N18-1202",
    pages = "2227--2237",
}

@article{vit,
  title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
  author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
  journal={ICLR},
  year={2021}
}

@article{coat-net,
  author    = {Zihang Dai and
               Hanxiao Liu and
               Quoc V. Le and
               Mingxing Tan},
  title     = {CoAtNet: Marrying Convolution and Attention for All Data Sizes},
  journal   = {CoRR},
  volume    = {abs/2106.04803},
  year      = {2021},
  url       = {https://arxiv.org/abs/2106.04803},
  eprinttype = {arXiv},
  eprint    = {2106.04803},
  timestamp = {Tue, 15 Jun 2021 16:35:15 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2106-04803.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@ARTICLE{mpi4py,
  author={Dalcin, Lisandro and Fang, Yao-Lung L.},
  journal={Computing in Science   Engineering},
  title={mpi4py: Status Update After 12 Years of Development},
  year={2021},
  volume={23},
  number={4},
  pages={47-54},
  doi={10.1109/MCSE.2021.3083216}}

@article{Rumelhart:1986backprop,
  added-at = {2019-05-21T10:10:49.000+0200},
  author = {Rumelhart, David E. and Hinton, Geoffrey E. and Williams, Ronald J.},
  biburl = {https://www.bibsonomy.org/bibtex/2a392597c4f9cff2cd3c96c2191fa1eb6/sxkdz},
  doi = {10.1038/323533a0},
  interhash = {c354bc293fa9aa7caffc66d40a014903},
  intrahash = {a392597c4f9cff2cd3c96c2191fa1eb6},
  journal = {Nature},
  keywords = {imported},
  number = 6088,
  pages = {533--536},
  timestamp = {2019-05-21T10:10:49.000+0200},
  title = {{Learning Representations by Back-propagating Errors}},
  url = {http://www.nature.com/articles/323533a0},
  volume = 323,
  year = 1986
}

@inproceedings{rannc,
  author    = {Masahiro Tanaka and
               Kenjiro Taura and
               Toshihiro Hanawa and
               Kentaro Torisawa},
  title     = {Automatic Graph Partitioning for Very Large-scale Deep Learning},
  booktitle = {35th {IEEE} International Parallel and Distributed Processing Symposium,
               {IPDPS} 2021, Portland, OR, USA, May 17-21, 2021},
  pages     = {1004--1013},
  publisher = {{IEEE}},
  year      = {2021},
  url       = {https://doi.org/10.1109/IPDPS49936.2021.00109},
  doi       = {10.1109/IPDPS49936.2021.00109},
  timestamp = {Fri, 02 Jul 2021 14:10:40 +0200},
  biburl    = {https://dblp.org/rec/conf/ipps/TanakaTHT21.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@software{gpt-neo,
  author       = {Black, Sid and
                  Leo, Gao and
                  Wang, Phil and
                  Leahy, Connor and
                  Biderman, Stella},
  title        = {{GPT-Neo: Large Scale Autoregressive Language
                   Modeling with Mesh-Tensorflow}},
  month        = mar,
  year         = 2021,
  note         = {{If you use this software, please cite it using
                   these metadata.}},
  publisher    = {Zenodo},
  version      = {1.0},
  doi          = {10.5281/zenodo.5297715},
  url          = {https://doi.org/10.5281/zenodo.5297715}
}

@article{the_pile,
  author    = {Leo Gao and
               Stella Biderman and
               Sid Black and
               Laurence Golding and
               Travis Hoppe and
               Charles Foster and
               Jason Phang and
               Horace He and
               Anish Thite and
               Noa Nabeshima and
               Shawn Presser and
               Connor Leahy},
  title     = {The Pile: An 800GB Dataset of Diverse Text for Language Modeling},
  journal   = {CoRR},
  volume    = {abs/2101.00027},
  year      = {2021},
  url       = {https://arxiv.org/abs/2101.00027},
  eprinttype = {arXiv},
  eprint    = {2101.00027},
  timestamp = {Thu, 21 Jan 2021 14:42:30 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2101-00027.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{zero_infinity,
author = {Rajbhandari, Samyam and Ruwase, Olatunji and Rasley, Jeff and Smith, Shaden and He, Yuxiong},
title = {ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning},
year = {2021},
isbn = {9781450384421},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3458817.3476205},
doi = {10.1145/3458817.3476205},
articleno = {59},
numpages = {14},
location = {St. Louis, Missouri},
series = {SC '21}
}

@article{zero_offload,
  author    = {Jie Ren and
               Samyam Rajbhandari and
               Reza Yazdani Aminabadi and
               Olatunji Ruwase and
               Shuangyan Yang and
               Minjia Zhang and
               Dong Li and
               Yuxiong He},
  title     = {ZeRO-Offload: Democratizing Billion-Scale Model Training},
  journal   = {CoRR},
  volume    = {abs/2101.06840},
  year      = {2021},
  url       = {https://arxiv.org/abs/2101.06840},
  archivePrefix = {arXiv},
  eprint    = {2101.06840},
  timestamp = {Mon, 03 May 2021 16:42:27 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2101-06840.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@misc{zero_3D,
  author = {Microsoft},
  title = {3D parallelism with MegatronLM and ZeRO Redundancy Optimizer},
  year = {2021},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-3D_parallelism}},
  commit = {1bee84f6eb75ed7e39e34601bfdd66d79cafe99a}
}

@misc{bloom176b,
  author = {BigScience},
  title = {BigScience Large Open-science Open-access Multilingual Language Model},
  year = {2022},
  publisher = {huggingface},
  howpublished = {\url{https://huggingface.co/bigscience/bloom}},
  commit = {1bee84f6eb75ed7e39e34601bfdd66d79cafe99a}
}


@article{megatronlm-2,
  author    = {Deepak Narayanan and
               Mohammad Shoeybi and
               Jared Casper and
               Patrick LeGresley and
               Mostofa Patwary and
               Vijay Korthikanti and
               Dmitri Vainbrand and
               Prethvi Kashinkunti and
               Julie Bernauer and
               Bryan Catanzaro and
               Amar Phanishayee and
               Matei Zaharia},
  title     = {Efficient Large-Scale Language Model Training on {GPU} Clusters},
  journal   = {CoRR},
  volume    = {abs/2104.04473},
  year      = {2021},
  url       = {https://arxiv.org/abs/2104.04473},
  archivePrefix = {arXiv},
  eprint    = {2104.04473},
  timestamp = {Tue, 13 Apr 2021 16:46:17 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2104-04473.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{snli-te-dataset,
    title = "A large annotated corpus for learning natural language inference",
    author = "Bowman, Samuel R.  and
      Angeli, Gabor  and
      Potts, Christopher  and
      Manning, Christopher D.",
    booktitle = "Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing",
    month = sep,
    year = "2015",
    address = "Lisbon, Portugal",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/D15-1075",
    doi = "10.18653/v1/D15-1075",
    pages = "632--642",
}


@article{squad-qa-dataset,
  author    = {Pranav Rajpurkar and
               Robin Jia and
               Percy Liang},
  title     = {Know What You Don't Know: Unanswerable Questions for SQuAD},
  journal   = {CoRR},
  volume    = {abs/1806.03822},
  year      = {2018},
  url       = {http://arxiv.org/abs/1806.03822},
  archivePrefix = {arXiv},
  eprint    = {1806.03822},
  timestamp = {Mon, 13 Aug 2018 16:48:21 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1806-03822.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{text-clf-transformer,
  author    = {Shervin Minaee and
               Nal Kalchbrenner and
               Erik Cambria and
               Narjes Nikzad and
               Meysam Chenaghlu and
               Jianfeng Gao},
  title     = {Deep Learning Based Text Classification: {A} Comprehensive Review},
  journal   = {CoRR},
  volume    = {abs/2004.03705},
  year      = {2020},
  url       = {https://arxiv.org/abs/2004.03705},
  archivePrefix = {arXiv},
  eprint    = {2004.03705},
  timestamp = {Tue, 14 Apr 2020 16:40:34 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2004-03705.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{roberta,
  author    = {Yinhan Liu and
               Myle Ott and
               Naman Goyal and
               Jingfei Du and
               Mandar Joshi and
               Danqi Chen and
               Omer Levy and
               Mike Lewis and
               Luke Zettlemoyer and
               Veselin Stoyanov},
  title     = {RoBERTa: {A} Robustly Optimized {BERT} Pretraining Approach},
  journal   = {CoRR},
  volume    = {abs/1907.11692},
  year      = {2019},
  url       = {http://arxiv.org/abs/1907.11692},
  archivePrefix = {arXiv},
  eprint    = {1907.11692},
  timestamp = {Thu, 01 Aug 2019 08:59:33 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1907-11692.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{gpt-3,
  author    = {Tom B. Brown and
               Benjamin Mann and
               Nick Ryder and
               Melanie Subbiah and
               Jared Kaplan and
               Prafulla Dhariwal and
               Arvind Neelakantan and
               Pranav Shyam and
               Girish Sastry and
               Amanda Askell and
               Sandhini Agarwal and
               Ariel Herbert{-}Voss and
               Gretchen Krueger and
               Tom Henighan and
               Rewon Child and
               Aditya Ramesh and
               Daniel M. Ziegler and
               Jeffrey Wu and
               Clemens Winter and
               Christopher Hesse and
               Mark Chen and
               Eric Sigler and
               Mateusz Litwin and
               Scott Gray and
               Benjamin Chess and
               Jack Clark and
               Christopher Berner and
               Sam McCandlish and
               Alec Radford and
               Ilya Sutskever and
               Dario Amodei},
  title     = {Language Models are Few-Shot Learners},
  journal   = {CoRR},
  volume    = {abs/2005.14165},
  year      = {2020},
  url       = {https://arxiv.org/abs/2005.14165},
  archivePrefix = {arXiv},
  eprint    = {2005.14165},
  timestamp = {Wed, 03 Jun 2020 11:36:54 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2005-14165.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@InProceedings{pmlr-v37-ioffe15,
  title =    {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},
  author =   {Ioffe, Sergey and Szegedy, Christian},
  booktitle =    {Proceedings of the 32nd International Conference on Machine Learning},
  pages =    {448--456},
  year =   {2015},
  editor =   {Bach, Francis and Blei, David},
  volume =   {37},
  series =   {Proceedings of Machine Learning Research},
  address =    {Lille, France},
  month =    {07--09 Jul},
  publisher =    {PMLR},
  pdf =    {http://proceedings.mlr.press/v37/ioffe15.pdf},
  url =    {
http://proceedings.mlr.press/v37/ioffe15.html
}
}

@inproceedings{micikevicius2018mixed,
title={Mixed Precision Training},
author={Paulius Micikevicius and Sharan Narang and Jonah Alben and Gregory Diamos and Erich Elsen and David Garcia and Boris Ginsburg and Michael Houston and Oleksii Kuchaiev and Ganesh Venkatesh and Hao Wu},
booktitle={International Conference on Learning Representations},
year={2018},
url={https://openreview.net/forum?id=r1gs9JgRZ},
}

@inproceedings{distbelief,
title = {Large Scale Distributed Deep Networks},
author  = {Jeffrey Dean and Greg S. Corrado and Rajat Monga and Kai Chen and Matthieu Devin and Quoc V. Le and Mark Z. Mao and Marc’Aurelio Ranzato and Andrew Senior and Paul Tucker and Ke Yang and Andrew Y. Ng},
year  = {2012},
booktitle = {NIPS}
}

@article{MLPerf2020,
  author    = {Peter Mattson and
               Christine Cheng and
               Cody Coleman and
               Greg Diamos and
               Paulius Micikevicius and
               David A. Patterson and
               Hanlin Tang and
               Gu{-}Yeon Wei and
               Peter Bailis and
               Victor Bittorf and
               David Brooks and
               Dehao Chen and
               Debojyoti Dutta and
               Udit Gupta and
               Kim M. Hazelwood and
               Andrew Hock and
               Xinyuan Huang and
               Bill Jia and
               Daniel Kang and
               David Kanter and
               Naveen Kumar and
               Jeffery Liao and
               Guokai Ma and
               Deepak Narayanan and
               Tayo Oguntebi and
               Gennady Pekhimenko and
               Lillian Pentecost and
               Vijay Janapa Reddi and
               Taylor Robie and
               Tom St. John and
               Carole{-}Jean Wu and
               Lingjie Xu and
               Cliff Young and
               Matei Zaharia},
  title     = {MLPerf Training Benchmark},
  journal   = {CoRR},
  volume    = {abs/1910.01500},
  year      = {2019},
  url       = {http://arxiv.org/abs/1910.01500},
  archivePrefix = {arXiv},
  eprint    = {1910.01500},
  timestamp = {Mon, 04 Nov 2019 08:16:51 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1910-01500.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{wikitext-103,
  author    = {Stephen Merity and
               Caiming Xiong and
               James Bradbury and
               Richard Socher},
  title     = {Pointer Sentinel Mixture Models},
  journal   = {CoRR},
  volume    = {abs/1609.07843},
  year      = {2016},
  url       = {http://arxiv.org/abs/1609.07843},
  archivePrefix = {arXiv},
  eprint    = {1609.07843},
  timestamp = {Thu, 21 Mar 2019 11:19:44 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/MerityXBS16.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{SGDMomentum,
title = {On the momentum term in gradient descent learning algorithms},
journal = {Neural Networks},
volume = {12},
number = {1},
pages = {145-151},
year = {1999},
issn = {0893-6080},
doi = {https://doi.org/10.1016/S0893-6080(98)00116-6},
url = {https://www.sciencedirect.com/science/article/pii/S0893608098001166},
author = {Ning Qian},
keywords = {Momentum, Gradient descent learning algorithm, Damped harmonic oscillator, Critical damping, Learning rate, Speed of convergence},
abstract = {A momentum term is usually included in the simulations of connectionist learning algorithms. Although it is well known that such a term greatly improves the speed of learning, there have been few rigorous studies of its mechanisms. In this paper, I show that in the limit of continuous time, the momentum parameter is analogous to the mass of Newtonian particles that move through a viscous medium in a conservative force field. The behavior of the system near a local minimum is equivalent to a set of coupled and damped harmonic oscillators. The momentum term improves the speed of convergence by bringing some eigen components of the system closer to critical damping. Similar results can be obtained for the discrete time case used in computer simulations. In particular, I derive the bounds for convergence on learning-rate and momentum parameters, and demonstrate that the momentum term can increase the range of learning rate over which the system converges. The optimal condition for convergence is also analyzed.}
}

@article{weight_pred,
  title     = {Efficient and Robust Parallel {DNN} Training through Model Parallelism
               on Multi-GPU Platform},
  journal   = {CoRR},
  volume    = {abs/1809.02839},
  year      = {2018},
  note      = {Withdrawn.},
  url       = {http://arxiv.org/abs/1809.02839},
  archivePrefix = {arXiv},
  eprint    = {1809.02839},
  timestamp = {Fri, 04 Jan 2019 10:57:56 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1809-02839.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{amoebanet,
  author    = {Esteban Real and
               Alok Aggarwal and
               Yanping Huang and
               Quoc V. Le},
  title     = {Regularized Evolution for Image Classifier Architecture Search},
  journal   = {CoRR},
  volume    = {abs/1802.01548},
  year      = {2018},
  url       = {http://arxiv.org/abs/1802.01548},
  archivePrefix = {arXiv},
  eprint    = {1802.01548},
  timestamp = {Mon, 13 Aug 2018 16:48:15 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1802-01548.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{amoebanet-aaai,
  title={Regularized Evolution for Image Classifier Architecture Search},
  volume={33},
  url={https://ojs.aaai.org/index.php/AAAI/article/view/4405},
  DOI={10.1609/aaai.v33i01.33014780},
  abstractNote={&lt;p&gt;The effort devoted to hand-crafting neural network image classifiers has motivated the use of architecture search to discover them automatically. Although evolutionary algorithms have been repeatedly applied to neural network topologies, the image classifiers thus discovered have remained inferior to human-crafted ones. Here, we evolve an image classifier— &lt;em&gt;AmoebaNet-A&lt;/em&gt;—that surpasses hand-designs for the first time. To do this, we modify the tournament selection evolutionary algorithm by introducing an age property to favor the younger genotypes. Matching size, AmoebaNet-A has comparable accuracy to current state-of-the-art ImageNet models discovered with more complex architecture-search methods. Scaled to larger size, AmoebaNet-A sets a new state-of-theart 83.9% top-1 / 96.6% top-5 ImageNet accuracy. In a controlled comparison against a well known reinforcement learning algorithm, we give evidence that evolution can obtain results faster with the same hardware, especially at the earlier stages of the search. This is relevant when fewer compute resources are available. Evolution is, thus, a simple method to effectively discover high-quality architectures.&lt;/p&gt;},
  number={01},
  journal={Proceedings of the AAAI Conference on Artificial Intelligence},
  author={Real, Esteban and Aggarwal, Alok and Huang, Yanping and Le, Quoc V.},
  year={2019},
  month={Jul.},
  pages={4780-4789}
}

@inproceedings{GEMS,
  author = {Jain, Arpan and Awan, Ammar Ahmad and Aljuhani, Asmaa M. and Hashmi, Jahanzeb Maqbool and Anthony, Quentin G. and Subramoni, Hari and Panda, Dhableswar K. and Machiraju, Raghu and Parwani, Anil},
  title = {GEMS: GPU-ENabled MEmory-Aware Model-Parallelism SYstem for Distributed DNN Training},
  year = {2020},
  isbn = {9781728199986},
  publisher = {IEEE Press},
  abstract = {Data-parallelism has become an established paradigm to train DNNs that fit inside GPU memory on large-scale HPC systems. However, model-parallelism is required to train out-of-core DNNs. In this paper, we deal with emerging requirements brought forward by very large DNNs being trained using high-resolution images common in digital pathology. To address these, we propose, design, and implement GEMS; a GPU-Enabled Memory-Aware Model-Parallelism System. We present several design schemes like GEMS-MAST, GEMS-MASTER, and GEMS-Hybrid that offer excellent speedups over state-of-the-art systems like Mesh-TensorFlow and FlexFlow. Furthermore, we combine model-parallelism and data-parallelism to train a 1000-layer ResNet-1k model using 1,024 Volta V100 GPUs with 97.32% scaling-efficiency. For the real-world histopathology whole-slide-image (WSI) of 100,000 x 100,000 pixels, we train custom ResNet-110-v2 on image tiles of size 1024 x 1024 and reduce the training time from seven hours to 28 minutes.},
  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  articleno = {45},
  numpages = {15},
  keywords = {MPI, TensorFlow, model parallelism, DNN, eager execution, keras},
  location = {Atlanta, Georgia},
  series = {SC '20}
}


@inproceedings {projectadam,
author = {Trishul Chilimbi and Yutaka Suzue and Johnson Apacible and Karthik Kalyanaraman},
title = {Project Adam: Building an Efficient and Scalable Deep Learning Training System},
booktitle = {11th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 14)},
year = {2014},
isbn = { 978-1-931971-16-4},
address = {Broomfield, CO},
pages = {571--582},
url = {https://www.usenix.org/conference/osdi14/technical-sessions/presentation/chilimbi},
publisher = {{USENIX} Association},
month = oct,
}

@misc{niu2011hogwild,
      title={HOGWILD!: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent},
      author={Feng Niu and Benjamin Recht and Christopher Re and Stephen J. Wright},
      year={2011},
      eprint={1106.5730},
      archivePrefix={arXiv},
      primaryClass={math.OC}
}

@inproceedings{niu2011hogwild-nips,
  author = {Recht, Benjamin and Re, Christopher and Wright, Stephen and Niu, Feng},
  booktitle = {Advances in Neural Information Processing Systems},
  editor = {J. Shawe-Taylor and R. Zemel and P. Bartlett and F. Pereira and K. Q. Weinberger},
  pages = {},
  publisher = {Curran Associates, Inc.},
  title = {Hogwild!: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent},
  url = {https://proceedings.neurips.cc/paper/2011/file/218a0aefd1d1a4be65601cc6ddc1520e-Paper.pdf},
  volume = {24},
  year = {2011}
}


@ARTICLE{MoDNN,
  author={X. {Chen} and D. Z. {Chen} and Y. {Han} and X. S. {Hu}},
  journal={IEEE Transactions on Parallel and Distributed Systems},
  title={moDNN: Memory Optimal Deep Neural Network Training on Graphics Processing Units},
  year={2019},
  volume={30},
  number={3},
  pages={646-661},
  doi={10.1109/TPDS.2018.2866582}}

@inproceedings{vDNN,
author = {Rhu, Minsoo and Gimelshein, Natalia and Clemons, Jason and Zulfiqar, Arslan and Keckler, Stephen W.},
title = {VDNN: Virtualized Deep Neural Networks for Scalable, Memory-Efficient Neural Network Design},
year = {2016},
publisher = {IEEE Press},
abstract = {The most widely used machine learning frameworks require users to carefully tune their memory usage so that the deep neural network (DNN) fits into the DRAM capacity of a GPU. This restriction hampers a researcher's flexibility to study different machine learning algorithms, forcing them to either use a less desirable network architecture or parallelize the processing across multiple GPUs. We propose a runtime memory manager that virtualizes the memory usage of DNNs such that both GPU and CPU memory can simultaneously be utilized for training larger DNNs. Our virtualized DNN (vDNN) reduces the average GPU memory usage of AlexNet by up to 89%, OverFeat by 91%, and GoogLeNet by 95%, a significant reduction in memory requirements of DNNs. Similar experiments on VGG-16, one of the deepest and memory hungry DNNs to date, demonstrate the memory-efficiency of our proposal. vDNN enables VGG-16 with batch size 256 (requiring 28 GB of memory) to be trained on a single NVIDIA Titan X GPU card containing 12 GB of memory, with 18% performance loss compared to a hypothetical, oracular GPU with enough memory to hold the entire DNN.},
booktitle = {The 49th Annual IEEE/ACM International Symposium on Microarchitecture},
articleno = {18},
numpages = {13},
location = {Taipei, Taiwan},
series = {MICRO-49}
}

@inproceedings{KingmaAdam2014,
  author    = {Diederik P. Kingma and
               Jimmy Ba},
  editor    = {Yoshua Bengio and
               Yann LeCun},
  title     = {Adam: {A} Method for Stochastic Optimization},
  booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
               San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
  year      = {2015},
  url       = {http://arxiv.org/abs/1412.6980},
  timestamp = {Thu, 25 Jul 2019 14:25:37 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/KingmaB14.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{adamw,
  author    = {Ilya Loshchilov and
               Frank Hutter},
  title     = {Fixing Weight Decay Regularization in Adam},
  journal   = {CoRR},
  volume    = {abs/1711.05101},
  year      = {2017},
  url       = {http://arxiv.org/abs/1711.05101},
  eprinttype = {arXiv},
  eprint    = {1711.05101},
  timestamp = {Mon, 13 Aug 2018 16:48:18 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1711-05101.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{mesh_tf,
 author = {Shazeer, Noam and Cheng, Youlong and Parmar, Niki and Tran, Dustin and Vaswani, Ashish and Koanantakool, Penporn and Hawkins, Peter and Lee, HyoukJoong and Hong, Mingsheng and Young, Cliff and Sepassi, Ryan and Hechtman, Blake},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {Mesh-TensorFlow: Deep Learning for Supercomputers},
 url = {https://proceedings.neurips.cc/paper/2018/file/3a37abdeefe1dab1b30f7c5c7e581b93-Paper.pdf},
 volume = {31},
 year = {2018}
}

@misc{abadi2016tensorflow,
title={TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems},
author={Martín Abadi and Ashish Agarwal and Paul Barham and Eugene Brevdo and Zhifeng Chen and Craig Citro and Greg S. Corrado and Andy Davis and Jeffrey Dean and Matthieu Devin and Sanjay Ghemawat and Ian Goodfellow and Andrew Harp and Geoffrey Irving and Michael Isard and Yangqing Jia and Rafal Jozefowicz and Lukasz Kaiser and Manjunath Kudlur and Josh Levenberg and Dan Mane and Rajat Monga and Sherry Moore and Derek Murray and Chris Olah and Mike Schuster and Jonathon Shlens and Benoit Steiner and Ilya Sutskever and Kunal Talwar and Paul Tucker and Vincent Vanhoucke and Vijay Vasudevan and Fernanda Viegas and Oriol Vinyals and Pete Warden and Martin Wattenberg and Martin Wicke and Yuan Yu and Xiaoqiang Zheng},
year={2016},
eprint={1603.04467},
archivePrefix={arXiv},
primaryClass={cs.DC}
}

@inproceedings{book-corpus,
title = {Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books},
author = {Yukun Zhu and Ryan Kiros and Richard Zemel and Ruslan Salakhutdinov and Raquel Urtasun and Antonio Torralba and Sanja Fidler},
booktitle = {arXiv preprint arXiv:1506.06724},
year = {2015}
}

@inproceedings{tensorflowosdi2016,
  author = {Abadi, Mart\'{\i}n and Barham, Paul and Chen, Jianmin and Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and Isard, Michael and Kudlur, Manjunath and Levenberg, Josh and Monga, Rajat and Moore, Sherry and Murray, Derek G. and Steiner, Benoit and Tucker, Paul and Vasudevan, Vijay and Warden, Pete and Wicke, Martin and Yu, Yuan and Zheng, Xiaoqiang},
  title = {TensorFlow: A System for Large-Scale Machine Learning},
  year = {2016},
  isbn = {9781931971331},
  publisher = {USENIX Association},
  address = {USA},
  abstract = {TensorFlow is a machine learning system that operates at large scale and in heterogeneous environments. Tensor-Flow uses dataflow graphs to represent computation, shared state, and the operations that mutate that state. It maps the nodes of a dataflow graph across many machines in a cluster, and within a machine across multiple computational devices, including multicore CPUs, general-purpose GPUs, and custom-designed ASICs known as Tensor Processing Units (TPUs). This architecture gives flexibility to the application developer: whereas in previous "parameter server" designs the management of shared state is built into the system, TensorFlow enables developers to experiment with novel optimizations and training algorithms. TensorFlow supports a variety of applications, with a focus on training and inference on deep neural networks. Several Google services use TensorFlow in production, we have released it as an open-source project, and it has become widely used for machine learning research. In this paper, we describe the TensorFlow dataflow model and demonstrate the compelling performance that TensorFlow achieves for several real-world applications.},
  booktitle = {Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation},
  pages = {265–283},
  numpages = {19},
  location = {Savannah, GA, USA},
  series = {OSDI'16}
}


@misc{sergeev2018horovod,
      title={Horovod: fast and easy distributed deep learning in TensorFlow},
      author={Alexander Sergeev and Mike Del Balso},
      year={2018},
      eprint={1802.05799},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@article{kim2020torchgpipe,
    title={torchgpipe: On-the-fly Pipeline Parallelism for Training Giant Models},
    author={Chiheon Kim and Heungsub Lee and Myungryong Jeong and Woonhyuk Baek and Boogeon Yoon and Ildoo Kim and Sungbin Lim and Sungwoong Kim},
    year={2020},
    eprint={2004.09910},
    archivePrefix={arXiv}
}

@misc{you2018imagenet,
      title={ImageNet Training in Minutes},
      author={Yang You and Zhao Zhang and Cho-Jui Hsieh and James Demmel and Kurt Keutzer},
      year={2018},
      eprint={1709.05011},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{you2018imagenet-icpp,
  author = {You, Yang and Zhang, Zhao and Hsieh, Cho-Jui and Demmel, James and Keutzer, Kurt},
  title = {ImageNet Training in Minutes},
  year = {2018},
  isbn = {9781450365109},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3225058.3225069},
  doi = {10.1145/3225058.3225069},
  abstract = {In this paper, we investigate large scale computers' capability of speeding up deep neural networks (DNN) training. Our approach is to use large batch size, powered by the Layer-wise Adaptive Rate Scaling (LARS) algorithm, for efficient usage of massive computing resources. Our approach is generic, as we empirically evaluate the effectiveness on two neural networks: AlexNet and ResNet-50 trained with the ImageNet-1k dataset while preserving the state-of-the-art test accuracy. Compared to the baseline of a previous study from a group of researchers at Facebook, our approach shows higher test accuracy on batch sizes that are larger than 16K. Using 2,048 Intel Xeon Platinum 8160 processors, we reduce the 100-epoch AlexNet training time from hours to 11 minutes. With 2,048 Intel Xeon Phi 7250 Processors, we reduce the 90-epoch ResNet-50 training time from hours to 20 minutes. Our implementation is open source and has been released in the Intel distribution of Caffe v1.0.7.},
  booktitle = {Proceedings of the 47th International Conference on Parallel Processing},
  articleno = {1},
  numpages = {10},
  keywords = {Fast Deep Neural Networks Training, Distributed Machine Learning},
  location = {Eugene, OR, USA},
  series = {ICPP 2018}
}

@misc{you2017large,
      title={Large Batch Training of Convolutional Networks},
      author={Yang You and Igor Gitman and Boris Ginsburg},
      year={2017},
      eprint={1708.03888},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{jia2018data-arxiv,
      title={Beyond Data and Model Parallelism for Deep Neural Networks},
      author={Zhihao Jia and Matei Zaharia and Alex Aiken},
      year={2018},
      eprint={1807.05358},
      archivePrefix={arXiv},
      primaryClass={cs.DC}
}

@inproceedings{chen2016revisiting,
title = {Revisiting Distributed Synchronous SGD},
author  = {Jianmin Chen and Rajat Monga and Samy Bengio and Rafal Jozefowicz},
year  = {2016},
URL = {https://arxiv.org/abs/1604.00981},
booktitle = {International Conference on Learning Representations Workshop Track}
}

@misc{chen2016training,
      title={Training Deep Nets with Sublinear Memory Cost},
      author={Tianqi Chen and Bing Xu and Chiyuan Zhang and Carlos Guestrin},
      year={2016},
      eprint={1604.06174},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}


@misc{wahib2020scaling,
    title={Scaling Distributed Deep Learning Workloads beyond the Memory Capacity with KARMA},
    author={Mohamed Wahib and Haoyu Zhang and Truong Thao Nguyen and Aleksandr Drozd and Jens Domke and Lingqi Zhang and Ryousei Takano and Satoshi Matsuoka},
    year={2020},
    eprint={2008.11421},
    archivePrefix={arXiv},
    primaryClass={cs.DC}
}

@inproceedings{wahib2020scaling-sc,
  author = {Wahib, Mohamed and Zhang, Haoyu and Nguyen, Truong Thao and Drozd, Aleksandr and Domke, Jens and Zhang, Lingqi and Takano, Ryousei and Matsuoka, Satoshi},
  title = {Scaling Distributed Deep Learning Workloads beyond the Memory Capacity with KARMA},
  year = {2020},
  isbn = {9781728199986},
  publisher = {IEEE Press},
  abstract = {The dedicated memory of hardware accelerators can be insufficient to store all weights and/or intermediate states of large deep learning models. Although model parallelism is a viable approach to reduce the memory pressure issue, significant modification of the source code and considerations for algorithms are required. An alternative solution is to use out-of-core methods instead of, or in addition to, data parallelism.We propose a performance model based on the concurrency analysis of out-of-core training behavior, and derive a strategy that combines layer swapping and redundant recomputing. We achieve an average of 1.52x speedup in six different models over the state-of-the-art out-of-core methods. We also introduce the first method to solve the challenging problem of out-of-core multi-node training by carefully pipelining gradient exchanges and performing the parameter updates on the host. Our data parallel out-of-core solution can outperform complex hybrid model parallelism in training large models, e.g. Megatron-LM and Turning-NLG.},
  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  articleno = {19},
  numpages = {15},
  keywords = {deep neural networks, out-of-core, GPUs},
  location = {Atlanta, Georgia},
  series = {SC '20}
}

@misc{you2019large,
    title={Large Batch Optimization for Deep Learning: Training BERT in 76 minutes },
    author={Yang You and Jing Li and Sashank Reddi and Jonathan Hseu and Sanjiv Kumar and Srinadh Bhojanapalli and Xiaodan Song and James Demmel and Kurt Keutzer and Cho-Jui Hsieh},
    year={2019},
    eprint={1904.00962},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

@misc{ying2018image,
    title={Image Classification at Supercomputer Scale},
    author={Chris Ying and Sameer Kumar and Dehao Chen and Tao Wang and Youlong Cheng},
    year={2018},
    eprint={1811.06992},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

@INPROCEEDINGS{summit2019,
  author={J. {Yin} and S. {Gahlot} and N. {Laanait} and K. {Maheshwari} and J. {Morrison} and S. {Dash} and M. {Shankar}},
  booktitle={2019 IEEE/ACM Third Workshop on Deep Learning on Supercomputers (DLS)},
  title={Strategies to Deploy and Scale Deep Learning on the Summit Supercomputer},
  year={2019},
  volume={},
  number={},
  pages={84-94},
  doi={10.1109/DLS49591.2019.00016}}

  @misc{oyama2020case,
      title={The Case for Strong Scaling in Deep Learning: Training Large 3D CNNs with Hybrid Parallelism},
      author={Yosuke Oyama and Naoya Maruyama and Nikoli Dryden and Erin McCarthy and Peter Harrington and Jan Balewski and Satoshi Matsuoka and Peter Nugent and Brian Van Essen},
      year={2020},
      eprint={2007.12856},
      archivePrefix={arXiv},
      primaryClass={cs.DC}
}

@misc{flexflow,
    title={Beyond Data and Model Parallelism for Deep Neural Networks},
    author={Zhihao Jia and Matei Zaharia and Alex Aiken},
    year={2018},
    eprint={1807.05358},
    archivePrefix={arXiv},
    primaryClass={cs.DC}
}

@inproceedings{jia2018data,
  author = {Jia, Zhihao and Zaharia, Matei and Aiken, Alex},
  booktitle = {Proceedings of Machine Learning and Systems},
  editor = {A. Talwalkar and V. Smith and M. Zaharia},
  pages = {1--13},
  title = {Beyond Data and Model Parallelism for Deep Neural Networks.},
  url = {https://proceedings.mlsys.org/paper/2019/file/c74d97b01eae257e44aa9d5bade97baf-Paper.pdf},
  volume = {1},
  year = {2019}
}

@article{DBLP:journals/corr/SchulmanWDRK17,
  author    = {John Schulman and
               Filip Wolski and
               Prafulla Dhariwal and
               Alec Radford and
               Oleg Klimov},
  title     = {Proximal Policy Optimization Algorithms},
  journal   = {CoRR},
  volume    = {abs/1707.06347},
  year      = {2017},
  url       = {http://arxiv.org/abs/1707.06347},
  archivePrefix = {arXiv},
  eprint    = {1707.06347},
  timestamp = {Mon, 13 Aug 2018 16:47:34 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/SchulmanWDRK17.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@InProceedings{pmlr-v37-schulman15,
title =    {Trust Region Policy Optimization},
author =   {John Schulman and Sergey Levine and Pieter Abbeel and Michael Jordan and Philipp Moritz},
booktitle =    {Proceedings of the 32nd International Conference on Machine Learning},
pages =    {1889--1897},
year =   {2015},
editor =   {Francis Bach and David Blei},
volume =   {37},
series =   {Proceedings of Machine Learning Research},
address =    {Lille, France},
month =    {07--09 Jul},
publisher =    {PMLR},
pdf =    {http://proceedings.mlr.press/v37/schulman15.pdf},
url =    {
http://proceedings.mlr.press/v37/schulman15.html
},

}

@misc{mnih2013playing,
      title={Playing Atari with Deep Reinforcement Learning},
      author={Volodymyr Mnih and Koray Kavukcuoglu and David Silver and Alex Graves and Ioannis Antonoglou and Daan Wierstra and Martin Riedmiller},
      year={2013},
      eprint={1312.5602},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@article{Purwins_2019,
   title={Deep Learning for Audio Signal Processing},
   volume={13},
   ISSN={1941-0484},
   url={http://dx.doi.org/10.1109/JSTSP.2019.2908700},
   DOI={10.1109/jstsp.2019.2908700},
   number={2},
   journal={IEEE Journal of Selected Topics in Signal Processing},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Purwins, Hendrik and Li, Bo and Virtanen, Tuomas and Schluter, Jan and Chang, Shuo-Yiin and Sainath, Tara},
   year={2019},
   month={May},
   pages={206–219}
}

@misc{paszke2019pytorch,
      title={PyTorch: An Imperative Style, High-Performance Deep Learning Library},
      author={Adam Paszke and Sam Gross and Francisco Massa and Adam Lerer and James Bradbury and Gregory Chanan and Trevor Killeen and Zeming Lin and Natalia Gimelshein and Luca Antiga and Alban Desmaison and Andreas Köpf and Edward Yang and Zach DeVito and Martin Raison and Alykhan Tejani and Sasank Chilamkurthy and Benoit Steiner and Lu Fang and Junjie Bai and Soumith Chintala},
      year={2019},
      eprint={1912.01703},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@inproceedings{paszke2019pytorch-nips,
  author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
  booktitle = {Advances in Neural Information Processing Systems},
  editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
  pages = {},
  publisher = {Curran Associates, Inc.},
  title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},
  url = {https://proceedings.neurips.cc/paper/2019/file/bdbca288fee7f92f2bfa9f7012727740-Paper.pdf},
  volume = {32},
  year = {2019}
}

@misc{sinha2017introspection,
      title={Introspection: Accelerating Neural Network Training By Learning Weight Evolution},
      author={Abhishek Sinha and Mausoom Sarkar and Aahitagni Mukherjee and Balaji Krishnamurthy},
      year={2017},
      eprint={1704.04959},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{chen2019efficient,
      title={Efficient and Robust Parallel DNN Training through Model Parallelism on Multi-GPU Platform},
      author={Chi-Chung Chen and Chia-Lin Yang and Hsiang-Yun Cheng},
      year={2019},
      eprint={1809.02839},
      archivePrefix={arXiv},
      primaryClass={cs.DC}
}

@misc{mirhoseini2017device,
      title={Device Placement Optimization with Reinforcement Learning},
      author={Azalia Mirhoseini and Hieu Pham and Quoc V. Le and Benoit Steiner and Rasmus Larsen and Yuefeng Zhou and Naveen Kumar and Mohammad Norouzi and Samy Bengio and Jeff Dean},
      year={2017},
      eprint={1706.04972},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@ARTICLE{Rosenblatt58theperceptron,
    author = {F. Rosenblatt},
    title = {The Perceptron: A Probabilistic Model for Information Storage and Organization in The Brain},
    journal = {Psychological Review},
    year = {1958},
    pages = {65--386}
}

@misc{nikolentzos2019message,
      title={Message Passing Attention Networks for Document Understanding},
      author={Giannis Nikolentzos and Antoine J. -P. Tixier and Michalis Vazirgiannis},
      year={2019},
      eprint={1908.06267},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@article{nikolentzos2019message-aaai,
  title={Message Passing Attention Networks for Document Understanding},
  volume={34},
  url={https://ojs.aaai.org/index.php/AAAI/article/view/6376},
  DOI={10.1609/aaai.v34i05.6376},
  abstractNote={&lt;p&gt;Graph neural networks have recently emerged as a very effective framework for processing graph-structured data. These models have achieved state-of-the-art performance in many tasks. Most graph neural networks can be described in terms of message passing, vertex update, and readout functions. In this paper, we represent documents as word co-occurrence networks and propose an application of the message passing framework to NLP, the Message Passing Attention network for Document understanding (MPAD). We also propose several hierarchical variants of MPAD. Experiments conducted on 10 standard text classification datasets show that our architectures are competitive with the state-of-the-art. Ablation studies reveal further insights about the impact of the different components on performance. Code is publicly available at: https://github.com/giannisnik/mpad.&lt;/p&gt;},
  number={05},
  journal={Proceedings of the AAAI Conference on Artificial Intelligence},
  author={Nikolentzos, Giannis and Tixier, Antoine and Vazirgiannis, Michalis},
  year={2020},
  month={Apr.},
  pages={8544-8551}
}

@article{nmt,
title = {Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation},
author  = {Yonghui Wu and Mike Schuster and Zhifeng Chen and Quoc V. Le and Mohammad Norouzi and Wolfgang Macherey and Maxim Krikun and Yuan Cao and Qin Gao and Klaus Macherey and Jeff Klingner and Apurva Shah and Melvin Johnson and Xiaobing Liu and Łukasz Kaiser and Stephan Gouws and Yoshikiyo Kato and Taku Kudo and Hideto Kazawa and Keith Stevens and George Kurian and Nishant Patil and Wei Wang and Cliff Young and Jason Smith and Jason Riesa and Alex Rudnick and Oriol Vinyals and Greg Corrado and Macduff Hughes and Jeffrey Dean},
year  = {2016},
URL = {http://arxiv.org/abs/1609.08144},
journal = {CoRR},
volume  = {abs/1609.08144}
}


@misc{vijayanarasimhan2017sfmnet,
      title={SfM-Net: Learning of Structure and Motion from Video},
      author={Sudheendra Vijayanarasimhan and Susanna Ricco and Cordelia Schmid and Rahul Sukthankar and Katerina Fragkiadaki},
      year={2017},
      eprint={1704.07804},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{zhao2019object,
      title={Object Detection with Deep Learning: A Review},
      author={Zhong-Qiu Zhao and Peng Zheng and Shou-tao Xu and Xindong Wu},
      year={2019},
      eprint={1807.05511},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@ARTICLE{zhao2019object-tnnls,
  author={Zhao, Zhong-Qiu and Zheng, Peng and Xu, Shou-Tao and Wu, Xindong},
  journal={IEEE Transactions on Neural Networks and Learning Systems},
  title={Object Detection With Deep Learning: A Review},
  year={2019},
  volume={30},
  number={11},
  pages={3212-3232},
  doi={10.1109/TNNLS.2018.2876865}
}

@misc{tao2020hierarchical,
      title={Hierarchical Multi-Scale Attention for Semantic Segmentation},
      author={Andrew Tao and Karan Sapra and Bryan Catanzaro},
      year={2020},
      eprint={2005.10821},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{dryden2019,
author = {Dryden, Nikoli and Maruyama, Naoya and Moon, Tim and Benson, Tom and Snir, Marc and Van Essen, Brian},
title = {Channel and Filter Parallelism for Large-Scale CNN Training},
year = {2019},
isbn = {9781450362290},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3295500.3356207},
doi = {10.1145/3295500.3356207},
booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
articleno = {10},
numpages = {20},
keywords = {convolution, CNN, deep learning, algorithms, scaling},
location = {Denver, Colorado},
series = {SC '19}
}


@article{BiT-resnet,
  author    = {Alexander Kolesnikov and
               Lucas Beyer and
               Xiaohua Zhai and
               Joan Puigcerver and
               Jessica Yung and
               Sylvain Gelly and
               Neil Houlsby},
  title     = {Large Scale Learning of General Visual Representations for Transfer},
  journal   = {CoRR},
  volume    = {abs/1912.11370},
  year      = {2019},
  url       = {http://arxiv.org/abs/1912.11370},
  archivePrefix = {arXiv},
  eprint    = {1912.11370},
  timestamp = {Fri, 03 Jan 2020 16:10:45 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1912-11370.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@InProceedings{BiT-resnet-eccv,
  author="Kolesnikov, Alexander
  and Beyer, Lucas
  and Zhai, Xiaohua
  and Puigcerver, Joan
  and Yung, Jessica
  and Gelly, Sylvain
  and Houlsby, Neil",
  editor="Vedaldi, Andrea
  and Bischof, Horst
  and Brox, Thomas
  and Frahm, Jan-Michael",
  title="Big Transfer (BiT): General Visual Representation Learning",
  booktitle="Computer Vision -- ECCV 2020",
  year="2020",
  publisher="Springer International Publishing",
  address="Cham",
  pages="491--507",
  isbn="978-3-030-58558-7"
}

@article{yanyou-large-batch,
  author    = {Priya Goyal and
               Piotr Doll{\'{a}}r and
               Ross B. Girshick and
               Pieter Noordhuis and
               Lukasz Wesolowski and
               Aapo Kyrola and
               Andrew Tulloch and
               Yangqing Jia and
               Kaiming He},
  title     = {Accurate, Large Minibatch {SGD:} Training ImageNet in 1 Hour},
  journal   = {CoRR},
  volume    = {abs/1706.02677},
  year      = {2017},
  url       = {http://arxiv.org/abs/1706.02677},
  archivePrefix = {arXiv},
  eprint    = {1706.02677},
  timestamp = {Mon, 13 Aug 2018 16:49:10 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/GoyalDGNWKTJH17.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@misc{nccl,
  author = {NVIDIA},
  title = {NCCL},
  year = {},
  publisher = {},
  journal = {},
  howpublished = {\url{https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/overview.html}},
  commit = {}
}

@misc{cerebas-gpt-pruning,
  author = {Cerebras-AI},
  title = {Creating Sparse GPT-3 Models with Iterative Pruning},
  year = {2022},
  publisher = {Cerebras-AI},
  journal = {},
  howpublished = {\url{https://www.cerebras.net/blog/creating-sparse-gpt-3-models-with-iterative-pruning}},
  commit = {}
}

@misc{deepspeed-extreme-3d,
  author = {Microsoft},
  title = {Deepspeed: Extreme-scale model training for everyone},
  year = {},
  publisher = {},
  journal = {},
  howpublished = {\url{https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/}},
  commit = {}
}

@misc{gpuDirect,
  author = {NVIDIA},
  title = {GPUDirect},
  year = {},
  publisher = {},
  journal = {},
  howpublished = {\url{https://docs.nvidia.com/cuda/gpudirect-rdma/index.html}},
  commit = {}
}

@misc{nsys,
  author = {NVIDIA},
  title = {NVIDIA Nsight Systems},
  year = {},
  publisher = {},
  journal = {},
  howpublished = {\url{https://developer.nvidia.com/nsight-systems}},
  commit = {}
}

@misc{osu-5.8,
  author = {Ohio State University},
  title = {OSU Micro-Benchmarks 5.8},
  year = {},
  publisher = {},
  journal = {},
  howpublished = {\url{http://mvapich.cse.ohio-state.edu/benchmarks/}},
  commit = {}
}

@misc{nvtx,
  author = {NVIDIA},
  title = {NVIDIA Nsight Systems},
  year = {},
  publisher = {},
  journal = {},
  howpublished = {\url{https://github.com/NVIDIA/NVTX}},
  commit = {}
}

@misc{nvlink,
  author = {NVIDIA},
  title = {NVLink and NVSwitch: The Building Blocks of Advanced Multi-GPU Communication},
  year = {},
  publisher = {},
  journal = {},
  howpublished = {\url{https://www.nvidia.com/en-us/data-center/nvlink/}},
  commit = {}
}

@misc{gloo,
  author = {Facebook},
  title = {Gloo},
  year = {},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/facebookincubator/gloo}},
  commit = {66a5b347f6610cb057d09d64dfdb6984508a8c69}
}

@misc{rhu2016vdnn,
      title={vDNN: Virtualized Deep Neural Networks for Scalable, Memory-Efficient Neural Network Design},
      author={Minsoo Rhu and Natalia Gimelshein and Jason Clemons and Arslan Zulfiqar and Stephen W. Keckler},
      year={2016},
      eprint={1602.08124},
      archivePrefix={arXiv},
      primaryClass={cs.DC}
}

@misc{huang2019gpipe,
      title={GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism},
      author={Yanping Huang and Youlong Cheng and Ankur Bapna and Orhan Firat and Mia Xu Chen and Dehao Chen and HyoukJoong Lee and Jiquan Ngiam and Quoc V. Le and Yonghui Wu and Zhifeng Chen},
      year={2019},
      eprint={1811.06965},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{huang2019gpipe_nips,
  author = {Huang, Yanping and Cheng, Youlong and Bapna, Ankur and Firat, Orhan and Chen, Dehao and Chen, Mia and Lee, HyoukJoong and Ngiam, Jiquan and Le, Quoc V and Wu, Yonghui and Chen, zhifeng},
  booktitle = {Advances in Neural Information Processing Systems},
  editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
  pages = {},
  publisher = {Curran Associates, Inc.},
  title = {GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism},
  url = {https://proceedings.neurips.cc/paper/2019/file/093f65e080a295f8076b1c5722a46aa2-Paper.pdf},
  volume = {32},
  year = {2019}
}

@article{transformer,
  author    = {Ashish Vaswani and
               Noam Shazeer and
               Niki Parmar and
               Jakob Uszkoreit and
               Llion Jones and
               Aidan N. Gomez and
               Lukasz Kaiser and
               Illia Polosukhin},
  title     = {Attention Is All You Need},
  journal   = {CoRR},
  volume    = {abs/1706.03762},
  year      = {2017},
  url       = {http://arxiv.org/abs/1706.03762},
  archivePrefix = {arXiv},
  eprint    = {1706.03762},
  timestamp = {Mon, 13 Aug 2018 16:48:37 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/VaswaniSPUJGKP17.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@misc{wu2016googles,
      title={Google's Neural Machine Translation System: Bridging the Gap
      between Human and Machine Translation},
      author={Yonghui Wu and Mike Schuster and Zhifeng Chen and Quoc V. Le
      and Mohammad Norouzi and Wolfgang Macherey and Maxim Krikun and Yuan
      Cao and Qin Gao and Klaus Macherey and Jeff Klingner and Apurva Shah
      and Melvin Johnson and Xiaobing Liu and Łukasz Kaiser and Stephan Gouws
      and Yoshikiyo Kato and Taku Kudo and Hideto Kazawa and Keith Stevens
      and George Kurian and Nishant Patil and Wei Wang and Cliff Young and
      Jason Smith and Jason Riesa and Alex Rudnick and Oriol Vinyals and Greg
      Corrado and Macduff Hughes and Jeffrey Dean},
      year={2016},
      eprint={1609.08144},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@article{wait-free-backprop,
  author    = {Hao Zhang and
               Zeyu Zheng and
               Shizhen Xu and
               Wei Dai and
               Qirong Ho and
               Xiaodan Liang and
               Zhiting Hu and
               Jinliang Wei and
               Pengtao Xie and
               Eric P. Xing},
  title     = {Poseidon: An Efficient Communication Architecture for Distributed
               Deep Learning on {GPU} Clusters},
  journal   = {CoRR},
  volume    = {abs/1706.03292},
  year      = {2017},
  url       = {http://arxiv.org/abs/1706.03292},
  archivePrefix = {arXiv},
  eprint    = {1706.03292},
  timestamp = {Wed, 12 Dec 2018 16:25:50 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/ZhangZXDHLHWXX17.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@inproceedings{wait-free-backprop-usenix,
  author = {Hao Zhang and Zeyu Zheng and Shizhen Xu and Wei Dai and Qirong Ho and Xiaodan Liang and Zhiting Hu and Jinliang Wei and Pengtao Xie and Eric P. Xing},
  title = {Poseidon: An Efficient Communication Architecture for Distributed Deep Learning on {GPU} Clusters},
  booktitle = {2017 {USENIX} Annual Technical Conference ({USENIX} {ATC} 17)},
  year = {2017},
  isbn = {978-1-931971-38-6},
  address = {Santa Clara, CA},
  pages = {181--193},
  url = {https://www.usenix.org/conference/atc17/technical-sessions/presentation/zhang},
  publisher = {{USENIX} Association},
  month = jul,
}


@article{pytorchdist-vldb,
  author = {Li, Shen and Zhao, Yanli and Varma, Rohan and Salpekar, Omkar and Noordhuis, Pieter and Li, Teng and Paszke, Adam and Smith, Jeff and Vaughan, Brian and Damania, Pritam and Chintala, Soumith},
  title = {PyTorch Distributed: Experiences on Accelerating Data Parallel Training},
  year = {2020},
  issue_date = {August 2020},
  publisher = {VLDB Endowment},
  volume = {13},
  number = {12},
  issn = {2150-8097},
  url = {https://doi.org/10.14778/3415478.3415530},
  doi = {10.14778/3415478.3415530},
  abstract = {This paper presents the design, implementation, and evaluation of the PyTorch distributed data parallel module. Py-Torch is a widely-adopted scientific computing package used in deep learning research and applications. Recent advances in deep learning argue for the value of large datasets and large models, which necessitates the ability to scale out model training to more computational resources. Data parallelism has emerged as a popular solution for distributed training thanks to its straightforward principle and broad applicability. In general, the technique of distributed data parallelism replicates the model on every computational resource to generate gradients independently and then communicates those gradients at each iteration to keep model replicas consistent. Despite the conceptual simplicity of the technique, the subtle dependencies between computation and communication make it non-trivial to optimize the distributed training efficiency. As of v1.5, PyTorch natively provides several techniques to accelerate distributed data parallel, including bucketing gradients, overlapping computation with communication, and skipping gradient synchronization. Evaluations show that, when configured appropriately, the PyTorch distributed data parallel module attains near-linear scalability using 256 GPUs.},
  journal = {Proc. VLDB Endow.},
  month = aug,
  pages = {3005–3018},
  numpages = {14}
}

@misc{megatron-turing-nlg-530b,
  doi = {10.48550/ARXIV.2201.11990},

  url = {https://arxiv.org/abs/2201.11990},

  author = {Smith, Shaden and Patwary, Mostofa and Norick, Brandon and LeGresley, Patrick and Rajbhandari, Samyam and Casper, Jared and Liu, Zhun and Prabhumoye, Shrimai and Zerveas, George and Korthikanti, Vijay and Zhang, Elton and Child, Rewon and Aminabadi, Reza Yazdani and Bernauer, Julie and Song, Xia and Shoeybi, Mohammad and He, Yuxiong and Houston, Michael and Tiwary, Saurabh and Catanzaro, Bryan},

  keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model},

  publisher = {arXiv},

  year = {2022},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{megatronlm,
      title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},
      author={Mohammad Shoeybi and Mostofa Patwary and Raul Puri and Patrick LeGresley and Jared Casper and Bryan Catanzaro},
      year={2020},
      eprint={1909.08053},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@InProceedings{pmlr-v28-coates13,
title = {Deep learning with COTS HPC systems},
author = {Adam Coates and Brody Huval and Tao Wang and David Wu and Bryan Catanzaro and Ng Andrew},
pages = {1337--1345},
year = {2013},
editor = {Sanjoy Dasgupta and David McAllester},
volume = {28},
number = {3},
series = {Proceedings of Machine Learning Research},
address = {Atlanta, Georgia, USA},
month = {17--19 Jun},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v28/coates13.pdf},
url = {http://proceedings.mlr.press/v28/coates13.html},
}

@misc{ericson2017performance,
      title={On the Performance of Network Parallel Training in Artificial
      Neural Networks},
      author={Ludvig Ericson and Rendani Mbuvha},
      year={2017},
      eprint={1701.05130},
      archivePrefix={arXiv},
      primaryClass={cs.AI}
}

@InProceedings{narayanan2019pipedream,
author = {Narayanan, Deepak and Harlap, Aaron and Phanishayee, Amar and
Seshadri, Vivek and Devanur, Nikhil and Granger, Greg and Gibbons, Phil and
Zaharia, Matei},
title = {PipeDream: Generalized Pipeline Parallelism for DNN Training},
booktitle = {ACM Symposium on Operating Systems Principles (SOSP 2019)},
year = {2019},
month = {October},
url =
{https://www.microsoft.com/en-us/research/publication/pipedream-generalized-pipeline-parallelism-for-dnn-training/},
}

@inproceedings{pipemare,
 author = {Yang, Bowen and Zhang, Jian and Li, Jonathan  and Re, Christopher and Aberger, Christopher and De Sa, Christopher},
 booktitle = {Proceedings of Machine Learning and Systems},
 editor = {A. Smola and A. Dimakis and I. Stoica},
 pages = {269--296},
 title = {PipeMare: Asynchronous Pipeline Parallel DNN Training},
 url = {https://proceedings.mlsys.org/paper/2021/file/6c8349cc7260ae62e3b1396831a8398f-Paper.pdf},
 volume = {3},
 year = {2021}
}

@article{liu1987dynamical,
  title={Dynamical behavior of epidemiological models with nonlinear incidence rates},
  author={Liu, Wei-min and Hethcote, Herbert W and Levin, Simon A},
  journal={Journal of mathematical biology},
  volume={25},
  pages={359--380},
  year={1987},
  publisher={Springer}
}

@misc{vaswaniTransformer,
      title={Attention Is All You Need},
      author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
      year={2017},
      eprint={1706.03762},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{MSRzero,
      title={ZeRO: Memory Optimizations Toward Training Trillion Parameter Models},
      author={Samyam Rajbhandari and Jeff Rasley and Olatunji Ruwase and Yuxiong He},
      year={2020},
      eprint={1910.02054},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@inproceedings{sc2020zero,
  author = {Rajbhandari, Samyam and Rasley, Jeff and Ruwase, Olatunji and He, Yuxiong},
  title = {ZeRO: Memory Optimizations toward Training Trillion Parameter Models},
  year = {2020},
  isbn = {9781728199986},
  publisher = {IEEE Press},
  abstract = {Large deep learning models offer significant accuracy gains, but training billions to trillions of parameters is challenging. Existing solutions such as data and model parallelisms exhibit fundamental limitations to fit these models into limited device memory, while obtaining computation, communication and development efficiency. We develop a novel solution, Zero Redundancy Optimizer (ZeRO), to optimize memory, vastly improving training speed while increasing the model size that can be efficiently trained. ZeRO eliminates memory redundancies in data- and model-parallel training while retaining low communication volume and high computational granularity, allowing us to scale the model size proportional to the number of devices with sustained high efficiency. Our analysis on memory requirements and communication volume demonstrates: ZeRO has the potential to scale beyond 1 Trillion parameters using today's hardware.We implement and evaluate ZeRO: it trains large models of over 100B parameter with super-linear speedup on 400 GPUs, achieving throughput of 15 Petaflops. This represents an 8x increase in model size and 10x increase in achievable performance over state-of-the-art. In terms of usability, ZeRO can train large models of up to 13B parameters (e.g., larger than Megatron GPT 8.3B and T5 11B) without requiring model parallelism which is harder for scientists to apply. Last but not the least, researchers have used the system breakthroughs of ZeRO to create Turing-NLG, the world's largest language model at the time (17B parameters) with record breaking accuracy.},
  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  articleno = {20},
  numpages = {16},
  location = {Atlanta, Georgia},
  series = {SC '20}
}

@misc{resnet,
      title={Deep Residual Learning for Image Recognition},
      author={Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},
      year={2015},
      eprint={1512.03385},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@INPROCEEDINGS{resnetCVPR,
  author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle={2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  title={Deep Residual Learning for Image Recognition},
  year={2016},
  volume={},
  number={},
  pages={770-778},
  doi={10.1109/CVPR.2016.90}
}

@misc{vgg16,
      title={Very Deep Convolutional Networks for Large-Scale Image Recognition},
      author={Karen Simonyan and Andrew Zisserman},
      year={2015},
      eprint={1409.1556},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{vgg16-iclr,
  author    = {Karen Simonyan and
                Andrew Zisserman},
  editor    = {Yoshua Bengio and
                Yann LeCun},
  title     = {Very Deep Convolutional Networks for Large-Scale Image Recognition},
  booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
                San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
  year      = {2015},
  url       = {http://arxiv.org/abs/1409.1556},
  timestamp = {Wed, 17 Jul 2019 10:40:54 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/SimonyanZ14a.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@misc{wide-resnet,
  doi = {10.48550/ARXIV.1605.07146},

  url = {https://arxiv.org/abs/1605.07146},

  author = {Zagoruyko, Sergey and Komodakis, Nikos},

  keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Wide Residual Networks},

  publisher = {arXiv},

  year = {2016},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{gpt-j,
  author = {Wang, Ben and Komatsuzaki, Aran},
  title = {{GPT-J-6B: A 6 Billion Parameter Autoregressive Language Model}},
  howpublished = {\url{https://github.com/kingoflolz/mesh-transformer-jax}},
  year = 2021,
  month = May
}

@article{gpt-2,
  title={Language Models are Unsupervised Multitask Learners},
  author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
  year={2019}
}

@inproceedings{bert,
    title = "{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding",
    author = "Devlin, Jacob  and
      Chang, Ming-Wei  and
      Lee, Kenton  and
      Toutanova, Kristina",
    booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
    month = jun,
    year = "2019",
    address = "Minneapolis, Minnesota",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/N19-1423",
    doi = "10.18653/v1/N19-1423",
    pages = "4171--4186",
}

@article{olah2017feature,
  author = {Olah, Chris and Mordvintsev, Alexander and Schubert, Ludwig},
  title = {Feature Visualization},
  journal = {Distill},
  year = {2017},
  note = {https://distill.pub/2017/feature-visualization},
  doi = {10.23915/distill.00007}
}

@article{mnistlecun2010,
  title={MNIST handwritten digit database},
  author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
  journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
  volume={2},
  year={2010}
}

@article{ILSVRC15,
Author = {Olga Russakovsky and Jia Deng and Hao Su and Jonathan Krause and Sanjeev Satheesh and Sean Ma and Zhiheng Huang and Andrej Karpathy and Aditya Khosla and Michael Bernstein and Alexander C. Berg and Li Fei-Fei},
Title = {{ImageNet Large Scale Visual Recognition Challenge}},
Year = {2015},
journal   = {International Journal of Computer Vision (IJCV)},
doi = {10.1007/s11263-015-0816-y},
volume={115},
number={3},
pages={211-252}
}

@inproceedings{lenet,
 author = {LeCun, Yann and Boser, Bernhard and Denker, John and Henderson, Donnie and Howard, R. and Hubbard, Wayne and Jackel, Lawrence},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {D. Touretzky},
 pages = {396--404},
 publisher = {Morgan-Kaufmann},
 title = {Handwritten Digit Recognition with a Back-Propagation Network},
 url = {https://proceedings.neurips.cc/paper/1989/file/53c3bce66e43be4f209556518c2fcb54-Paper.pdf},
 volume = {2},
 year = {1990}
}


@inproceedings{lockwood:sc2018,
  author = {Lockwood, Glenn K. and Snyder, Shane and Wang, Teng and Byna, Suren and Carns, Philip and Wright, Nicholas J.},
  title = {A Year in the Life of a Parallel File System},
  year = {2018},
  publisher = {IEEE Press},
  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis},
  articleno = {Article 74},
  numpages = {13},
  location = {Dallas, Texas},
  series = {SC ’18}
}

@article{kunen2015kripke,
  Author = {Kunen, AJ and Bailey, TS and Brown, PN},
  Journal = {Lawrence Livermore National Laboratory {(LLNL)}, Livermore, CA, Tech. Rep},
  Title = {{KRIPKE}-A massively parallel transport mini-app},
  Year = {2015}
}

@incollection{hypre,
  Author = {R.D. Falgout and J.E. Jones and U.M. Yang},
  Booktitle = {Numerical Solution of Partial Differential Equations on Parallel Computers},
  Editor = {A.M. Bruaset and A. Tveito},
  Pages = {267-294},
  Publisher = {Springer-Verlag},
  Title = {The Design and Implementation of hypre, a Library of Parallel High Performance Preconditioners},
  Volume = 51,
  Year = 2006
}

@techreport{RAJA,
  Author = {Hornung, Rich D. and Keasler, Jeff A.},
  Institution = {Lawrence Livermore National Laboratory},
  Month = sep,
  Number = {LLNL-TR-661403},
  Title = {{The RAJA Portability Layer: Overview and Status}},
  Year = {2014}
}

@InProceedings{jain:springer2016,
  author="Jain, Nikhil and Bohm, Eric and Mikida, Eric and Mandal, Subhasish and Kim, Minjung and Jindal, Prateek and Li, Qi and Ismail-Beigi, Sohrab and Martyna, Glenn J. and Kale, Laxmikant V.",
  editor="Kunkel, Julian M. and Balaji, Pavan and Dongarra, Jack",
  title="OpenAtom: Scalable Ab-Initio Molecular Dynamics with Diverse Capabilities",
  booktitle="High Performance Computing",
  year="2016",
  publisher="Springer International Publishing",
  address="Cham",
  pages="139--158",
}

@article{settles2012active,
  title={Active learning},
  author={Settles, Burr},
  journal={Synthesis Lectures on Artificial Intelligence and Machine Learning},
  volume={6},
  number={1},
  pages={1--114},
  year={2012},
  publisher={Morgan \& Claypool Publishers}
}

@article{sukhija:2014,
  Author = {Sukhija, Nitin and Malone, Brandon and Srivastava, Srishti and Banicescu, Ioana and Ciorba, Florina M.},
  Journal = {Parallel and Cloud Computing},
  Month = oct,
  Number = {4},
  Pages = {66--81},
  Title = {A Learning-based Selection for Portfolio Scheduling of Scientific Applications on Heterogeneous Computing Systems},
  Volume = {3},
  Year = {2014}
}

@inproceedings{tallent:2015,
  Author = {Tallent, Nathan R. and Vishnu, Abhinav and Van Dam, Hubertus and Daily, Jeff and Kerbyson, Darren J. and Hoisie, Adolfy},
  Booktitle = {Proceedings of the 20th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
  Pages = {130--139},
  Publisher = {ACM},
  Series = {PPoPP 2015},
  Title = {Diagnosing the Causes and Severity of One-sided Message Contention},
  Url = {http://doi.acm.org/10.1145/2688500.2688516},
  Year = {2015}
}

@article{randomforests,
  Author = {Leo Breiman},
  Journal = {Machine Learning},
  Number = {1},
  Pages = {5-32},
  Title = {Random Forests},
  Volume = {45},
  Year = {2001}
}

@inproceedings{Fang:ICPP2011,
  Author = {J. Fang and A. L. Varbanescu and H. Sips},
  Booktitle = {2011 International Conference on Parallel Processing},
  Doi = {10.1109/ICPP.2011.45},
  Month = {Sept},
  Pages = {216-225},
  Title = {A Comprehensive Performance Comparison of CUDA and OpenCL},
  Year = {2011},
}

@article{ClintWhaley:2001cl,
  Author = {Clint Whaley, R and Petitet, Antoine and Dongarra, Jack J},
  Journal = {Parallel Computing},
  Month = jan,
  Number = {1-2},
  Pages = {3--35},
  Title = {{Automated empirical optimizations of software and the ATLAS project}},
  Volume = {27},
  Year = {2001}
}

@article{FFTW3,
  Author = {Frigo, M and Johnson, S G},
  Journal = {Proceedings of the IEEE},
  Number = {2},
  Pages = {216--231},
  Title = {{The Design and Implementation of FFTW3}},
  Volume = {93},
  Year = {2005}
}

@article{OSKI,
  Author = {Vuduc, Richard and Demmel, James W and Yelick, Katherine A},
  Journal = {Journal of Physics: Conference Series},
  Month = aug,
  Number = {1},
  Pages = {521--530},
  Title = {{OSKI: A library of automatically tuned sparse matrix kernels}},
  Volume = {16},
  Year = {2005}
}

@inproceedings{Ansel:2014gj,
  Address = {New York, New York, USA},
  Author = {Ansel, Jason and others},
  Booktitle = {Proceedings of the 23rd International Conference on Parallel Architectures and Compilation Techniques},
  Pages = {303--316},
  Title = {{OpenTuner}},
  Year = {2014}
}

@inproceedings{Orio,
  Author = {Hartono, Albert and Norris, Boyana and Sadayappan, P},
  Booktitle = {IEEE International Symposium on Parallel {\&} Distributed Processing},
  Month = may,
  Pages = {1--11},
  Title = {{Annotation-based empirical performance tuning using Orio}},
  Year = {2009}
}

@inproceedings{hollings:2002wg,
  Author = {Tapus, C and Chung, I-Hsin and Hollingsworth, J K},
  Booktitle = {Supercomputing 2002 (SC'02)},
  Month = nov,
  Pages = {44},
  Title = {{Active Harmony: Towards Automated Performance Tuning}},
  Year = {2002}
}

@inproceedings{hollings:1998,
  Author = {Hollingsworth, J K and Keleher, P J},
  Booktitle = {Proceedings of the 7th International Symposium on High Performance Distributed Computing},
  Month = jul,
  Pages = {180--188},
  Title = {{Prediction and adaptation in Active Harmony}},
  Year = {1998}
}

@inproceedings{Calotoiu:2013ewa,
  Author = {Calotoiu, Alexandru and Hoefler, Torsten and Poke, Marius and Wolf, Felix},
  Booktitle = {Supercomputing 2013 (SC'13)},
  Month = nov,
  Pages = {1--12},
  Title = {{Using automated performance modeling to find scalability bugs in complex codes}},
  Year = {2013}
}

@inproceedings{Bhattacharyya:2014fr,
  Address = {New York, New York, USA},
  Author = {Bhattacharyya, Arnamoy and Hoefler, Torsten},
  Booktitle = {The 23rd International Conference on Parallel Architecuters and Compilation Techniques},
  Pages = {393--404},
  Title = {{PEMOGEN}},
  Year = {2014}
}

@inproceedings{Hoefler:SC11,
  Author = {Hoefler, T and Gropp, W and Kramer, W and Snir, M},
  Booktitle = {Supercomputing 2011 (SC'11)},
  Pages = {1--12},
  Title = {{Performance modeling for systematic performance tuning}},
  Year = {2011}
}

@inproceedings{Bergstra:2012iz,
  Author = {Bergstra, J and Pinto, N and Cox, D},
  Booktitle = {Proceedings of Innovative Parallel Computing},
  Month = may,
  Pages = {1--9},
  Title = {{Machine learning for predictive auto-tuning with boosted regression trees}},
  Year = {2012}
}

@inproceedings{Didona:2015il,
  Address = {New York, New York, USA},
  Author = {Didona, Diego and Romano, Paolo},
  Booktitle = {the 6th ACM/SPEC International Conference},
  Pages = {341--344},
  Title = {{Hybrid Machine Learning/Analytical Models for Performance Prediction}},
  Year = {2015}
}

@inproceedings{Agakov:CGO2006,
  Author = {Agakov, F and others},
  Booktitle = {Proceedings of the International Symposium on Code Generation and Optimization},
  Month = mar,
  Pages = {295--305},
  Title = {{Using Machine Learning to Focus Iterative Optimization}},
  Year = {2006}
}

@inproceedings{Wu:HPCA2015,
  Author = {Wu, Gene and Greathouse, Joseph L and Lyashevsky, Alexander and Jayasena, Nuwan and Chiou, Derek},
  Booktitle = {Proceedings of the 21st IEEE International Symposium on High Performance Computer Architecture},
  Month = feb,
  Pages = {564--576},
  Title = {{GPGPU performance and power estimation using machine learning}},
  Year = {2015}
}

@inproceedings{Song:2013df,
  Author = {Song, S and Su, C and Rountree, B},
  Booktitle = {Proceedings of the 27th IEEE International Symposium on Parallel {\&} Distributed Processing},
  Month = may,
  Pages = {673--686},
  Title = {{A simplified and accurate model of power-performance efficiency on emergent GPU architectures}},
  Year = {2013}
}

@inproceedings{MatthewCurtisMaury:2007wi,
  Author = {Curtis-Maury, Matthew A and others},
  Booktitle = {Proceedings of the IEEE Conference on Cluster Computing},
  Month = sep,
  Pages = {488--495},
  Title = {{Identifying energy-efficient concurrency levels using machine learning}},
  Year = {2007}
}

@inproceedings{Su:wc2012,
  Author = {Su, Chunyi and Li, Dong and Nikolopoulos, Dimitrios S and Cameron, Kirk W and de Supinski, Bronis R and Leon, Edgar A},
  Booktitle = {Proceedings of the IEEE International Symposium on Workload Characterization},
  Month = nov,
  Pages = {164--173},
  Title = {{Model-based, memory-centric performance and power optimization on NUMA multiprocessors}},
  Year = {2012}
}

@inproceedings{beckingsale:ipdps2017,
  Author = {D. Beckingsale and O. Pearce and I. Laguna and T. Gamblin},
  Booktitle = {IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
  Month = {May},
  Pages = {307-316},
  Title = {Apollo: Reusable Models for Fast, Dynamic Tuning of Input-Dependent Code},
  Year = {2017}
}

@inproceedings{yoo2011automated,
  Author = {Yoo, W. and Larson, K. and Kim, S. and Ahn, W. and Campbell, R and Baugh, L.},
  Booktitle = {Proc. of USENIX Workshop on Hot topics in parallelism. USENIX Association},
  Title = {Automated Fingerprinting of Performance Pathologies using Performance Monitoring Units (PMUs)},
  Year = {2011}
}

@article{Bhowmick06,
  Author = {Sanjukta Bhowmick and Victor Eijkhouta and Yoav Freund and Erika Fuentes and David Keye},
  Journal = {Int. J. High Perf. Comput. Appl},
  Title = {Application of Machine Learning to the Selection of Sparse Linear Solvers},
  Year = {2006}
}

@inproceedings{ganapathi2009case,
  title={A case for machine learning to optimize multicore performance},
  author={Ganapathi, Archana and Datta, Kaushik and Fox, Armando and Patterson, David},
  booktitle={Proceedings of the First {USENIX} conference on {Hot} topics in parallelism},
  year={2009},
  organization={USENIX Association}
}

@inproceedings{duplyakin2016active,
  title={Active Learning in Performance Analysis},
  author={Duplyakin, Dmitry and Brown, Jed and Ricci, Robert},
  booktitle={Cluster Computing (CLUSTER), 2016 {IEEE} International Conference on},
  pages={182--191},
  year={2016},
  organization={IEEE}
}

@article{NorrisBJ14,
  Author = {Boyana Norris and Sa{-}Lin Bernstein and Ramya Nair and Elizabeth R. Jessup},
  Biburl = {http://dblp.uni-trier.de/rec/bib/journals/corr/NorrisBNJ14},
  Journal = {CoRR},
  Title = {Lighthouse: {A} User-Centered Web Service for Linear Algebra Software},
  Url = {http://arxiv.org/abs/1408.1363},
  Volume = {abs/1408.1363},
  Year = {2014},
}

@book{kullback1997information,
  title={Information theory and statistics},
  author={Kullback, Solomon},
  year={1997},
  publisher={Courier Corporation}
}

@inproceedings{vaswani2017attention,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  booktitle={Advances in neural information processing systems},
  pages={5998--6008},
  year={2017}
}

@misc{dewancker2015bayesian,
  title={Bayesian optimization primer},
  author={Dewancker, Ian and McCourt, Michael and Clark, Scott},
  year={2015}
}

@book{mockus2012bayesian,
  title={Bayesian approach to global optimization: theory and applications},
  author={Mockus, Jonas},
  volume={37},
  year={2012},
  publisher={Springer Science \& Business Media}
}

@techreport{arnold:ibm-tr00,
  Author = {Matthew Arnold and Peter F. Sweeney},
  Institution = {IBM Research Division},
  Keywords = {cct,sampling,profiling,java},
  Month = {July 7},
  Number = {RC 21789 (98099)},
  Title = {Approximating the Calling Context Tree via Sampling},
  Year = {2000}
}

@inproceedings{tallent:ics11,
  Author = {N. Tallent and J. Mellor-Crummey and M. Franco and R. Landrum and L. Adhianto},
  Booktitle = ICS11,
  Month = jun,
  Title = {{Scalable Fine-grained Call Path Tracing}},
  Year = 2011
}

@ARTICLE{decision-trees,
  author={S. R. {Safavian} and D. {Landgrebe}},
  journal={IEEE Transactions on Systems, Man, and Cybernetics},
  title={A survey of decision tree classifier methodology},
  year={1991},
  volume={21},
  number={3},
  pages={660-674},
}

@article{Cortes:1995,
  Author = {Corinna Cortes and Vladimir Vapnik},
  Issue_Date = {Sept. 1995},
  Journal = {Mach. Learn.},
  Month = {September},
  Number = {3},
  Numpages = {25},
  Pages = {273--297},
  Title = {Support-Vector Networks},
  Volume = {20},
  Year = {1995}
}

@article{Smola:2004,
  Author = {Smola, Alex J. and Sch\"{o}lkopf, Bernhard},
  Issue_Date = {August 2004},
  Journal = {Statistics and Computing},
  Month = {August},
  Number = {3},
  Numpages = {24},
  Pages = {199--222},
  Title = {A Tutorial on Support Vector Regression},
  Volume = {14},
  Year = {2004}
}

@article{KalchbrennerGB14,
  author    = {Nal Kalchbrenner and
               Edward Grefenstette and
               Phil Blunsom},
  title     = {A Convolutional Neural Network for Modelling Sentences},
  journal   = {CoRR},
  volume    = {abs/1404.2188},
  year      = {2014},
  url       = {http://arxiv.org/abs/1404.2188},
  archivePrefix = {arXiv},
  eprint    = {1404.2188},
}

@ARTICLE{Bronstein:spm2017,
  author={M. M. {Bronstein} and J. {Bruna} and Y. {LeCun} and A. {Szlam} and P. {Vandergheynst}},
  journal={IEEE Signal Processing Magazine},
  title={Geometric Deep Learning: Going beyond Euclidean data},
  year={2017},
  volume={34},
  number={4},
  pages={18-42},
}

@incollection{Cuturi2013,
  title = {Sinkhorn Distances: Lightspeed Computation of Optimal Transport},
  author = {Cuturi, Marco},
  booktitle = {Advances in Neural Information Processing Systems 26},
  editor = {C. J. C. Burges and L. Bottou and M. Welling and Z. Ghahramani and K. Q. Weinberger},
  pages = {2292--2300},
  year = {2013},
  publisher = {Curran Associates, Inc.},
}

@article{Vallisneri2008,
  title = {Use and abuse of the Fisher information matrix in the assessment of gravitational-wave parameter-estimation prospects},
  author = {Vallisneri, Michele},
  journal = {Phys. Rev. D},
  volume = {77},
  issue = {4},
  pages = {042001},
  numpages = {20},
  year = {2008},
  month = {Feb},
  publisher = {American Physical Society},
}

@article{dietterich2000experimental,
    Author = {Dietterich, Thomas G},
    Journal = {Machine learning},
    Number = {2},
    Pages = {139--157},
    Publisher = {Springer},
    Title = {An experimental comparison of three methods for constructing ensembles of decision trees: Bagging, boosting, and randomization},
    Volume = {40},
    Year = {2000}}

@article{friedman2002stochastic,
    Author = {Friedman, Jerome H},
    Journal = {Computational Statistics \& Data Analysis},
    Number = {4},
    Pages = {367--378},
    Publisher = {Elsevier},
    Title = {Stochastic gradient boosting},
    Volume = {38},
    Year = {2002}}

@article{geurts2006extremely,
    Author = {Geurts, Pierre and Ernst, Damien and Wehenkel, Louis},
    Journal = {Machine learning},
    Number = {1},
    Pages = {3--42},
    Publisher = {Springer},
    Title = {Extremely randomized trees},
    Volume = {63},
    Year = {2006}}

@article{natekin2013gradient,
    Author = {Natekin, Alexey and Knoll, Alois},
    Journal = {Frontiers in neurorobotics},
    Publisher = {Frontiers Media SA},
    Title = {Gradient boosting machines, a tutorial},
    Volume = {7},
    Year = {2013}}

@article{TaiSM15,
  author    = {Kai Sheng Tai and
               Richard Socher and
               Christopher D. Manning},
  title     = {Improved Semantic Representations From Tree-Structured Long Short-Term
               Memory Networks},
  journal   = {CoRR},
  volume    = {abs/1503.00075},
  year      = {2015},
  url       = {http://arxiv.org/abs/1503.00075},
  eprint    = {1503.00075},
}

@article{KipfW16,
  author    = {Thomas N. Kipf and
               Max Welling},
  title     = {Semi-Supervised Classification with Graph Convolutional Networks},
  journal   = {CoRR},
  volume    = {abs/1609.02907},
  year      = {2016},
  url       = {http://arxiv.org/abs/1609.02907},
  eprint    = {1609.02907},
}

@InProceedings{Xiao_2015_CVPR,
author = {Xiao, Tianjun and Xu, Yichong and Yang, Kuiyuan and Zhang, Jiaxing and Peng, Yuxin and Zhang, Zheng},
title = {The Application of Two-Level Attention Models in Deep Convolutional Neural Network for Fine-Grained Image Classification},
booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2015}
}

@techreport{Hornung:0AUN2GzA,
    Author = {Hornung, R D and Keasler, J A},
    Institution = {Lawrence Livermore National Laboratory},
    Month = sep,
    Number = {LLNL-TR-661403},
    Title = {{The RAJA Portability Layer: Overview and Status}},
    Year = {2014}}

@inproceedings{RadfordMC15,
  author    = {Alec Radford and
               Luke Metz and
               Soumith Chintala},
  editor    = {Yoshua Bengio and
               Yann LeCun},
  title     = {Unsupervised Representation Learning with Deep Convolutional Generative
               Adversarial Networks},
  booktitle = {4th International Conference on Learning Representations, {ICLR} 2016,
               San Juan, Puerto Rico, May 2-4, 2016, Conference Track Proceedings},
  year      = {2016},
}

@inproceedings{PeKePa03,
    Author = {Petrini, Fabrizio and Kerbyson, Darren J. and Pakin, Scott},
    Booktitle = {Proceedings of the 2003 ACM/IEEE conference on Supercomputing (SC'03)},
    Location = {Phoenix, AZ, USA},
    Title = {{The Case of the Missing Supercomputer Performance: Achieving Optimal Performance on the 8,192 Processors of ASCI Q}},
    Year = {2003}
}

@INPROCEEDINGS{kim:dragonfly,
    author={J. Kim and W. J. Dally and S. Scott and D. Abts},
    booktitle={2008 International Symposium on Computer Architecture},
    title={Technology-Driven, Highly-Scalable Dragonfly Topology},
    year={2008},
    publisher = {IEEE Computer Society},
}

@misc{nersc:cori_system,
  title = {Cori System},
  url = {https://docs.nersc.gov/systems/cori/},
  publisher = {NERSC},
}

@misc{alcf:theta,
  title = {Theta System},
  url = {https://www.alcf.anl.gov/alcf-resources/theta},
  publisher = {ALCF},
}

@misc{nersc:perlmutter,
  title = {Perlmutter Architecture},
  url = {https://docs.nersc.gov/systems/perlmutter/architecture/},
  publisher = {NERSC},
}

@misc{uva:rivanna,
  title = {Rivanna},
  url = {https://www.rc.virginia.edu/userinfo/rivanna/overview/},
  publisher = {University of Virginia Research Computing},
}

@INPROCEEDINGS{groves:perf_variability_on_dragonfly,
    author={T. {Groves} and Y. {Gu} and N. J. {Wright}},
    booktitle={2017 IEEE International Conference on Cluster Computing (CLUSTER)},
    title={Understanding Performance Variability on the Aries Dragonfly Network},
    year={2017},
    volume={},
    number={},
    pages={809-813},
}

@article{prettenhofer2014gradient,
  title={Gradient boosted regression trees in scikit-learn},
  author={Prettenhofer, Peter and Louppe, Gilles},
  year={2014}
}

@article{freund2003efficient,
  title={An efficient boosting algorithm for combining preferences},
  author={Freund, Yoav and Iyer, Raj and Schapire, Robert E and Singer, Yoram},
  journal={Journal of machine learning research},
  volume={4},
  number={Nov},
  pages={933--969},
  year={2003}
}

@article{srivastava2013improving,
  title={Improving neural networks with dropout},
  author={Srivastava, Nitish},
  journal={University of Toronto},
  volume={182},
  number={566},
  pages={7},
  year={2013}
}

@inproceedings{Chunduri:2017,
  Address = {New York, NY, USA},
  Author = {Chunduri, Sudheer and Harms, Kevin and Parker, Scott and Morozov, Vitali and Oshin, Samuel and Cherukuri, Naveen and Kumaran, Kalyan},
  Booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  Location = {Denver, Colorado},
  Publisher = {ACM},
  Series = {SC '17},
  Title = {Run-to-run Variability on Xeon Phi Based Cray XC Systems},
  Year = {2017},
}

@inproceedings{jha:cluster2018,
    Author = {S. {Jha} and J. {Brandt} and A. {Gentile} and Z. {Kalbarczyk} and R. {Iyer}},
    Booktitle = {2018 IEEE International Conference on Cluster Computing (CLUSTER)},
    Doi = {10.1109/CLUSTER.2018.00072},
    Month = {Sep.},
    Pages = {562-570},
    Title = {Characterizing Supercomputer Traffic Networks Through Link-Level Analysis},
    Year = {2018},
}

@inproceedings{skinner2005understanding,
  Author = {Skinner, David and Kramer, William},
  Booktitle = {Workload Characterization Symposium, 2005. Proceedings of the IEEE International},
  Organization = {IEEE},
  Pages = {137--149},
  Title = {Understanding the causes of performance variability in HPC workloads},
  Year = {2005}
}

@article{tuncer:tpds2019,
  author={O. {Tuncer} and E. {Ates} and Y. {Zhang} and A. {Turk} and J. {Brandt} and V. J. {Leung} and M. {Egele} and A. K. {Coskun}},
  journal={IEEE Transactions on Parallel and Distributed Systems},
  title={Online Diagnosis of Performance Variation in HPC Systems Using Machine Learning},
  year={2019},
  volume={30},
  pages={883-896}
}

@article{sandri2010analysis,
  title={Analysis and correction of bias in total decrease in node impurity measures for tree-based algorithms},
  author={Sandri, Marco and Zuccolotto, Paola},
  journal={Statistics and Computing},
  volume={20},
  number={4},
  pages={393--407},
  year={2010},
  publisher={Springer}
}

@misc{ariescounters,
    Howpublished = {\url{http://docs.cray.com/books/S-0045-20/S-0045-20.pdf}},
    Title = {Aries Hardware Counters (S-0045-20)},
    Year = {2017}
}

@article{milc00,
  Author = {Claude Bernard and Tom Burch and Thomas A. DeGrand and Carleton DeTar and Steven Gottlieb and Urs M. Heller and James E. Hetrick and Kostas Orginos and Bob Sugar and Doug Toussaint},
  Journal = {Physical Review D},
  Number = {61},
  Title = {{Scaling tests of the improved Kogut-Susskind quark action}},
  Year = 2000}

@INPROCEEDINGS{vite:ipdps18,
    author={S. {Ghosh} and M. {Halappanavar} and A. {Tumeo} and A. {Kalyanaraman} and H. {Lu} and D. {Chavarrià-Miranda} and A. {Khan} and A. {Gebremedhin}},
    booktitle={2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
    title={Distributed Louvain Algorithm for Graph Community Detection},
    year={2018},
    pages={885-895},
    doi={10.1109/IPDPS.2018.00098},
    ISSN={1530-2075},
    month={May}
}

@INPROCEEDINGS{minivite:pmbs18,
    author={S. {Ghosh} and M. {Halappanavar} and A. {Tumeo} and A. {Kalyanaraman} and A. H. {Gebremedhin}},
    booktitle={2018 IEEE/ACM Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)},
    title={MiniVite: A Graph Analytics Benchmarking Tool for Massively Parallel Systems},
    year={2018},
    pages={51-56},
    doi={10.1109/PMBS.2018.8641631},
    month={Nov},
}

@article{friedman2001greedy,
  Author = {Friedman, Jerome H.},
  Journal = {The Annals of Statistics},
  Number = {5},
  Pages = {pp. 1189-1232},
  Publisher = {Institute of Mathematical Statistics},
  Title = {Greedy Function Approximation: A Gradient Boosting Machine},
  Volume = {29},
  Year = {2001}
}

@phdthesis{ugal,
  Author = {Arjun Singh},
  Note = {\url{http://cva.stanford.edu/publications/2005/thesis_arjuns.pdf}},
  School = {Dept. of Electrical Engineering, Stanford University},
  Title = {Load-Balanced Routing in Interconnection Networks},
  Year = {2005}
}

@inproceedings{wolski:sc97,
  author = {Wolski, Rich and Spring, Neil and Peterson, Chris},
  title = {Implementing a Performance Forecasting System for Metacomputing: The Network Weather Service},
  year = {1997},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/509593.509600},
  doi = {10.1145/509593.509600},
  booktitle = {Proceedings of the 1997 ACM/IEEE Conference on Supercomputing},
  location = {San Jose, CA},
  series = {SC ’97}
}

@inproceedings{hoefler:sc2010,
  author = {Hoefler, Torsten and Schneider, Timo and Lumsdaine, Andrew},
  title = {Characterizing the Influence of System Noise on Large-Scale Applications by Simulation},
  year = {2010},
  publisher = {IEEE Computer Society},
  address = {USA},
  url = {https://doi.org/10.1109/SC.2010.12},
  booktitle = {Proceedings of the 2010 ACM/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis},
  series = {SC ’10}
}

@article{agelastos:parco2016,
  author = "Anthony Agelastos and Benjamin Allan and Jim Brandt and Ann Gentile and Sophia Lefantzi and Steve Monk and Jeff Ogden and Mahesh Rajan and Joel Stevenson",
  title = "Continuous whole-system monitoring toward rapid understanding of production HPC applications and systems",
  journal = "Parallel Computing",
  volume = "58",
  pages = "90 - 106",
  year = "2016",
  issn = "0167-8191",
  doi = "https://doi.org/10.1016/j.parco.2016.05.009",
  url = "http://www.sciencedirect.com/science/article/pii/S0167819116300394",
}

@incollection{hoppe:2020,
  author = {Hoppe, Dennis and Zhong, Li and Andersson, Stefan and Moise, Diana},
  title = {On the Detection and Interpretation of Performance Variations of HPC Applications},
  year = {2020},
  month = {03},
  pages = {41-56},
  booktitle = {Sustained Simulation Performance 2018 and 2019},
  publisher = {Springer},
  editor = {M. Resch and Y. Kovalenko and W. Bez and E. Focht and H. Kobayashi}
}

@INPROCEEDINGS{chunduri:pmbs2019,
  author={S. {Chunduri} and E. {Jennings} and K. {Harms} and C. {Knight} and S. {Parker}},
  booktitle={2019 IEEE/ACM Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)},
  title={A Generalized Statistics-Based Model for Predicting Network-Induced Variability},
  year={2019},
  pages={59-72},
}

@inproceedings{MONET:NSDI:2020,
 author = {S. Jha and  A. Patke and B. Lim and  J. Brandt and A. Gentile and G. Bauer and M. Showerman and L. Kaplan and Z. Kalbarczyk and W. T. Kramer and R. Iyer},
 booktitle = {17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)},
 month = {Feb},
 title = {Measuring Congestion in High-Performance Datacenter Interconnects},
 year = {2020}
}

@INPROCEEDINGS{wyatt:ipdps2020,
  author={M. R. {Wyatt} and S. {Herbein} and K. {Shoga} and T. {Gamblin} and M. {Taufer}},
  booktitle={2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
  title={CanarIO: Sounding the Alarm on IO-Related Performance Degradation},
  year={2020},
  volume={},
  number={},
  pages={73-83},
}

@INPROCEEDINGS{di:dsn2019,
  author={S. {Di} and H. {Guo} and E. {Pershey} and M. {Snir} and F. {Cappello}},
  booktitle={2019 49th Annual IEEE/IFIP International Conference on Dependable Systems and Networks (DSN)},
  title={Characterizing and Understanding HPC Job Failures Over The 2K-Day Life of IBM BlueGene/Q System},
  year={2019},
  volume={},
  number={},
  pages={473-484},
}

@inproceedings{nowak1999,
    Author = {Nowak, P. F. and Nemanic, M. K.},
    Booktitle = {Proceedings of the International Conference on Mathematics and Computation, Reactor Physics and Environmental Analysis in Nuclear Applications},
    Month = sep,
    Title = {Radiation Transport Calculations on Unstructured Grids Using a Spatially Decomposed and Threaded Algorithm},
    Year = 1999
}

@inproceedings{mou:aaai2016,
  author = {Mou, Lili and Li, Ge and Zhang, Lu and Wang, Tao and Jin, Zhi},
  title = {Convolutional Neural Networks over Tree Structures for Programming Language Processing},
  year = {2016},
  publisher = {AAAI Press},
  booktitle = {Proceedings of the Thirtieth AAAI Conference on Artificial Intelligence},
  pages = {1287–1293},
  numpages = {7},
  location = {Phoenix, Arizona},
  series = {AAAI’16}
}

@INPROCEEDINGS{subramoni:cluster2013,
  author={H. {Subramoni} and D. {Bureddy} and K. {Kandalla} and K. {Schulz} and B. {Barth} and J. {Perkins} and M. {Arnold} and D. K. {Panda}},
  booktitle={2013 IEEE International Conference on Cluster Computing (CLUSTER)},
  title={Design of network topology aware scheduling services for large InfiniBand clusters},
  year={2013},
}

@INPROCEEDINGS{lu:globecom2019,
  author={J. {Lu} and P. {Li} and K. {Wang} and H. {Feng} and E. {Guo} and X. {Wang} and S. {Guo}},
  booktitle={2019 IEEE Global Communications Conference (GLOBECOM)},
  title={Topology-Aware Job Scheduling for Machine Learning Cluster},
  year={2019},
}

@INPROCEEDINGS{ldms,
  author={A. {Agelastos} and B. {Allan} and J. {Brandt} and P. {Cassella} and J. {Enos} and J. {Fullop} and A. {Gentile} and S. {Monk} and N. {Naksinehaboon} and J. {Ogden} and M. {Rajan} and M. {Showerman} and J. {Stevenson} and N. {Taerat} and T. {Tucker}},
  booktitle={SC '14: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  title={The Lightweight Distributed Metric Service: A Scalable Infrastructure for Continuous Monitoring of Large Scale Computing Systems and Applications},
  year={2014},
  volume={},
  number={},
  pages={154-165},
}

@article{graham1982gprof,
    Author = {Graham, Susan L and Kessler, Peter B and Mckusick, Marshall K},
    Date-Modified = {2018-12-08 06:15:33 +0000},
    Journal = {SIGPLAN Not.},
    Number = {6},
    Pages = {120--126},
    Publisher = {ACM},
    Title = {Gprof: A call graph execution profiler},
    Volume = {17},
    Year = {1982}}
}

@article{mellorcrummey+:jsc02,
    Author = {J. Mellor-Crummey and R. Fowler and G. Marin},
    Date-Added = {2012-01-16 03:42:52 -0800},
    Date-Modified = {2012-01-16 03:42:52 -0800},
    Journal = {The Journal of Supercomputing},
    Pages = {81-101},
    Title = {{HPCView: A tool for top-down analysis of node performance}},
    Volume = 23,
    Year = 2002
}

@misc{gregg:flame2015,
    Author = {Gregg, Brendan},
    Date-Added = {2019-04-11 00:12:41 -0700},
    Date-Modified = {2019-04-11 00:42:39 -0700},
    Howpublished = {Online},
    Note = {http://www.brendangregg.com/Slides/FreeBSD2014\_FlameGraphs.pdf},
    Title = {Flame graphs},
    Year = {2015}
}

@inproceedings{huck+:perfdmf2005,
    Author = {Kevin Huck and Allen D. Malony and R Bell and L Li and A Morris},
    Booktitle = {International Conference on Parallel Processing (ICPP'05)},
    Date-Added = {2019-04-11 00:06:10 -0700},
    Date-Modified = {2019-04-11 00:07:27 -0700},
    Title = {PerfDMF: Design and implementation of a parallel performance data management framework},
    Year = {2005}
}

@inproceedings{mckenney:differential1995,
    Author = {P. E. {McKenney}},
    Booktitle = {MASCOTS '95. Proceedings of the Third International Workshop on Modeling, Analysis, and Simulation of Computer and Telecommunication Systems},
    Date-Modified = {2019-04-11 00:08:35 -0700},
    Doi = {10.1109/MASCOT.1995.378681},
    Month = {Jan},
    Pages = {237-241},
    Title = {Differential profiling},
    Year = {1995},
    Bdsk-Url-1 = {https://doi.org/10.1109/MASCOT.1995.378681}
}

@inproceedings{schulz+:differential07,
    Address = {Berlin, Heidelberg},
    Author = {Schulz, Martin and de Supinski, Bronis R.},
    Booktitle = {Euro-Par 2007 Parallel Processing},
    Date-Modified = {2019-04-11 00:09:03 -0700},
    Editor = {Kermarrec, Anne-Marie and Boug{\'e}, Luc and Priol, Thierry},
    Isbn = {978-3-540-74466-5},
    Pages = {97--106},
    Publisher = {Springer Berlin Heidelberg},
    Title = {Practical Differential Profiling},
    Year = {2007}
}

@inproceedings{tallent+:sc09,
    Author = {Nathan R. Tallent and John M. Mellor-Crummey and Laksono Adhianto and Michael W. Fagan and Mark Krentel},
    Booktitle = super09,
    Date-Added = {2012-01-16 03:17:15 -0800},
    Date-Modified = {2012-01-16 03:17:15 -0800},
    Month = nov,
    Title = {{Diagnosing performance bottlenecks in emerging petascale applications}},
    Year = 2011
}

@inproceedings{tallent+:sc10,
    Author = {Nathan R. Tallent and Laksono Adhianto and John M. Mellor-Crummey},
    Booktitle = super10,
    Date-Added = {2012-01-16 03:17:15 -0800},
    Date-Modified = {2012-01-16 03:17:15 -0800},
    Month = nov,
    Title = {Scalable Identification of Load Imbalance in Parallel Executions Using Call Path Profiles},
    Year = 2010
}

@article{shende:tau2006,
  title={The TAU parallel performance system},
  author={Shende, Sameer S and Malony, Allen D},
  journal={The International Journal of High Performance Computing Applications},
  volume={20},
  number={2},
  pages={287--311},
  year={2006},
  publisher={Sage Publications Sage CA: Thousand Oaks, CA}
}

@inproceedings{bell:paraprof2003,
  title={Paraprof: A portable, extensible, and scalable tool for parallel performance profile analysis},
  author={Bell, Robert and Malony, Allen D and Shende, Sameer},
  booktitle={European Conference on Parallel Processing},
  pages={17--26},
  year={2003},
  organization={Springer}
}

@inproceedings{huck:perfexplorer2005,
  title={Perfexplorer: A performance data mining framework for large-scale parallel computing},
  author={Huck, Kevin A and Malony, Allen D},
  booktitle={SC'05: Proceedings of the 2005 ACM/IEEE conference on Supercomputing},
  pages={41--41},
  year={2005},
  organization={IEEE}
}

@misc{openspeedshop,
  author = {The Open|SpeedShop Team.},
  note={Accessed November 2020},
  title = { Open|SpeedShop for Linux},
  howpublished = {https://openspeedshop.org/}
}

@misc{team2013r,
  title={R: A language and environment for statistical computing},
  author={Team, R Core and others},
  year={2013},
  publisher={Vienna, Austria}
}

@book{mckinney:pandas,
    Author = {Wes McKinney},
    Date-Added = {2019-04-10 23:58:27 -0700},
    Date-Modified = {2019-04-10 23:58:56 -0700},
    Isbn = {1491957662},
    Publisher = {O'Reilly Media},
    Title = {{Python for Data Analysis: Data Wrangling with Pandas, NumPy, and IPython}},
    Year = {2017},
}

@InProceedings{mckinney:pandas2,
  author    = { Wes McKinney },
  title     = { Data Structures for Statistical Computing in Python },
  booktitle = { Proceedings of the 9th Python in Science Conference },
  pages     = { 51 - 56 },
  year      = { 2010 },
  editor    = { St\'efan van der Walt and Jarrod Millman }
}

@INPROCEEDINGS{song:icpp2004,
  author={F. {Song} and F. {Wolf} and N. {Bhatia} and J. {Dongarra} and S. {Moore}},
  booktitle={International Conference on Parallel Processing, 2004. ICPP 2004.},
  title={An algebra for cross-experiment performance analysis},
  year={2004},
  volume={},
  number={},
  pages={63-72 vol.1},
  doi={10.1109/ICPP.2004.1327905}
}

@inproceedings{acun:sc14,
    Author = {Acun, Bilge and Gupta, Abhishek and Jain, Nikhil and Langer, Akhil and Menon, Harshitha and Mikida, Eric and Ni, Xiang and Robson, Michael and Sun, Yanhua and Totoni, Ehsan and Wesolowski, Lukasz and Kale, Laxmikant},
    Series = {SC},
    Title = {{Parallel Programming with Migratable Objects: Charm++ in Practice}},
    Year = {2014}
}

@inproceedings{Kaiser.PGAS.14,
 author = {Kaiser, Hartmut and Heller, Thomas and Adelstein-Lelbach, Bryce and Serio, Adrian and Fey, Dietmar},
 title = {HPX: A Task Based Programming Model in a Global Address Space},
 booktitle = {Proceedings of the 8th International Conference on Partitioned Global Address Space Programming Models},
 series = {PGAS '14},
 year = {2014},
 isbn = {978-1-4503-3247-7},
 location = {Eugene, OR, USA},
 pages = {6:1--6:11},
 articleno = {6},
 numpages = {11},
 @url = {http://doi.acm.org/10.1145/2676870.2676883},
 @doi = {10.1145/2676870.2676883},
 acmid = {2676883},
 publisher = {ACM},
 address = {New York, NY, USA},
}

@MISC{OpenMP4,
  title = {{OpenMP Application Program Interface. Version 4.0. July 2013}},
  key = {OpenMP4},
  year = {2013}
}

@ARTICLE{Duran.2011.PPL,
  author = {Duran, Alejandro and Ayguad{\'e}, Eduard and Badia, Rosa M and Labarta,
  Jes{\'u}s and Martinell, Luis and Martorell, Xavier and Planas, Judit},
  title = {{OmpSs}: A Proposal for Programming Heterogeneous Multi-core Architectures},
  journal = {Parallel Processing Letters},
  month = jun,
  year = {2011},
  volume = {21},
  pages = {173--193},
  number = {2},
  publisher = {World Scientific}
}

@INPROCEEDINGS{Bauer.2012.SC,
  author = {Bauer, Michael and Treichler, Sean and Slaughter, Elliott and Aiken,
  Alex},
  title = {Legion: Expressing Locality and Independence with Logical Regions},
  booktitle = {Proceedings of the 2012 ACM/IEEE International Conference on High
  Performance Computing, Networking, Storage and Analysis},
  year = {2012},
  series = {SC '12},
  pages = {66:1--66:11},
  address = {Los Alamitos, CA, USA},
  publisher = {IEEE Computer Society},
  acmid = {2389086},
  articleno = {66},
  isbn = {978-1-4673-0804-5},
  location = {Salt Lake City, Utah},
  numpages = {11},
}

@misc{modin,
  author = {Modin Developers},
  note={Accessed November 2020},
  title = { Modin: Speed up your Pandas workflows by changing a single line of code},
  howpublished = {https://modin.readthedocs.io/en/latest/}
}

@inproceedings{CharmppOOPSLA93,
    Author = {{Kal\'{e}}, L.V. and Krishnan, S.},
    Booktitle = {{Proceedings of OOPSLA'93}},
    Editor = {Paepcke, A.},
    Fulleditor = {Paepcke, Andreas},
    Month = {September},
    Pages = {91--108},
    Publisher = {{ACM Press}},
    Title = {{CHARM++: A Portable Concurrent Object Oriented System Based on C++}},
    Year = {1993}
}

@misc{charm4py,
  title = {Charm4py},
  howpublished = {https://charm4py.readthedocs.io}
}

@INPROCEEDINGS{charmpy,
  author={J. J. {Galvez} and K. {Senthil} and L. {Kale}},
  booktitle={2018 IEEE International Conference on Cluster Computing (CLUSTER)},
  title={CharmPy: A Python Parallel Programming Model},
  year={2018},
  volume={},
  number={},
  pages={423-433},
  doi={10.1109/CLUSTER.2018.00059}}

@Article{Jones2008,
  author={Jones, Kate E.
    and Patel, Nikkita G.
      and Levy, Marc A.
      and Storeygard, Adam
      and Balk, Deborah
      and Gittleman, John L.
      and Daszak, Peter},
  title={Global trends in emerging infectious diseases},
  journal={Nature},
  year={2008},
  month={Feb},
  day={01},
  volume={451},
  number={7181},
  pages={990-993},
  abstract={Emerging infectious diseases are a major threat to health: AIDS, SARS, drug-resistant bacteria and Ebola virus are among the more recent examples. By identifying emerging disease 'hotspots', the thinking goes, it should be possible to spot health risks at an early stage and prepare containment strategies. An analysis of over 300 examples of disease emerging between 1940 and 2004 suggests that these hotspots can be accurately mapped based on socio-economic, environmental and ecological factors. The data show that the surveillance effort, and much current research spending, is concentrated in developed economies, yet the risk maps point to developing countries as the more likely source of new diseases.},
  issn={1476-4687},
  doi={10.1038/nature06536},
  url={https://doi.org/10.1038/nature06536}
}

@article{hethcote2000mathematics,
  title={The mathematics of infectious diseases},
  author={Hethcote, Herbert W},
  journal={SIAM review},
  volume={42},
  number={4},
  pages={599--653},
  year={2000},
  publisher={SIAM}
}


@article{babel_k-partitioning_1998,
  title = {The k-partitioning problem},
  volume = {47},
  issn = {1432-2994, 1432-5217},
  url = {http://link.springer.com/10.1007/BF01193837},
  doi = {10.1007/BF01193837},
  abstract = {The k-partitioning problem is defined as follows: Given a set of items \{I1,12,..., In\} where i t e m / j is of weight wj {\textgreater} O, find a partition S1, \$2,..., Sm of this set with IsA = k such that the maximum weight of all subsets Si is minimal, k-partitioning is strongly related to the classical multiproeessor scheduling problem of minimizing the makespan on identical machines. This paper provides suitable tools for the construction of algorithms which solve exactly the problem. Several approximation algorithms are presented for this NP-hard problem. The worst-case behavior of the algorithms is analyzed. The best of these algorithms achieves a performance bound of 4]3.},
  language = {en},
  number = {1},
  urldate = {2022-02-04},
  journal = {Mathematical Methods of Operations Research},
  author = {Babel, Luitpold and Kellerer, Hans and Kotov, Vladimir},
  month = feb,
  year = {1998},
  pages = {59--82},
  annote = {Outlines k-partitioning problem (Dividing a list of numbers into partitions of size k so as to minimize the largest sum of elements in a set \$min\_\{P\} max\_\{p {\textbackslash}in P\} {\textbackslash}sum\_\{n {\textbackslash}in p\} n\$) and a few algorithms for solving it approximately. Mostly focuses on the quality of the worst-case result relative to the scheduling problem (which is similar but lacks the size constraint). Does not consider computational complexity of the algorithms.},
  annote = {Since we already have starting partitions, I think two-way swaps are probably a reasonable thing to try first.},
  file = {Babel et al. - 1998 - Thek-partitioning problem.pdf:C\:\\Users\\bitqu\\Zotero\\storage\\EXL7SZ7I\\Babel et al. - 1998 - Thek-partitioning problem.pdf:application/pdf},
}

@article{cuDNN,
  author    = {Sharan Chetlur and
               Cliff Woolley and
               Philippe Vandermersch and
               Jonathan Cohen and
               John Tran and
               Bryan Catanzaro and
               Evan Shelhamer},
  title     = {cuDNN: Efficient Primitives for Deep Learning},
  journal   = {CoRR},
  volume    = {abs/1410.0759},
  year      = {2014},
  url       = {http://arxiv.org/abs/1410.0759},
  archivePrefix = {arXiv},
  eprint    = {1410.0759},
  timestamp = {Mon, 13 Aug 2018 16:48:28 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/ChetlurWVCTCS14.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@misc{jeh2019miopen,
    title={MIOpen: An Open Source Library For Deep Learning Primitives},
    author={Jehandad Khan and Paul Fultz and Artem Tamazov and Daniel Lowell and Chao Liu and Michael Melesse and Murali Nandhimandalam and Kamil Nasyrov and Ilya Perminov and Tejash Shah and Vasilii Filippov and Jing Zhang and Jing Zhou and Bragadeesh Natarajan and Mayank Daga},
    year={2019},
    eprint={1910.00078},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

@inproceedings{gamma-iccad2019,
  author = {Kao, Sheng-Chun and Krishna, Tushar},
  title = {GAMMA: Automating the HW Mapping of DNN Models on Accelerators via Genetic Algorithm},
  year = {2020},
  isbn = {9781450380263},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3400302.3415639},
  doi = {10.1145/3400302.3415639},
  booktitle = {Proceedings of the 39th International Conference on Computer-Aided Design},
  articleno = {44},
  numpages = {9},
  keywords = {ML accelerator, reconfigurable device, genetic algorithm},
  location = {Virtual Event, USA},
  series = {ICCAD '20}
}

@ARTICLE{maestro-micro2020,
  author={H. {Kwon} and P. {Chatarasi} and V. {Sarkar} and T. {Krishna} and M. {Pellauer} and A. {Parashar}},
  journal={IEEE Micro},
  title={MAESTRO: A Data-Centric Approach to Understand Reuse, Performance, and Hardware Cost of DNN Mappings},
  year={2020},
  volume={40},
  number={3},
  pages={20-29},
  doi={10.1109/MM.2020.2985963}
}

@inproceedings{lift-cgo2017,
  author = {Steuwer, Michel and Remmelg, Toomas and Dubach, Christophe},
  title = {Lift: A Functional Data-Parallel IR for High-Performance GPU Code Generation},
  year = {2017},
  isbn = {9781509049318},
  publisher = {IEEE Press},
  booktitle = {Proceedings of the 2017 International Symposium on Code Generation and Optimization},
  pages = {74–85},
  numpages = {12},
  location = {Austin, USA},
  series = {CGO '17}
}

@inproceedings{cnn-fpga-isfpg2017,
  author = {Ma, Yufei and Cao, Yu and Vrudhula, Sarma and Seo, Jae-sun},
  title = {Optimizing Loop Operation and Dataflow in FPGA Acceleration of Deep Convolutional Neural Networks},
  year = {2017},
  isbn = {9781450343541},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3020078.3021736},
  doi = {10.1145/3020078.3021736},
  booktitle = {Proceedings of the 2017 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
  pages = {45–54},
  numpages = {10},
  keywords = {hardware acceleration, convolutional neural networks, FPGA},
  location = {Monterey, California, USA},
  series = {FPGA '17}
}

@INPROCEEDINGS{neurocube-isca2016,
  author={D. {Kim} and J. {Kung} and S. {Chai} and S. {Yalamanchili} and S. {Mukhopadhyay}},
  booktitle={2016 ACM/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA)},
  title={Neurocube: A Programmable Digital Neuromorphic Architecture with High-Density 3D Memory},
  year={2016},
  volume={},
  number={},
  pages={380-392},
  doi={10.1109/ISCA.2016.41}
}

@INPROCEEDINGS{eyeriss-isca2016,
  author={Y. {Chen} and J. {Emer} and V. {Sze}},
  booktitle={2016 ACM/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA)},
  title={Eyeriss: A Spatial Architecture for Energy-Efficient Dataflow for Convolutional Neural Networks},
  year={2016},
  volume={},
  number={},
  pages={367-379},
  doi={10.1109/ISCA.2016.40}
}

@ARTICLE{eyeriss-ssc2017,
  author={Y. {Chen} and T. {Krishna} and J. S. {Emer} and V. {Sze}},
  journal={IEEE Journal of Solid-State Circuits},
  title={Eyeriss: An Energy-Efficient Reconfigurable Accelerator for Deep Convolutional Neural Networks},
  year={2017},
  volume={52},
  number={1},
  pages={127-138},
  doi={10.1109/JSSC.2016.2616357}
}

@INPROCEEDINGS{hypar-hpca2019,
  author={L. {Song} and J. {Mao} and Y. {Zhuo} and X. {Qian} and H. {Li} and Y. {Chen}},
  booktitle={2019 IEEE International Symposium on High Performance Computer Architecture (HPCA)},
  title={HyPar: Towards Hybrid Parallelism for Deep Learning Accelerator Array},
  year={2019},
  volume={},
  number={},
  pages={56-68},
  doi={10.1109/HPCA.2019.00027}
}

@book{CharmAppsBook:2013,
  editor = {Kale, Laxmikant V. and Bhatele, Abhinav},
  title = "{Parallel Science and Engineering Applications: The Charm++ Approach}",
  year = {2013},
  month = nov,
  publisher = {Taylor \& Francis Group, CRC Press},
  isbn = {9781466504127}
}

@inproceedings{gamblin:sc15,
  author = {T. Gamblin and M. LeGendre and M. R. Collette and G. L. Lee and A. Moody and B. R. de Supinski and S. Futral},
  booktitle = {SC15: International Conference for High-Performance Computing, Networking, Storage and Analysis},
  title = {The Spack package manager: bringing order to HPC software chaos},
  year = {2015},
  issn = {2167-4337},
  doi = {10.1145/2807591.2807623},
  url = {https://doi.ieeecomputersociety.org/10.1145/2807591.2807623},
  publisher = {IEEE Computer Society},
  address = {Los Alamitos, CA, USA},
  month = {nov}
}

@inproceedings{seal:simutools2010,
  author = {Aaby, Brandon G. and Perumalla, Kalyan S. and Seal, Sudip K.},
  title = {Efficient Simulation of Agent-Based Models on Multi-GPU and Multi-Core Clusters},
  year = {2010},
  publisher = {ICST (Institute for Computer Sciences, Social-Informatics and Telecommunications Engineering)},
  address = {Brussels, BEL},
  url = {https://doi.org/10.4108/ICST.SIMUTOOLS2010.8822},
  booktitle = {Proceedings of the 3rd International ICST Conference on Simulation Tools and Techniques},
  location = {Torremolinos, Malaga, Spain},
  series = {SIMUTools '10}
}

@article{grefenstette2013fred,
  title={FRED (A Framework for Reconstructing Epidemic Dynamics): an open-source software system for modeling infectious diseases and control strategies using census-based populations},
  author={Grefenstette, John J and Brown, Shawn T and Rosenfeld, Roni and DePasse, Jay and Stone, Nathan TB and Cooley, Phillip C and Wheaton, William D and Fyshe, Alona and Galloway, David D and Sriram, Anuroop and others},
  journal={BMC public health},
  volume={13},
  number={1},
  pages={1--14},
  year={2013},
  publisher={BioMed Central}
}

@article{parker2011distributed,
  title={A distributed platform for global-scale agent-based models of disease transmission},
  author={Parker, Jon and Epstein, Joshua M},
  journal={ACM Transactions on Modeling and Computer Simulation (TOMACS)},
  volume={22},
  number={1},
  pages={1--25},
  year={2011},
  publisher={ACM New York, NY, USA}
}

@article{perumalla2012discrete,
  title={Discrete event modeling and massively parallel execution of epidemic outbreak phenomena},
  author={Perumalla, Kalyan S and Seal, Sudip K},
  journal={Simulation},
  volume={88},
  number={7},
  pages={768--783},
  year={2012},
  publisher={Sage Publications Sage UK: London, England}
}

@article{eubank2004modelling,
  title={Modelling disease outbreaks in realistic urban social networks},
  author={Eubank, Stephen and Guclu, Hasan and Kumar, VS Anil and Marathe, Madhav V and Srinivasan, Aravind and Toroczkai, Zoltan and Wang, Nan},
  journal={Nature},
  volume={429},
  number={6988},
  pages={180--184},
  year={2004},
  publisher={Nature Publishing Group}
}

@article{longini2005containing,
  title={Containing pandemic influenza at the source},
  author={Longini, Ira M and Nizam, Azhar and Xu, Shufu and Ungchusak, Kumnuan and Hanshaoworakul, Wanna and Cummings, Derek AT and Halloran, M Elizabeth},
  journal={Science},
  volume={309},
  number={5737},
  pages={1083--1087},
  year={2005},
  publisher={American Association for the Advancement of Science}
}

@article{ferguson2003planning,
  title={Planning for smallpox outbreaks},
  author={Ferguson, Neil M and Keeling, Matt J and Edmunds, W John and Gani, Raymond and Grenfell, Bryan T and Anderson, Roy M and Leach, Steve},
  journal={Nature},
  volume={425},
  number={6959},
  pages={681--685},
  year={2003},
  publisher={Nature Publishing Group}
}

@manual{google2020protobuf,
  title={Protocol Buffers},
  author={Google},
  howpublished="ver. 3.14.0",
  year={2020},
  url={https://github.com/protocolbuffers/protobuf/releases/tag/v3.14.0}
}

@manual{ppl2020charm,
  title={Charm++},
  author={Parallel Programing Laboratory},
  howpublished="ver 6.10.2",
  year={2020},
  url={https://github.com/UIUC-PPL/charm/releases/tag/v6.10.2}
}

@inproceedings{barrett2008episimdemics,
  title={EpiSimdemics: an efficient algorithm for simulating the spread of infectious disease over large realistic social networks},
  author={Barrett, Christopher L and Bisset, Keith R and Eubank, Stephen G and Feng, Xizhou and Marathe, Madhav V},
  booktitle={SC'08: Proceedings of the 2008 ACM/IEEE Conference on Supercomputing},
  pages={1--12},
  year={2008},
  organization={IEEE}
}

@inproceedings{yeom2014overcoming,
  title={Overcoming the scalability challenges of epidemic simulations on blue waters},
  author={Yeom, Jae-Seung and Bhatele, Abhinav and Bisset, Keith and Bohm, Eric and Gupta, Abhishek and Kale, Laxmikant V and Marathe, Madhav and Nikolopoulos, Dimitrios S and Schulz, Martin and Wesolowski, Lukasz},
  booktitle={2014 IEEE 28th International Parallel and Distributed Processing Symposium},
  pages={755--764},
  year={2014},
  organization={IEEE}
}

@inproceedings{bhatele2017massively,
  title={Massively parallel simulations of spread of infectious diseases over realistic social networks},
  author={Bhatele, Abhinav and Yeom, Jae-Seung and Jain, Nikhil and Kuhlman, Chris J and Livnat, Yarden and Bisset, Keith R and Kale, Laxmikant V and Marathe, Madhav V},
  booktitle={2017 17th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGRID)},
  pages={689--694},
  year={2017},
  organization={IEEE}
}

@article{bennun2019demystifying,
  author = {Ben-Nun, Tal and Hoefler, Torsten},
  title = {Demystifying Parallel and Distributed Deep Learning: An In-Depth Concurrency Analysis},
  year = {2019},
  issue_date = {September 2019},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  volume = {52},
  number = {4},
  issn = {0360-0300},
  url = {https://doi.org/10.1145/3320060},
  doi = {10.1145/3320060},
  abstract = {Deep Neural Networks (DNNs) are becoming an important tool in modern computing applications. Accelerating their training is a major challenge and techniques range from distributed algorithms to low-level circuit design. In this survey, we describe the problem from a theoretical perspective, followed by approaches for its parallelization. We present trends in DNN architectures and the resulting implications on parallelization strategies. We then review and model the different types of concurrency in DNNs: from the single operator, through parallelism in network inference and training, to distributed deep learning. We discuss asynchronous stochastic optimization, distributed system architectures, communication schemes, and neural architecture search. Based on those approaches, we extrapolate potential directions for parallelism in deep learning.},
  journal = {ACM Comput. Surv.},
  month = aug,
  articleno = {65},
  numpages = {43},
  keywords = {Deep learning, parallel algorithms, distributed computing}
}

@article{pouyanfar2018dlsurvey,
  author = {Pouyanfar, Samira and Sadiq, Saad and Yan, Yilin and Tian, Haiman and Tao, Yudong and Reyes, Maria Presa and Shyu, Mei-Ling and Chen, Shu-Ching and Iyengar, S. S.},
  title = {A Survey on Deep Learning: Algorithms, Techniques, and Applications},
  year = {2018},
  issue_date = {January 2019},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  volume = {51},
  number = {5},
  issn = {0360-0300},
  url = {https://doi.org/10.1145/3234150},
  doi = {10.1145/3234150},
  abstract = {The field of machine learning is witnessing its golden era as deep learning slowly becomes the leader in this domain. Deep learning uses multiple layers to represent the abstractions of data to build computational models. Some key enabler deep learning algorithms such as generative adversarial networks, convolutional neural networks, and model transfers have completely changed our perception of information processing. However, there exists an aperture of understanding behind this tremendously fast-paced domain, because it was never previously represented from a multiscope perspective. The lack of core understanding renders these powerful methods as black-box machines that inhibit development at a fundamental level. Moreover, deep learning has repeatedly been perceived as a silver bullet to all stumbling blocks in machine learning, which is far from the truth. This article presents a comprehensive review of historical and recent state-of-the-art approaches in visual, audio, and text processing; social network analysis; and natural language processing, followed by the in-depth analysis on pivoting and groundbreaking advances in deep learning applications. It was also undertaken to review the issues faced in deep learning such as unsupervised learning, black-box models, and online learning and to illustrate how these challenges can be transformed into prolific future research avenues.},
  journal = {ACM Comput. Surv.},
  month = sep,
  articleno = {92},
  numpages = {36},
  keywords = {neural networks, machine learning, distributed processing, big data, Deep learning, survey}
}

@ARTICLE{lu2018dlbigdata,
  author={X. {Lu} and H. {Shi} and R. {Biswas} and M. H. {Javed} and D. K. {Panda}},
  journal={IEEE Transactions on Multi-Scale Computing Systems},
  title={DLoBD: A Comprehensive Study of Deep Learning over Big Data Stacks on HPC Clusters},
  year={2018},
  volume={4},
  number={4},
  pages={635-648},
  doi={10.1109/TMSCS.2018.2845886}
}

@ARTICLE{awan2020dlcommunication,
  author={A. A. {Awan} and A. {Jain} and C. {Chu} and H. {Subramoni} and D. K. {Panda}},
  journal={IEEE Micro},
  title={Communication Profiling and Characterization of Deep-Learning Workloads on Clusters With High-Performance Interconnects},
  year={2020},
  volume={40},
  number={1},
  pages={35-43},
  doi={10.1109/MM.2019.2949986}
}

@misc{tang2020communicationefficient,
  title={Communication-Efficient Distributed Deep Learning: A Comprehensive Survey},
  author={Zhenheng Tang and Shaohuai Shi and Xiaowen Chu and Wei Wang and Bo Li},
  year={2020},
  eprint={2003.06307},
  archivePrefix={arXiv},
  primaryClass={cs.DC}
}

@inproceedings{dryden2019channelparrallelism,
  author = {Dryden, Nikoli and Maruyama, Naoya and Moon, Tim and Benson, Tom and Snir, Marc and Van Essen, Brian},
  title = {Channel and Filter Parallelism for Large-Scale CNN Training},
  year = {2019},
  isbn = {9781450362290},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3295500.3356207},
  doi = {10.1145/3295500.3356207},
  abstract = {Accelerating large-scale CNN training is needed to keep training times reasonable as datasets grow larger and models become more complex. Existing frameworks primarily scale using data-parallelism, but this is limited by the mini-batch size, which cannot grow arbitrarily. We introduce three algorithms that partition channel or filter data to exploit parallelism beyond the sample dimension. Further, they partition the parameters of convolutional layers, replacing global all reduces with segmented allreduces---smaller, concurrent allreduces among disjoint processor sets. These algorithms enable strong scaling, reduced communication overhead, and reduced memory pressure, enabling training of very wide CNNs.We demonstrate improved strong and weak scaling, including up to 4.1x reductions in training time for residual networks and 4x reductions in allreduce overhead. We also show that wider models provide improved accuracy on ImageNet. We study the current limitations of our algorithms and provide a direction for future optimizations of large-scale deep learning frameworks.},
  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  articleno = {10},
  numpages = {20},
  keywords = {algorithms, scaling, deep learning, CNN, convolution},
  location = {Denver, Colorado},
  series = {SC '19}
}

@inproceedings{essen2015lbann,
  author    = {Brian Van Essen and
               Hyojin Kim and
               Roger A. Pearce and
               Kofi Boakye and
               Barry Chen},
  title     = {{LBANN:} livermore big artificial neural network {HPC} toolkit},
  booktitle = {Proceedings of the Workshop on Machine Learning in High-Performance
               Computing Environments, {MLHPC} 2015, Austin, Texas, USA, November
               15, 2015},
  pages     = {5:1--5:6},
  publisher = {{ACM}},
  year      = {2015},
  url       = {https://doi.org/10.1145/2834892.2834897},
  doi       = {10.1145/2834892.2834897},
  timestamp = {Tue, 06 Nov 2018 16:59:29 +0100},
  biburl    = {https://dblp.org/rec/conf/sc/EssenKPBC15.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@ARTICLE{sze2017efficientdnn,
  author={V. {Sze} and Y. {Chen} and T. {Yang} and J. S. {Emer}},
  journal={Proceedings of the IEEE},
  title={Efficient Processing of Deep Neural Networks: A Tutorial and Survey},
  year={2017},
  volume={105},
  number={12},
  pages={2295-2329},
  doi={10.1109/JPROC.2017.2761740}
}
@inproceedings{parunak1998agent,
  title={Agent-based modeling vs. equation-based modeling: A case study and users’ guide},
  author={Parunak, H Van Dyke and Savit, Robert and Riolo, Rick L},
  booktitle={International Workshop on Multi-Agent Systems and Agent-Based Simulation},
  pages={10--25},
  year={1998},
  organization={Springer}
}

@article{giordano2020modelling,
  title={Modelling the COVID-19 epidemic and implementation of population-wide interventions in Italy},
  author={Giordano, Giulia and Blanchini, Franco and Bruno, Raffaele and Colaneri, Patrizio and Di Filippo, Alessandro and Di Matteo, Angela and Colaneri, Marta},
  journal={Nature medicine},
  volume={26},
  number={6},
  pages={855--860},
  year={2020},
  publisher={Nature Publishing Group}
}

@article{tiwari2020modelling,
  title={Modelling and analysis of COVID-19 epidemic in India},
  author={Tiwari, Alok},
  journal={Journal of Safety Science and Resilience},
  volume={1},
  number={2},
  pages={135--140},
  year={2020},
  publisher={Elsevier}
}

@article{silva2020covid,
  title={COVID-ABS: An agent-based model of COVID-19 epidemic to simulate health and economic effects of social distancing interventions},
  author={Silva, Petr{\^o}nio CL and Batista, Paulo VC and Lima, H{\'e}lder S and Alves, Marcos A and Guimar{\~a}es, Frederico G and Silva, Rodrigo CP},
  journal={Chaos, Solitons \& Fractals},
  volume={139},
  pages={110088},
  year={2020},
  publisher={Elsevier}
}

@article{cuevas2020agent,
  title={An agent-based model to evaluate the COVID-19 transmission risks in facilities},
  author={Cuevas, Erik},
  journal={Computers in biology and medicine},
  volume={121},
  pages={103827},
  year={2020},
  publisher={Elsevier}
}

@article{chinazzi2020effect,
  title={The effect of travel restrictions on the spread of the 2019 novel coronavirus (COVID-19) outbreak},
  author={Chinazzi, Matteo and Davis, Jessica T and Ajelli, Marco and Gioannini, Corrado and Litvinova, Maria and Merler, Stefano and y Piontti, Ana Pastore and Mu, Kunpeng and Rossi, Luca and Sun, Kaiyuan and others},
  journal={Science},
  volume={368},
  number={6489},
  pages={395--400},
  year={2020},
  publisher={American Association for the Advancement of Science}
}

@article{prem2020effect,
  title={The effect of control strategies to reduce social mixing on outcomes of the COVID-19 epidemic in Wuhan, China: a modelling study},
  author={Prem, Kiesha and Liu, Yang and Russell, Timothy W and Kucharski, Adam J and Eggo, Rosalind M and Davies, Nicholas and Flasche, Stefan and Clifford, Samuel and Pearson, Carl AB and Munday, James D and others},
  journal={The Lancet Public Health},
  volume={5},
  number={5},
  pages={e261--e270},
  year={2020},
  publisher={Elsevier}
}

@article{anastassopoulou2020data,
  title={Data-based analysis, modelling and forecasting of the COVID-19 outbreak},
  author={Anastassopoulou, Cleo and Russo, Lucia and Tsakris, Athanasios and Siettos, Constantinos},
  journal={PloS one},
  volume={15},
  number={3},
  pages={e0230405},
  year={2020},
  publisher={Public Library of Science San Francisco, CA USA}
}


@Article{Naghshnejad2020,
  author={Naghshnejad, Mina
  and Singhal, Mukesh},
  title={A hybrid scheduling platform: a runtime prediction reliability aware scheduling platform to improve HPC scheduling performance},
  journal={The Journal of Supercomputing},
  year={2020},
  month={Jan},
  day={01},
  volume={76},
  number={1},
  pages={122-149},
  abstract={The performance of scheduling algorithms for HPC jobs highly depends on the accuracy of job runtime values. Prior research has established that neither user-provided runtimes nor system-generated runtime predictions are accurate. We propose a new scheduling platform that performs well in spite of runtime uncertainties. The key observation that we use for building our platform is the fact that two important classes of scheduling strategies (backfilling and plan based) differ in terms of sensitivity to runtime accuracy. We first confirm this observation by performing trace-based simulations to characterize the sensitivity of different scheduling strategies to job runtime accuracy. We then apply gradient boosting tree regression as a meta-learning approach to estimate the reliability of the system-generated job runtimes. The estimated prediction reliability of job runtimes is then used to choose a specific class of scheduling algorithm. Our hybrid scheduling platform uses a plan-based scheduling strategy for jobs with high expected runtime accuracy and backfills the remaining jobs on top of the planned jobs. While resource sharing is used to minimize fragmentation of resources, a specific ratio of CPU cores is reserved for backfilling of less predictable jobs to avoid starvation of these jobs. This ratio is adapted dynamically based on the resource requirement ratio of predictable jobs among recently submitted jobs. We perform extensive trace-driven simulations on real-world production traces to show that our hybrid scheduling platform outperforms both pure backfilling and pure plan-based scheduling algorithms.},
  issn={1573-0484},
  doi={10.1007/s11227-019-03004-3},
  url={https://doi.org/10.1007/s11227-019-03004-3}
}

@inproceedings{ml-scheduling-sc17,
  author = {Carastan-Santos, Danilo and de Camargo, Raphael Y.},
  title = {Obtaining Dynamic Scheduling Policies with Simulation and Machine Learning},
  year = {2017},
  isbn = {9781450351140},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3126908.3126955},
  doi = {10.1145/3126908.3126955},
  abstract = {Dynamic scheduling of tasks in large-scale HPC platforms is normally accomplished using ad-hoc heuristics, based on task characteristics, combined with some backfilling strategy. Defining heuristics that work efficiently in different scenarios is a difficult task, specially when considering the large variety of task types and platform architectures. In this work, we present a methodology based on simulation and machine learning to obtain dynamic scheduling policies. Using simulations and a workload generation model, we can determine the characteristics of tasks that lead to a reduction in the mean slowdown of tasks in an execution queue. Modeling these characteristics using a nonlinear function and applying this function to select the next task to execute in a queue improved the mean task slowdown in synthetic workloads. When applied to real workload traces from highly different machines, these functions still resulted in performance improvements, attesting the generalization capability of the obtained heuristics.},
  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  articleno = {32},
  numpages = {13},
  keywords = {high performance computing, scheduling, machine learning, simulation},
  location = {Denver, Colorado},
  series = {SC '17}
}

@INPROCEEDINGS{ml-backfilling-sc15,
  author={Gaussier, Eric and Glesser, David and Reis, Valentin and Trystram, Denis},
  booktitle={SC '15: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  title={Improving backfilling by using machine learning to predict running times},
  year={2015},
  volume={},
  number={},
  pages={1-10},
  doi={10.1145/2807591.2807646}
}

@inproceedings{rl-sched,
  author = {Zhang, Di and Dai, Dong and He, Youbiao and Bao, Forrest Sheng and Xie, Bing},
  title = {RLScheduler: An Automated HPC Batch Job Scheduler Using Reinforcement Learning},
  year = {2020},
  isbn = {9781728199986},
  publisher = {IEEE Press},
  abstract = {Today's high-performance computing (HPC) platforms are still dominated by batch jobs. Accordingly, effective batch job scheduling is crucial to obtain high system efficiency. Existing HPC batch job schedulers typically leverage heuristic priority functions to prioritize and schedule jobs. But, once configured and deployed by the experts, such priority functions can hardly adapt to the changes of job loads, optimization goals, or system settings, potentially leading to degraded system efficiency when changes occur. To address this fundamental issue, we present RLScheduler, an automated HPC batch job scheduler built on reinforcement learning. RLScheduler relies on minimal manual interventions or expert knowledge, but can learn high-quality scheduling policies via its own continuous 'trial and error'. We introduce a new kernel-based neural network structure and trajectory filtering mechanism in RLScheduler to improve and stabilize the learning process. Through extensive evaluations, we confirm that RLScheduler can learn high-quality scheduling policies towards various workloads and various optimization goals with relatively low computation cost. Moreover, we show that the learned models perform stably even when applied to unseen workloads, making them practical for production use.},
  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  articleno = {31},
  numpages = {15},
  location = {Atlanta, Georgia},
  series = {SC '20}
}

@misc{lsf_documentation,
  title={IBM Spectrum LSF Session Scheduler},
  url={https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=lsf-session-scheduler},
  journal={Platform LSF Session Scheduler},
  year={2021}
}

@misc{slurm_documentation,
  title={Slurm Workload Manager},
  url={https://slurm.schedmd.com/documentation.html},
  journal={Slurm Workload Manager - Documentation},
  year={2020}
}

@INPROCEEDINGS{flux,
  author={Ahn, Dong H. and Bass, Ned and Chu, Albert and Garlick, Jim and Grondona, Mark and Herbein, Stephen and Koning, Joseph and Patki, Tapasya and Scogland, Thomas R. W. and Springmeyer, Becky and Taufer, Michela},
  booktitle={2018 IEEE/ACM Workflows in Support of Large-Scale Science (WORKS)},
  title={Flux: Overcoming Scheduling Challenges for Exascale Workflows},
  year={2018},
  volume={},
  number={},
  pages={10-19},
  doi={10.1109/WORKS.2018.00007}
}

@inproceedings{tanash-pearc19,
  author = {Tanash, Mohammed and Dunn, Brandon and Andresen, Daniel and Hsu, William and Yang, Huichen and Okanlawon, Adedolapo},
  title = {Improving HPC System Performance by Predicting Job Resources via Supervised Machine Learning},
  year = {2019},
  isbn = {9781450372275},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3332186.3333041},
  doi = {10.1145/3332186.3333041},
  abstract = {High-Performance Computing (HPC) systems are resources utilized for data capture, sharing, and analysis. The majority of our HPC users come from other disciplines than Computer Science. HPC users including computer scientists have difficulties and do not feel proficient enough to decide the required amount of resources for their submitted jobs on the cluster. Consequently, users are encouraged to over-estimate resources for their submitted jobs, so their jobs will not be killing due insufficient resources. This process will waste and devour HPC resources; hence, this will lead to inefficient cluster utilization. We created a supervised machine learning model and integrated it into the Slurm resource manager simulator to predict the amount of required memory resources (Memory) and the required amount of time to run the computation. Our model involves using different machine learning algorithms. Our goal is to integrate and test the proposed supervised machine learning model on Slurm. We used over 10000 tasks selected from our HPC log files to evaluate the performance and the accuracy of our integrated model. The purpose of our work is to increase the performance of the Slurm by predicting the amount of require jobs memory resources and the time required for each particular job in order to improve the utilization of the HPC system using our integrated supervised machine learning model.Our results indicate that for larger jobs our model helps dramatically reduce computational turnaround time (from five days to ten hours for large jobs), substantially increased utilization of the HPC system, and decreased the average waiting time for the submitted jobs.},
  booktitle = {Proceedings of the Practice and Experience in Advanced Research Computing on Rise of the Machines (Learning)},
  articleno = {69},
  numpages = {8},
  keywords = {Performance, Slurm, HPC, Scheduling, User Modeling, Supervised Machine Learning},
  location = {Chicago, IL, USA},
  series = {PEARC '19}
}

@inproceedings{li-ross19,
  author = {Li, Boyang and Chunduri, Sudheer and Harms, Kevin and Fan, Yuping and Lan, Zhiling},
  title = {The Effect of System Utilization on Application Performance Variability},
  year = {2019},
  isbn = {9781450367554},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3322789.3328743},
  doi = {10.1145/3322789.3328743},
  abstract = {Application performance variability caused by network contention is a major issue on dragonfly based systems. This work-in-progress study makes two contributions. First, we analyze real workload logs and conduct application experiments on the production system Theta at Argonne to evaluate application performance variability. We find a strong correlation between system utilization and performance variability where a high system utilization (e.g., above 95%) can cause up to 21% degradation in application performance. Next, driven by this key finding, we investigate a scheduling policy to mitigate workload interference by leveraging the fact that production systems often exhibit diurnal utilization behavior and not all users are in a hurry for job completion. Preliminary results show that this scheduling design is capable of improving system productivity (measured by scheduling makespan) as well as improving user-level scheduling metrics such as user wait time and job slowdown.},
  booktitle = {Proceedings of the 9th International Workshop on Runtime and Operating Systems for Supercomputers},
  pages = {11–18},
  numpages = {8},
  keywords = {job scheduling, dragonfly network, system utilization, performance variability, application experiments},
  location = {Phoenix, AZ, USA},
  series = {ROSS '19}
}

@inproceedings{luo-ics2012,
  author = {Luo, Miao and Panda, Dhabaleswar K. and Ibrahim, Khaled Z. and Iancu, Costin},
  title = {Congestion Avoidance on Manycore High Performance Computing Systems},
  year = {2012},
  isbn = {9781450313162},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/2304576.2304594},
  doi = {10.1145/2304576.2304594},
  abstract = {Efficient communication is a requirement for application scalability on High Performance Computing systems. In this paper we argue for incorporating proactive congestion avoidance mechanisms into the design of communication layers on manycore systems. This is in contrast with the status quo which employs a reactive approach, emph{e.g.} congestion control mechanisms are activated only when resources have been exhausted. We present a core stateless optimization approach based on open loop end-point throttling, implemented for two UPC runtimes (Cray and Berkeley UPC) and validated on InfiniBand and the Cray Gemini networks. Microbenchmark results indicate that throttling the number of messages in flight per core can provide up to 4X performance improvements, while throttling the number of active cores per node can provide additional 40% and 6X performance improvement for UPC and MPI respectively. We evaluate inline (each task makes independent decisions) and proxy (server) congestion avoidance designs. Our runtime provides both performance and performance portability. We improve all-to-all collective performance by up to 4X and provide better performance than vendor provided MPI and UPC implementations. We also demonstrate performance improvements of up to 60% in application settings. Overall, our results indicate that modern systems accommodate only a surprisingly small number of messages in flight per node. As Exascale projections indicate that future systems are likely to contain hundreds to thousands of cores per node, we believe that their networks will be underprovisioned. In this situation, proactive congestion avoidance might become mandatory for performance improvement and portability.},
  booktitle = {Proceedings of the 26th ACM International Conference on Supercomputing},
  pages = {121–132},
  numpages = {12},
  keywords = {cray, high performance computing, infiniband, management, manycore, avoidance, multicore, congestion},
  location = {San Servolo Island, Venice, Italy},
  series = {ICS '12}
}

@article{patke-arxiv2020,
  author    = {Archit Patke and
               Saurabh Jha and
               Haoran Qiu and
               Jim M. Brandt and
               Ann C. Gentile and
               Joe Greenseid and
               Zbigniew Kalbarczyk and
               Ravishankar K. Iyer},
  title     = {Application-aware Congestion Mitigation forHigh-Performance Computing
               Systems},
  journal   = {CoRR},
  volume    = {abs/2012.07755},
  year      = {2020},
  url       = {https://arxiv.org/abs/2012.07755},
  archivePrefix = {arXiv},
  eprint    = {2012.07755},
  timestamp = {Sat, 02 Jan 2021 15:43:30 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2012-07755.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{easy-backfill-lifka,
  author = {Lifka, David A.},
  title = {The ANL/IBM SP Scheduling System},
  year = {1995},
  isbn = {3540601538},
  publisher = {Springer-Verlag},
  address = {Berlin, Heidelberg},
  booktitle = {Proceedings of the Workshop on Job Scheduling Strategies for Parallel Processing},
  pages = {295–303},
  numpages = {9},
  series = {IPPS '95}
}

@inproceedings{easy-backfill,
  author = {Skovira, Joseph and Chan, Waiman and Zhou, Honbo and Lifka, David A.},
  title = {The EASY - LoadLeveler API Project},
  year = {1996},
  isbn = {3540618643},
  publisher = {Springer-Verlag},
  address = {Berlin, Heidelberg},
  booktitle = {Proceedings of the Workshop on Job Scheduling Strategies for Parallel Processing},
  pages = {41–47},
  numpages = {7},
  series = {IPPS '96}
}

@software{kafka,
  author = {{Apache Software Foundation}},
  title = {Kafka},
  url = {https://kafka.apache.org/}
}

@software{spark,
  author = {{Apache Software Foundation}},
  title = {Spark},
  url = {https://spark.apache.org/}
}

@article{amg-proxy-app,
  title = {BoomerAMG: A parallel algebraic multigrid solver and preconditioner},
  journal = {Applied Numerical Mathematics},
  volume = {41},
  number = {1},
  pages = {155-177},
  year = {2002},
  note = {Developments and Trends in Iterative Methods for Large Systems of Equations - in memorium Rudiger Weiss},
  issn = {0168-9274},
  doi = {https://doi.org/10.1016/S0168-9274(01)00115-5},
  url = {https://www.sciencedirect.com/science/article/pii/S0168927401001155},
  author = {Van Emden Henson and Ulrike Meier Yang},
  keywords = {Algebraic multigrid, Parallel computing},
  abstract = {Driven by the need to solve linear systems arising from problems posed on extremely large, unstructured grids, there has been a recent resurgence of interest in algebraic multigrid (AMG). AMG is attractive in that it holds out the possibility of multigrid-like performance on unstructured grids. The sheer size of many modern physics and simulation problems has led to the development of massively parallel computers, and has sparked much research into developing algorithms for them. Parallelizing AMG is a difficult task, however. While much of the AMG method parallelizes readily, the process of coarse-grid selection, in particular, is fundamentally sequential in nature. We have previously introduced a parallel algorithm [A.J. Cleary, R.D. Falgout, V.E. Henson, J.E. Jones, in: Proceedings of the Fifth International Symposium on Solving Irregularly Structured Problems in Parallel, Springer, New York, 1998] for the selection of coarse-grid points, based on modifications of certain parallel independent set algorithms and the application of heuristics designed to insure the quality of the coarse grids, and shown results from a prototype serial version of the algorithm. In this paper we describe an implementation of a parallel AMG code, using the algorithm of A.J. Cleary, R.D. Falgout, V.E. Henson, J.E. Jones [in: Proceedings of the Fifth International Symposium on Solving Irregularly Structured Problems in Parallel, Springer, New York, 1998] as well as other approaches to parallelizing the coarse-grid selection. We consider three basic coarsening schemes and certain modifications to the basic schemes, designed to address specific performance issues. We present numerical results for a broad range of problem sizes and descriptions, and draw conclusions regarding the efficacy of the method. Finally, we indicate the current directions of the research.}
}

@article{laghos-proxy-app,
  author = {Dobrev, Veselin A. and Kolev, Tzanio V. and Rieben, Robert N.},
  title = {High-Order Curvilinear Finite Element Methods for Lagrangian Hydrodynamics},
  journal = {SIAM Journal on Scientific Computing},
  volume = {34},
  number = {5},
  pages = {B606-B641},
  year = {2012},
  doi = {10.1137/120864672},
  URL = {
          https://doi.org/10.1137/120864672
  },
  eprint = {
          https://doi.org/10.1137/120864672
  }
}

@article{aksar-sc19,
  title = {A Machine Learning Approach to Understanding HPC Application Performance Variation.},
  author = {Schwaller, Benjamin and Aksar, Burak and Aaziz, Omar Raad and Ates, Emre and Brandt, James M. and Coskun, Ayse and Egele, Manuel and Leung, Vitus J.},
  abstractNote = {Abstract not provided.},
  doi = {},
  url = {https://www.osti.gov/biblio/1642784}, journal = {},
  place = {United States},
  year = {2019},
  month = {10}
}

@misc{swfft,
  author = {Adrian Pope et al},
  title = {Swfft},
  year = {2017},
  publisher = {CELS ANL},
  journal = {Git repository},
  howpublished = {https://git.cels.anl.gov/hacc/SWFFT}
}

@misc{pennant,
  author = {Charles R. Ferenbaugh},
  title = {Pennant},
  year = {2016},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {https://github.com/lanl/PENNANT}
}

@misc{sw4lite,
  title = {sw4lite},
  year = {2017},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {https://github.com/geodynamics/sw4lite}
}

@misc{oms,
  title = {OpenSM Monitoring Service},
  year = {2015},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {https://github.com/meier/opensm-client-server}
}

@INPROCEEDINGS{caliper,
  author={Boehme, David and Gamblin, Todd and Beckingsale, David and Bremer, Peer-Timo and Gimenez, Alfredo and LeGendre, Matthew and Pearce, Olga and Schulz, Martin},
  booktitle={SC '16: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  title={Caliper: Performance Introspection for HPC Software Stacks},
  year={2016},
  volume={},
  number={},
  pages={550-560},
  doi={10.1109/SC.2016.46}
}

@inproceedings{klusavcek2018evaluating,
  title={Evaluating the impact of soft walltimes on job scheduling performance},
  author={Klus{\'a}{\v{c}}ek, Dalibor and Chlumsk{\`y}, V{\'a}clav},
  booktitle={Workshop on Job Scheduling Strategies for Parallel Processing},
  pages={15--38},
  year={2018}
}

@INPROCEEDINGS{aaziz-cluster2018,
  author={Aaziz, Omar and Cook, Jonathan and Tanash, Mohammed},
  booktitle={2018 IEEE International Conference on Cluster Computing (CLUSTER)},
  title={Modeling Expected Application Runtime for Characterizing and Assessing Job Performance},
  year={2018},
  volume={},
  number={},
  pages={543-551},
  doi={10.1109/CLUSTER.2018.00070}
}

@misc{gotcha,
  title         = "Gotcha Developer’s Guide",
  url           = {http://doi.acm.org/10.1145/1219092.1219093},
}

@misc{darshan,
  title         = "Darshan",
  url           = {https://www.mcs.anl.gov/research/projects/darshan},
}

@misc{got,
  author        = "Linux Foundation",
  title        = "Dynamic Linking",
  url           = {https://refspecs.linuxfoundation.org/ELF/zSeries/lzsabi0_zSeries/x2251.html},
}

@misc{unifyfs,
  title = {UnifyFS: A Distributed Burst Buffer File System - 0.1.0},
  author = {Moody, Adam and Sikich, Danielle and Bass, Ned and Brim, Michael J. and Stanavige, Cameron and Sim, Hyogi and Moore, Joseph and Hutter, Tony and Boehm, Swen and Mohror, Kathryn and Ivanov, Dmitry and Wang, Teng and Steffen, Craig P. and USDOE National Nuclear Security Administration},
  url = {https://www.osti.gov//servlets/purl/1408515},
  doi = {10.11578/dc.20200519.19},
  url = {https://www.osti.gov/biblio/1408515},
  year = {2017},
  month = {10},
}

@INPROCEEDINGS{recorder,
  author={Wang, Chen and Sun, Jinghan and Snir, Marc and Mohror, Kathryn and Gonsiorowski, Elsa},
  booktitle={2020 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)},
  title={Recorder 2.0: Efficient Parallel I/O Tracing and Analysis},
  year={2020},
  pages={1-8},
  doi={10.1109/IPDPSW50202.2020.00176}
}

@article{hpctoolkit,
  title={HPCToolkit: Tools for performance analysis of optimized parallel programs},
  author={Adhianto, Laksono and Banerjee, Sinchan and Fagan, Mike and Krentel, Mark and Marin, Gabriel and Mellor-Crummey, John and Tallent, Nathan R},
  journal={Concurrency and Computation: Practice and Experience},
  volume={22},
  number={6},
  pages={685--701},
  year={2010},
  publisher={Wiley Online Library}
}

@article{tallent2009binary,
  title={Binary analysis for measurement and attribution of program performance},
  author={Tallent, Nathan R and Mellor-Crummey, John M and Fagan, Michael W},
  journal={ACM Sigplan Notices},
  volume={44},
  number={6},
  pages={441--452},
  year={2009},
  publisher={ACM New York, NY, USA}
}

@article{browne2000portable,
  title={A portable programming interface for performance evaluation on modern processors},
  author={Browne, Shirley and Dongarra, Jack and Garner, Nathan and Ho, George and Mucci, Philip},
  journal={The international journal of high performance computing applications},
  volume={14},
  number={3},
  pages={189--204},
  year={2000},
  publisher={Sage Publications Sage CA: Thousand Oaks, CA}
}

@InProceedings{scorep,
  author="Kn{\"u}pfer, Andreas and R{\"o}ssel, Christian and Mey, Dieter an and Biersdorff, Scott and Diethelm, Kai and Eschweiler, Dominic and Geimer, Markus and Gerndt, Michael and Lorenz, Daniel and Malony, Allen and Nagel, Wolfgang E. and Oleynik, Yury and Philippen, Peter and Saviankou, Pavel and Schmidl, Dirk and Shende, Sameer and Tsch{\"u}ter, Ronny and Wagner, Michael and Wesarg, Bert and Wolf, Felix", editor="Brunst, Holger and M{\"u}ller, Matthias S. and Nagel, Wolfgang E. and Resch, Michael M.",
  title="Score-P: A Joint Performance Measurement Run-Time Infrastructure for Periscope,Scalasca, TAU, and Vampir",
  booktitle="Tools for High Performance Computing 2011",
  year="2012",
  publisher="Springer Berlin Heidelberg",
  address="Berlin, Heidelberg",
  pages="79--91",
  abstract="This paper gives an overview about the Score-P performance measurement infrastructure which is being jointly developed by leading HPC performance tools groups. It motivates the advantages of the joint undertaking from both the developer and the user perspectives, and presents the design and components of the newly developed Score-P performance measurement infrastructure. Furthermore, it contains first evaluation results in comparison with existing performance tools and presents an outlook to the long-term cooperative development of the new system.",
  isbn="978-3-642-31476-6"
}

@inproceedings{timemory,
  title={TiMemory: modular performance analysis for HPC},
  author={Madsen, Jonathan R and Awan, Muaaz G and Brunie, Hugo and Deslippe, Jack and Gayatri, Rahul and Oliker, Leonid and Wang, Yunsong and Yang, Charlene and Williams, Samuel},
  booktitle={International Conference on High Performance Computing},
  pages={434--452},
  year={2020},
  organization={Springer}
}

@inproceedings{leko2008practical,
  title={Practical experiences with modern parallel performance analysis tools: an evaluation},
  author={Leko, Adam and Sherburne, Hans and Su, H and Golden, Bryan and George, Alan D},
  booktitle={Parallel and Distributed Processing, IPDPS 2008 IEEE Symposium},
  pages={14--18},
  year={2008}
}

@techreport{LULESH2:changes,
  author = {Ian Karlin and Jeff Keasler and Rob Neely},
  title = {LULESH 2.0 Updates and Changes},number = {LLNL-TR-641973},
  location = {Livermore, CA},
  pages = {1-9},
  year = {2013},
  month = {August}
}

@techreport{nethercote2004dynamic,
  title={Dynamic binary analysis and instrumentation},
  author={Nethercote, Nicholas},
  year={2004},
  institution={University of Cambridge, Computer Laboratory}
}

@inproceedings{froyd2005low,
  title={Low-overhead call path profiling of unmodified, optimized code},
  author={Froyd, Nathan and Mellor-Crummey, John and Fowler, Rob},
  booktitle={Proceedings of the 19th annual international conference on Supercomputing},
  pages={81--90},
  year={2005}
}

@incollection{knupfer2008vampir,
  title={The vampir performance analysis tool-set},
  author={Kn{\"u}pfer, Andreas and Brunst, Holger and Doleschal, Jens and Jurenz, Matthias and Lieber, Matthias and Mickler, Holger and M{\"u}ller, Matthias S and Nagel, Wolfgang E},
  booktitle={Tools for high performance computing},
  pages={139--155},
  year={2008},
  publisher={Springer}
}

@inproceedings{pdt:lindlan2000tool,
  title={A tool framework for static and dynamic analysis of object-oriented software with templates},
  author={Lindlan, Kathleen A and Cuny, Janice and Malony, Allen D and Shende, Sameer and Mohr, Bernd and Rivenburgh, Reid and Rasmussen, Craig},
  booktitle={SC'00: Proceedings of the 2000 ACM/IEEE Conference on Supercomputing},
  pages={49--49},
  year={2000},
  organization={IEEE}
}

@article{liu2014tool,
  title={A tool to analyze the performance of multithreaded programs on NUMA architectures},
  author={Liu, Xu and Mellor-Crummey, John},
  journal={ACM Sigplan Notices},
  volume={49},
  number={8},
  pages={259--272},
  year={2014},
  publisher={ACM New York, NY, USA}
}

@inproceedings{malony2014general,
  title={General hybrid parallel profiling},
  author={Malony, Allen D and Huck, Kevin A},
  booktitle={2014 22nd Euromicro International Conference on Parallel, Distributed, and Network-Based Processing},
  pages={204--212},
  year={2014},
  organization={IEEE}
}

@article{shende2003integration,
  title={Integration and application of TAU in parallel Java environments},
  author={Shende, Sameer and Malony, Allen D},
  journal={Concurrency and Computation: Practice and Experience},
  volume={15},
  number={3-5},
  pages={501--519},
  year={2003},
  publisher={Wiley Online Library}
}

@inproceedings{nataraj2007tauoversupermon,
  title={Tauoversupermon: Low-overhead online parallel performance monitoring},
  author={Nataraj, Aroon and Sottile, Matthew and Morris, Alan and Malony, Allen D and Shende, Sameer},
  booktitle={European Conference on Parallel Processing},
  pages={85--96},
  year={2007},
  organization={Springer}
}

@article{allamanis2018survey,
  title={A survey of machine learning for big code and naturalness},
  author={Allamanis, Miltiadis and Barr, Earl T and Devanbu, Premkumar and Sutton, Charles},
  journal={ACM Computing Surveys (CSUR)},
  volume={51},
  number={4},
  pages={81},
  year={2018},
  publisher={ACM}
}

@inproceedings{Nitin2021DIRECTA,
  title={DIRECT : A Transformer-based Model for Decompiled Identifier Renaming},
  author={Vikram Nitin and Anthony Saieva and Baishakhi Ray and Gail E. Kaiser},
  booktitle={NLP4PROG},
  year={2021}
}

@misc{codex-copilot-all-author,
  Author = {Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
  Title = {Evaluating Large Language Models Trained on Code},
  Year = {2021},
  Eprint = {arXiv:2107.03374},
}

@misc{codex-copilot-short-author,
  Author = {Mark Chen and et al},
  Title = {Evaluating Large Language Models Trained on Code},
  Year = {2021},
  Eprint = {arXiv:2107.03374},
}


@InProceedings{pmlr-v139-cummins21a,
  title =    {ProGraML: A Graph-based Program Representation for Data Flow Analysis and Compiler Optimizations},
  author =       {Cummins, Chris and Fisches, Zacharias V. and Ben-Nun, Tal and Hoefler, Torsten and O'Boyle, Michael F P and Leather, Hugh},
  booktitle =    {Proceedings of the 38th International Conference on Machine Learning},
  pages =    {2244--2253},
  year =   {2021},
  editor =   {Meila, Marina and Zhang, Tong},
  volume =   {139},
  series =   {Proceedings of Machine Learning Research},
  month =    {18--24 Jul},
  publisher =    {PMLR},
  pdf =    {http://proceedings.mlr.press/v139/cummins21a/cummins21a.pdf},
  url =    {https://proceedings.mlr.press/v139/cummins21a.html},
}

@inproceedings{neural-bug-finding_gupta_NEURIPS2019,
  author = {Gupta, Rahul and Kanade, Aditya and Shevade, Shirish},
  booktitle = {Advances in Neural Information Processing Systems},
  editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
  pages = {},
  publisher = {Curran Associates, Inc.},
  title = {Neural Attribution for Semantic Bug-Localization in Student Programs},
  volume = {32},
  year = {2019}
}

@Article{android-malware-detection-ml-survey-2021,
  AUTHOR = {Senanayake, Janaka and Kalutarage, Harsha and Al-Kadri, Mhd Omar},
  TITLE = {Android Mobile Malware Detection Using Machine Learning: A Systematic Review},
  JOURNAL = {Electronics},
  VOLUME = {10},
  YEAR = {2021},
  NUMBER = {13},
  ARTICLE-NUMBER = {1606},
  URL = {https://www.mdpi.com/2079-9292/10/13/1606},
  ISSN = {2079-9292},
  DOI = {10.3390/electronics10131606}
}

@InProceedings{huang2021cosqa,
  author = {Huang, Junjie and Tang, Duyu and Shou, Linjun and Gong (YIMING), Ming and Xu, Ke and Jiang, Daxin and Zhou, Ming and Duan, Nan},
  title = {CoSQA: 20,000+ Web Queries for Code Search and Question Answering},
  booktitle = {ACL-IJCNLP 2021},
  year = {2021},
  month = {May},
  url = {https://www.microsoft.com/en-us/research/publication/cosqa-20-000-web-queries-for-code-search-and-question-answering/},
}

@INPROCEEDINGS{code-queries-nlp-yan-icsaer-2020,  author={Yan, Shuhan and Yu, Hang and Chen, Yuting and Shen, Beijun and Jiang, Lingxiao},  booktitle={2020 IEEE 27th International Conference on Software Analysis, Evolution and Reengineering (SANER)},   title={Are the Code Snippets What We Are Searching for? A Benchmark and an Empirical Study on Code Search with Natural-Language Queries},   year={2020},  volume={},  number={},  pages={344-354},  doi={10.1109/SANER48275.2020.9054840}}

@article{Mir2021Type4PyDS,
  title={Type4Py: Deep Similarity Learning-Based Type Inference for Python},
  author={Amir M. Mir and Evaldas Latoskinas and Sebastian Proksch and Georgios Gousios},
  journal={ArXiv},
  year={2021},
  volume={abs/2101.04470}
}

@misc{ast-tree-lstm-tanzima-arxiv-2021,
  Author = {Nathan Pinnow and Tarek Ramadan and Tanzima Z. Islam and Chase Phelps and Jayaraman J. Thiagarajan},
  Title = {Comparative Code Structure Analysis using Deep Learning for Performance Prediction},
  Year = {2021},
  Eprint = {arXiv:2102.07660},
}

@article{Wu2021ProtoTransformerAM,
  title={ProtoTransformer: A Meta-Learning Approach to Providing Student Feedback},
  author={Mike Wu and Noah D. Goodman and Chris Piech and Chelsea Finn},
  journal={ArXiv},
  year={2021},
  volume={abs/2107.14035}
}

@article{Zhu2021ASE,
  title={A syntax-guided edit decoder for neural program repair},
  author={Qihao Zhu and Zeyu Sun and Yuan-an Xiao and Wenjie Zhang and Kang Yuan and Yingfei Xiong and Lu Zhang},
  journal={Proceedings of the 29th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering},
  year={2021}
}

@article{Bhatia2018NeuroSymbolicPC,
  title={Neuro-Symbolic Program Corrector for Introductory Programming Assignments},
  author={Sahil Bhatia and Pushmeet Kohli and Rishabh Singh},
  journal={2018 IEEE/ACM 40th International Conference on Software Engineering (ICSE)},
  year={2018},
  pages={60-70}
}

@article{saviankou2015cube,
  title={Cube v4: From performance report explorer to performance analysis tool},
  author={Saviankou, Pavel and Knobloch, Michael and Visser, Anke and Mohr, Bernd},
  journal={Procedia Computer Science},
  volume={51},
  pages={1343--1352},
  year={2015},
  publisher={Elsevier}
}

@inproceedings{calotoiu2016fast,
  title={Fast multi-parameter performance modeling},
  author={Calotoiu, Alexandru and Beckinsale, David and Earl, Christopher W and Hoefler, Torsten and Karlin, Ian and Schulz, Martin and Wolf, Felix},
  booktitle={2016 IEEE International Conference on Cluster Computing (CLUSTER)},
  pages={172--181},
  year={2016},
  organization={IEEE}
}

@inproceedings{shudler2015exascaling,
  title={Exascaling your library: Will your implementation meet your expectations?},
  author={Shudler, Sergei and Calotoiu, Alexandru and Hoefler, Torsten and Strube, Alexandre and Wolf, Felix},
  booktitle={Proceedings of the 29th ACM on International Conference on Supercomputing},
  pages={165--175},
  year={2015}
}

@inproceedings{calotoiu2013using,
  title={Using automated performance modeling to find scalability bugs in complex codes},
  author={Calotoiu, Alexandru and Hoefler, Torsten and Poke, Marius and Wolf, Felix},
  booktitle={Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis},
  pages={1--12},
  year={2013}
}

@inproceedings{adhianto2010effectively,
  title={Effectively presenting call path profiles of application performance},
  author={Adhianto, Laksono and Mellor-Crummey, John and Tallent, Nathan R},
  booktitle={2010 39th International Conference on Parallel Processing Workshops},
  pages={179--188},
  year={2010},
  organization={IEEE}
}

@article{vetter2005mpip,
  title={mpip: Lightweight, scalable mpi profiling},
  author={Vetter, Jeffrey and Chambreau, Chris},
  year={2005}
}

@inproceedings{richards2017quicksilver,
  title={Quicksilver: a proxy app for the Monte Carlo transport code mercury},
  author={Richards, David F and Bleile, Ryan C and Brantley, Patrick S and Dawson, Shawn A and McKinley, Michael Scott and O’Brien, Matthew J},
  booktitle={2017 IEEE International Conference on Cluster Computing (CLUSTER)},
  pages={866--873},
  year={2017},
  organization={IEEE}
}

@article{knobloch2020tools,
  title={Tools for gpu computing--debugging and performance analysis of heterogenous hpc applications},
  author={Knobloch, Michael and Mohr, Bernd},
  journal={Supercomputing Frontiers and Innovations},
  volume={7},
  number={1},
  pages={91--111},
  year={2020}
}

@article{mohr2014scalable,
  title={Scalable parallel performance measurement and analysis tools-state-of-the-art and future challenges},
  author={Mohr, Bernd},
  journal={Supercomputing frontiers and innovations},
  volume={1},
  number={2},
  pages={108--123},
  year={2014}
}

@misc{alphacode-2022,
  doi = {10.48550/ARXIV.2203.07814},
  url = {https://arxiv.org/abs/2203.07814},
  author = {Li, Yujia and Choi, David and Chung, Junyoung and Kushman, Nate and Schrittwieser, Julian and Leblond, Rémi and Eccles, Tom and Keeling, James and Gimeno, Felix and Lago, Agustin Dal and Hubert, Thomas and Choy, Peter and d'Autume, Cyprien de Masson and Babuschkin, Igor and Chen, Xinyun and Huang, Po-Sen and Welbl, Johannes and Gowal, Sven and Cherepanov, Alexey and Molloy, James and Mankowitz, Daniel J. and Robson, Esme Sutherland and Kohli, Pushmeet and de Freitas, Nando and Kavukcuoglu, Koray and Vinyals, Oriol},
  keywords = {Programming Languages (cs.PL), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {Competition-Level Code Generation with AlphaCode},
  publisher = {arXiv},
  year = {2022},
  copyright = {Creative Commons Attribution 4.0 International}
}

@inproceedings{neural-code-comprehension_ben-nun_2018,
  author = {Ben-Nun, Tal and Jakobovits, Alice Shoshana and Hoefler, Torsten},
  title = {Neural Code Comprehension: A Learnable Representation of Code Semantics},
  year = {2018},
  publisher = {Curran Associates Inc.},
  address = {Red Hook, NY, USA},
  booktitle = {Proceedings of the 32nd International Conference on Neural Information Processing Systems},
  pages = {3589–3601},
  numpages = {13},
  location = {Montr\'{e}al, Canada},
  series = {NIPS'18}
}

@article{ir2vec_2020,
  author = {VenkataKeerthy, S. and Aggarwal, Rohit and Jain, Shalini and Desarkar, Maunendra Sankar and Upadrasta, Ramakrishna and Srikant, Y. N.},
  title = {IR2V<span class="smallcaps SmallerCapital">EC</span>: LLVM IR Based Scalable Program Embeddings},
  year = {2020},
  issue_date = {December 2020},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  volume = {17},
  number = {4},
  issn = {1544-3566},
  url = {https://doi.org/10.1145/3418463},
  doi = {10.1145/3418463},
  journal = {ACM Trans. Archit. Code Optim.},
  month = {dec},
  articleno = {32},
  numpages = {27},
  keywords = {compiler optimizations, heterogeneous systems, LLVM, intermediate representations, representation learning}
}

@INPROCEEDINGS{bert_regression_testing_2010,
  author={Jin, Wei and Orso, Alessandro and Xie, Tao},
  booktitle={2010 Third International Conference on Software Testing, Verification and Validation},
  title={Automated Behavioral Regression Testing},
  year={2010},
  volume={},
  number={},
  pages={137-146},
  doi={10.1109/ICST.2010.64}
}

@inbook{simrt_regression_testing_2014,
  author = {Yu, Tingting and Srisa-an, Witawas and Rothermel, Gregg},
  title = {SimRT: An Automated Framework to Support Regression Testing for Data Races},
  year = {2014},
  isbn = {9781450327565},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/2568225.2568294},
  booktitle = {Proceedings of the 36th International Conference on Software Engineering},
  pages = {48–59},
  numpages = {12}
}

@misc{reframe,
  title = {ReFrame},
  year = {2022},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/eth-cscs/reframe}}
}

@inproceedings{gumtree_2014,
  author = {Falleri, Jean-R\'{e}my and Morandat, Flor\'{e}al and Blanc, Xavier and Martinez, Matias and Monperrus, Martin},
  title = {Fine-Grained and Accurate Source Code Differencing},
  year = {2014},
  isbn = {9781450330138},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/2642937.2642982},
  doi = {10.1145/2642937.2642982},
  booktitle = {Proceedings of the 29th ACM/IEEE International Conference on Automated Software Engineering},
  pages = {313–324},
  numpages = {12},
  keywords = {ast, software evolution, program comprehension, tree differencing},
  location = {Vasteras, Sweden},
  series = {ASE '14}
}

@misc{gumtree_impl,
  title = {GumTree},
  year = {2022},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/GumTreeDiff/gumtree}}
}

@article{survey-edit-distance,
  title = {A survey on tree edit distance and related problems},
  journal = {Theoretical Computer Science},
  volume = {337},
  number = {1},
  pages = {217-239},
  year = {2005},
  issn = {0304-3975},
  doi = {https://doi.org/10.1016/j.tcs.2004.12.030},
  url = {https://www.sciencedirect.com/science/article/pii/S0304397505000174},
  author = {Philip Bille},
  keywords = {Tree matching, Tree edit distance, Tree alignment, Tree inclusion},
}

@inproceedings{huck2010detailedload,
  title={Detailed load balance analysis of large scale parallel applications},
  author={Huck, Kevin A and Labarta, Jesus},
  booktitle={2010 39th International Conference on Parallel Processing},
  pages={535--544},
  year={2010},
  organization={IEEE}
}

@inproceedings{derose2007detectingload,
  title={Detecting application load imbalance on high end massively parallel systems},
  author={DeRose, Luiz and Homer, Bill and Johnson, Dean},
  booktitle={European Conference on Parallel Processing},
  pages={150--159},
  year={2007},
  organization={Springer}
}

@inproceedings{bohme2012scalable,
  title={Scalable critical-path based performance analysis},
  author={B{\"o}hme, David and Wolf, Felix and de Supinski, Bronis R and Schulz, Martin and Geimer, Markus},
  booktitle={2012 IEEE 26th International Parallel and Distributed Processing Symposium},
  pages={1330--1340},
  year={2012},
  organization={IEEE}
}

@article{wolf2007automatic,
  title={Automatic analysis of inefficiency patterns in parallel applications},
  author={Wolf, Felix and Mohr, Bernd and Dongarra, Jack and Moore, Shirley},
  journal={Concurrency and Computation: Practice and Experience},
  volume={19},
  number={11},
  pages={1481--1496},
  year={2007},
  publisher={Wiley Online Library}
}

@inproceedings{tallent2009diagnosing,
  title={Diagnosing performance bottlenecks in emerging petascale applications},
  author={Tallent, Nathan R and Mellor-Crummey, John M and Adhianto, Laksono and Fagan, Michael W and Krentel, Mark},
  booktitle={Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis},
  pages={1--11},
  year={2009}
}

@inproceedings{benedict2009automatic,
  title={Automatic performance analysis of large scale simulations},
  author={Benedict, Shajulin and Brehm, Matthias and Gerndt, Michael and Guillen, Carla and Hesse, Wolfram and Petkov, Ventsislav},
  booktitle={European Conference on Parallel Processing},
  pages={199--207},
  year={2009},
  organization={Springer}
}

@inproceedings{coarfa2007scalability,
  title={Scalability analysis of SPMD codes using expectations},
  author={Coarfa, Cristian and Mellor-Crummey, John and Froyd, Nathan and Dotsenko, Yuri},
  booktitle={Proceedings of the 21st annual international conference on Supercomputing},
  pages={13--22},
  year={2007}
}

@article{miller1995paradyn,
  title={The Paradyn parallel performance measurement tool},
  author={Miller, Barton P and Callaghan, Mark D. and Cargille, Jonathan M and Hollingsworth, Jeffrey K and Irvin, R Bruce and Karavanic, Karen L and Kunchithapadam, Krishna and Newhall, Tia},
  journal={Computer},
  volume={28},
  number={11},
  pages={37--46},
  year={1995},
  publisher={IEEE}
}

@article{geimer2010scalasca,
  title={The Scalasca performance toolset architecture},
  author={Geimer, Markus and Wolf, Felix and Wylie, Brian JN and {\'A}brah{\'a}m, Erika and Becker, Daniel and Mohr, Bernd},
  journal={Concurrency and computation: Practice and experience},
  volume={22},
  number={6},
  pages={702--719},
  year={2010},
  publisher={Wiley Online Library}
}

@inproceedings{liu2015scaanalyzer,
  title={Scaanalyzer: A tool to identify memory scalability bottlenecks in parallel programs},
  author={Liu, Xu and Wu, Bo},
  booktitle={SC'15: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages={1--12},
  year={2015},
  organization={IEEE}
}

@inproceedings{zhao2020drcctprof,
  title={DrCCTProf: A fine-grained call path profiler for arm-based clusters},
  author={Zhao, Qidong and Liu, Xu and Chabbi, Milind},
  booktitle={SC20: International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages={1--16},
  year={2020},
  organization={IEEE}
}

@inproceedings{xapp_2015,
  author = {Ardalani, Newsha and Lestourgeon, Clint and Sankaralingam, Karthikeyan and Zhu, Xiaojin},
  title = {Cross-Architecture Performance Prediction (XAPP) Using CPU Code to Predict GPU Performance},
  year = {2015},
  isbn = {9781450340342},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/2830772.2830780},
  doi = {10.1145/2830772.2830780},
  abstract = {GPUs have become prevalent and more general purpose, but GPU programming remains challenging and time consuming for the majority of programmers. In addition, it is not always clear which codes will benefit from getting ported to GPU. Therefore, having a tool to estimate GPU performance for a piece of code before writing a GPU implementation is highly desirable. To this end, we propose Cross-Architecture Performance Prediction (XAPP), a machine-learning based technique that uses only single-threaded CPU implementation to predict GPU performance.Our paper is built on the two following insights: i) Execution time on GPU is a function of program properties and hardware characteristics. ii) By examining a vast array of previously implemented GPU codes along with their CPU counterparts, we can use established machine learning techniques to learn this correlation between program properties, hardware characteristics and GPU execution time. We use an adaptive two-level machine learning solution. Our results show that our tool is robust and accurate: we achieve 26.9% average error on a set of 24 real-world kernels. We also discuss practical usage scenarios for XAPP.},
  booktitle = {Proceedings of the 48th International Symposium on Microarchitecture},
  pages = {725–737},
  numpages = {13},
  keywords = {cross-platform prediction, GPU, machine learning, performance modeling},
  location = {Waikiki, Hawaii},
  series = {MICRO-48}
}

@InProceedings{papi,
  author="Terpstra, Dan
  and Jagode, Heike
  and You, Haihang
  and Dongarra, Jack",
  editor="M{\"u}ller, Matthias S.
  and Resch, Michael M.
  and Schulz, Alexander
  and Nagel, Wolfgang E.",
  title="Collecting Performance Data with PAPI-C",
  booktitle="Tools for High Performance Computing 2009",
  year="2010",
  publisher="Springer Berlin Heidelberg",
  address="Berlin, Heidelberg",
  pages="157--173",
  isbn="978-3-642-11261-4"
}

@InProceedings{pmlr-vae-calibrate,
  title =    {Simple and Effective VAE Training with Calibrated Decoders},
  author =       {Rybkin, Oleh and Daniilidis, Kostas and Levine, Sergey},
  booktitle =    {Proceedings of the 38th International Conference on Machine Learning},
  pages =    {9179--9189},
  year =   {2021},
  editor =   {Meila, Marina and Zhang, Tong},
  volume =   {139},
  series =   {Proceedings of Machine Learning Research},
  month =    {18--24 Jul},
  publisher =    {PMLR},
  pdf =    {http://proceedings.mlr.press/v139/rybkin21a/rybkin21a.pdf},
  url =    {https://proceedings.mlr.press/v139/rybkin21a.html},
}

@inproceedings{extrap,
  author={Calotoiu, Alexandru and Hoefler, Torsten and Poke, Marius and Wolf, Felix},
  month={November},
  title={Using Automated Performance Modeling to Find Scalability Bugs in Complex Codes},
  booktitle={Proc. of the ACM/IEEE Conference on Supercomputing (SC13), Denver, CO, USA},
  year={2013},
  pages={1--12},
  publisher={ACM},
  isbn={978-1-4503-2378-9},
  doi={10.1145/2503210.2503277}
}

@inproceedings{allamanis_codeduplication,
  author = {Allamanis, Miltiadis},
  title = {The Adverse Effects of Code Duplication in Machine Learning Models of Code},
  year = {2019},
  isbn = {9781450369954},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3359591.3359735},
  doi = {10.1145/3359591.3359735},
  abstract = {The field of big code relies on mining large corpora of code to perform some learning task towards creating better tools for software engineers. A significant threat to this approach was recently identified by Lopes et al. (2017) who found a large amount of near-duplicate code on GitHub. However, the impact of code duplication has not been noticed by researchers devising machine learning models for source code. In this work, we explore the effects of code duplication on machine learning models showing that reported performance metrics are sometimes inflated by up to 100% when testing on duplicated code corpora compared to the performance on de-duplicated corpora which more accurately represent how machine learning models of code are used by software engineers. We present a duplication index for widely used datasets, list best practices for collecting code corpora and evaluating machine learning models on them. Finally, we release tools to help the community avoid this problem in future research.},
  booktitle = {Proceedings of the 2019 ACM SIGPLAN International Symposium on New Ideas, New Paradigms, and Reflections on Programming and Software},
  pages = {143–153},
  numpages = {11},
  keywords = {code naturalness, big code, dataset collection, duplication, machine learning},
  location = {Athens, Greece},
  series = {Onward! 2019}
}

@dataset{xu_2022_code-llms-survey-dataset,
  author       = {Frank F. Xu and
                  Uri Alon and
                  Graham Neubig and
                  Vincent J. Hellendoorn},
  title        = {{A Systematic Evaluation of Large Language Models
                   of Code}},
  month        = feb,
  year         = 2022,
  note         = {https://arxiv.org/abs/2202.13169},
  publisher    = {Zenodo},
  doi          = {10.5281/zenodo.6363556},
  url          = {https://doi.org/10.5281/zenodo.6363556}
}

@InProceedings{namdPerfFGCS,
  author = "Laxmikant V. Kale and Gengbin Zheng and Chee Wai Lee and Sameer Kumar",
  title = "Scaling Applications to Massively Parallel Machines Using Projections Performance Analysis Tool",
  booktitle = "Future Generation Computer Systems Special Issue on: Large-Scale System Performance Modeling and Analysis",
  volume = 22,
  number = 3,
  pages = {347-358},
  month = "February",
  year =   2006
}

@InProceedings{vampir,
  author="Kn{\"u}pfer, Andreas and Brunst, Holger and Doleschal, Jens and Jurenz, Matthias and Lieber, Matthias and Mickler, Holger and M{\"u}ller, Matthias S. and Nagel, Wolfgang E.",
  editor="Resch, Michael and Keller, Rainer and Himmler, Valentin and Krammer, Bettina and Schulz, Alexander",
  title="The Vampir Performance Analysis Tool-Set",
  booktitle="Tools for High Performance Computing",
  year="2008",
  publisher="Springer Berlin Heidelberg",
  address="Berlin, Heidelberg",
  pages="139--155",
  isbn="978-3-540-68564-7"
}
@article{bissett_agent-based_2021,
	title = {Agent-{Based} {Computational} {Epidemiological} {Modeling}},
	volume = {101},
	issn = {0019-4964},
	url = {https://doi.org/10.1007/s41745-021-00260-2},
	doi = {10.1007/s41745-021-00260-2},
	abstract = {The study of epidemics is useful for not only understanding outbreaks and trying to limit their adverse effects, but also because epidemics are related to social phenomena such as government instability, crime, poverty, and inequality. One approach for studying epidemics is to simulate their spread through populations. In this work, we describe an integrated multi-dimensional approach to epidemic simulation, which encompasses: (1) a theoretical framework for simulation and analysis; (2) synthetic population (digital twin) generation; (3) (social contact) network construction methods from synthetic populations, (4) stylized network construction methods; and (5) simulation of the evolution of a virus or disease through a social network. We describe these aspects and end with a short discussion on simulation results that inform public policy.},
	language = {en},
	number = {3},
	urldate = {2023-03-07},
	journal = {Journal of the Indian Institute of Science},
	author = {Bissett, Keith R. and Cadena, Jose and Khan, Maleq and Kuhlman, Chris J.},
	month = jul,
	year = {2021},
	keywords = {Agent-based simulation, Computational epidemiology, Data-driven social network generation, Discrete dynamical systems, High performance computing, Large-scale stylized network construction, Synthetic populations},
	pages = {303--327},
	file = {Full Text PDF:C\:\\Users\\bitqu\\Zotero\\storage\\GCC2M9UU\\Bissett et al. - 2021 - Agent-Based Computational Epidemiological Modeling.pdf:application/pdf},
}

@article{gillespie1977exact,
  title={Exact stochastic simulation of coupled chemical reactions},
  author={Gillespie, Daniel T},
  journal={The journal of physical chemistry},
  volume={81},
  number={25},
  pages={2340--2361},
  year={1977},
  publisher={ACS Publications}
}

@article{chen2021prioritizing,
  title={Prioritizing allocation of COVID-19 vaccines based on social contacts increases vaccination effectiveness},
  author={Chen, Jiangzhuo and Hoops, Stefan and Marathe, Achla and Mortveit, Henning and Lewis, Bryan and Venkatramanan, Srinivasan and Haddadan, Arash and Bhattacharya, Parantapa and Adiga, Abhijin and Vullikanti, Anil and others},
  journal={MedRxiv},
  pages={2021--02},
  year={2021},
  publisher={Cold Spring Harbor Laboratory Press}
}


@article{jiang_method_2022,
	title = {A method to create a synthetic population with social networks for geographically-explicit agent-based models},
	volume = {2},
	issn = {2730-6852},
	url = {https://doi.org/10.1007/s43762-022-00034-1},
	doi = {10.1007/s43762-022-00034-1},
	abstract = {Geographically-explicit simulations have become crucial in understanding cities and are playing an important role in Urban Science. One such approach is that of agent-based modeling which allows us to explore how agents interact with the environment and each other (e.g., social networks), and how through such interactions aggregate patterns emerge (e.g., disease outbreaks, traffic jams). While the use of agent-based modeling has grown, one challenge remains, that of creating realistic, geographically-explicit, synthetic populations which incorporate social networks. To address this challenge, this paper presents a novel method to create a synthetic population which incorporates social networks using the New York Metro Area as a test area. To demonstrate the generalizability of our synthetic population method and data to initialize models, three different types of agent-based models are introduced to explore a variety of urban problems: traffic, disaster response, and the spread of disease. These use cases not only demonstrate how our geographically-explicit synthetic population can be easily utilized for initializing agent populations which can explore a variety of urban problems, but also show how social networks can be integrated into such populations and large-scale simulations.},
	language = {en},
	number = {1},
	urldate = {2023-03-21},
	journal = {Computational Urban Science},
	author = {Jiang, Na and Crooks, Andrew T. and Kavak, Hamdi and Burger, Annetta and Kennedy, William G.},
	month = feb,
	year = {2022},
	keywords = {Agent-based modeling, Disaster, Disease, New York, Synthetic population generation, Traffic dynamics},
	pages = {7},
	file = {Full Text PDF:C\:\\Users\\bitqu\\Zotero\\storage\\7ATREXJH\\Jiang et al. - 2022 - A method to create a synthetic population with soc.pdf:application/pdf},
}

@inproceedings{barrett_generation_2009,
	title = {Generation and analysis of large synthetic social contact networks},
	doi = {10.1109/WSC.2009.5429425},
	abstract = {We describe ¿first principles¿ based methods for developing synthetic urban and national scale social contact networks. Unlike simple random graph techniques, these methods use real world data sources and combine them with behavioral and social theories to synthesize networks. We develop a synthetic population for the United States modeling every individual in the population including household structure, demographics and a 24-hour activity sequence. The process involves collecting and manipulating public and proprietary data sets integrated into a common architecture for data exchange and then using these data sets to generate new relations. A social contact network is derived from the synthetic population based on physical co-location of interacting persons. We use graph measures to compare and contrast the structural characteristics of the social networks that span different urban regions. We then simulate diffusion processes on these networks and analyze similarities and differences in the structure of the networks.},
	booktitle = {Proceedings of the 2009 {Winter} {Simulation} {Conference} ({WSC})},
	author = {Barrett, Christopher L. and Beckman, Richard J. and Khan, Maleq and Kumar, V. S. Anil and Marathe, Madhav V. and Stretz, Paula E. and Dutta, Tridib and Lewis, Bryan},
	month = dec,
	year = {2009},
	note = {ISSN: 1558-4305},
	keywords = {Analytical models, Laboratories, Architecture, Demography, Diffusion processes, Diseases, Explosions, Network synthesis, Social network services, Turning},
	pages = {1003--1014},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\bitqu\\Zotero\\storage\\Y2MMDX53\\5429425.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\bitqu\\Zotero\\storage\\A46J6GYP\\Barrett et al. - 2009 - Generation and analysis of large synthetic social .pdf:application/pdf},
}

@incollection{eschweiler2012open,
  title={Open trace format 2: The next generation of scalable trace formats and support libraries},
  author={Eschweiler, Dominic and Wagner, Michael and Geimer, Markus and Kn{\"u}pfer, Andreas and Nagel, Wolfgang E and Wolf, Felix},
  booktitle={Applications, Tools and Techniques on the Road to Exascale Computing},
  pages={481--490},
  year={2012},
  publisher={IOS Press}
}

@article{kokkos:tpds2022,
  author={Trott, Christian R. and Lebrun-Grandié, Damien and Arndt, Daniel and Ciesko, Jan and Dang, Vinh and Ellingwood, Nathan and Gayatri, Rahulkumar and Harvey, Evan and Hollman, Daisy S. and Ibanez, Dan and Liber, Nevin and Madsen, Jonathan and Miles, Jeff and Poliakoff, David and Powell, Amy and Rajamanickam, Sivasankaran and Simberg, Mikael and Sunderland, Dan and Turcksin, Bruno and Wilke, Jeremiah},
  journal={IEEE Transactions on Parallel and Distributed Systems},
  title={Kokkos 3: Programming Model Extensions for the Exascale Era},
  year={2022},
  volume={33},
  number={4},
  pages={805-817},
  doi={10.1109/TPDS.2021.3097283}}

@inproceedings{ahamoles,
  author    = {Ahn, Dong H. and Zhang, Xiaohua and Mast, Jeffrey and Herbein, Stephen and Di Natale, Francesco and Kirshner, Dan and Jacobs, Sam Ade and Karlin, Ian and Milroy, Daniel J. and De Supinski, Bronis and Van Essen, Brian and Allen, Jonathan and Lightstone, Felice C.},
  booktitle = {2022 IEEE 18th International Conference on e-Science (e-Science)},
  title     = {Scalable Composition and Analysis Techniques for Massive Scientific Workflows},
  year      = {2022},
  volume    = {},
  number    = {},
  pages     = {32-43},
  doi       = {10.1109/eScience55777.2022.00018}
}

@inproceedings{mummi,
  author    = {Ingólfsson, Helgi I et al.},
  booktitle = {Proceedings of the National Academy of Sciences of the United States of America},
  title     = {Machine learning-driven multiscale modeling reveals lipid-dependent dynamics of RAS signaling proteins.},
  year      = {2022},
  volume    = {119,1},
  doi       = {10.1073/pnas.2113297119}
}

@inproceedings{mummi_sc21,
  author = {Bhatia, Harsh and Di Natale, Francesco and Moon, Joseph Y. and Zhang, Xiaohua and Chavez, Joseph R. and Aydin, Fikret and Stanley, Chris and Oppelstrup, Tomas and Neale, Chris and Schumacher, Sara Kokkila and Ahn, Dong H. and Herbein, Stephen and Carpenter, Timothy S. and Gnanakaran, Sandrasegaram and Bremer, Peer-Timo and Glosli, James N. and Lightstone, Felice C. and Ing\'{o}lfsson, Helgi I.},
  title = {Generalizable Coordination of Large Multiscale Workflows: Challenges and Learnings at Scale},
  year = {2021},
  isbn = {9781450384421},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3458817.3476210},
  doi = {10.1145/3458817.3476210},
  abstract = {The advancement of machine learning techniques and the heterogeneous architectures of most current supercomputers are propelling the demand for large multiscale simulations that can automatically and autonomously couple diverse components and map them to relevant resources to solve complex problems at multiple scales. Nevertheless, despite the recent progress in workflow technologies, current capabilities are limited to coupling two scales. In the first-ever demonstration of using three scales of resolution, we present a scalable and generalizable framework that couples pairs of models using machine learning and in situ feedback. We expand upon the massively parallel Multiscale Machine-Learned Modeling Infrastructure (MuMMI), a recent, award-winning workflow, and generalize the framework beyond its original design. We discuss the challenges and learnings in executing a massive multiscale simulation campaign that utilized over 600,000 node hours on Summit and achieved more than 98% GPU occupancy for more than 83% of the time. We present innovations to enable several orders of magnitude scaling, including simultaneously coordinating 24,000 jobs, and managing several TBs of new data per day and over a billion files in total. Finally, we describe the generalizability of our framework and, with an upcoming open-source release, discuss how the presented framework may be used for new applications.},
  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  articleno = {10},
  numpages = {16},
  keywords = {heterogenous architecture, machine learning, adaptive simulations, massively parallel, cancer research, multiscale simulations},
  location = {St. Louis, Missouri},
  series = {SC '21}
}

@article{perumalla-simulation12,
    title = {Discrete event modeling and massively parallel execution of epidemic outbreak phenomena},
    author = {Kalyan S Perumalla and Sudip K Seal},
    year = 2012,
    journal = {SIMULATION},
    volume = 88,
    number = 7,
    pages = {768--783},
    doi = {10.1177/0037549711413001},
    url = {https://doi.org/10.1177/0037549711413001},
    eprint = {https://doi.org/10.1177/0037549711413001}
}

@article{Bershteyn:18,
    author = {Bershteyn, Anna and Gerardin, Jaline and Bridenbecker, Daniel and Lorton, Christopher W and Bloedow, Jonathan and Baker, Robert S and Chabot-Couture, Guillaume and Chen, Ye and Fischle, Thomas and Frey, Kurt and Gauld, Jillian S and Hu, Hao and Izzo, Amanda S and Klein, Daniel J and Lukacevic, Dejan and McCarthy, Kevin A and Miller, Joel C and Ouedraogo, Andre Lin and Perkins, T Alex and Steinkraus, Jeffrey and ten Bosch, Quirine A and Ting, Hung-Fu and Titova, Svetlana and Wagner, Bradley G and Welkhoff, Philip A and Wenger, Edward A and Wiswell, Christian N and for the Institute for Disease Modeling},
    title = {Implementation and applications of {EMOD}, an individual-based multi-disease modeling platform},
    journal = {Pathogens and Disease},
    volume = {76},
    number = {5},
    year = {2018},
    month = {07},
    issn = {2049-632X},
    doi = {10.1093/femspd/fty059},
    url = {https://doi.org/10.1093/femspd/fty059},
    note = {fty059},
    eprint = {https://academic.oup.com/femspd/article-pdf/76/5/fty059/25336697/fty059.pdf},
}

@inproceedings{bhattacharya2021ai,
  title={AI-Driven Agent-Based Models to Study the Role of Vaccine Acceptance in Controlling COVID-19 Spread in the US},
  author={Bhattacharya, Parantapa and Machi, Dustin and Chen, Jiangzhuo and Hoops, Stefan and Lewis, Bryan and Mortveit, Henning and Venkatramanan, Srinivasan and Wilson, Mandy L and Marathe, Achla and Porebski, Przemyslaw and others},
  booktitle={2021 IEEE International Conference on Big Data (Big Data)},
  pages={1566--1574},
  year={2021},
  organization={IEEE}
}

@article{bhattacharya2022data,
  title={Data-driven scalable pipeline using national agent-based models for real-time pandemic response and decision support},
  author={Bhattacharya, Parantapa and Chen, Jiangzhuo and Hoops, Stefan and Machi, Dustin and Lewis, Bryan and Venkatramanan, Srinivasan and Wilson, Mandy L and Klahn, Brian and Adiga, Aniruddha and Hurt, Benjamin and others},
  journal={The International Journal of High Performance Computing Applications},
  pages={10943420221127034},
  year={2022},
  publisher={SAGE Publications Sage UK: London, England}
}

@article {Cramer2021-hub-dataset,
	author = {Cramer, Estee Y and Huang, Yuxin and Wang, Yijin and Ray, Evan L and Cornell, Matthew and Bracher, Johannes and others and US COVID-19 Forecast Hub Consortium},
	title = {The United States COVID-19 Forecast Hub dataset},
	year = {2021},
	doi = {10.1101/2021.11.04.21265886},
	URL = {https://www.medrxiv.org/content/10.1101/2021.11.04.21265886v1},
	journal = {medRxiv}
}

@article{reich2019collaborative,
  title={A collaborative multiyear, multimodel assessment of seasonal influenza forecasting in the United States},
  author={Reich, Nicholas G and Brooks, Logan C and Fox, Spencer J and Kandula, Sasikiran and McGowan, Craig J and Moore, Evan and Osthus, Dave and Ray, Evan L and Tushar, Abhinav and Yamana, Teresa K and others},
  journal={Proceedings of the National Academy of Sciences},
  volume={116},
  number={8},
  pages={3146--3154},
  year={2019},
  publisher={National Acad Sciences}
}

@misc{cdc-covid19-smh,
  author = {{MIDAS Network}},
  title = {{COVID-19 Scenario Modeling Hub}},
  howpublished = {\url{https://covid19scenariomodelinghub.org}, last accessed Apr 6th, 2023}
}

@article{chen-2021-prioritizing,
 author = {Chen, Jiangzhuo and Hoops, Stefan and Marathe, Achla and Mortveit, Henning and Lewis, Bryan and Venkatramanan, Srinivasan and Haddadan, Arash and Bhattacharya, Parantapa and Adiga, Abhijin and Vullikanti, Anil and Srinivasan, Aravind and Wilson, Mandy L and Ehrlich, Gal and Fenster, Maier and Eubank, Stephen and Barrett, Christopher and Marathe, Madhav},
 title = {Prioritizing allocation of COVID-19 vaccines based on social contacts increases vaccination effectiveness},
 year = {2021},
 doi = {10.1101/2021.02.04.21251012},
 journal = {medRxiv preprint:2021.02.04.21251012}
}

@article{Gillespie_1976,
  author={Gillespie, Daniel T},
  title={A general method for numerically simulating the stochastic time evolution of coupled chemical reactions},
  volume={22},
  DOI={10.1016/0021-9991(76)90041-3},
  number={4},
  journal={Journal of Computational Physics},
  year={1976},
  month={Dec},
  pages={403–434}
}

@misc{code2vec,
  doi = {10.48550/ARXIV.1803.09473},
  url = {https://arxiv.org/abs/1803.09473},
  author = {Alon, Uri and Zilberstein, Meital and Levy, Omer and Yahav, Eran},
  keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Programming Languages (cs.PL), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {code2vec: Learning Distributed Representations of Code},
  publisher = {arXiv},
  year = {2018},
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{ml_for_code,
  title= {ML4Code},
  howpublished = {\url{https://ml4code.github.io/}},
  note = {Accessed: 2022}
}

@article{Gu2022AssembleFM,
  title={Assemble Foundation Models for Automatic Code Summarization},
  author={Jian Gu and Pasquale Salza and Harald C. Gall},
  journal={2022 IEEE International Conference on Software Analysis, Evolution and Reengineering (SANER)},
  year={2022},
  pages={935-946}
}

@article{Ahmed2022LearningCS,
  title={Learning code summarization from a small and local dataset},
  author={Toufique Ahmed and Prem Devanbu},
  journal={ArXiv},
  year={2022},
  volume={abs/2206.00804}
}

@article{Haque2022SemanticSM,
  title={Semantic Similarity Metrics for Evaluating Source Code Summarization},
  author={Sakib Haque and Zachary Eberhart and Aakash Bansal and Collin McMillan},
  journal={2022 IEEE/ACM 30th International Conference on Program Comprehension (ICPC)},
  year={2022},
  pages={36-47}
}

@article{Ahmad2020ATA,
  title={A Transformer-based Approach for Source Code Summarization},
  author={Wasi Uddin Ahmad and Saikat Chakraborty and Baishakhi Ray and Kai-Wei Chang},
  journal={ArXiv},
  year={2020},
  volume={abs/2005.00653}
}

@article{Richter2022CanWL,
  title={Can we learn from developer mistakes? Learning to localize and repair real bugs from real bug fixes},
  author={Cedric Richter and Heike Wehrheim},
  journal={ArXiv},
  year={2022},
  volume={abs/2207.00301}
}

@article{Kharkar2022LearningTR,
  title={Learning to Reduce False Positives in Analytic Bug Detectors},
  author={Anant Kharkar and Roshanak Zilouchian Moghaddam and Matthew Jin and Xiaoyu Liu and Xin Shi and Colin B. Clement and Neel Sundaresan},
  journal={2022 IEEE/ACM 44th International Conference on Software Engineering (ICSE)},
  year={2022},
  pages={1307-1316}
}

@article{Garg2022DeepDevPERFAD,
  title={DeepDev-PERF: a deep learning-based approach for improving software performance},
  author={Spandan Garg and Roshanak Zilouchian Moghaddam and Colin B. Clement and Neel Sundaresan and Chen Wu},
  journal={Proceedings of the 30th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering},
  year={2022}
}

@misc{bart,
  doi = {10.48550/ARXIV.1910.13461},
  url = {https://arxiv.org/abs/1910.13461},
  author = {Lewis, Mike and Liu, Yinhan and Goyal, Naman and Ghazvininejad, Marjan and Mohamed, Abdelrahman and Levy, Omer and Stoyanov, Ves and Zettlemoyer, Luke},
  keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension},
  publisher = {arXiv},
  year = {2019},
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@article{Dderlein2022PilotingCA,
  title={Piloting Copilot and Codex: Hot Temperature, Cold Prompts, or Black Magic?},
  author={Jean-Baptiste D{\"o}derlein and Mathieu Acher and Djamel Eddine Khelladi and Beno{\^i}t Combemale},
  journal={ArXiv},
  year={2022},
  volume={abs/2210.14699}
}

@article{Barke2022GroundedCH,
  title={Grounded Copilot: How Programmers Interact with Code-Generating Models},
  author={Shraddha Barke and Michael B. James and Nadia Polikarpova},
  journal={ArXiv},
  year={2022},
  volume={abs/2206.15000}
}

@article{Sarkar2022WhatII,
  title={What is it like to program with artificial intelligence?},
  author={Advait Sarkar and Andrew D. Gordon and Carina Negreanu and Christian Poelitz and Sruti Srinivasa Ragavan and Benjamin G. Zorn},
  journal={ArXiv},
  year={2022},
  volume={abs/2208.06213}
}

@misc{uta:modeling,
  organization = {University of Texas at Austin COVID-19 Modeling Consortium},
  title = {UT Austin COVID-19 Modeling Consortium},
  url = {https://covid-19.tacc.utexas.edu/},
  publisher = {Texas Advanced Computing Center}
}

@misc{vdh:modeling,
  organization = {Virginia Department of Health},
  title = {COVID-19 Modeling},
  url = {https://www.vdh.virginia.gov/coronavirus/see-the-numbers/covid-19-modeling/},
  publisher = {Virginia Department of Health}
}

@article{law2019stumpy,
  author  = {Law, Sean M.},
  title   = {{STUMPY: A Powerful and Scalable Python Library for Time Series Data Mining}},
  journal = {{The Journal of Open Source Software}},
  volume  = {4},
  number  = {39},
  pages   = {1504},
  year    = {2019}
}

@INPROCEEDINGS{matrix_profile,
  author={Yeh, Chin-Chia Michael and Zhu, Yan and Ulanova, Liudmila and Begum, Nurjahan and Ding, Yifei and Dau, Hoang Anh and Silva, Diego Furtado and Mueen, Abdullah and Keogh, Eamonn},
  booktitle={2016 IEEE 16th International Conference on Data Mining (ICDM)},
  title={Matrix Profile I: All Pairs Similarity Joins for Time Series: A Unifying View That Includes Motifs, Discords and Shapelets},
  year={2016},
  volume={},
  number={},
  pages={1317-1322},
  doi={10.1109/ICDM.2016.0179}}

@inproceedings{xgboost,
  author    = {Chen, Tianqi and Guestrin, Carlos},
  title     = {XGBoost: A Scalable Tree Boosting System},
  year      = {2016},
  isbn      = {9781450342322},
  publisher = {Association for Computing Machinery},
  address   = {New York, NY, USA},
  url       = {https://doi.org/10.1145/2939672.2939785},
  doi       = {10.1145/2939672.2939785},
  abstract  = {Tree boosting is a highly effective and widely used machine learning method. In this paper, we describe a scalable end-to-end tree boosting system called XGBoost, which is used widely by data scientists to achieve state-of-the-art results on many machine learning challenges. We propose a novel sparsity-aware algorithm for sparse data and weighted quantile sketch for approximate tree learning. More importantly, we provide insights on cache access patterns, data compression and sharding to build a scalable tree boosting system. By combining these insights, XGBoost scales beyond billions of examples using far fewer resources than existing systems.},
  booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
  pages     = {785–794},
  numpages  = {10},
  keywords  = {large-scale machine learning},
  location  = {San Francisco, California, USA},
  series    = {KDD '16}
}

@inproceedings{prep_icpp2021,
  author = {Zhou, Longfang and Zhang, Xiaorong and Yang, Wenxiang and Han, Yongguo and Wang, Fang and Wu, Yadong and Yu, Jie},
  title = {PREP: Predicting Job Runtime with Job Running Path on Supercomputers},
  year = {2021},
  isbn = {9781450390682},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3472456.3473521},
  doi = {10.1145/3472456.3473521},
  abstract = {Supercomputers serve a lot of parallel jobs by scheduling jobs and allocating computing resources. One popular scheduling strategy is First Come First Serve (FCFS). However, there are always some idle resources not being effectively utilized, since they are not enough and are reserved for the head job in the waiting queue. To improve resource utilization, a common solution is to use backfilling, which allocates the reserved computing resources to a small, short job selected from the queue, on the premise of not delaying the original head job. Unfortunately, the estimated job runtime provided by users is often overestimated. Previous studies extract features from historical job logs and predict runtime based on machine learning. However, traditional features (e.g. CPU, user, submitting time, etc.) are insufficient to describe the characteristics of jobs. In this paper, we propose a novel runtime prediction framework called PREP. It explores a new feature named job running path, which encodes important implications about the job’s characteristics, such as the project it belongs to, data sets and parameters it uses, etc. As there is a strong correlation between job runtime and its running path. PREP groups jobs into separate clusters according to their running paths and trains a runtime prediction model for each job cluster. Final results demonstrate that adding the new feature can achieve high prediction accuracy of 88% and has a better prediction effect than other methods, such as Last-2 and IRPA.},
  booktitle = {Proceedings of the 50th International Conference on Parallel Processing},
  articleno = {16},
  numpages = {10},
  keywords = {backfilling, runtime prediction, machine learning., running path},
  location = {Lemont, IL, USA},
  series = {ICPP '21}
}

@inproceedings{fcfs-easy,
  author    = {Lelong, J{\'e}r{\^o}me
               and Reis, Valentin
               and Trystram, Denis},
  editor    = {Klus{\'a}{\v{c}}ek, Dalibor
               and Cirne, Walfredo
               and Desai, Narayan},
  title     = {Tuning EASY-Backfilling Queues},
  booktitle = {Job Scheduling Strategies for Parallel Processing},
  year      = {2018},
  publisher = {Springer International Publishing},
  address   = {Cham},
  pages     = {43--61},
  abstract  = {EASY-Backfilling is a popular scheduling heuristic for allocating jobs in large scale High Performance Computing platforms. While its aggressive reservation mechanism is fast and prevents job starvation, it does not try to optimize any scheduling objective per se. We consider in this work the problem of tuning EASY using queue reordering policies. More precisely, we propose to tune the reordering using a simulation-based methodology. For a given system, we choose the policy in order to minimize the average waiting time. This methodology departs from the First-Come, First-Serve rule and introduces a risk on the maximum values of the waiting time, which we control using a queue thresholding mechanism. This new approach is evaluated through a comprehensive experimental campaign on five production logs. In particular, we show that the behavior of the systems under study is stable enough to learn a heuristic that generalizes in a train/test fashion. Indeed, the average waiting time can be reduced consistently (between 11{\%} to 42{\%} for the logs used) compared to EASY, with almost no increase in maximum waiting times. This work departs from previous learning-based approaches and shows that scheduling heuristics for HPC can be learned directly in a policy space.},
  isbn      = {978-3-319-77398-8}
}


@inproceedings{prionn_icpp2018,
  author = {Wyatt, Michael R. and Herbein, Stephen and Gamblin, Todd and Moody, Adam and Ahn, Dong H. and Taufer, Michela},
  title = {PRIONN: Predicting Runtime and IO Using Neural Networks},
  year = {2018},
  isbn = {9781450365109},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3225058.3225091},
  doi = {10.1145/3225058.3225091},
  abstract = {For job allocation decision, current batch schedulers have access to and use only information on the number of nodes and runtime because it is readily available at submission time from user job scripts. User-provided runtimes are typically inaccurate because users overestimate or lack understanding of job resource requirements. Beyond the number of nodes and runtime, other system resources, including IO and network, are not available but play a key role in system performance. There is the need for automatic, general, and scalable tools that provide accurate resource usage information to schedulers so that, by becoming resource-aware, they can better manage system resources.We tackle this need by presenting a tool for Predicting Runtime and IO using Neural Networks (PRIONN). PRIONN automates prediction of per-job runtime and IO resource usage, enabling IO-aware scheduling on HPC systems. The novelty of our tool is the input of whole job scripts into deep learning models that allows complete automation of runtime and IO resource predictions. We demonstrate the power of PRIONN with runtime and IO resource predictions applied to IO-aware scheduling for real HPC data. Specifically, we achieve over 75% mean and 98% median accuracy for runtime and IO predictions across 300,000 jobs from a real HPC machine. We combine our per-job runtime and IO predictions with queue and system simulations to predict future system IO usage accurately. We predict over 50% of IO bursts in advance on a real HPC system.},
  booktitle = {Proceedings of the 47th International Conference on Parallel Processing},
  articleno = {46},
  numpages = {12},
  keywords = {Convolutional Neural Network, IO-Aware Scheduler, IO Prediction},
  location = {Eugene, OR, USA},
  series = {ICPP '18}
}


@inproceedings{power_springer2019,
  author    = {Borghesi, Andrea
               and Bartolini, Andrea
               and Lombardi, Michele
               and Milano, Michela
               and Benini, Luca},
  editor    = {Kunkel, Julian M.
               and Balaji, Pavan
               and Dongarra, Jack},
  title     = {Predictive Modeling for Job Power Consumption in HPC Systems},
  booktitle = {High Performance Computing},
  year      = {2016},
  publisher = {Springer International Publishing},
  address   = {Cham},
  pages     = {181--199},
  abstract  = {Power consumption is a critical aspect for next generation High Performance Computing systems: Supercomputers are expected to reach Exascale in 2023 but this will require a significant improvement in terms of energy efficiency. In this domain, power-capping can significant increase the final energy-efficiency by cutting cooling effort and worst-case design margins. A key aspect for an optimal implementation of power capping is the ability to estimate the power consumption of HPC applications before they run on the real system. In this paper we propose a Machine-Learning approach, based on the user and application resource request, to accurately predict the power consumption of typical supercomputer workloads. We demonstrate our method on real production workloads executed on the Eurora supercomputer hosted at CINECA computing center in Bologna and we provide useful insights to apply our technique in other installations.},
  isbn      = {978-3-319-41321-1}
}

@inproceedings{ml_pmbs2018,
  author    = {Malakar, Preeti and Balaprakash, Prasanna and Vishwanath, Venkatram and Morozov, Vitali and Kumaran, Kalyan},
  booktitle = {2018 IEEE/ACM Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)},
  title     = {Benchmarking Machine Learning Methods for Performance Modeling of Scientific Applications},
  year      = {2018},
  volume    = {},
  number    = {},
  pages     = {33-44},
  doi       = {10.1109/PMBS.2018.8641686}
}

@misc{ardalani2019static,
  title         = {A Static Analysis-based Cross-Architecture Performance Prediction Using Machine Learning},
  author        = {Newsha Ardalani and Urmish Thakker and Aws Albarghouthi and Karu Sankaralingam},
  year          = {2019},
  eprint        = {1906.07840},
  archiveprefix = {arXiv},
  primaryclass  = {cs.DC}
}

@inproceedings{xapp_micro2015,
  author = {Ardalani, Newsha and Lestourgeon, Clint and Sankaralingam, Karthikeyan and Zhu, Xiaojin},
  title = {Cross-Architecture Performance Prediction (XAPP) Using CPU Code to Predict GPU Performance},
  year = {2015},
  isbn = {9781450340342},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/2830772.2830780},
  doi = {10.1145/2830772.2830780},
  abstract = {GPUs have become prevalent and more general purpose, but GPU programming remains challenging and time consuming for the majority of programmers. In addition, it is not always clear which codes will benefit from getting ported to GPU. Therefore, having a tool to estimate GPU performance for a piece of code before writing a GPU implementation is highly desirable. To this end, we propose Cross-Architecture Performance Prediction (XAPP), a machine-learning based technique that uses only single-threaded CPU implementation to predict GPU performance.Our paper is built on the two following insights: i) Execution time on GPU is a function of program properties and hardware characteristics. ii) By examining a vast array of previously implemented GPU codes along with their CPU counterparts, we can use established machine learning techniques to learn this correlation between program properties, hardware characteristics and GPU execution time. We use an adaptive two-level machine learning solution. Our results show that our tool is robust and accurate: we achieve 26.9% average error on a set of 24 real-world kernels. We also discuss practical usage scenarios for XAPP.},
  booktitle = {Proceedings of the 48th International Symposium on Microarchitecture},
  pages = {725–737},
  numpages = {13},
  keywords = {machine learning, cross-platform prediction, GPU, performance modeling},
  location = {Waikiki, Hawaii},
  series = {MICRO-48}
}

@article{taskshaping_ipdps2022,
  place        = {Country unknown/Code not available},
  title        = {Dynamic Task Shaping for High Throughput Data Analysis Applications in High Energy Physics},
  url          = {https://par.nsf.gov/biblio/10356916},
  doi          = {10.1109/IPDPS53621.2022.00041},
  abstractnote = {Distributed data analysis frameworks are widely used for processing large datasets generated by instruments in scientific fields such as astronomy, genomics, and particle physics. Such frameworks partition petabyte-size datasets into chunks and execute many parallel tasks to search for common patterns, locate unusual signals, or compute aggregate properties. When well-configured, such frameworks make it easy to churn through large quantities of data on large clusters. However, configuring frameworks presents a challenge for end users, who must select a variety of parameters such as the blocking of the input data, the number of tasks, the resources allocated to each task, and the size of nodes on which they run. If poorly configured, the result may perform many orders of magnitude worse than optimal, or the application may even fail to make progress at all. Even if a good configuration is found through painstaking observations, the performance may change drastically when the input data or analysis kernel changes. This paper considers the problem of automatically configuring a data analysis application for high energy physics (TopEFT) built upon standard frameworks for physics analysis (Coffea) and distributed tasking (Work Queue). We observe the inherent variability within the application, demonstrate the problems of poor configuration, and then develop several techniques for automatically sizing tasks to meet goals of resource consumption, and overall application completion.},
  journal      = {IPDPS International Parallel and Distributed Processing Symposium},
  author       = {Tovar, Ben and Lyons, Ben and Mohrman, Kelci and Sly-Delgado, Barry and Lannon, Kevin and Thain, Douglas}
}

@inproceedings{tasks_ipdps2022,
  author    = {Nesi, Lucas Leandro and Schnorr, Lucas Mello and Legrand, Arnaud},
  booktitle = {2022 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
  title     = {Multi-Phase Task-Based HPC Applications: Quickly Learning how to Run Fast},
  year      = {2022},
  volume    = {},
  number    = {},
  pages     = {357-367},
  doi       = {10.1109/IPDPS53621.2022.00042}
}

@techreport{kale:sc2011,
  author = {Kale, Laxmikant V. and Arya, Anshu and Bhatele, Abhinav and Gupta, Abhishek and Jain, Nikhil and Jetley, Pritish and Lifflander, Jonathan and Miller, Phil and Sun, Yanhua and Venkataraman, Ramprasad and Wesolowski, Lukasz and Zheng, Gengbin},
  title = {Charm++ for Productivity and Performance: A Submission to the 2011 {HPC} {C}lass {II} {C}hallenge},
  month = nov,
  year = 2011,
  institution = {Dept. of Computer Science, University of Illinois},
}

@article{adagrad:2011,
  author = {Duchi, John and Hazan, Elad and Singer, Yoram},
  title = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization},
  year = {2011},
  issue_date = {2/1/2011},
  publisher = {JMLR.org},
  volume = {12},
  number = {null},
  issn = {1532-4435},
  journal = {J. Mach. Learn. Res.},
  month = {jul},
  pages = {2121–2159},
  numpages = {39}
}

@inproceedings{kfac:2015,
  author = {Martens, James and Grosse, Roger},
  title = {Optimizing Neural Networks with Kronecker-Factored Approximate Curvature},
  year = {2015},
  publisher = {JMLR.org},
  booktitle = {Proceedings of the 32nd International Conference on International Conference on Machine Learning - Volume 37},
  pages = {2408–2417},
  numpages = {10},
  location = {Lille, France},
  series = {ICML'15}
}

@article{fsdp,
author = {Zhao, Yanli and Gu, Andrew and Varma, Rohan and Luo, Liang and Huang, Chien-Chin and Xu, Min and Wright, Less and Shojanazeri, Hamid and Ott, Myle and Shleifer, Sam and Desmaison, Alban and Balioglu, Can and Damania, Pritam and Nguyen, Bernard and Chauhan, Geeta and Hao, Yuchen and Mathews, Ajit and Li, Shen},
title = {PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel},
year = {2023},
issue_date = {August 2023},
publisher = {VLDB Endowment},
volume = {16},
number = {12},
issn = {2150-8097},
url = {https://doi.org/10.14778/3611540.3611569},
doi = {10.14778/3611540.3611569},
journal = {Proc. VLDB Endow.},
month = {aug},
pages = {3848–3860},
numpages = {13}
}

@book{Goodfellow-et-al-2016,
    title={Deep Learning},
    author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
    publisher={MIT Press},
    note={\url{http://www.deeplearningbook.org}},
    year={2016}
}

@misc{mlperf,
  title = "MLPerf Training Benchmarks v3.0",
  howpublished = {\url{https://mlcommons.org/en/training-normal-30}}
}

@misc{top500nov2022,
  author = {TOP500.org},
  title = {November 2022 TOP500},
  year = 2022,
  url = {https://www.top500.org/lists/top500/2022/11/},
  urldate = {2023-02-23}
}

@misc{top500jun2023,
  author = {TOP500.org},
  title = {June 2023 TOP500},
  year = 2023,
  url = {https://www.top500.org/lists/top500/2023/06/},
  urldate = {2023-08-12}
}

@misc{top500nov2023,
  author = {TOP500.org},
  title = {November 2023 TOP500},
  year = 2023,
  url = {https://www.top500.org/lists/top500/2023/06/},
  urldate = {2024-03-12}
}

@article{pennycook2019implications,
  title={Implications of a metric for performance portability},
  author={Pennycook, Simon J and Sewall, Jason D and Lee, Victor W},
  journal={Future Generation Computer Systems},
  volume={92},
  pages={947--958},
  year={2019},
  publisher={Elsevier}
}

@inproceedings{pennycook2016metric,
  author={Pennycook, Simon J and Sewall, Jason D and Lee, Victor W},
  title={A metric for performance portability},
  booktitle={Proceedings of the 7th International Workshop in Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems},
  year={2016},
  url = {https://arxiv.org/abs/1611.07409}
}

@inproceedings{pennycook2021revisiting,
  author={Pennycook, S. John and Sewall, Jason D.},
  booktitle={2021 International Workshop on Performance, Portability and Productivity in HPC (P3HPC)},
  title={Revisiting a Metric for Performance Portability},
  year={2021},
  volume={},
  number={},
  pages={1-9},
  doi={10.1109/P3HPC54578.2021.00004}}

@inproceedings{sewall2020interpreting,
  author={Sewall, Jason and Pennycook, S. John and Jacobsen, Douglas and Deakin, Tom and McIntosh-Smith, Simon},
  booktitle={2020 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC)},
  title={Interpreting and Visualizing Performance Portability Metrics},
  year={2020},
  volume={},
  number={},
  pages={14-24},
  doi={10.1109/P3HPC51967.2020.00007}}

@inproceedings{daniel2019applying,
  author={Daniel, Daniela F. and Panetta, Jairo},
  booktitle={2019 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC)},
  title={On Applying Performance Portability Metrics},
  year={2019},
  volume={},
  number={},
  pages={50-59},
  doi={10.1109/P3HPC49587.2019.00010}}

@article{deakin2018evaluating,
  author={Deakin, Tom and Price, James and Martineau, Matt and McIntosh-Smith, Simon},
  title = {Evaluating Attainable Memory Bandwidth of Parallel Programming Models via BabelStream},
  year = {2018},
  issue_date = {January 2018},
  publisher = {Inderscience Publishers},
  address = {Geneva 15, CHE},
  volume = {17},
  number = {3},
  issn = {1742-7185},
  journal = {Int. J. Comput. Sci. Eng.},
  month = {jan},
  pages = {247–262},
  numpages = {16}
}

@inproceedings{hammond2022benchmarking,
  author={Hammond, Jeff R. and Deakin, Tom and Cownie, James and McIntosh-Smith, Simon},
  booktitle={2022 IEEE/ACM International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)},
  title={Benchmarking Fortran DO CONCURRENT on CPUs and GPUs Using BabelStream},
  year={2022},
  volume={},
  number={},
  pages={82-99},
  doi={10.1109/PMBS56514.2022.00013}
}

@inproceedings{anzt2020evaluating,
  author={Anzt, Hartwig and Tsai, Yuhsiang M. and Abdelfattah, Ahmad and Cojean, Terry and Dongarra, Jack},
  booktitle={2020 IEEE/ACM Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)},
  title={Evaluating the Performance of NVIDIA’s A100 Ampere GPU for Sparse and Batched Computations},
  year={2020},
  volume={},
  number={},
  pages={26-38},
  doi={10.1109/PMBS51919.2020.00009}}

@inproceedings{kuncham2021performance,
  author={Reddy Kuncham, Goutham Kalikrishna and Vaidya, Rahul and Barve, Mahesh},
  booktitle={2021 IEEE High Performance Extreme Computing Conference (HPEC)},
  title={Performance Study of GPU applications using SYCL and CUDA on Tesla V100 GPU},
  year={2021},
  volume={},
  number={},
  pages={1-7},
  doi={10.1109/HPEC49654.2021.9622813}}

@inproceedings{deakin2020evaluating,
  author = {Deakin, Tom and McIntosh-Smith, Simon},
  title = {Evaluating the Performance of HPC-Style SYCL Applications},
  year = {2020},
  isbn = {9781450375313},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3388333.3388643},
  doi = {10.1145/3388333.3388643},
  abstract = {SYCL is a parallel programming model for developing single-source programs for running on heterogeneous platforms. To this end, it allows for one code to be written which can run on a different architectures. For this study, we develop applications in SYCL which are representative of those often used in High-Performance Computing. Their performance is benchmarked on a variety of CPU and GPU architectures from multiple vendors, and compared to well optimised versions written in OpenCL and other parallel programming models.},
  booktitle = {Proceedings of the International Workshop on OpenCL},
  articleno = {12},
  numpages = {11},
  keywords = {performance portability, benchmarking, SYCL, GPGPUs},
  location = {Munich, Germany},
  series = {IWOCL '20}
}

@inproceedings{deakin2019performance,
  author={Deakin, Tom and McIntosh-Smith, Simon and Price, James and Poenaru, Andrei and Atkinson, Patrick and Popa, Codrin and Salmon, Justin},
  booktitle={2019 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC)},
  title={Performance Portability across Diverse Computer Architectures},
  year={2019},
  volume={},
  number={},
  pages={1-13},
  doi={10.1109/P3HPC49587.2019.00006}}

@inproceedings{deakin2020tracking,
  author={Deakin, Tom and Poenaru, Andrei and Lin, Tom and McIntosh-Smith, Simon},
  booktitle={2020 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC)},
  title={Tracking Performance Portability on the Yellow Brick Road to Exascale},
  year={2020},
  volume={},
  number={},
  pages={1-13},
  doi={10.1109/P3HPC51967.2020.00006}}

@inproceedings{deakin2021analyzing,
  title={Analyzing Reduction Abstraction Capabilities},
  author={Deakin, Tom and McIntosh-Smith, Simon and Pennycook, S John and Sewall, Jason},
  booktitle={2021 International Workshop on Performance, Portability and Productivity in HPC (P3HPC)},
  pages={33--44},
  year={2021},
  organization={IEEE}
}

@inproceedings{deakin2022heterogeneous,
  author={Deakin, Tom and Cownie, James and Lin, Wei-Chen and McIntosh-Smith, Simon},
  booktitle={2022 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC)},
  title={Heterogeneous Programming for the Homogeneous Majority},
  year={2022},
  volume={},
  number={},
  pages={1-13},
  doi={10.1109/P3HPC56579.2022.00006}}

@inproceedings{davis2021performance,
  author="Davis, Joshua Hoke and Daley, Christopher and Pophale, Swaroop and Huber, Thomas and Chandrasekaran, Sunita and Wright, Nicholas J.",
  editor="Bhalachandra, Sridutt and Wienke, Sandra and Chandrasekaran, Sunita and Juckeland, Guido",
  title="Performance Assessment of OpenMP Compilers Targeting NVIDIA V100 GPUs",
  booktitle="Accelerator Programming Using Directives",
  year="2021",
  publisher="Springer International Publishing",
  address="Cham",
  pages="25--44",
  isbn="978-3-030-74224-9"
}

@article{tramm2014xsbench,
  title={XSBench-the development and verification of a performance abstraction for Monte Carlo reactor analysis},
  author={Tramm, John R and Siegel, Andrew R and Islam, Tanzima and Schulz, Martin},
  journal={The Role of Reactor Physics toward a Sustainable Future (PHYSOR)},
  year={2014}
}

@inproceedings{kwack2021evaluation,
  author={Kwack, JaeHyuk and Tramm, John and Bertoni, Colleen and Ghadar, Yasaman and Homerding, Brian and Rangel, Esteban and Knight, Christopher and Parker, Scott},
  booktitle={2021 International Workshop on Performance, Portability and Productivity in HPC (P3HPC)},
  title={Evaluation of Performance Portability of Applications and Mini-Apps across AMD, Intel and NVIDIA GPUs},
  year={2021},
  volume={},
  number={},
  pages={45-56},
  doi={10.1109/P3HPC54578.2021.00008}}

@inproceedings{doerfert2023breaking,
  author = {Doerfert, Johannes and Jasper, Marc and Huber, Joseph and Abdelaal, Khaled and Georgakoudis, Giorgis and Scogland, Thomas and Parasyris, Konstantinos},
  title = {Breaking the Vendor Lock: Performance Portable Programming through OpenMP as Target Independent Runtime Layer},
  year = {2023},
  isbn = {9781450398688},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3559009.3569687},
  doi = {10.1145/3559009.3569687},
  booktitle = {Proceedings of the International Conference on Parallel Architectures and Compilation Techniques},
  pages = {494–504},
  numpages = {11},
  keywords = {OpenMP, AMDGPU, GPGPU, CUDA, performance portability, LLVM},
  location = {Chicago, Illinois},
  series = {PACT '22}
}

@book{Gebser2013,
  doi = {10.1007/978-3-031-01561-8},
  url = {https://doi.org/10.1007/978-3-031-01561-8},
  year = {2013},
  publisher = {Springer International Publishing},
  author = {Martin Gebser and Roland Kaminski and Benjamin Kaufmann and Torsten Schaub},
  title = {Answer Set Solving in Practice}
}

@article{DBLP:journals/corr/abs-1108-3281,
  author       = {Victor W. Marek and
                  Ilkka Niemel{\"{a}} and
                  Miroslaw Truszczynski},
  title        = {Origins of Answer-Set Programming - Some Background And Two Personal
                  Accounts},
  journal      = {CoRR},
  volume       = {abs/1108.3281},
  year         = {2011},
  url          = {http://arxiv.org/abs/1108.3281},
  eprinttype    = {arXiv},
  eprint       = {1108.3281},
  timestamp    = {Mon, 13 Aug 2018 16:48:10 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1108-3281.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{menon2022reliabuild,
author = {Menon, Harshitha and Parasyris, Konstantinos and Scogland, Tom and Gamblin, Todd},
title = {Searching for High-Fidelity Builds Using Active Learning},
year = {2022},
isbn = {9781450393034},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3524842.3528464},
doi = {10.1145/3524842.3528464},
abstract = {Modern software is incredibly complex. A typical application may comprise hundreds or thousands of reusable components. Automated package managers can help to maintain a consistent set of dependency versions, but ultimately the solvers in these systems rely on constraints generated by humans. At scale, small errors add up, and it becomes increasingly difficult to find high-fidelity configurations. We cannot test all configurations, because the space is combinatorial, so exhaustive exploration is infeasible.In this paper, we present Reliabuild, an auto-tuning framework that efficiently explores the build configuration space and learns which package versions are likely to result in a successful configuration. We implement two models in Reliabuild to rank the different configurations and use adaptive sampling to select good configurations with fewer samples. We demonstrate Reliabuild's effectiveness by evaluating 31,186 build configurations of 61 packages from the Extreme-scale Scientific Software Stack (E4S). Reliabuild selects good configurations efficiently. For example, Reliabuild selects 3\texttimes{} the number of good configurations in comparison to random sampling for several packages including Abyss, Bolt, libnrm, OpenMPI. Our framework is also able to select all the high-fidelity builds in half the number of samples required by random sampling for packages such as Chai, OpenMPI, py-petsc4py, and slepc. We further use the model to learn statistics about the compatibility of different packages, which will enable package solvers to better select high-fidelity build configurations automatically.},
booktitle = {Proceedings of the 19th International Conference on Mining Software Repositories},
pages = {179–190},
numpages = {12},
location = {Pittsburgh, Pennsylvania},
series = {MSR '22}
}

@inproceedings{gamblin+:sc22,
	address = {Dallas, Texas},
	author = {Todd Gamblin and Massimiliano Culpo and Gregory Becker and Sergei Shudler},
	booktitle = {Supercomputing 2022 (SC'22)},
	date-added = {2022-09-23 12:42:34 -0700},
	date-modified = {2022-09-30 16:15:18 -0700},
	month = {November 13-18},
	note = {LLNL-CONF-839332},
	title = {{Using Answer Set Programming for HPC Dependency Solving}},
	year = {2022},
	bdsk-url-1 = {http://tgamblin.github.io/pubs/spack-sc15.pdf}
}

@article{gebser+:aicomm11,
	author = {Gebser, Martin and Kaufmann, Benjamin and Kaminski, Roland and Ostrowski, Max and Schaub, Torsten and Schneider, Marius},
	date-added = {2020-11-15 23:36:57 -0800},
	date-modified = {2020-11-15 23:36:57 -0800},
	journal = {{AI Communications}},
	number = {2},
	pages = {107--124},
	publisher = {Citeseer},
	title = {Potassco: The Potsdam answer set solving collection},
	volume = {24},
	year = {2011}
}

@InProceedings{plingo,
  author="Hahn, Susana
  and Janhunen, Tomi
  and Kaminski, Roland
  and Romero, Javier
  and R{\"u}hling, Nicolas
  and Schaub, Torsten",
  editor="Governatori, Guido
  and Turhan, Anni-Yasmin",
  title="Plingo: A System for Probabilistic Reasoning in Clingo Based on {$LP^{MLN}$}",
  booktitle="Rules and Reasoning",
  year="2022",
  publisher="Springer International Publishing",
  address="Cham",
  pages="54--62",
  isbn="978-3-031-21541-4"
}

@inproceedings{plingo_lpmln,
  author = {Lee, Joohyung and Wang, Yi},
  title = {Weighted Rules under the Stable Model Semantics},
  year = {2016},
  publisher = {AAAI Press},
  abstract = {We introduce the concept of weighted rules under the stable model semantics following the log-linear models of Markov Logic. This provides versatile methods to overcome the deterministic nature of the stable model semantics, such as resolving inconsistencies in answer set programs, ranking stable models, associating probability to stable models, and applying statistical inference to computing weighted stable models. We also present formal comparisons with related formalisms, such as answer set programs, Markov Logic, ProbLog, and P-log.},
  booktitle = {Proceedings of the Fifteenth International Conference on Principles of Knowledge Representation and Reasoning},
  pages = {145–154},
  numpages = {10},
  location = {Cape Town, South Africa},
  series = {KR'16}
}

@article{plingo_plog,
  author = {Baral, Chitta and Gelfond, Michael and Rushton, Nelson},
  title = {Probabilistic Reasoning with Answer Sets},
  year = {2009},
  issue_date = {January 2009},
  publisher = {Cambridge University Press},
  address = {USA},
  volume = {9},
  number = {1},
  issn = {1471-0684},
  url = {https://doi.org/10.1017/S1471068408003645},
  doi = {10.1017/S1471068408003645},
  abstract = {This paper develops a declarative language, P-log, that combines logical and probabilistic arguments in its reasoning. Answer Set Prolog is used as the logical foundation, while causal Bayes nets serve as a probabilistic foundation. We give several non-trivial examples and illustrate the use of P-log for knowledge representation and updating of knowledge. We argue that our approach to updates is more appealing than existing approaches. We give sufficiency conditions for the coherency of P-log programs and show that Bayes nets can be easily mapped to coherent P-log programs.},
  journal = {Theory Pract. Log. Program.},
  month = {jan},
  pages = {57–144},
  numpages = {88},
  keywords = {Answer Set Prolog, Logic programming, probabilistic reasoning, answer sets}
}

@inproceedings{plingo_problog,
  author = {De Raedt, Luc and Kimmig, Angelika and Toivonen, Hannu},
  title = {ProbLog: A Probabilistic Prolog and Its Application in Link Discovery},
  year = {2007},
  publisher = {Morgan Kaufmann Publishers Inc.},
  address = {San Francisco, CA, USA},
  abstract = {We introduce ProbLog, a probabilistic extension of Prolog. A ProbLog program defines a distribution over logic programs by specifying for each clause the probability that it belongs to a randomly sampled program, and these probabilities are mutually independent. The semantics of ProbLog is then defined by the success probability of a query, which corresponds to the probability that the query succeeds in a randomly sampled program. The key contribution of this paper is the introduction of an effective solver for computing success probabilities. It essentially combines SLD-resolution with methods for computing the probability of Boolean formulae. Our implementation further employs an approximation algorithm that combines iterative deepening with binary decision diagrams. We report on experiments in the context of discovering links in real biological networks, a demonstration of the practical usefulness of the approach.},
  booktitle = {Proceedings of the 20th International Joint Conference on Artifical Intelligence},
  pages = {2468–2473},
  numpages = {6},
  location = {Hyderabad, India},
  series = {IJCAI'07}
}

@article{goldfarb2020practical,
  title={Practical quasi-newton methods for training deep neural networks},
  author={Goldfarb, Donald and Ren, Yi and Bahamou, Achraf},
  journal={Advances in Neural Information Processing Systems},
  volume={33},
  pages={2386--2396},
  year={2020}
}

@article{pennycook2021navigating,
  title={Navigating performance, portability, and productivity},
  author={Pennycook, S John and Sewall, Jason D and Jacobsen, Douglas W and Deakin, Tom and McIntosh-Smith, Simon},
  journal={Computing in Science \& Engineering},
  volume={23},
  number={5},
  pages={28--38},
  year={2021},
  publisher={IEEE}
}

@inproceedings{marowka2021toward,
  title={Toward a better performance portability metric},
  author={Marowka, Ami},
  booktitle={2021 29th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)},
  pages={181--184},
  year={2021},
  organization={IEEE}
}

@article{marowka2023comparison,
  title={A comparison of two performance portability metrics},
  author={Marowka, Ami},
  journal={Concurrency and Computation: Practice and Experience},
  pages={e7868},
  year={2023},
  publisher={Wiley Online Library}
}

@inproceedings{dufek2021case,
  title={Case study of using Kokkos and SYCL as performance-portable frameworks for Milc-Dslash benchmark on NVIDIA, AMD and Intel GPUs},
  author={Dufek, Amanda S and Gayatri, Rahulkumar and Mehta, Neil and Doerfler, Douglas and Cook, Brandon and Ghadar, Yasaman and DeTar, Carleton},
  booktitle={2021 International Workshop on Performance, Portability and Productivity in HPC (P3HPC)},
  pages={57--67},
  year={2021},
  organization={IEEE}
}

@article{reguly2020productivity,
  title={Productivity, performance, and portability for computational fluid dynamics applications},
  author={Reguly, Istv{\'a}n Z and Mudalige, Gihan R},
  journal={Computers \& Fluids},
  volume={199},
  pages={104425},
  year={2020},
  publisher={Elsevier}
}

@inproceedings{reguly2019performance,
  title={Performance portability of multi-material kernels},
  author={Reguly, Istv{\'a}n Z},
  booktitle={2019 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC)},
  pages={26--35},
  year={2019},
  organization={IEEE}
}

@article{martineau2017assessing,
  title={Assessing the performance portability of modern parallel programming models using TeaLeaf},
  author={Martineau, Matthew and McIntosh-Smith, Simon and Gaudin, Wayne},
  journal={Concurrency and Computation: Practice and Experience},
  volume={29},
  number={15},
  pages={e4117},
  year={2017},
  publisher={Wiley Online Library}
}

@article{artigues2020evaluation,
  title={Evaluation of performance portability frameworks for the implementation of a particle-in-cell code},
  author={Artigues, Victor and Kormann, Katharina and Rampp, Markus and Reuter, Klaus},
  journal={Concurrency and Computation: Practice and Experience},
  volume={32},
  number={11},
  pages={e5640},
  year={2020},
  publisher={Wiley Online Library}
}

@inproceedings{sedova2018high,
  title={High-performance molecular dynamics simulation for biological and materials sciences: Challenges of performance portability},
  author={Sedova, Ada and Eblen, John D and Budiardja, Reuben and Tharrington, Arnold and Smith, Jeremy C},
  booktitle={2018 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC)},
  pages={1--13},
  year={2018},
  organization={IEEE}
}

@inproceedings{sabne2015evaluating,
  title={Evaluating performance portability of OpenACC},
  author={Sabne, Amit and Sakdhnagool, Putt and Lee, Seyong and Vetter, Jeffrey S},
  booktitle={Languages and Compilers for Parallel Computing: 27th International Workshop, LCPC 2014, Hillsboro, OR, USA, September 15-17, 2014, Revised Selected Papers 27},
  pages={51--66},
  year={2015},
  organization={Springer}
}

@techreport{doerfler2020su3_bench,
  title={su3\_bench: Lattice QCD SU (3) matrix-matrix multiply microbenchmark (su3\_bench) v1. 0},
  author={Doerfler, Douglas and Daley, Christopher},
  year={2020},
  institution={Lawrence Berkeley National Lab.(LBNL), Berkeley, CA (United States)}
}

@INPROCEEDINGS{hunold:cluster2020,
  author={Hunold, Sascha and Bhatele, Abhinav and Bosilca, George and Knees, Peter},
  booktitle={2020 IEEE International Conference on Cluster Computing (CLUSTER)},
  title={Predicting MPI Collective Communication Performance Using Machine Learning},
  year={2020},
  volume={},
  number={},
  pages={259-269},
  doi={10.1109/CLUSTER49012.2020.00036}
}

@ARTICLE{balaprakash:jproc2018,
  author={Balaprakash, Prasanna and Dongarra, Jack and Gamblin, Todd and Hall, Mary and Hollingsworth, Jeffrey K. and Norris, Boyana and Vuduc, Richard},
  journal={Proceedings of the IEEE},
  title={Autotuning in High-Performance Computing Applications},
  year={2018},
  volume={106},
  number={11},
  pages={2068-2083},
  doi={10.1109/JPROC.2018.2841200}
}

@INPROCEEDINGS{cho:ipdps2023,
  author={Cho, Younghyun and Demmel, James W. and King, Jacob and Li, Xiaoye S. and Liu, Yang and Luo, Hengrui},
  booktitle={2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
  title={Harnessing the Crowd for Autotuning High-Performance Computing Applications},
  year={2023},
  volume={},
  number={},
  pages={635-645},
  doi={10.1109/IPDPS54959.2023.00069}
}

@INPROCEEDINGS{zhou:ipdpsw2020,
  author={Zhou, Wenju and Zhang, Jiepeng and Sun, Jingwei and Sun, Guangzhong},
  booktitle={2020 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)},
  title={Using Small-Scale History Data to Predict Large-Scale Performance of HPC Application},
  year={2020},
  volume={},
  number={},
  pages={787-795},
  doi={10.1109/IPDPSW50202.2020.00135}
}

@article{mossong2008social,
  title={Social contacts and mixing patterns relevant to the spread of infectious diseases},
  author={Mossong, Jo{\"e}l and Hens, Niel and Jit, Mark and Beutels, Philippe and Auranen, Kari and Mikolajczyk, Rafael and Massari, Marco and Salmaso, Stefania and Tomba, Gianpaolo Scalia and Wallinga, Jacco and others},
  journal={PLoS medicine},
  volume={5},
  number={3},
  pages={e74},
  year={2008},
  publisher={Public Library of Science}
}

@inproceedings{barrett2009generation,
  title={Generation and analysis of large synthetic social contact networks},
  author={Barrett, Christopher L and Beckman, Richard J and Khan, Maleq and Kumar, VS Anil and Marathe, Madhav V and Stretz, Paula E and Dutta, Tridib and Lewis, Bryan},
  booktitle={Proceedings of the 2009 Winter Simulation Conference (WSC)},
  pages={1003--1014},
  year={2009},
  organization={IEEE}
}

@inproceedings{boehm2018evaluating,
  title={Evaluating performance portability of accelerator programming models using SPEC ACCEL 1.2 benchmarks},
  author={Boehm, Swen and Pophale, Swaroop and Vergara Larrea, Ver{\'o}nica G and Hernandez, Oscar},
  booktitle={High Performance Computing: ISC High Performance 2018 International Workshops, Frankfurt/Main, Germany, June 28, 2018, Revised Selected Papers 33},
  pages={711--723},
  year={2018},
  organization={Springer}
}

@inproceedings{gayatri2019case,
  title={A case study for performance portability using OpenMP 4.5},
  author={Gayatri, Rahulkumar and Yang, Charlene and Kurth, Thorsten and Deslippe, Jack},
  booktitle={Accelerator Programming Using Directives: 5th International Workshop, WACCPD 2018, Dallas, TX, USA, November 11-17, 2018, Proceedings 5},
  pages={75--95},
  year={2019},
  organization={Springer}
}

@article{tsai2023providing,
  title={Providing performance portable numerics for Intel GPUs},
  author={Tsai, Yu-Hsiang M and Cojean, Terry and Anzt, Hartwig},
  journal={Concurrency and Computation: Practice and Experience},
  volume={35},
  number={20},
  pages={e7400},
  year={2023},
  publisher={Wiley Online Library}
}

@inproceedings{ben2019stateful,
  title={Stateful dataflow multigraphs: A data-centric model for performance portability on heterogeneous architectures},
  author={Ben-Nun, Tal and de Fine Licht, Johannes and Ziogas, Alexandros N and Schneider, Timo and Hoefler, Torsten},
  booktitle={Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages={1--14},
  year={2019}
}

@inproceedings{harrell2018effective,
  title={Effective performance portability},
  author={Harrell, Stephen Lien and Kitson, Joy and Bird, Robert and Pennycook, Simon John and Sewall, Jason and Jacobsen, Douglas and Asanza, David Neill and Hsu, Abaigail and Carrillo, Hector Carrillo and Kim, Hessoo and others},
  booktitle={2018 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC)},
  pages={24--36},
  year={2018},
  organization={IEEE}
}

@inproceedings{brunst2022first,
  title={First experiences in performance benchmarking with the new SPEChpc 2021 suites},
  author={Brunst, Holger and Chandrasekaran, Sunita and Ciorba, Florina M and Hagerty, Nick and Henschel, Robert and Juckeland, Guido and Li, Junjie and Vergara, Veronica G Melesse and Wienke, Sandra and Zavala, Miguel},
  booktitle={2022 22nd IEEE International Symposium on Cluster, Cloud and Internet Computing (CCGrid)},
  pages={675--684},
  year={2022},
  organization={IEEE}
}

@ARTICLE{8907404,
  author={Beckingsale, D. A. and McFadden, M. J. and Dahm, J. P. S. and Pankajakshan, R. and Hornung, R. D.},
  journal={IBM Journal of Research and Development},
  title={Umpire: Application-focused management and coordination of complex hierarchical memory},
  year={2020},
  volume={64},
  number={3/4},
  pages={00:1-00:10},
  doi={10.1147/JRD.2019.2954403}
}

@misc{ecp-proxy-apps,
  title = {ECP Proxy Applications},
  howpublished = {https://proxyapps.exascaleproject.org/},
  note = {Accessed: 2023-09-30}
}

@misc{e4s,
  title = {The Extreme-scale Scientific Software Stack},
  howpublished = {https://e4s-project.github.io/index.html},
  note = {Accessed: 2023-09-30}
}

@misc{aurora,
  title = {Aurora},
  url = {https://www.alcf.anl.gov/aurora},
}

@INPROCEEDINGS{yang:sc05,
  author={Yang, L.T. and Xiaosong Ma and Mueller, F.},
  booktitle={SC '05: Proceedings of the 2005 ACM/IEEE Conference on Supercomputing},
  title={Cross-Platform Performance Prediction of Parallel Applications Using Partial Execution},
  year={2005},
  volume={},
  number={},
  pages={40-40},
  doi={10.1109/SC.2005.20}
}

@misc{cupti,
  title = {CUPTI},
  url = {https://docs.nvidia.com/cuda/cupti/index.html},
  note = {Accessed: 2023-09-30}
}

@misc{rocprofiler,
  title = {rocProfiler},
  url = {https://rocm.docs.amd.com/projects/rocprofiler/en/latest/rocprof.html},
  note = {Accessed: 2023-09-30}
}

@inproceedings{munshi2009opencl,
  title={The opencl specification},
  author={Munshi, Aaftab},
  booktitle={2009 IEEE Hot Chips 21 Symposium (HCS)},
  pages={1--314},
  year={2009},
  organization={IEEE}
}

@article{medina2014occa,
  title={OCCA: A unified approach to multi-threading languages},
  author={Medina, David S and St-Cyr, Amik and Warburton, Tim},
  journal={arXiv preprint arXiv:1403.0968},
  year={2014}
}

@inproceedings{zenker2016alpaka,
  title={Alpaka--an abstraction library for parallel kernel acceleration},
  author={Zenker, Erik and Worpitz, Benjamin and Widera, Ren{\'e} and Huebl, Axel and Juckeland, Guido and Kn{\"u}pfer, Andreas and Nagel, Wolfgang E and Bussmann, Michael},
  booktitle={2016 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)},
  pages={631--640},
  year={2016},
  organization={IEEE}
}

@techreport{heroux2013mantevo,
  title={Mantevo Suite 1.0.},
  author={Heroux, Michael Allen and Barrett, Richard Frederick and Willenbring, James Michael and Hammond, Simon David and Richards, David and Mohd-Yusof, Jamal and Herdman, Andrew},
  year={2013},
  institution={Sandia National Lab.(SNL-NM), Albuquerque, NM (United States)}
}

@misc{nersc-proxy-apps,
  title = {NERSC Proxy Suite},
  howpublished = {https://www.nersc.gov/research-and-development/nersc-proxy-suite/},
}

@misc{tirumala2023d4,
      title={D4: Improving LLM Pretraining via Document De-Duplication and Diversification},
      author={Kushal Tirumala and Daniel Simig and Armen Aghajanyan and Ari S. Morcos},
      year={2023},
      eprint={2308.12284},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{touvron2023llama,
      title={Llama 2: Open Foundation and Fine-Tuned Chat Models},
      author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic and Sergey Edunov and Thomas Scialom},
      year={2023},
      eprint={2307.09288},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{ouyang2022training,
      title={Training language models to follow instructions with human feedback},
      author={Long Ouyang and Jeff Wu and Xu Jiang and Diogo Almeida and Carroll L. Wainwright and Pamela Mishkin and Chong Zhang and Sandhini Agarwal and Katarina Slama and Alex Ray and John Schulman and Jacob Hilton and Fraser Kelton and Luke Miller and Maddie Simens and Amanda Askell and Peter Welinder and Paul Christiano and Jan Leike and Ryan Lowe},
      year={2022},
      eprint={2203.02155},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{ziegler2020finetuning,
      title={Fine-Tuning Language Models from Human Preferences},
      author={Daniel M. Ziegler and Nisan Stiennon and Jeffrey Wu and Tom B. Brown and Alec Radford and Dario Amodei and Paul Christiano and Geoffrey Irving},
      year={2020},
      eprint={1909.08593},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{bai2022training,
      title={Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback},
      author={Yuntao Bai and Andy Jones and Kamal Ndousse and Amanda Askell and Anna Chen and Nova DasSarma and Dawn Drain and Stanislav Fort and Deep Ganguli and Tom Henighan and Nicholas Joseph and Saurav Kadavath and Jackson Kernion and Tom Conerly and Sheer El-Showk and Nelson Elhage and Zac Hatfield-Dodds and Danny Hernandez and Tristan Hume and Scott Johnston and Shauna Kravec and Liane Lovitt and Neel Nanda and Catherine Olsson and Dario Amodei and Tom Brown and Jack Clark and Sam McCandlish and Chris Olah and Ben Mann and Jared Kaplan},
      year={2022},
      eprint={2204.05862},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{wisdom_of_the_crowd,
  author = {Mileva, Yana Momchilova and Dallmeier, Valentin and Burger, Martin and Zeller, Andreas},
  title = {Mining Trends of Library Usage},
  year = {2009},
  isbn = {9781605586786},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/1595808.1595821},
  doi = {10.1145/1595808.1595821},
  abstract = {A library is available in multiple versions. Which one should I use? Has it been widely adopted already? Was it a good decision to switch to the newest version? We have mined hundreds of open-source projects for their library dependencies, and determined global trends in library usage. This wisdom of the crowds can be helpful for developers when deciding when to use which version of a library - by helping them avoid pitfalls experienced by other developers, and by showing important emerging trends in library usage.},
  booktitle = {Proceedings of the Joint International and Annual ERCIM Workshops on Principles of Software Evolution (IWPSE) and Software Evolution (Evol) Workshops},
  pages = {57–62},
  numpages = {6},
  keywords = {library versions usage, mining software archives, global usage trends},
  location = {Amsterdam, The Netherlands},
  series = {IWPSE-Evol '09}
}

@INPROCEEDINGS{perftrack,
  author={Karavanic, K.L. and May, J. and Mohror, K. and Miller, B. and Huck, K. and Knapp, R. and Pugh, B.},
  booktitle={SC '05: Proceedings of the 2005 ACM/IEEE Conference on Supercomputing},
  title={Integrating Database Technology with Comparison-based Parallel Performance Diagnosis: The PerfTrack Performance Experiment Management Tool},
  year={2005},
  volume={},
  number={},
  pages={39-39},
  doi={10.1109/SC.2005.36}
}

@article{rae_scaling_2022,
  title = {Scaling {{Language Models}}: {{Methods}}, {{Analysis}} \& {{Insights}} from {{Training Gopher}}},
  shorttitle = {Scaling {{Language Models}}},
  author = {Rae, Jack W. and Borgeaud, Sebastian and Cai, Trevor and Millican, Katie and Hoffmann, Jordan and Song, Francis and Aslanides, John and Henderson, Sarah and Ring, Roman and Young, Susannah and Rutherford, Eliza and Hennigan, Tom and Menick, Jacob and Cassirer, Albin and Powell, Richard and van den Driessche, George and Hendricks, Lisa Anne and Rauh, Maribeth and Huang, Po-Sen and Glaese, Amelia and Welbl, Johannes and Dathathri, Sumanth and Huang, Saffron and Uesato, Jonathan and Mellor, John and Higgins, Irina and Creswell, Antonia and McAleese, Nat and Wu, Amy and Elsen, Erich and Jayakumar, Siddhant and Buchatskaya, Elena and Budden, David and Sutherland, Esme and Simonyan, Karen and Paganini, Michela and Sifre, Laurent and Martens, Lena and Li, Xiang Lorraine and Kuncoro, Adhiguna and Nematzadeh, Aida and Gribovskaya, Elena and Donato, Domenic and Lazaridou, Angeliki and Mensch, Arthur and Lespiau, Jean-Baptiste and Tsimpoukelli, Maria and Grigorev, Nikolai and Fritz, Doug and Sottiaux, Thibault and Pajarskas, Mantas and Pohlen, Toby and Gong, Zhitao and Toyama, Daniel and {d'Autume}, Cyprien de Masson and Li, Yujia and Terzi, Tayfun and Mikulik, Vladimir and Babuschkin, Igor and Clark, Aidan and Casas, Diego de Las and Guy, Aurelia and Jones, Chris and Bradbury, James and Johnson, Matthew and Hechtman, Blake and Weidinger, Laura and Gabriel, Iason and Isaac, William and Lockhart, Ed and Osindero, Simon and Rimell, Laura and Dyer, Chris and Vinyals, Oriol and Ayoub, Kareem and Stanway, Jeff and Bennett, Lorrayne and Hassabis, Demis and Kavukcuoglu, Koray and Irving, Geoffrey},
  year = {2022},
  month = jan,
  journal = {arXiv:2112.11446 [cs]},
  eprint = {2112.11446},
  primaryclass = {cs},
  url = {http://arxiv.org/abs/2112.11446},
  urldate = {2022-02-22},
  archiveprefix = {arxiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language}
}

@article{kaplan2020scaling,
  title = {Scaling {{Laws}} for {{Neural Language Models}}},
  author = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B. and Chess, Benjamin and Child, Rewon and Gray, Scott and Radford, Alec and Wu, Jeffrey and Amodei, Dario},
  year = {2020},
  month = jan,
  eprint = {2001.08361},
  primaryclass = {cs, stat},
  publisher = {{arXiv}},
  doi = {10.48550/arXiv.2001.08361},
  url = {http://arxiv.org/abs/2001.08361},
  urldate = {2022-09-20},
  archiveprefix = {arxiv},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  journal = {arxiv:2001.08361[cs, stat]}
}

@article{raissi2019physics,
  title={Physics-informed neural networks: A deep learning framework for solving forward and inverse problems involving nonlinear partial differential equations},
  author={Raissi, Maziar and Perdikaris, Paris and Karniadakis, George E},
  journal={Journal of Computational physics},
  volume={378},
  pages={686--707},
  year={2019},
  publisher={Elsevier}
}

@article{chen2018neural,
  title={Neural ordinary differential equations},
  author={Chen, Ricky TQ and Rubanova, Yulia and Bettencourt, Jesse and Duvenaud, David K},
  journal={Advances in neural information processing systems},
  volume={31},
  year={2018}
}

@inproceedings{cossette2012seeking,
  title={Seeking the ground truth: a retroactive study on the evolution and migration of software libraries},
  author={Cossette, Bradley E and Walker, Robert J},
  booktitle={Proceedings of the ACM SIGSOFT 20th International Symposium on the Foundations of Software Engineering},
  pages={1--11},
  year={2012}
}

@misc{preston2013semantic,
  title={Semantic Versioning 2.0. 0},
  author={Preston-Werner, Tom},
  year={2013}
}

@article{decan2019package,
  title={What do package dependencies tell us about semantic versioning?},
  author={Decan, Alexandre and Mens, Tom},
  journal={IEEE Transactions on Software Engineering},
  year={2019},
  publisher={IEEE}
}

@inproceedings{dietrich2019dependency,
	author = {Dietrich, Jens and Pearce, David and Stringer, Jacob and Tahir, Amjed and Blincoe, Kelly},
	booktitle = {2019 IEEE/ACM 16th International Conference on Mining Software Repositories (MSR)},
	organization = {IEEE},
	pages = {349--359},
	title = {Dependency versioning in the wild},
	year = {2019}
}

@inproceedings{xu2019meditor,
  title={Meditor: inference and application of API migration edits},
  author={Xu, Shengzhe and Dong, Ziqi and Meng, Na},
  booktitle={2019 IEEE/ACM 27th International Conference on Program Comprehension (ICPC)},
  pages={335--346},
  year={2019},
  organization={IEEE}
}

@article{ouni2017search,
  title={Search-based software library recommendation using multi-objective optimization},
  author={Ouni, Ali and Kula, Raula Gaikovina and Kessentini, Marouane and Ishio, Takashi and German, Daniel M and Inoue, Katsuro},
  journal={Information and Software Technology},
  volume={83},
  pages={55--75},
  year={2017},
  publisher={Elsevier}
}

@inproceedings{sun2020req2lib,
  title={Req2Lib: A Semantic Neural Model for Software Library Recommendation},
  author={Sun, Zhensu and Liu, Yan and Cheng, Ziming and Yang, Chen and Che, Pengyu},
  booktitle={2020 IEEE 27th International Conference on Software Analysis, Evolution and Reengineering (SANER)},
  pages={542--546},
  year={2020},
  organization={IEEE}
}

@article{nguyen2020crossrec,
  title={CrossRec: Supporting software developers by recommending third-party libraries},
  author={Nguyen, Phuong T and Di Rocco, Juri and Di Ruscio, Davide and Di Penta, Massimiliano},
  journal={Journal of Systems and Software},
  volume={161},
  pages={110460},
  year={2020},
  publisher={Elsevier}
}

@inproceedings{nguyen2018mining,
  title={Mining Software Repositories to Support OSS Developers: A Recommender Systems Approach.},
  author={Nguyen, Phuong T and Di Rocco, Juri and Di Ruscio, Davide},
  booktitle={IIR},
  year={2018}
}

@article{nguyen2021recommending,
  title={Recommending Third-party Library Updates with LSTM Neural Networks},
  author={Nguyen, Phuong T and Di Rocco, Juri and Rubei, Riccardo and Di Sipio, Claudio and Di Ruscio, Davide},
  year={2021}
}

@article{he2020diversified,
  title={Diversified third-party library prediction for mobile app development},
  author={He, Qiang and Li, Bo and Chen, Feifei and Grundy, John and Xia, Xin and Yang, Yun},
  journal={IEEE Transactions on Software Engineering},
  year={2020},
  publisher={IEEE}
}

@article{li2023starcoder,
      title={StarCoder: may the source be with you!},
      author={Raymond Li and Loubna Ben Allal and Yangtian Zi and Niklas Muennighoff and Denis Kocetkov and Chenghao Mou and Marc Marone and Christopher Akiki and Jia Li and Jenny Chim and Qian Liu and Evgenii Zheltonozhskii and Terry Yue Zhuo and Thomas Wang and Olivier Dehaene and Mishig Davaadorj and Joel Lamy-Poirier and João Monteiro and Oleh Shliazhko and Nicolas Gontier and Nicholas Meade and Armel Zebaze and Ming-Ho Yee and Logesh Kumar Umapathi and Jian Zhu and Benjamin Lipkin and Muhtasham Oblokulov and Zhiruo Wang and Rudra Murthy and Jason Stillerman and Siva Sankalp Patel and Dmitry Abulkhanov and Marco Zocca and Manan Dey and Zhihan Zhang and Nour Fahmy and Urvashi Bhattacharyya and Wenhao Yu and Swayam Singh and Sasha Luccioni and Paulo Villegas and Maxim Kunakov and Fedor Zhdanov and Manuel Romero and Tony Lee and Nadav Timor and Jennifer Ding and Claire Schlesinger and Hailey Schoelkopf and Jan Ebert and Tri Dao and Mayank Mishra and Alex Gu and Jennifer Robinson and Carolyn Jane Anderson and Brendan Dolan-Gavitt and Danish Contractor and Siva Reddy and Daniel Fried and Dzmitry Bahdanau and Yacine Jernite and Carlos Muñoz Ferrandis and Sean Hughes and Thomas Wolf and Arjun Guha and Leandro von Werra and Harm de Vries},
      year={2023},
      eprint={2305.06161},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@article{mbpp,
  author       = {Jacob Austin and
                  Augustus Odena and
                  Maxwell I. Nye and
                  Maarten Bosma and
                  Henryk Michalewski and
                  David Dohan and
                  Ellen Jiang and
                  Carrie J. Cai and
                  Michael Terry and
                  Quoc V. Le and
                  Charles Sutton},
  title        = {Program Synthesis with Large Language Models},
  journal      = {CoRR},
  volume       = {abs/2108.07732},
  year         = {2021},
  url          = {https://arxiv.org/abs/2108.07732},
  eprinttype    = {arXiv},
  eprint       = {2108.07732},
  timestamp    = {Fri, 29 Apr 2022 17:42:58 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2108-07732.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@misc{lai2022ds1000,
      title={DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation},
      author={Yuhang Lai and Chengxi Li and Yiming Wang and Tianyi Zhang and Ruiqi Zhong and Luke Zettlemoyer and Scott Wen-tau Yih and Daniel Fried and Sida Wang and Tao Yu},
      year={2022},
      eprint={2211.11501},
      archivePrefix={arXiv},
      primaryClass={cs.SE}
}

@ARTICLE{multiple,
  author={Cassano, Federico and Gouwar, John and Nguyen, Daniel and Nguyen, Sydney and Phipps-Costin, Luna and Pinckney, Donald and Yee, Ming-Ho and Zi, Yangtian and Anderson, Carolyn Jane and Feldman, Molly Q and Guha, Arjun and Greenberg, Michael and Jangda, Abhinav},
  journal={IEEE Transactions on Software Engineering},
  title={MultiPL-E: A Scalable and Polyglot Approach to Benchmarking Neural Code Generation},
  year={2023},
  volume={49},
  number={7},
  pages={3675-3691},
  doi={10.1109/TSE.2023.3267446}
}

@article{lora,
  author       = {Edward J. Hu and
                  Yelong Shen and
                  Phillip Wallis and
                  Zeyuan Allen{-}Zhu and
                  Yuanzhi Li and
                  Shean Wang and
                  Weizhu Chen},
  title        = {LoRA: Low-Rank Adaptation of Large Language Models},
  journal      = {CoRR},
  volume       = {abs/2106.09685},
  year         = {2021},
  url          = {https://arxiv.org/abs/2106.09685},
  eprinttype    = {arXiv},
  eprint       = {2106.09685},
  timestamp    = {Tue, 29 Jun 2021 16:55:04 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2106-09685.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{gao2022pal,
  title={PAL: Program-aided Language Models},
  author={Gao, Luyu and Madaan, Aman and Zhou, Shuyan and Alon, Uri and Liu, Pengfei and Yang, Yiming and Callan, Jamie and Neubig, Graham},
  journal={arXiv preprint arXiv:2211.10435},
  year={2022}
}

@article{cobbe2021gsm8k,
  title={Training Verifiers to Solve Math Word Problems},
  author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
  journal={arXiv preprint arXiv:2110.14168},
  year={2021}
}

@techreport{dicosmo:edos,
	author = {Roberto {Di Cosmo}},
	date-added = {2021-11-27 15:58:40 -0800},
	date-modified = {2021-11-27 16:25:52 -0800},
	institution = {{INRIA}},
	month = {May 15},
	note = {hal-00697463},
	title = {{EDOS deliverable WP2-D2.1: Report on Formal Management of Software Dependencies}},
	year = {2005}
}

@inproceedings{mancinelli+:ase06-foss-distros,
	author = {F. {Mancinelli} and J. {Boender} and R. {di Cosmo} and J. {Vouillon} and B. {Durak} and X. {Leroy} and R. {Treinen}},
	booktitle = {21st IEEE/ACM International Conference on Automated Software Engineering (ASE'06)},
	date-added = {2021-11-28 07:41:31 -0800},
	date-modified = {2021-11-28 07:41:31 -0800},
	pages = {199-208},
	title = {Managing the Complexity of Large Free and Open Source Package-Based Software Distributions},
	year = {2006}
}

@misc{pip-new-resolver,
	author = {{Python Software Foundation}},
	date-added = {2021-11-28 07:08:20 -0800},
	date-modified = {2021-11-28 07:09:12 -0800},
	howpublished = {Online},
	month = {March 23},
	note = {https://pyfound.blogspot.com/2020/03/new-pip-resolver-to-roll-out-this-year.html},
	title = {{New pip resolver to roll out this year}},
	year = {2020}
}

@article{scikit-learn,
 title={Scikit-learn: Machine Learning in {P}ython},
 author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
 journal={Journal of Machine Learning Research},
 volume={12},
 pages={2825--2830},
 year={2011}
}

@article{yu2023codereval,
  title={CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models},
  author={Yu, Hao and Shen, Bo and Ran, Dezhi and Zhang, Jiaxin and Zhang, Qi and Ma, Yuchi and Liang, Guangtai and Li, Ying and Xie, Tao and Wang, Qianxiang},
  journal={arXiv preprint arXiv:2302.00288},
  year={2023}
}

@misc{liu2023verilogeval,
      title={VerilogEval: Evaluating Large Language Models for Verilog Code Generation},
      author={Mingjie Liu and Nathaniel Pinckney and Brucek Khailany and Haoxing Ren},
      year={2023},
      eprint={2309.07544},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{du2023classeval,
      title={ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation},
      author={Xueying Du and Mingwei Liu and Kaixin Wang and Hanlin Wang and Junwei Liu and Yixuan Chen and Jiayi Feng and Chaofeng Sha and Xin Peng and Yiling Lou},
      year={2023},
      eprint={2308.01861},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{tang2023biocoder,
      title={BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual Pragmatic Knowledge},
      author={Xiangru Tang and Bill Qian and Rick Gao and Jiakang Chen and Xinyun Chen and Mark Gerstein},
      year={2023},
      eprint={2308.16458},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{kadosh2023scope,
      title={Scope is all you need: Transforming LLMs for HPC Code},
      author={Tal Kadosh and Niranjan Hasabnis and Vy A. Vo and Nadav Schneider and Neva Krien and Abdul Wasay and Nesreen Ahmed and Ted Willke and Guy Tamir and Yuval Pinter and Timothy Mattson and Gal Oren},
      year={2023},
      eprint={2308.09440},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{kadosh2023quantifying,
      title={Quantifying OpenMP: Statistical Insights into Usage and Adoption},
      author={Tal Kadosh and Niranjan Hasabnis and Timothy Mattson and Yuval Pinter and Gal Oren},
      year={2023},
      eprint={2308.08002},
      archivePrefix={arXiv},
      primaryClass={cs.DC}
}

@misc{munley2023llm4vv,
      title={LLM4VV: Developing LLM-Driven Testsuite for Compiler Validation},
      author={Christian Munley and Aaron Jarmusch and Sunita Chandrasekaran},
      year={2023},
      eprint={2310.04963},
      archivePrefix={arXiv},
      primaryClass={cs.AI}
}

@misc{chen2023data,
      title={Data Race Detection Using Large Language Models},
      author={Le Chen and Xianzhong Ding and Murali Emani and Tristan Vanderbruggen and Pei-hung Lin and Chuanhua Liao},
      year={2023},
      eprint={2308.07505},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@InProceedings{chen2023lm4hpc,
  author="Chen, Le
  and Lin, Pei-Hung
  and Vanderbruggen, Tristan
  and Liao, Chunhua
  and Emani, Murali
  and de Supinski, Bronis",
  editor="McIntosh-Smith, Simon
  and Klemm, Michael
  and de Supinski, Bronis R.
  and Deakin, Tom
  and Klinkenberg, Jannis",
  title="LM4HPC: Towards Effective Language Model Application in High-Performance Computing",
  booktitle="OpenMP: Advanced Task-Based, Device and Compiler Programming",
  year="2023",
  publisher="Springer Nature Switzerland",
  address="Cham",
  pages="18--33",
  abstract="In recent years, language models (LMs), such as GPT-4, have been widely used in multiple domains, including natural language processing, visualization, and so on. However, applying them for analyzing and optimizing high-performance computing (HPC) software is still challenging due to the lack of HPC-specific support. In this paper, we design the LM4HPC framework to facilitate the research and development of HPC software analyses and optimizations using LMs. Tailored for supporting HPC datasets, AI models, and pipelines, our framework is built on top of a range of components from different levels of the machine learning software stack, with Hugging Face-compatible APIs. Using three representative tasks, we evaluated the prototype of our framework. The results show that LM4HPC can help users quickly evaluate a set of state-of-the-art models and generate insightful leaderboards.",
  isbn="978-3-031-40744-4"
}

@INPROCEEDINGS{azad:msr2023,
  author={Kalam Azad, Md Abul and Iqbal, Nafees and Hassan, Foyzul and Roy, Probir},
  booktitle={2023 IEEE/ACM 20th International Conference on Mining Software Repositories (MSR)},
  title={An Empirical Study of High Performance Computing (HPC) Performance Bugs},
  year={2023},
  volume={},
  number={},
  pages={194-206},
  doi={10.1109/MSR59073.2023.00037}
}

@article{Kocetkov2022TheStack,
  title={The Stack: 3 TB of permissively licensed source code},
  author={Kocetkov, Denis and Li, Raymond and Ben Allal, Loubna and Li, Jia and Mou,Chenghao and Muñoz Ferrandis, Carlos and Jernite, Yacine and Mitchell, Margaret and Hughes, Sean and Wolf, Thomas and Bahdanau, Dzmitry and von Werra, Leandro and de Vries, Harm},
  journal={Preprint},
  year={2022}
}

@misc{roziere2023code,
      title={Code Llama: Open Foundation Models for Code},
      author={Baptiste Rozière and Jonas Gehring and Fabian Gloeckle and Sten Sootla and Itai Gat and Xiaoqing Ellen Tan and Yossi Adi and Jingyu Liu and Tal Remez and Jérémy Rapin and Artyom Kozhevnikov and Ivan Evtimov and Joanna Bitton and Manish Bhatt and Cristian Canton Ferrer and Aaron Grattafiori and Wenhan Xiong and Alexandre Défossez and Jade Copet and Faisal Azhar and Hugo Touvron and Louis Martin and Nicolas Usunier and Thomas Scialom and Gabriel Synnaeve},
      year={2023},
      eprint={2308.12950},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{holtzman:iclr2020,
  title={The Curious Case of Neural Text Degeneration},
  author={Ari Holtzman and Jan Buys and Li Du and Maxwell Forbes and Yejin Choi},
  booktitle={International Conference on Learning Representations},
  year={2020},
  url={https://openreview.net/forum?id=rygGQyrFvH}
}

@book{snir1998mpi,
  title={MPI--the Complete Reference: The MPI core},
  author={Snir, M.},
  isbn={9780262692151},
  lccn={98025604},
  series={MPI: The Complete Reference},
  url={https://books.google.com/books?id=x79puJ2YkroC},
  year={1998},
  publisher={Mass}
}

@misc{cuda,
  author={NVIDIA and Vingelmann, Péter and Fitzek, Frank H.P.},
  title={CUDA, release: 10.2.89},
  year={2020},
  url={https://developer.nvidia.com/cuda-toolkit},
}

@misc{hip,
  title={HIP Documentation},
  year={2023},
  url={https://rocm.docs.amd.com/projects/HIP/en/latest/}
}

@misc{bigcode_leaderboard,
  url={https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard},
  title={Big Code Models Leaderboard - a Hugging Face Space by bigcode},
  year={2023},
  publisher={BigCode}
}

@misc{zeroshotreplication,
  title = {Zero-Shot Replication Framework},
  year = {2023},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/emrgnt-cmplxty/zero-shot-replication}},
  commit = {fb1b57256b3041d315f91ee12167a72df51dd0ac}
}

@software{together2023redpajama,
  author = {Together Computer},
  title = {RedPajama: an Open Dataset for Training Large Language Models},
  month = {October},
  year = 2023,
  url = {https://github.com/togethercomputer/RedPajama-Data}
}

@online{MosaicML2023Introducing,
    author    = {MosaicML NLP Team},
    title     = {Introducing MPT-7B: A New Standard for Open-Source,
    Commercially Usable LLMs},
    year      = {2023},
    url       = {www.mosaicml.com/blog/mpt-7b},
    note      = {Accessed: 2023-05-05},
    urldate   = {2023-05-05}
}

@article{allal2023santacoder,
  title={SantaCoder: don't reach for the stars!},
  author={Allal, Loubna Ben and Li, Raymond and Kocetkov, Denis and Mou, Chenghao and Akiki, Christopher and Ferrandis, Carlos Munoz and Muennighoff, Niklas and Mishra, Mayank and Gu, Alex and Dey, Manan and others},
  journal={arXiv preprint arXiv:2301.03988},
  year={2023}
}

@misc{openai2023gpt4,
      title={GPT-4 Technical Report},
      author={OpenAI},
      year={2023},
      eprint={2303.08774},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@online{replit3b1_5,
  author = {Replit},
  title = {Replit Code V-1.5 3B},
  year = 2023,
  url = {https://huggingface.co/replit/replit-code-v1_5-3b}
}

@online{phind-codellama-34b-v2,
  author = {Phind},
  title = {Phind-CodeLlama-34B-v2},
  year = 2023,
  url = {https://huggingface.co/Phind/Phind-CodeLlama-34B-v2}
}

@inproceedings{le-zuidema-2016-quantifying,
    title = "Quantifying the Vanishing Gradient and Long Distance Dependency Problem in Recursive Neural Networks and Recursive {LSTM}s",
    author = "Le, Phong  and
      Zuidema, Willem",
    editor = "Blunsom, Phil  and
      Cho, Kyunghyun  and
      Cohen, Shay  and
      Grefenstette, Edward  and
      Hermann, Karl Moritz  and
      Rimell, Laura  and
      Weston, Jason  and
      Yih, Scott Wen-tau",
    booktitle = "Proceedings of the 1st Workshop on Representation Learning for {NLP}",
    month = aug,
    year = "2016",
    address = "Berlin, Germany",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/W16-1610",
    doi = "10.18653/v1/W16-1610",
    pages = "87--93",
}

@inproceedings{huggingface,
  author = {Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond, Julien and Delangue, Clement and Moi, Anthony and Cistac, Perric and Ma, Clara and Jernite, Yacine and Plu, Julien and Xu, Canwen and Le Scao, Teven and Gugger, Sylvain and Drame, Mariama and Lhoest, Quentin and Rush, Alexander M.},
  month = oct,
  pages = {38--45},
  publisher = {Association for Computational Linguistics},
  title = {{Transformers: State-of-the-Art Natural Language Processing}},
  url = {https://www.aclweb.org/anthology/2020.emnlp-demos.6},
  year = {2020}
}

@online{openai-api,
  author = {OpenAI},
  title = {OpenAI API},
  year = 2023,
  url = {https://platform.openai.com/docs/api-reference/}
}

@software{openai-api-python,
  author = {OpenAI},
  title = {OpenAI Python API library},
  year = 2023,
  url = {https://github.com/openai/openai-python}
}

@inproceedings{Cadena2016MultimodalAA,
  title={Multi-modal Auto-Encoders as Joint Estimators for Robotics Scene Understanding},
  author={C{\'e}sar Cadena and Anthony R. Dick and Ian D. Reid},
  booktitle={Robotics: Science and Systems},
  year={2016},
  url={https://api.semanticscholar.org/CorpusID:846232}
}

@article{geng2022multimodal,
  title={Multimodal Masked Autoencoders Learn Transferable Representations},
  author={Geng, Xinyang and Liu, Hao and Lee, Lisa and Schuurams, Dale and Levine, Sergey and Abbeel, Pieter},
  journal={arXiv preprint arXiv:2205.14204},
  year={2022}
}

@article{bachmann2022multimae,
  author    = {Roman Bachmann and David Mizrahi and Andrei Atanov and Amir Zamir},
  title     = {{MultiMAE}: Multi-modal Multi-task Masked Autoencoders},
  booktitle = {European Conference on Computer Vision},
  year      = {2022},
}

@inproceedings{cummins2021a,
  title={{ProGraML: A Graph-based Program Representation for Data Flow Analysis and Compiler Optimizations}},
  author={Cummins, Chris and Fisches, Zacharias and Ben-Nun, Tal and Hoefler, Torsten and O'Boyle, Michael and Leather, Hugh},
  booktitle = {Thirty-eighth International Conference on Machine Learning (ICML)},
  year={2021}
}

@inproceedings{menon2024buildcheck,
  title={Learning to Predict and Improve Build Successes in Package Ecosystems},
  author={Harshitha Menon and Daniel Nichols and Abhinav Bhatele and Todd Gamblin},
  booktitle={Mining Software Repositories (MSR)},
  year={2024}
}

@INPROCEEDINGS{he2021cvf,
  author={He, Kaiming and Chen, Xinlei and Xie, Saining and Li, Yanghao and Dollár, Piotr and Girshick, Ross},
  booktitle={2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  title={Masked Autoencoders Are Scalable Vision Learners},
  year={2022},
  volume={},
  number={},
  pages={15979-15988},
  keywords={Training;Couplings;Computer vision;Computational modeling;Computer architecture;Data models;Pattern recognition;Representation learning; Self-& semi-& meta- & unsupervised learning},
  doi={10.1109/CVPR52688.2022.01553}
}

@inproceedings{girdhar2023imagebind,
  title={ImageBind: One Embedding Space To Bind Them All},
  author={Girdhar, Rohit and El-Nouby, Alaaeldin and Liu, Zhuang
and Singh, Mannat and Alwala, Kalyan Vasudev and Joulin, Armand and Misra, Ishan},
  booktitle={CVPR},
  year={2023}
}

@misc{zhu2023languagebind,
      title={LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment},
      author={Bin Zhu and Bin Lin and Munan Ning and Yang Yan and Jiaxi Cui and Wang HongFa and Yatian Pang and Wenhao Jiang and Junwu Zhang and Zongwei Li and Cai Wan Zhang and Zhifeng Li and Wei Liu and Li Yuan},
      year={2023},
      eprint={2310.01852},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{NEURIPS2022_b1efde53,
 author = {Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and Schulman, John and Hilton, Jacob and Kelton, Fraser and Miller, Luke and Simens, Maddie and Askell, Amanda and Welinder, Peter and Christiano, Paul F and Leike, Jan and Lowe, Ryan},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh},
 pages = {27730--27744},
 publisher = {Curran Associates, Inc.},
 title = {Training language models to follow instructions with human feedback},
 url = {https://proceedings.neurips.cc/paper_files/paper/2022/file/b1efde53be364a73914f58805a001731-Paper-Conference.pdf},
 volume = {35},
 year = {2022}
}
@article{wei2023magicoder,
  title={Magicoder: Source Code Is All You Need},
  author={Wei, Yuxiang and Wang, Zhe and Liu, Jiawei and Ding, Yifeng and Zhang, Lingming},
  journal={arXiv preprint arXiv:2312.02120},
  year={2023}
}

@article{li2022competition,
  title={Competition-Level Code Generation with AlphaCode},
    author={Li, Yujia and Choi, David and Chung, Junyoung and Kushman, Nate and
    Schrittwieser, Julian and Leblond, R{\'e}mi and Eccles, Tom and
    Keeling, James and Gimeno, Felix and Dal Lago, Agustin and
    Hubert, Thomas and Choy, Peter and de Masson d'Autume, Cyprien and
    Babuschkin, Igor and Chen, Xinyun and Huang, Po-Sen and Welbl, Johannes and
    Gowal, Sven and Cherepanov, Alexey and Molloy, James and
    Mankowitz, Daniel and Sutherland Robson, Esme and Kohli, Pushmeet and
    de Freitas, Nando and Kavukcuoglu, Koray and Vinyals, Oriol},
  journal={arXiv preprint arXiv:2203.07814},
  year={2022}
}

@misc{zhao2023survey,
      title={A Survey of Large Language Models},
      author={Wayne Xin Zhao and Kun Zhou and Junyi Li and Tianyi Tang and Xiaolei Wang and Yupeng Hou and Yingqian Min and Beichen Zhang and Junjie Zhang and Zican Dong and Yifan Du and Chen Yang and Yushuo Chen and Zhipeng Chen and Jinhao Jiang and Ruiyang Ren and Yifan Li and Xinyu Tang and Zikang Liu and Peiyu Liu and Jian-Yun Nie and Ji-Rong Wen},
      year={2023},
      eprint={2303.18223},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@article{watts1998collective,
  title={Collective dynamics of ‘small-world’networks},
  author={Watts, Duncan J and Strogatz, Steven H},
  journal={nature},
  volume={393},
  number={6684},
  pages={440--442},
  year={1998},
  publisher={Nature Publishing Group}
}

@article{erdos1963asymmetric,
  title={Asymmetric graphs},
  author={Erdos, Paul and R{\'e}nyi, Alfr{\'e}d},
  journal={Acta Math. Acad. Sci. Hungar},
  volume={14},
  number={295-315},
  pages={15},
  year={1963}
}

@inproceedings{machi_scalable_2021,
	title = {Scalable {Epidemiological} {Workflows} to {Support} {COVID}-19 {Planning} and {Response}},
	doi = {10.1109/IPDPS49936.2021.00072},
	abstract = {The COVID-19 global outbreak represents the most significant epidemic event since the 1918 influenza pandemic. Simulations have played a crucial role in supporting COVID-19 planning and response efforts. Developing scalable workflows to provide policymakers quick responses to important questions pertaining to logistics, resource allocation, epidemic forecasts and intervention analysis remains a challenging computational problem. In this work, we present scalable high performance computing-enabled workflows for COVID-19 pandemic planning and response. The scalability of our methodology allows us to run fine-grained simulations daily, and to generate county-level forecasts and other counterfactual analysis for each of the 50 states (and DC), 3140 counties across the USA. Our workflows use a hybrid cloud/cluster system utilizing a combination of local and remote cluster computing facilities, and using over 20,000 CPU cores running for 6-9 hours every day to meet this objective. Our state (Virginia), state hospital network, our university, the DOD and the CDC use our models to guide their COVID-19 planning and response efforts. We began executing these pipelines March 25, 2020, and have delivered and briefed weekly updates to these stakeholders for over 30 weeks without interruption.},
	booktitle = {2021 {IEEE} {International} {Parallel} and {Distributed} {Processing} {Symposium} ({IPDPS})},
	author = {Machi, Dustin and Bhattacharya, Parantapa and Hoops, Stefan and Chen, Jiangzhuo and Mortveit, Henning and Venkatramanan, Srinivasan and Lewis, Bryan and Wilson, Mandy and Fadikar, Arindam and Maiden, Tom and Barrett, Christopher L. and Marathe, Madhav V.},
	month = may,
	year = {2021},
	note = {ISSN: 1530-2075},
	keywords = {Analytical models, Computational modeling, COVID-19, Scalability, Pandemics, Epidemic Modeling, HPC Workflow Development, Surveillance, Tools},
	pages = {639--650},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\bitqu\\Zotero\\storage\\DPQG2HDV\\9460555.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\bitqu\\Zotero\\storage\\AFV3MQS8\\Machi et al. - 2021 - Scalable Epidemiological Workflows to Support COVI.pdf:application/pdf},
}

@article{truelove_projected_2022,
	title = {Projected resurgence of {COVID}-19 in the {United} {States} in {July}—{December} 2021 resulting from the increased transmissibility of the {Delta} variant and faltering vaccination},
	volume = {11},
	issn = {2050-084X},
	url = {https://doi.org/10.7554/eLife.73584},
	doi = {10.7554/eLife.73584},
	abstract = {In Spring 2021, the highly transmissible SARS-CoV-2 Delta variant began to cause increases in cases, hospitalizations, and deaths in parts of the United States. At the time, with slowed vaccination uptake, this novel variant was expected to increase the risk of pandemic resurgence in the US in summer and fall 2021. As part of the COVID-19 Scenario Modeling Hub, an ensemble of nine mechanistic models produced 6-month scenario projections for July–December 2021 for the United States. These projections estimated substantial resurgences of COVID-19 across the US resulting from the more transmissible Delta variant, projected to occur across most of the US, coinciding with school and business reopening. The scenarios revealed that reaching higher vaccine coverage in July–December 2021 reduced the size and duration of the projected resurgence substantially, with the expected impacts was largely concentrated in a subset of states with lower vaccination coverage. Despite accurate projection of COVID-19 surges occurring and timing, the magnitude was substantially underestimated 2021 by the models compared with the of the reported cases, hospitalizations, and deaths occurring during July–December, highlighting the continued challenges to predict the evolving COVID-19 pandemic. Vaccination uptake remains critical to limiting transmission and disease, particularly in states with lower vaccination coverage. Higher vaccination goals at the onset of the surge of the new variant were estimated to avert over 1.5 million cases and 21,000 deaths, although may have had even greater impacts, considering the underestimated resurgence magnitude from the model.},
	urldate = {2024-01-05},
	journal = {eLife},
	author = {Truelove, Shaun and Smith, Claire P and Qin, Michelle and Mullany, Luke C and Borchering, Rebecca K and Lessler, Justin and Shea, Katriona and Howerton, Emily and Contamin, Lucie and Levander, John and Kerr, Jessica and Hochheiser, Harry and Kinsey, Matt and Tallaksen, Kate and Wilson, Shelby and Shin, Lauren and Rainwater-Lovett, Kaitlin and Lemairtre, Joseph C and Dent, Juan and Kaminsky, Joshua and Lee, Elizabeth C and Perez-Saez, Javier and Hill, Alison and Karlen, Dean and Chinazzi, Matteo and Davis, Jessica T and Mu, Kunpeng and Xiong, Xinyue and Pastore y Piontti, Ana and Vespignani, Alessandro and Srivastava, Ajitesh and Porebski, Przemyslaw and Venkatramanan, Srinivasan and Adiga, Aniruddha and Lewis, Bryan and Klahn, Brian and Outten, Joseph and Orr, Mark and Harrison, Galen and Hurt, Benjamin and Chen, Jiangzhuo and Vullikanti, Anil and Marathe, Madhav and Hoops, Stefan and Bhattacharya, Parantapa and Machi, Dustin and Chen, Shi and Paul, Rajib and Janies, Daniel and Thill, Jean-Claude and Galanti, Marta and Yamana, Teresa K and Pei, Sen and Shaman, Jeffrey L and Healy, Jessica M and Slayton, Rachel B and Biggerstaff, Matthew and Johansson, Michael A and Runge, Michael C and Viboud, Cecile},
	editor = {Malagón, Talía and Franco, Eduardo and McVernon, Jodie},
	month = jun,
	year = {2022},
	note = {Publisher: eLife Sciences Publications, Ltd},
	keywords = {COVID-19, Delta variant, disease modeling, pandemic, SARS-CoV-2, scenario projection},
	pages = {e73584},
	file = {Full Text:C\:\\Users\\bitqu\\Zotero\\storage\\HL2J2GA7\\Truelove et al. - 2022 - Projected resurgence of COVID-19 in the United Sta.pdf:application/pdf},
}

@article{germann_mitigation_2006,
	title = {Mitigation strategies for pandemic influenza in the {United} {States}},
	volume = {103},
	url = {https://www.pnas.org/doi/full/10.1073/pnas.0601266103},
	doi = {10.1073/pnas.0601266103},
	abstract = {Recent human deaths due to infection by highly pathogenic (H5N1) avian influenza A virus have raised the specter of a devastating pandemic like that of 1917–1918, should this avian virus evolve to become readily transmissible among humans. We introduce and use a large-scale stochastic simulation model to investigate the spread of a pandemic strain of influenza virus through the U.S. population of 281 million individuals for R0 (the basic reproductive number) from 1.6 to 2.4. We model the impact that a variety of levels and combinations of influenza antiviral agents, vaccines, and modified social mobility (including school closure and travel restrictions) have on the timing and magnitude of this spread. Our simulations demonstrate that, in a highly mobile population, restricting travel after an outbreak is detected is likely to delay slightly the time course of the outbreak without impacting the eventual number ill. For R0 {\textless} 1.9, our model suggests that the rapid production and distribution of vaccines, even if poorly matched to circulating strains, could significantly slow disease spread and limit the number ill to {\textless}10\% of the population, particularly if children are preferentially vaccinated. Alternatively, the aggressive deployment of several million courses of influenza antiviral agents in a targeted prophylaxis strategy may contain a nascent outbreak with low R0, provided adequate contact tracing and distribution capacities exist. For higher R0, we predict that multiple strategies in combination (involving both social and medical interventions) will be required to achieve similar limits on illness rates.},
	number = {15},
	urldate = {2023-10-09},
	journal = {Proceedings of the National Academy of Sciences},
	author = {Germann, Timothy C. and Kadau, Kai and Longini, Ira M. and Macken, Catherine A.},
	month = apr,
	year = {2006},
	note = {Publisher: Proceedings of the National Academy of Sciences},
	pages = {5935--5940},
	file = {Full Text PDF:C\:\\Users\\bitqu\\Zotero\\storage\\9D6HSYSF\\Germann et al. - 2006 - Mitigation strategies for pandemic influenza in th.pdf:application/pdf},
}

@article{bhattacharya_data-driven_2023,
	title = {Data-driven scalable pipeline using national agent-based models for real-time pandemic response and decision support},
	volume = {37},
	issn = {1094-3420},
	url = {https://doi.org/10.1177/10943420221127034},
	doi = {10.1177/10943420221127034},
	abstract = {This paper describes an integrated, data-driven operational pipeline based on national agent-based models to support federal and state-level pandemic planning and response. The pipeline consists of (i) an automatic semantic-aware scheduling method that coordinates jobs across two separate high performance computing systems; (ii) a data pipeline to collect, integrate and organize national and county-level disaggregated data for initialization and post-simulation analysis; (iii) a digital twin of national social contact networks made up of 288 Million individuals and 12.6 Billion time-varying interactions covering the US states and DC; (iv) an extension of a parallel agent-based simulation model to study epidemic dynamics and associated interventions. This pipeline can run 400 replicates of national runs in less than 33 h, and reduces the need for human intervention, resulting in faster turnaround times and higher reliability and accuracy of the results. Scientifically, the work has led to significant advances in real-time epidemic sciences.},
	language = {en},
	number = {1},
	urldate = {2023-07-28},
	journal = {The International Journal of High Performance Computing Applications},
	author = {Bhattacharya, Parantapa and Chen, Jiangzhuo and Hoops, Stefan and Machi, Dustin and Lewis, Bryan and Venkatramanan, Srinivasan and Wilson, Mandy L. and Klahn, Brian and Adiga, Aniruddha and Hurt, Benjamin and Outten, Joseph and Adiga, Abhijin and Warren, Andrew and Baek, Young Yun and Porebski, Przemyslaw and Marathe, Achla and Xie, Dawen and Swarup, Samarth and Vullikanti, Anil and Mortveit, Henning and Eubank, Stephen and Barrett, Christopher L. and Marathe, Madhav},
	month = jan,
	year = {2023},
	note = {Publisher: SAGE Publications Ltd STM},
	pages = {4--27},
	file = {SAGE PDF Full Text:C\:\\Users\\bitqu\\Zotero\\storage\\FIY92QUB\\Bhattacharya et al. - 2023 - Data-driven scalable pipeline using national agent.pdf:application/pdf},
}

@inproceedings{thicket,
author = {Brink, Stephanie and McKinsey, Michael and Boehme, David and Scully-Allison, Connor and Lumsden, Ian and Hawkins, Daryl and Burgess, Treece and Lama, Vanessa and L\"{u}ttgau, Jakob and Isaacs, Katherine E. and Taufer, Michela and Pearce, Olga},
title = {Thicket: Seeing the Performance Experiment Forest for the Individual Run Trees},
year = {2023},
isbn = {9798400701559},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3588195.3592989},
doi = {10.1145/3588195.3592989},
abstract = {Thicket is an open-source Python toolkit for Exploratory Data Analysis (EDA) of multi-run performance experiments. It enables an understanding of optimal performance configuration for large-scale application codes. Most performance tools focus on a single execution (e.g., single platform, single measurement tool, single scale). Thicket bridges the gap to convenient analysis in multi-dimensional, multi-scale, multi-architecture, and multi-tool performance datasets by providing an interface for interacting with the performance data. Thicket has a modular structure composed of three components. The first component is a data structure for multi-dimensional performance data, which is composed automatically on the portable basis of call trees, and accommodates any subset of dimensions present in the dataset. The second is the metadata, enabling distinction and sub-selection of dimensions in performance data. The third is a dimensionality reduction mechanism, enabling analysis such as computing aggregated statistics on a given data dimension. Extensible mechanisms are available for applying analyses (e.g., top-down on Intel CPUs), data science techniques (e.g., K-means clustering from scikit-learn), modeling performance (e.g., Extra-P), and interactive visualization. We demonstrate the power and flexibility of Thicket through two case studies, first with the open-source RAJA Performance Suite on CPU and GPU clusters and another with a large physics simulation run on both a traditional HPC cluster and an AWS Parallel Cluster instance.},
booktitle = {Proceedings of the 32nd International Symposium on High-Performance Parallel and Distributed Computing},
pages = {281–293},
numpages = {13},
keywords = {hpc, exploratory data analysis, parallel profile, multi-dimensional, performance analysis},
location = {Orlando, FL, USA},
series = {HPDC '23}
}

@misc{census_census_nodate,
  author = {United States Census Bureau},
	title = {Census {Bureau} {Data}},
	url = {https://data.census.gov/},
	urldate = {2024-01-16},
	file = {Census Bureau Data:C\:\\Users\\bitqu\\Zotero\\storage\\LTGRWDXD\\data.census.gov.html:text/html},
}

@misc{fha_nhts_nodate,
  author = {United States Department of Transportation Federal Highway Administration},
	title = {{NHTS} {NextGen} {OD} {Data}},
	url = {https://nhts.ornl.gov/od/},
	urldate = {2024-01-16},
	file = {NHTS NextGen OD Data:C\:\\Users\\bitqu\\Zotero\\storage\\E7LZ2P6M\\od.html:text/html},
}

@misc{nces_electronic_nodate,
  author = {National Center for Educational Statistics},
	title = {Electronic {Catalog} of {NCES} {Products} ({National} {Center} for {Education} {Statistics}). {Publications} and data products.},
	url = {https://nces.ed.gov/datatools/index.asp?DataToolSectionID=1},
	abstract = {Product information and search tools. Find information, locate, learn how to order, and browse the content of NCES publications or download data files.},
	language = {EN},
	urldate = {2024-01-16},
	note = {Publisher: National Center for Education Statistics},
	file = {Snapshot:C\:\\Users\\bitqu\\Zotero\\storage\\MNTI5ELH\\index.html:text/html},
}

@misc{census_north_nodate,
  author = {United States Census Bureau},
	title = {North {American} {Industry} {Classification} {System} ({NAICS}) {U}.{S}. {Census} {Bureau}},
	url = {https://www.census.gov/naics/},
	urldate = {2024-01-16},
	file = {North American Industry Classification System (NAICS) U.S. Census Bureau:C\:\\Users\\bitqu\\Zotero\\storage\\TCS87CCE\\naics.html:text/html},
}

@article{chen_epihiper_2022,
	title = {{EpiHiper} - {A} {High} {Performance} {Computational} {Modeling} {Framework} to {Support} {Epidemic} {Science}},
	url = {https://biocomplexity.virginia.edu/system/files/publications/2022-12/EpiHiper-NCS-Manuscript-Mortveit-et-al-30-Nov-2022_0.pdf},
	abstract = {This paper describes EpiHiper, a state-of-the art high performance computational modeling framework supporting epidemic science. The EpiHiper modeling framework permits custom disease models and can simulate epidemics over dynamic, large-scale networks, while supporting modulation of the epidemic evolution through a set of user-programmable interventions. The nodes and edges of the social-contact network have customizable sets of static and dynamic attributes which allow the user to specify intervention target sets at a very fine-grained level; these also permit the network to be updated in response to non-pharmaceutical interventions such as school closures. The execution of interventions is governed by trigger conditions, Boolean expressions formed using any of EpiHiper’s primitives and sets. Rich expressiveness, extensibility and high performance computing performance were central design goals to ensure that the framework could effectively target realistic scenarios at the scale and detail required to support state and federal public health policymakers in their efforts to plan and respond in the event of epidemics. The modeling framework is currently being used to support the CDC scenario modeling hub for COVID-19 response. EpiHiper was a part of a hybrid high-performance cloud system that was nominated as a finalist for the 2021 ACM Gordon Bell Special Prize for HPC-based COVID-19 Research.},
	language = {en},
	urldate = {2024-01-17},
	author = {Chen, Jiangzhuo and Hoops, Stefan and Mortveit, Henning S and Machi, Dustin and Bhattacharya, Parantapa and Venkatramanan, Srini and Barrett, Chris L and Marathe, Madhav V},
	month = nov,
	year = {2022},
}

@misc{microsoft_microsoftusbuildingfootprints_2024,
  author = {Microsoft},
	title = {microsoft/{USBuildingFootprints}},
	url = {https://github.com/microsoft/USBuildingFootprints},
	abstract = {Computer generated building footprints for the United States},
	urldate = {2024-01-17},
	publisher = {Microsoft},
	month = jan,
	year = {2024},
	note = {original-date: 2018-06-13T18:31:31Z},
}
@inproceedings{wang2023overlap,
author = {Wang, Shibo and Wei, Jinliang and Sabne, Amit and Davis, Andy and Ilbeyi, Berkin and Hechtman, Blake and Chen, Dehao and Murthy, Karthik Srinivasa and Maggioni, Marcello and Zhang, Qiao and Kumar, Sameer and Guo, Tongfei and Xu, Yuanzhong and Zhou, Zongwei},
title = {Overlap Communication with Dependent Computation via Decomposition in Large Deep Learning Models},
year = {2022},
isbn = {9781450399159},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3567955.3567959},
doi = {10.1145/3567955.3567959},
abstract = {Large deep learning models have shown great potential with state-of-the-art results in many tasks. However, running these large models is quite challenging on an accelerator (GPU or TPU) because the on-device memory is too limited for the size of these models. Intra-layer model parallelism is an approach to address the issues by partitioning individual layers or operators across multiple devices in a distributed accelerator cluster. But, the data communications generated by intra-layer model parallelism can contribute to a significant proportion of the overall execution time and severely hurt the computational efficiency. As intra-layer model parallelism is critical to enable large deep learning models, this paper proposes a novel technique to effectively reduce its data communication overheads by overlapping communication with computation. With the proposed technique, an identified original communication collective is decomposed along with the dependent computation operation into a sequence of finer-grained operations. By creating more overlapping opportunities and executing the newly created, finer-grained communication and computation operations in parallel, it effectively hides the data transfer latency and achieves a better system utilization. Evaluated on TPU v4 Pods using different types of large models that have 10 billion to 1 trillion parameters, the proposed technique improves system throughput by 1.14 - 1.38x. The achieved highest peak FLOPS utilization is 72\% on 1024 TPU chips with a large language model that has 500 billion parameters.},
booktitle = {Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1},
pages = {93–106},
numpages = {14},
keywords = {Collective communication hiding, Compiler optimization, Large scale machine learning},
location = {Vancouver, BC, Canada},
series = {ASPLOS 2023}
}

@article{merak,
  author={Lai, Zhiquan and Li, Shengwei and Tang, Xudong and Ge, Keshi and Liu, Weijie and Duan, Yabo and Qiao, Linbo and Li, Dongsheng},
  journal={IEEE Transactions on Parallel and Distributed Systems},
  title={Merak: An Efficient Distributed DNN Training Framework With Automated 3D Parallelism for Giant Foundation Models},
  year={2023},
  volume={34},
  number={5},
  pages={1466-1478},
  doi={10.1109/TPDS.2023.3247001}}

@article{li2023automated-oases,
  title={Automated Tensor Model Parallelism with Overlapped Communication for Efficient Foundation Model Training},
  author={Li, Shengwei and Lai, Zhiquan and Hao, Yanqi and Liu, Weijie and Ge, Keshi and Deng, Xiaoge and Li, Dongsheng and Lu, Kai},
  journal={arXiv preprint arXiv:2305.16121},
  year={2023}
}

@article{cheng2023atp,
  title={ATP: Adaptive Tensor Parallelism for Foundation Models},
  author={Cheng, Shenggan and Liu, Ziming and Du, Jiangsu and You, Yang},
  journal={arXiv preprint arXiv:2301.08658},
  year={2023}
}

@inproceedings{tripathy2020reducing,
  title={Reducing communication in graph neural network training},
  author={Tripathy, Alok and Yelick, Katherine and Bulu{\c{c}}, Ayd{\i}n},
  booktitle={SC20: International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages={1--14},
  year={2020},
  organization={IEEE}
}

@misc{winget,
  author = {Microsoft},
  title = {WinGet},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/microsoft/winget-cli}}
}

@article{mcintosh2015high,
  title={High performance in silico virtual drug screening on many-core processors},
  author={McIntosh-Smith, Simon and Price, James and Sessions, Richard B and Ibarra, Amaurys A},
  journal={The international journal of high performance computing applications},
  volume={29},
  number={2},
  pages={119--134},
  year={2015},
  publisher={SAGE Publications Sage UK: London, England}
}

@article{lin2024preliminary,
  title={Preliminary report: Initial evaluation of StdPar implementations on AMD GPUs for HPC},
  author={Lin, Wei-Chen and McIntosh-Smith, Simon and Deakin, Tom},
  journal={arXiv preprint arXiv:2401.02680},
  year={2024}
}

@inproceedings{rangel2023performance,
  title={A Performance-Portable SYCL Implementation of CRK-HACC for Exascale},
  author={Rangel, Esteban Miguel and Pennycook, Simon John and Pope, Adrian and Frontiere, Nicholas and Ma, Zhiqiang and Madananth, Varsha},
  booktitle={Proceedings of the SC'23 Workshops of The International Conference on High Performance Computing, Network, Storage, and Analysis},
  pages={1114--1125},
  year={2023}
}

@book{knuth2015art,
  author = {Knuth, Donald E.},
  title = {The Art of Computer Programming, Volume 4, Fascicle 6: Satisfiability},
  year = {2015},
  isbn = {0134397606},
  publisher = {Addison-Wesley Professional},
  edition = {1st},
}

@misc{zheng2023judging,
      title={Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena}, 
      author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric P. Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
      year={2023},
      eprint={2306.05685},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@article{Gilardi_2023,
   title={ChatGPT outperforms crowd workers for text-annotation tasks},
   volume={120},
   ISSN={1091-6490},
   url={http://dx.doi.org/10.1073/pnas.2305016120},
   DOI={10.1073/pnas.2305016120},
   number={30},
   journal={Proceedings of the National Academy of Sciences},
   publisher={Proceedings of the National Academy of Sciences},
   author={Gilardi, Fabrizio and Alizadeh, Meysam and Kubli, Maël},
   year={2023},
   month=jul 
}

@misc{he2023annollm,
      title={AnnoLLM: Making Large Language Models to Be Better Crowdsourced Annotators}, 
      author={Xingwei He and Zhenghao Lin and Yeyun Gong and A-Long Jin and Hang Zhang and Chen Lin and Jian Jiao and Siu Ming Yiu and Nan Duan and Weizhu Chen},
      year={2023},
      eprint={2303.16854},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@software{benallal2024cosmopedia,
  author = {Ben Allal, Loubna and Lozhkov, Anton and Penedo, Guilherme and Wolf, Thomas and von Werra, Leandro},
  title = {Cosmopedia},
  month = February,
  year = 2024,
  url = {https://huggingface.co/datasets/HuggingFaceTB/cosmopedia}
}

@misc{geminiteam2023gemini,
      title={Gemini: A Family of Highly Capable Multimodal Models}, 
      author={Gemini Team and Rohan Anil and Sebastian Borgeaud and Yonghui Wu and Jean-Baptiste Alayrac and Jiahui Yu and Radu Soricut and Johan Schalkwyk and Andrew M. Dai and Anja Hauth and Katie Millican and David Silver and Slav Petrov and Melvin Johnson and Ioannis Antonoglou and Julian Schrittwieser and Amelia Glaese and Jilin Chen and Emily Pitler and Timothy Lillicrap and Angeliki Lazaridou and Orhan Firat and James Molloy and Michael Isard and Paul R. Barham and Tom Hennigan and Benjamin Lee and Fabio Viola and Malcolm Reynolds and Yuanzhong Xu and Ryan Doherty and Eli Collins and Clemens Meyer and Eliza Rutherford and Erica Moreira and Kareem Ayoub and Megha Goel and George Tucker and Enrique Piqueras and Maxim Krikun and Iain Barr and Nikolay Savinov and Ivo Danihelka and Becca Roelofs and Anaïs White and Anders Andreassen and Tamara von Glehn and Lakshman Yagati and Mehran Kazemi and Lucas Gonzalez and Misha Khalman and Jakub Sygnowski and Alexandre Frechette and Charlotte Smith and Laura Culp and Lev Proleev and Yi Luan and Xi Chen and James Lottes and Nathan Schucher and Federico Lebron and Alban Rrustemi and Natalie Clay and Phil Crone and Tomas Kocisky and Jeffrey Zhao and Bartek Perz and Dian Yu and Heidi Howard and Adam Bloniarz and Jack W. Rae and Han Lu and Laurent Sifre and Marcello Maggioni and Fred Alcober and Dan Garrette and Megan Barnes and Shantanu Thakoor and Jacob Austin and Gabriel Barth-Maron and William Wong and Rishabh Joshi and Rahma Chaabouni and Deeni Fatiha and Arun Ahuja and Ruibo Liu and Yunxuan Li and Sarah Cogan and Jeremy Chen and Chao Jia and Chenjie Gu and Qiao Zhang and Jordan Grimstad and Ale Jakse Hartman and Martin Chadwick and Gaurav Singh Tomar and Xavier Garcia and Evan Senter and Emanuel Taropa and Thanumalayan Sankaranarayana Pillai and Jacob Devlin and Michael Laskin and Diego de Las Casas and Dasha Valter and Connie Tao and Lorenzo Blanco and Adrià Puigdomènech Badia and David Reitter and Mianna Chen and Jenny Brennan and Clara Rivera and Sergey Brin and Shariq Iqbal and Gabriela Surita and Jane Labanowski and Abhi Rao and Stephanie Winkler and Emilio Parisotto and Yiming Gu and Kate Olszewska and Yujing Zhang and Ravi Addanki and Antoine Miech and Annie Louis and Laurent El Shafey and Denis Teplyashin and Geoff Brown and Elliot Catt and Nithya Attaluri and Jan Balaguer and Jackie Xiang and Pidong Wang and Zoe Ashwood and Anton Briukhov and Albert Webson and Sanjay Ganapathy and Smit Sanghavi and Ajay Kannan and Ming-Wei Chang and Axel Stjerngren and Josip Djolonga and Yuting Sun and Ankur Bapna and Matthew Aitchison and Pedram Pejman and Henryk Michalewski and Tianhe Yu and Cindy Wang and Juliette Love and Junwhan Ahn and Dawn Bloxwich and Kehang Han and Peter Humphreys and Thibault Sellam and James Bradbury and Varun Godbole and Sina Samangooei and Bogdan Damoc and Alex Kaskasoli and Sébastien M. R. Arnold and Vijay Vasudevan and Shubham Agrawal and Jason Riesa and Dmitry Lepikhin and Richard Tanburn and Srivatsan Srinivasan and Hyeontaek Lim and Sarah Hodkinson and Pranav Shyam and Johan Ferret and Steven Hand and Ankush Garg and Tom Le Paine and Jian Li and Yujia Li and Minh Giang and Alexander Neitz and Zaheer Abbas and Sarah York and Machel Reid and Elizabeth Cole and Aakanksha Chowdhery and Dipanjan Das and Dominika Rogozińska and Vitaly Nikolaev and Pablo Sprechmann and Zachary Nado and Lukas Zilka and Flavien Prost and Luheng He and Marianne Monteiro and Gaurav Mishra and Chris Welty and Josh Newlan and Dawei Jia and Miltiadis Allamanis and Clara Huiyi Hu and Raoul de Liedekerke and Justin Gilmer and Carl Saroufim and Shruti Rijhwani and Shaobo Hou and Disha Shrivastava and Anirudh Baddepudi and Alex Goldin and Adnan Ozturel and Albin Cassirer and Yunhan Xu and Daniel Sohn and Devendra Sachan and Reinald Kim Amplayo and Craig Swanson and Dessie Petrova and Shashi Narayan and Arthur Guez and Siddhartha Brahma and Jessica Landon and Miteyan Patel and Ruizhe Zhao and Kevin Villela and Luyu Wang and Wenhao Jia and Matthew Rahtz and Mai Giménez and Legg Yeung and Hanzhao Lin and James Keeling and Petko Georgiev and Diana Mincu and Boxi Wu and Salem Haykal and Rachel Saputro and Kiran Vodrahalli and James Qin and Zeynep Cankara and Abhanshu Sharma and Nick Fernando and Will Hawkins and Behnam Neyshabur and Solomon Kim and Adrian Hutter and Priyanka Agrawal and Alex Castro-Ros and George van den Driessche and Tao Wang and Fan Yang and Shuo-yiin Chang and Paul Komarek and Ross McIlroy and Mario Lučić and Guodong Zhang and Wael Farhan and Michael Sharman and Paul Natsev and Paul Michel and Yong Cheng and Yamini Bansal and Siyuan Qiao and Kris Cao and Siamak Shakeri and Christina Butterfield and Justin Chung and Paul Kishan Rubenstein and Shivani Agrawal and Arthur Mensch and Kedar Soparkar and Karel Lenc and Timothy Chung and Aedan Pope and Loren Maggiore and Jackie Kay and Priya Jhakra and Shibo Wang and Joshua Maynez and Mary Phuong and Taylor Tobin and Andrea Tacchetti and Maja Trebacz and Kevin Robinson and Yash Katariya and Sebastian Riedel and Paige Bailey and Kefan Xiao and Nimesh Ghelani and Lora Aroyo and Ambrose Slone and Neil Houlsby and Xuehan Xiong and Zhen Yang and Elena Gribovskaya and Jonas Adler and Mateo Wirth and Lisa Lee and Music Li and Thais Kagohara and Jay Pavagadhi and Sophie Bridgers and Anna Bortsova and Sanjay Ghemawat and Zafarali Ahmed and Tianqi Liu and Richard Powell and Vijay Bolina and Mariko Iinuma and Polina Zablotskaia and James Besley and Da-Woon Chung and Timothy Dozat and Ramona Comanescu and Xiance Si and Jeremy Greer and Guolong Su and Martin Polacek and Raphaël Lopez Kaufman and Simon Tokumine and Hexiang Hu and Elena Buchatskaya and Yingjie Miao and Mohamed Elhawaty and Aditya Siddhant and Nenad Tomasev and Jinwei Xing and Christina Greer and Helen Miller and Shereen Ashraf and Aurko Roy and Zizhao Zhang and Ada Ma and Angelos Filos and Milos Besta and Rory Blevins and Ted Klimenko and Chih-Kuan Yeh and Soravit Changpinyo and Jiaqi Mu and Oscar Chang and Mantas Pajarskas and Carrie Muir and Vered Cohen and Charline Le Lan and Krishna Haridasan and Amit Marathe and Steven Hansen and Sholto Douglas and Rajkumar Samuel and Mingqiu Wang and Sophia Austin and Chang Lan and Jiepu Jiang and Justin Chiu and Jaime Alonso Lorenzo and Lars Lowe Sjösund and Sébastien Cevey and Zach Gleicher and Thi Avrahami and Anudhyan Boral and Hansa Srinivasan and Vittorio Selo and Rhys May and Konstantinos Aisopos and Léonard Hussenot and Livio Baldini Soares and Kate Baumli and Michael B. Chang and Adrià Recasens and Ben Caine and Alexander Pritzel and Filip Pavetic and Fabio Pardo and Anita Gergely and Justin Frye and Vinay Ramasesh and Dan Horgan and Kartikeya Badola and Nora Kassner and Subhrajit Roy and Ethan Dyer and Víctor Campos and Alex Tomala and Yunhao Tang and Dalia El Badawy and Elspeth White and Basil Mustafa and Oran Lang and Abhishek Jindal and Sharad Vikram and Zhitao Gong and Sergi Caelles and Ross Hemsley and Gregory Thornton and Fangxiaoyu Feng and Wojciech Stokowiec and Ce Zheng and Phoebe Thacker and Çağlar Ünlü and Zhishuai Zhang and Mohammad Saleh and James Svensson and Max Bileschi and Piyush Patil and Ankesh Anand and Roman Ring and Katerina Tsihlas and Arpi Vezer and Marco Selvi and Toby Shevlane and Mikel Rodriguez and Tom Kwiatkowski and Samira Daruki and Keran Rong and Allan Dafoe and Nicholas FitzGerald and Keren Gu-Lemberg and Mina Khan and Lisa Anne Hendricks and Marie Pellat and Vladimir Feinberg and James Cobon-Kerr and Tara Sainath and Maribeth Rauh and Sayed Hadi Hashemi and Richard Ives and Yana Hasson and YaGuang Li and Eric Noland and Yuan Cao and Nathan Byrd and Le Hou and Qingze Wang and Thibault Sottiaux and Michela Paganini and Jean-Baptiste Lespiau and Alexandre Moufarek and Samer Hassan and Kaushik Shivakumar and Joost van Amersfoort and Amol Mandhane and Pratik Joshi and Anirudh Goyal and Matthew Tung and Andrew Brock and Hannah Sheahan and Vedant Misra and Cheng Li and Nemanja Rakićević and Mostafa Dehghani and Fangyu Liu and Sid Mittal and Junhyuk Oh and Seb Noury and Eren Sezener and Fantine Huot and Matthew Lamm and Nicola De Cao and Charlie Chen and Gamaleldin Elsayed and Ed Chi and Mahdis Mahdieh and Ian Tenney and Nan Hua and Ivan Petrychenko and Patrick Kane and Dylan Scandinaro and Rishub Jain and Jonathan Uesato and Romina Datta and Adam Sadovsky and Oskar Bunyan and Dominik Rabiej and Shimu Wu and John Zhang and Gautam Vasudevan and Edouard Leurent and Mahmoud Alnahlawi and Ionut Georgescu and Nan Wei and Ivy Zheng and Betty Chan and Pam G Rabinovitch and Piotr Stanczyk and Ye Zhang and David Steiner and Subhajit Naskar and Michael Azzam and Matthew Johnson and Adam Paszke and Chung-Cheng Chiu and Jaume Sanchez Elias and Afroz Mohiuddin and Faizan Muhammad and Jin Miao and Andrew Lee and Nino Vieillard and Sahitya Potluri and Jane Park and Elnaz Davoodi and Jiageng Zhang and Jeff Stanway and Drew Garmon and Abhijit Karmarkar and Zhe Dong and Jong Lee and Aviral Kumar and Luowei Zhou and Jonathan Evens and William Isaac and Zhe Chen and Johnson Jia and Anselm Levskaya and Zhenkai Zhu and Chris Gorgolewski and Peter Grabowski and Yu Mao and Alberto Magni and Kaisheng Yao and Javier Snaider and Norman Casagrande and Paul Suganthan and Evan Palmer and Geoffrey Irving and Edward Loper and Manaal Faruqui and Isha Arkatkar and Nanxin Chen and Izhak Shafran and Michael Fink and Alfonso Castaño and Irene Giannoumis and Wooyeol Kim and Mikołaj Rybiński and Ashwin Sreevatsa and Jennifer Prendki and David Soergel and Adrian Goedeckemeyer and Willi Gierke and Mohsen Jafari and Meenu Gaba and Jeremy Wiesner and Diana Gage Wright and Yawen Wei and Harsha Vashisht and Yana Kulizhskaya and Jay Hoover and Maigo Le and Lu Li and Chimezie Iwuanyanwu and Lu Liu and Kevin Ramirez and Andrey Khorlin and Albert Cui and Tian LIN and Marin Georgiev and Marcus Wu and Ricardo Aguilar and Keith Pallo and Abhishek Chakladar and Alena Repina and Xihui Wu and Tom van der Weide and Priya Ponnapalli and Caroline Kaplan and Jiri Simsa and Shuangfeng Li and Olivier Dousse and Fan Yang and Jeff Piper and Nathan Ie and Minnie Lui and Rama Pasumarthi and Nathan Lintz and Anitha Vijayakumar and Lam Nguyen Thiet and Daniel Andor and Pedro Valenzuela and Cosmin Paduraru and Daiyi Peng and Katherine Lee and Shuyuan Zhang and Somer Greene and Duc Dung Nguyen and Paula Kurylowicz and Sarmishta Velury and Sebastian Krause and Cassidy Hardin and Lucas Dixon and Lili Janzer and Kiam Choo and Ziqiang Feng and Biao Zhang and Achintya Singhal and Tejasi Latkar and Mingyang Zhang and Quoc Le and Elena Allica Abellan and Dayou Du and Dan McKinnon and Natasha Antropova and Tolga Bolukbasi and Orgad Keller and David Reid and Daniel Finchelstein and Maria Abi Raad and Remi Crocker and Peter Hawkins and Robert Dadashi and Colin Gaffney and Sid Lall and Ken Franko and Egor Filonov and Anna Bulanova and Rémi Leblond and Vikas Yadav and Shirley Chung and Harry Askham and Luis C. Cobo and Kelvin Xu and Felix Fischer and Jun Xu and Christina Sorokin and Chris Alberti and Chu-Cheng Lin and Colin Evans and Hao Zhou and Alek Dimitriev and Hannah Forbes and Dylan Banarse and Zora Tung and Jeremiah Liu and Mark Omernick and Colton Bishop and Chintu Kumar and Rachel Sterneck and Ryan Foley and Rohan Jain and Swaroop Mishra and Jiawei Xia and Taylor Bos and Geoffrey Cideron and Ehsan Amid and Francesco Piccinno and Xingyu Wang and Praseem Banzal and Petru Gurita and Hila Noga and Premal Shah and Daniel J. Mankowitz and Alex Polozov and Nate Kushman and Victoria Krakovna and Sasha Brown and MohammadHossein Bateni and Dennis Duan and Vlad Firoiu and Meghana Thotakuri and Tom Natan and Anhad Mohananey and Matthieu Geist and Sidharth Mudgal and Sertan Girgin and Hui Li and Jiayu Ye and Ofir Roval and Reiko Tojo and Michael Kwong and James Lee-Thorp and Christopher Yew and Quan Yuan and Sumit Bagri and Danila Sinopalnikov and Sabela Ramos and John Mellor and Abhishek Sharma and Aliaksei Severyn and Jonathan Lai and Kathy Wu and Heng-Tze Cheng and David Miller and Nicolas Sonnerat and Denis Vnukov and Rory Greig and Jennifer Beattie and Emily Caveness and Libin Bai and Julian Eisenschlos and Alex Korchemniy and Tomy Tsai and Mimi Jasarevic and Weize Kong and Phuong Dao and Zeyu Zheng and Frederick Liu and Fan Yang and Rui Zhu and Mark Geller and Tian Huey Teh and Jason Sanmiya and Evgeny Gladchenko and Nejc Trdin and Andrei Sozanschi and Daniel Toyama and Evan Rosen and Sasan Tavakkol and Linting Xue and Chen Elkind and Oliver Woodman and John Carpenter and George Papamakarios and Rupert Kemp and Sushant Kafle and Tanya Grunina and Rishika Sinha and Alice Talbert and Abhimanyu Goyal and Diane Wu and Denese Owusu-Afriyie and Cosmo Du and Chloe Thornton and Jordi Pont-Tuset and Pradyumna Narayana and Jing Li and Sabaer Fatehi and John Wieting and Omar Ajmeri and Benigno Uria and Tao Zhu and Yeongil Ko and Laura Knight and Amélie Héliou and Ning Niu and Shane Gu and Chenxi Pang and Dustin Tran and Yeqing Li and Nir Levine and Ariel Stolovich and Norbert Kalb and Rebeca Santamaria-Fernandez and Sonam Goenka and Wenny Yustalim and Robin Strudel and Ali Elqursh and Balaji Lakshminarayanan and Charlie Deck and Shyam Upadhyay and Hyo Lee and Mike Dusenberry and Zonglin Li and Xuezhi Wang and Kyle Levin and Raphael Hoffmann and Dan Holtmann-Rice and Olivier Bachem and Summer Yue and Sho Arora and Eric Malmi and Daniil Mirylenka and Qijun Tan and Christy Koh and Soheil Hassas Yeganeh and Siim Põder and Steven Zheng and Francesco Pongetti and Mukarram Tariq and Yanhua Sun and Lucian Ionita and Mojtaba Seyedhosseini and Pouya Tafti and Ragha Kotikalapudi and Zhiyu Liu and Anmol Gulati and Jasmine Liu and Xinyu Ye and Bart Chrzaszcz and Lily Wang and Nikhil Sethi and Tianrun Li and Ben Brown and Shreya Singh and Wei Fan and Aaron Parisi and Joe Stanton and Chenkai Kuang and Vinod Koverkathu and Christopher A. Choquette-Choo and Yunjie Li and TJ Lu and Abe Ittycheriah and Prakash Shroff and Pei Sun and Mani Varadarajan and Sanaz Bahargam and Rob Willoughby and David Gaddy and Ishita Dasgupta and Guillaume Desjardins and Marco Cornero and Brona Robenek and Bhavishya Mittal and Ben Albrecht and Ashish Shenoy and Fedor Moiseev and Henrik Jacobsson and Alireza Ghaffarkhah and Morgane Rivière and Alanna Walton and Clément Crepy and Alicia Parrish and Yuan Liu and Zongwei Zhou and Clement Farabet and Carey Radebaugh and Praveen Srinivasan and Claudia van der Salm and Andreas Fidjeland and Salvatore Scellato and Eri Latorre-Chimoto and Hanna Klimczak-Plucińska and David Bridson and Dario de Cesare and Tom Hudson and Piermaria Mendolicchio and Lexi Walker and Alex Morris and Ivo Penchev and Matthew Mauger and Alexey Guseynov and Alison Reid and Seth Odoom and Lucia Loher and Victor Cotruta and Madhavi Yenugula and Dominik Grewe and Anastasia Petrushkina and Tom Duerig and Antonio Sanchez and Steve Yadlowsky and Amy Shen and Amir Globerson and Adam Kurzrok and Lynette Webb and Sahil Dua and Dong Li and Preethi Lahoti and Surya Bhupatiraju and Dan Hurt and Haroon Qureshi and Ananth Agarwal and Tomer Shani and Matan Eyal and Anuj Khare and Shreyas Rammohan Belle and Lei Wang and Chetan Tekur and Mihir Sanjay Kale and Jinliang Wei and Ruoxin Sang and Brennan Saeta and Tyler Liechty and Yi Sun and Yao Zhao and Stephan Lee and Pandu Nayak and Doug Fritz and Manish Reddy Vuyyuru and John Aslanides and Nidhi Vyas and Martin Wicke and Xiao Ma and Taylan Bilal and Evgenii Eltyshev and Daniel Balle and Nina Martin and Hardie Cate and James Manyika and Keyvan Amiri and Yelin Kim and Xi Xiong and Kai Kang and Florian Luisier and Nilesh Tripuraneni and David Madras and Mandy Guo and Austin Waters and Oliver Wang and Joshua Ainslie and Jason Baldridge and Han Zhang and Garima Pruthi and Jakob Bauer and Feng Yang and Riham Mansour and Jason Gelman and Yang Xu and George Polovets and Ji Liu and Honglong Cai and Warren Chen and XiangHai Sheng and Emily Xue and Sherjil Ozair and Adams Yu and Christof Angermueller and Xiaowei Li and Weiren Wang and Julia Wiesinger and Emmanouil Koukoumidis and Yuan Tian and Anand Iyer and Madhu Gurumurthy and Mark Goldenson and Parashar Shah and MK Blake and Hongkun Yu and Anthony Urbanowicz and Jennimaria Palomaki and Chrisantha Fernando and Kevin Brooks and Ken Durden and Harsh Mehta and Nikola Momchev and Elahe Rahimtoroghi and Maria Georgaki and Amit Raul and Sebastian Ruder and Morgan Redshaw and Jinhyuk Lee and Komal Jalan and Dinghua Li and Ginger Perng and Blake Hechtman and Parker Schuh and Milad Nasr and Mia Chen and Kieran Milan and Vladimir Mikulik and Trevor Strohman and Juliana Franco and Tim Green and Demis Hassabis and Koray Kavukcuoglu and Jeffrey Dean and Oriol Vinyals},
      year={2023},
      eprint={2312.11805},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{geminishort2023gemini,
      title={Gemini: A Family of Highly Capable Multimodal Models}, 
      author={Gemini Team},
      year={2023},
      eprint={2312.11805},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{wang2024secrets,
      title={Secrets of RLHF in Large Language Models Part II: Reward Modeling}, 
      author={Binghai Wang and Rui Zheng and Lu Chen and Yan Liu and Shihan Dou and Caishuang Huang and Wei Shen and Senjie Jin and Enyu Zhou and Chenyu Shi and Songyang Gao and Nuo Xu and Yuhao Zhou and Xiaoran Fan and Zhiheng Xi and Jun Zhao and Xiao Wang and Tao Ji and Hang Yan and Lixing Shen and Zhan Chen and Tao Gui and Qi Zhang and Xipeng Qiu and Xuanjing Huang and Zuxuan Wu and Yu-Gang Jiang},
      year={2024},
      eprint={2401.06080},
      archivePrefix={arXiv},
      primaryClass={cs.AI}
}

@misc{jaques2019way,
      title={Way Off-Policy Batch Deep Reinforcement Learning of Implicit Human Preferences in Dialog}, 
      author={Natasha Jaques and Asma Ghandeharioun and Judy Hanwen Shen and Craig Ferguson and Agata Lapedriza and Noah Jones and Shixiang Gu and Rosalind Picard},
      year={2019},
      eprint={1907.00456},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@inproceedings{laidlaw2023preventing,
  title={Preventing Reward Hacking with Occupancy Measure Regularization},
  author={Cassidy Laidlaw and Shivam Singhal and Anca Dragan},
  booktitle={ICML Workshop on New Frontiers in Learning, Control, and Dynamical Systems},
  year={2023},
  url={https://openreview.net/forum?id=oiT8js6p3Z}
}

@misc{rafailov2023direct,
      title={Direct Preference Optimization: Your Language Model is Secretly a Reward Model}, 
      author={Rafael Rafailov and Archit Sharma and Eric Mitchell and Stefano Ermon and Christopher D. Manning and Chelsea Finn},
      year={2023},
      eprint={2305.18290},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{polybench,
  author = {John Cavazos Scott Grauer-Gray},
  howpublished = {\url{https://web.cs.ucla.edu/~pouchet/software/polybench/}},
  title = {Polybench},
  year = {2012}
}

@misc{guo2024deepseekcoder,
      title={DeepSeek-Coder: When the Large Language Model Meets Programming -- The Rise of Code Intelligence}, 
      author={Daya Guo and Qihao Zhu and Dejian Yang and Zhenda Xie and Kai Dong and Wentao Zhang and Guanting Chen and Xiao Bi and Y. Wu and Y. K. Li and Fuli Luo and Yingfei Xiong and Wenfeng Liang},
      year={2024},
      eprint={2401.14196},
      archivePrefix={arXiv},
      primaryClass={cs.SE}
}

@misc{vonwerra2022trl,
  author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang},
  title = {TRL: Transformer Reinforcement Learning},
  year = {2020},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/huggingface/trl}}
}

@inproceedings{herdman2012accelerating,
  title={Accelerating hydrocodes with OpenACC, OpenCL and CUDA},
  author={Herdman, JA and Gaudin, WP and McIntosh-Smith, Simon and Boulton, Michael and Beckingsale, David A and Mallinson, Andrew C and Jarvis, Stephen A},
  booktitle={2012 SC Companion: High Performance Computing, Networking Storage and Analysis},
  pages={465--471},
  year={2012},
  organization={IEEE}
}

@inproceedings{koskela2023principles,
  title={Principles for automated and reproducible benchmarking},
  author={Koskela, Tuomas and Christidi, Ilektra and Giordano, Mos{\`e} and Dubrovska, Emily and Quinn, Jamie and Maynard, Christopher and Case, Dave and Olgu, Kaan and Deakin, Tom},
  booktitle={Proceedings of the SC'23 Workshops of The International Conference on High Performance Computing, Network, Storage, and Analysis},
  pages={609--618},
  year={2023}
}

@misc{valerolara2023comparing,
      title={Comparing Llama-2 and GPT-3 LLMs for HPC kernels generation}, 
      author={Pedro Valero-Lara and Alexis Huante and Mustafa Al Lail and William F. Godoy and Keita Teranishi and Prasanna Balaprakash and Jeffrey S. Vetter},
      year={2023},
      eprint={2309.07103},
      archivePrefix={arXiv},
      primaryClass={cs.SE}
}

@misc{naveed2024comprehensive,
      title={A Comprehensive Overview of Large Language Models}, 
      author={Humza Naveed and Asad Ullah Khan and Shi Qiu and Muhammad Saqib and Saeed Anwar and Muhammad Usman and Naveed Akhtar and Nick Barnes and Ajmal Mian},
      year={2024},
      eprint={2307.06435},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{schulman2017proximal,
      title={Proximal Policy Optimization Algorithms}, 
      author={John Schulman and Filip Wolski and Prafulla Dhariwal and Alec Radford and Oleg Klimov},
      year={2017},
      eprint={1707.06347},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{wang2023helpsteer,
      title={HelpSteer: Multi-attribute Helpfulness Dataset for SteerLM}, 
      author={Zhilin Wang and Yi Dong and Jiaqi Zeng and Virginia Adams and Makesh Narsimhan Sreedhar and Daniel Egert and Olivier Delalleau and Jane Polak Scowcroft and Neel Kant and Aidan Swope and Oleksii Kuchaiev},
      year={2023},
      eprint={2311.09528},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{aizu,
  title = {Aizu},
  howpublished = {\url{https://judge.u-aizu.ac.jp/onlinejudge/}},
}

@misc{atcoder,
  title = {AtCoder},
  howpublished = {\url{https://atcoder.jp/}},
}

@misc{codechef,
  title = {CodeChef},
  howpublished = {\url{https://www.codechef.com/}},
}

@misc{codeforces,
  title = {CodeForces},
  howpublished = {\url{https://codeforces.com/}},
}

@misc{hackerearth,
  title = {HackerEarth},
  howpublished = {\url{https://www.hackerearth.com/}},
}