bibtex

RunzhouHan · RunzhouHan · commit 152dcc0ca921 · 2022-11-12T12:19:46.000-06:00
diff --git a/doc/acm_3502181.3531477.bib b/doc/acm_3502181.3531477.bib
@@ -0,0 +1,17 @@
+@inproceedings{10.1145/3502181.3531477,
+author = {Han, Runzhou and Byna, Suren and Tang, Houjun and Dong, Bin and Zheng, Mai},
+title = {PROV-IO: An I/O-Centric Provenance Framework for Scientific Data on HPC Systems},
+year = {2022},
+isbn = {9781450391993},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3502181.3531477},
+doi = {10.1145/3502181.3531477},
+abstract = {cData provenance, or data lineage, describes the life cycle of data. In scientific workflows on HPC systems, scientists often seek diverse provenance (e.g., origins of data products, usage patterns of datasets). Unfortunately, existing provenance solutions cannot address the challenges due to their incompatible provenance models and/or system implementations.In this paper, we analyze three representative scientific workflows in collaboration with the domain scientists to identify concrete provenance needs. Based on the first-hand analysis, we propose a provenance framework called PROV-IO, which includes an I/O-centric provenance model for describing scientific data and the associated I/O operations and environments precisely. Moreover, we build a prototype of PROV-IO to enable end-to-end provenance support on real HPC systems with little manual effort. The PROV-IO framework provides flexibility in selecting various classes of provenance. Our experiments with realistic workflows show that PROV-IO can address the provenance needs of the domain scientists effectively with reasonable performance (e.g., less than 3.5\% tracking overhead for most experiments). Moreover, PROV-IO outperforms a state-of-the-art system (i.e., ProvLake) in our experiments.},
+booktitle = {Proceedings of the 31st International Symposium on High-Performance Parallel and Distributed Computing},
+pages = {213–226},
+numpages = {14},
+keywords = {workflows, high performance computing, scientific data, data provenance, trustworthiness, fair principles, lineage, explainability},
+location = {Minneapolis, MN, USA},
+series = {HPDC '22}
+}