Added more bibtex references

canimus · Mar 10, 2024 · c473780 · c473780
1 parent 3b7c60f
commit c473780
Show file tree

Hide file tree

Showing 4 changed files with 228 additions and 35 deletions.
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -1,3 +1,62 @@
+@article{10.1145/3603707,
+author = {Fadlallah, Hadi and Kilany, Rima and Dhayne, Houssein and El Haddad, Rami and Haque, Rafiqul and Taher, Yehia and Jaber, Ali},
+year = {2023},
+month = {06},
+pages = {},
+title = {Context-aware Big Data Quality Assessment: A Scoping Review},
+volume = {15},
+journal = {Journal of Data and Information Quality},
+doi = {10.1145/3603707}
+}
+
+@article{10.1145/3603706,
+author = {Fadlallah, Hadi and Kilany, Rima and Dhayne, Houssein and El Haddad, Rami and Haque, Rafiqul and Taher, Yehia and Jaber, Ali},
+year = {2023},
+month = {06},
+pages = {},
+title = {BIGQA: Declarative Big Data Quality Assessment},
+volume = {15},
+journal = {Journal of Data and Information Quality},
+doi = {10.1145/3603706}
+}
+
+@inproceedings{10.1145/3580305.3599776,
+author = {Tu, Dezhan and He, Yeye and Cui, Weiwei and Ge, Song and Zhang, Haidong and Han, Shi and Zhang, Dongmei and Chaudhuri, Surajit},
+title = {Auto-Validate by-History: Auto-Program Data Quality Constraints to Validate Recurring Data Pipelines},
+year = {2023},
+isbn = {9798400701030},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3580305.3599776},
+doi = {10.1145/3580305.3599776},
+abstract = {Data pipelines are widely employed in modern enterprises to power a variety of Machine-Learning (ML) and Business-Intelligence (BI) applications. Crucially, these pipelines are recurring (e.g., daily or hourly) in production settings to keep data updated so that ML models can be re-trained regularly, and BI dashboards refreshed frequently. However, data quality (DQ) issues can often creep into recurring pipelines because of upstream schema and data drift over time. As modern enterprises operate thousands of recurring pipelines, today data engineers have to spend substantial efforts to manually monitor and resolve DQ issues, as part of their DataOps and MLOps practices.Given the high human cost of managing large-scale pipeline operations, it is imperative that we can automate as much as possible. In this work, we propose Auto-Validate-by-History (AVH) that can automatically detect DQ issues in recurring pipelines, leveraging rich statistics from historical executions. We formalize this as an optimization problem, and develop constant-factor approximation algorithms with provable precision guarantees. Extensive evaluations using 2000 production data pipelines at Microsoft demonstrate the effectiveness and efficiency of AVH.},
+booktitle = {Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},
+pages = {4991–5003},
+numpages = {13},
+keywords = {data drift, data histories, data pipelines, data quality, data validation, dataops, mlops, statistical constraints},
+location = {<conf-loc>, <city>Long Beach</city>, <state>CA</state>, <country>USA</country>, </conf-loc>},
+series = {KDD '23}
+}
+
+
+@article{10.14778/3229863.3229867,
+author = {Schelter, Sebastian and Lange, Dustin and Schmidt, Philipp and Celikel, Meltem and Biessmann, Felix and Grafberger, Andreas},
+title = {Automating large-scale data quality verification},
+year = {2018},
+issue_date = {August 2018},
+publisher = {VLDB Endowment},
+volume = {11},
+number = {12},
+issn = {2150-8097},
+url = {https://doi.org/10.14778/3229863.3229867},
+doi = {10.14778/3229863.3229867},
+abstract = {Modern companies and institutions rely on data to guide every single business process and decision. Missing or incorrect information seriously compromises any decision process downstream. Therefore, a crucial, but tedious task for everyone involved in data processing is to verify the quality of their data. We present a system for automating the verification of data quality at scale, which meets the requirements of production use cases. Our system provides a declarative API, which combines common quality constraints with user-defined validation code, and thereby enables 'unit tests' for data. We efficiently execute the resulting constraint validation workload by translating it to aggregation queries on Apache Spark. Our platform supports the incremental validation of data quality on growing datasets, and leverages machine learning, e.g., for enhancing constraint suggestions, for estimating the 'predictability' of a column, and for detecting anomalies in historic data quality time series. We discuss our design decisions, describe the resulting system architecture, and present an experimental evaluation on various datasets.},
+journal = {Proc. VLDB Endow.},
+month = {aug},
+pages = {1781–1794},
+numpages = {14}
+}
+
 @article{Pearson:2017,
   	url = {http://adsabs.harvard.edu/abs/2017arXiv170304627P},
   	Archiveprefix = {arXiv},

diff --git a/paper/paper.jats b/paper/paper.jats
@@ -18,7 +18,7 @@
 <article-id pub-id-type="publisher-id">0</article-id>
 <article-id pub-id-type="doi">N/A</article-id>
 <title-group>
-<article-title>Cuallee: A Python package for data quality across
+<article-title>cuallee: A python package for data quality checks across
 multiple DataFrame APIs</article-title>
 </title-group>
 <contrib-group>
@@ -74,6 +74,8 @@ a Creative Commons Attribution 4.0 International License (CC BY
 <kwd>data quality</kwd>
 <kwd>data checks</kwd>
 <kwd>data unit test</kwd>
+<kwd>data pipelines</kwd>
+<kwd>data validation</kwd>
 <kwd>data observability</kwd>
 <kwd>data lake</kwd>
 <kwd>pyspark</kwd>
@@ -109,38 +111,53 @@ a Creative Commons Attribution 4.0 International License (CC BY
   cloud-based environments. Whilst working in local environments
   typically allows them to fit data sets in memory, moving workloads to
   cloud environments involve operating with full scale data that
-  requires a different computing framework, i.e. distributed computing,
-  parallelization, and horizontal scaling.</p>
+  requires a different computing framework
+  (<xref alt="Schelter et al., 2018" rid="ref-10.14778U002F3229863.3229867" ref-type="bibr">Schelter
+  et al., 2018</xref>), i.e. distributed computing, parallelization, and
+  horizontal scaling.</p>
   <p>This shift in computing frameworks requires the adoption of testing
   strategies that can accommodate testing activities in both local and
   remote environments, without the need to rewrite test scenarios or
   employ different testing approaches for assessing various quality
-  dimensions of the data.</p>
+  dimensions of the data
+  (<xref alt="Fadlallah et al., 2023b" rid="ref-10.1145U002F3603707" ref-type="bibr">Fadlallah
+  et al., 2023b</xref>).</p>
   <p>An additional argument is related to the rapid evolution of the
-  data ecosystem. Organizations and data teams are constantly seeking
-  ways to improve, whether through cost-effective solutions or by
-  integrating new capabilities into their data operations. However, this
-  pursuit presents new challenges when migrating workloads from one
+  data ecosystem
+  (<xref alt="Fadlallah et al., 2023a" rid="ref-10.1145U002F3603706" ref-type="bibr">Fadlallah
+  et al., 2023a</xref>). Organizations and data teams are constantly
+  seeking ways to improve, whether through cost-effective solutions or
+  by integrating new capabilities into their data operations. However,
+  this pursuit presents new challenges when migrating workloads from one
   technology to another. As information technology and data strategies
   become more resilient against vendor lock-ins, they turn to
   technologies that enable seamless operation across platforms, avoiding
   the chaos of fully re-implementing data products. In essence, no data
   testing strategy needs to be rewritten or reformulated due to platform
   changes.</p>
-  <p>A last argument is the need for such a quality tool, is the desire
-  of moving quality procedures to the earliest phases of the data
-  product development life-cycle. Whether in industry or academia, the
-  reduction of time allocated for quality activities is unfortunately
-  like a norm, due to the predominant focus on functional aspects.
-  Enabling a declarative, intuitive and flexible programming interface
-  to data quality, allows teams to embed quality into their development,
-  adopting a mindset of building quality in, as opposed to testing
-  quality out.</p>
+  <p>One last argument in favor of using a quality tool is the need to
+  integrate quality procedures into the early stages of data product
+  development. Whether in industry or academia, there’s often a tendency
+  to prioritize functional aspects over quality, leading to less time
+  being dedicated to quality activities. By providing a clear,
+  easy-to-use, and adaptable programming interface for data quality,
+  teams can incorporate quality into their development process,
+  promoting a proactive approach of building quality in rather than
+  relying solely on testing to ensure quality.</p>
+</sec>
+<sec id="methods">
+  <title>Methods</title>
+  <p><monospace>cuallee</monospace> employs a heuristic-based approach
+  to define quality rules for each dataset. This prevents the
+  inadvertent duplication of quality predicates, thus reducing the
+  likelihood of human error in defining rules with identical predicates.
+  Several studies have been conducted on the efficiency of these rules,
+  including auto-validation
+  (<xref alt="Tu et al., 2023" rid="ref-10.1145U002F3580305.3599776" ref-type="bibr">Tu
+  et al., 2023</xref>) and auto-definition using profilers.</p>
 </sec>
 <sec id="checks">
   <title>Checks</title>
-  <p>(<xref alt="Binney &amp; Tremaine, 2008" rid="ref-BinneyU003A2008" ref-type="bibr">Binney
-  &amp; Tremaine, 2008</xref>) is the reference for the checks</p>
   <table-wrap>
     <table>
       <colgroup>
@@ -319,7 +336,7 @@ a Creative Commons Attribution 4.0 International License (CC BY
           <td><italic>numeric</italic></td>
         </tr>
         <tr>
-          <td><monospace>is_inside_interquartile_range</monospace></td>
+          <td><monospace>is_inside_iqr</monospace></td>
           <td>Verifies column values reside inside limits of
           interquartile range
           <monospace>Q1 &lt;= col &lt;= Q3</monospace> used on
@@ -467,20 +484,123 @@ a Creative Commons Attribution 4.0 International License (CC BY
 <sec id="controls">
   <title>Controls</title>
   <p>This are the controls</p>
+  <table-wrap>
+    <table>
+      <thead>
+        <tr>
+          <th>Check</th>
+          <th>Description</th>
+          <th>DataType</th>
+        </tr>
+      </thead>
+      <tbody>
+        <tr>
+          <td><monospace>completeness</monospace></td>
+          <td>Zero <monospace>nulls</monospace></td>
+          <td><italic>agnostic</italic></td>
+        </tr>
+        <tr>
+          <td><monospace>percentage_fill</monospace></td>
+          <td><monospace>% rows</monospace> not empty</td>
+          <td><italic>agnostic</italic></td>
+        </tr>
+        <tr>
+          <td><monospace>percentage_empty</monospace></td>
+          <td><monospace>% rows</monospace> empty</td>
+          <td><italic>agnostic</italic></td>
+        </tr>
+      </tbody>
+    </table>
+  </table-wrap>
 </sec>
 </body>
 <back>
 <ref-list>
-  <ref id="ref-BinneyU003A2008">
-    <element-citation publication-type="book">
+  <ref id="ref-10.1145U002F3603707">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Fadlallah</surname><given-names>Hadi</given-names></name>
+        <name><surname>Kilany</surname><given-names>Rima</given-names></name>
+        <name><surname>Dhayne</surname><given-names>Houssein</given-names></name>
+        <name><surname>El Haddad</surname><given-names>Rami</given-names></name>
+        <name><surname>Haque</surname><given-names>Rafiqul</given-names></name>
+        <name><surname>Taher</surname><given-names>Yehia</given-names></name>
+        <name><surname>Jaber</surname><given-names>Ali</given-names></name>
+      </person-group>
+      <article-title>Context-aware big data quality assessment: A scoping review</article-title>
+      <source>Journal of Data and Information Quality</source>
+      <year iso-8601-date="2023-06">2023</year><month>06</month>
+      <volume>15</volume>
+      <pub-id pub-id-type="doi">10.1145/3603707</pub-id>
+      <fpage></fpage>
+      <lpage></lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-10.1145U002F3603706">
+    <element-citation publication-type="article-journal">
+      <person-group person-group-type="author">
+        <name><surname>Fadlallah</surname><given-names>Hadi</given-names></name>
+        <name><surname>Kilany</surname><given-names>Rima</given-names></name>
+        <name><surname>Dhayne</surname><given-names>Houssein</given-names></name>
+        <name><surname>El Haddad</surname><given-names>Rami</given-names></name>
+        <name><surname>Haque</surname><given-names>Rafiqul</given-names></name>
+        <name><surname>Taher</surname><given-names>Yehia</given-names></name>
+        <name><surname>Jaber</surname><given-names>Ali</given-names></name>
+      </person-group>
+      <article-title>BIGQA: Declarative big data quality assessment</article-title>
+      <source>Journal of Data and Information Quality</source>
+      <year iso-8601-date="2023-06">2023</year><month>06</month>
+      <volume>15</volume>
+      <pub-id pub-id-type="doi">10.1145/3603706</pub-id>
+      <fpage></fpage>
+      <lpage></lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-10.1145U002F3580305.3599776">
+    <element-citation publication-type="paper-conference">
+      <person-group person-group-type="author">
+        <name><surname>Tu</surname><given-names>Dezhan</given-names></name>
+        <name><surname>He</surname><given-names>Yeye</given-names></name>
+        <name><surname>Cui</surname><given-names>Weiwei</given-names></name>
+        <name><surname>Ge</surname><given-names>Song</given-names></name>
+        <name><surname>Zhang</surname><given-names>Haidong</given-names></name>
+        <name><surname>Han</surname><given-names>Shi</given-names></name>
+        <name><surname>Zhang</surname><given-names>Dongmei</given-names></name>
+        <name><surname>Chaudhuri</surname><given-names>Surajit</given-names></name>
+      </person-group>
+      <article-title>Auto-validate by-history: Auto-program data quality constraints to validate recurring data pipelines</article-title>
+      <source>Proceedings of the 29th ACM SIGKDD conference on knowledge discovery and data mining</source>
+      <publisher-name>Association for Computing Machinery</publisher-name>
+      <publisher-loc>New York, NY, USA</publisher-loc>
+      <year iso-8601-date="2023">2023</year>
+      <isbn>9798400701030</isbn>
+      <uri>https://doi.org/10.1145/3580305.3599776</uri>
+      <pub-id pub-id-type="doi">10.1145/3580305.3599776</pub-id>
+      <fpage>4991</fpage>
+      <lpage>5003</lpage>
+    </element-citation>
+  </ref>
+  <ref id="ref-10.14778U002F3229863.3229867">
+    <element-citation publication-type="article-journal">
       <person-group person-group-type="author">
-        <name><surname>Binney</surname><given-names>J.</given-names></name>
-        <name><surname>Tremaine</surname><given-names>S.</given-names></name>
+        <name><surname>Schelter</surname><given-names>Sebastian</given-names></name>
+        <name><surname>Lange</surname><given-names>Dustin</given-names></name>
+        <name><surname>Schmidt</surname><given-names>Philipp</given-names></name>
+        <name><surname>Celikel</surname><given-names>Meltem</given-names></name>
+        <name><surname>Biessmann</surname><given-names>Felix</given-names></name>
+        <name><surname>Grafberger</surname><given-names>Andreas</given-names></name>
       </person-group>
-      <source>Galactic Dynamics: Second Edition</source>
-      <publisher-name>Princeton University Press</publisher-name>
-      <year iso-8601-date="2008">2008</year>
-      <uri>http://adsabs.harvard.edu/abs/2008gady.book.....B</uri>
+      <article-title>Automating large-scale data quality verification</article-title>
+      <source>Proc. VLDB Endow.</source>
+      <publisher-name>VLDB Endowment</publisher-name>
+      <year iso-8601-date="2018-08">2018</year><month>08</month>
+      <volume>11</volume>
+      <issue>12</issue>
+      <issn>2150-8097</issn>
+      <uri>https://doi.org/10.14778/3229863.3229867</uri>
+      <pub-id pub-id-type="doi">10.14778/3229863.3229867</pub-id>
+      <fpage>1781</fpage>
+      <lpage>1794</lpage>
     </element-citation>
   </ref>
 </ref-list>