index.html

<!doctype html>
<html lang="en" data-bs-theme="light">

<head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>CONDA 2024 | The 1st Workshop on Data Contamination</title>
    <link rel="icon" type="image/x-icon" href="/assets/favicon.png">
    <meta property="og:title" content="CONDA 2024 | The 1st Workshop on Data Contamination" />
    <meta property="og:locale" content="en_US" />
    <meta name="description"
        content="Evaluation data has been compromised! A workshop on detecting, preventing, and addressing data contamination.">
    <meta property="og:description"
        content="Evaluation data has been compromised! A workshop on detecting, preventing, and addressing data contamination." />

    <meta property="og:site_name" content="CONDA 2024 | The 1st Workshop on Data Contamination" />
    <meta property="og:type" content="website" />
    <meta name="twitter:card" content="summary" />
    <meta property="twitter:title" content="CONDA 2024 | The 1st Workshop on Data Contamination" />
    <meta property="og:image" content="https://conda-workshop.github.io/assets/favicon.png" />


    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet"
        integrity="sha384-T3c6CoIi6uLrA9TneNEoa7RxnatzjcDSCmG1MXxSR1GAsXEV/Dwwykc2MPK8M2HN" crossorigin="anonymous">


    <style>
        a {
            text-decoration: none;
        }
    </style>
</head>

<body>
    <header class="border-bottom">

        <nav class="navbar navbar-expand-lg">
            <div class="container">
                <a class="navbar-brand" href="#">
                    <img src="/assets/favicon.png" alt="Logo" width="30" height="30" class="rounded">
                    <span class="align-middle">CONDA 2024</span>
                </a>
                <button class="navbar-toggler" type="button" data-bs-toggle="collapse"
                    data-bs-target="#navbarNavAltMarkup" aria-controls="navbarNavAltMarkup" aria-expanded="false"
                    aria-label="Toggle navigation">
                    <span class="navbar-toggler-icon"></span>
                </button>
                <div class="collapse navbar-collapse justify-content-end" id="navbarNavAltMarkup">
                    <div class="navbar-nav">
                        <a class="nav-link" href="#program">Program</a>
                        <a class="nav-link" href="#invited-speakers">Invited Speakers</a>
                        <a class="nav-link" href="#important-dates">Important Dates</a>
                        <a class="nav-link" href="#call_for_papers">Call for papers</a>
                        <a class="nav-link" href="#shared_task">Shared Task</a>
                        <a class="nav-link" href="#organizers">Organizers</a>
                        <a class="nav-link" href="#sponsors">Sponsors</a>
                        <div class="vr d-none d-lg-block "></div>
                        <button type="button" class="btn" onclick="change_theme()"><img class="rounded-circle bg-light"
                                src="https://img.icons8.com/ios-filled/50/contrast.png" height="15px" /></button>
                    </div>
                </div>
            </div>
        </nav>

    </header>
    <main class="container">
        <!-- <div class="p-3 px-5 my-4 rounded-5 shadow-sm border bg-body-tertiary">-->
        <div class="p-3 px-lg-5 my-4">
            <div class="container-fluid py-5">
                <h1 class="display-5 fw-bold mb-5">The 1st Workshop on Data Contamination (CONDA)</h1>
                <p class="col-md-8 fs-4">Workshop@<a href="https://2024.aclweb.org/">ACL 2024</a></p>
                <p class="fs-5 fw-light"><b>Evaluation data has been compromised!</b> <br>
                    A workshop on detecting, preventing, and addressing data contamination.
                </p>
            </div>

            <!--<div class="py-3 text-center">
                <h3>Page in progress</h3>
            </div>-->
        </div>

        <div class="content px-lg-5 px-2">

            <div class="py-3 fw-light">
                <h4>
                    <a 
                        class="btn btn-primary fw-normal"
                        href="https://us06web.zoom.us/rec/play/MYvEKessE4oBqc5s3L3mz5JrQLFSIs5vWvYfemZgZuCBcwn8uCxBRa7e8m_dK_unq34WZ7-DGAC07N0b.EiYyoJAhciDjmNaJ?autoplay=true&startTime=1723770535000">
                        Watch the recordings of the event here.
                    </a>
                </h4>
            </div>

            <div class="py-3 fw-light" id="program">
                <h2 class="border-bottom pb-1">Program schedule (Friday, August 16, 2024)</h2>
                <p>The workshop will be located in the room <b>LOTUS SUITE 4</b> at the ACL2024 conference <a
                        href="https://2024.aclweb.org/participants/">venue</a>. The schedule for the workshop is the
                    following:</p>
                <div class="pt-2 row">
                    <div class="col-12">
                        <table class="table table-borderless">
                            <tbody>
                                <tr>
                                    <th style="width: 125px;">08:55-09:00</th>
                                    <td>Opening Remarks</td>
                                </tr>
                                <tr>
                                    <th>09:00-09:45</th>
                                    <td><b>Invited talk by Margaret Mitchell:</b> On the value of carefully measuring
                                        data.</td>
                                </tr>
                                <tr>
                                    <th>09:45-10:30</th>
                                    <td><b>Invited talk by Dieuwke Hupkes:</b> Evaluation data contamination:how much is
                                        there, and how
                                        much does it actually matter?</td>
                                </tr>
                                <tr>
                                    <th>10:30-11:00</th>
                                    <td><i>Break</i></td>
                                </tr>
                                <tr>
                                    <th>11:00-11:45</th>
                                    <td><b>Invited talk by Anna Rogers:</b> A Sanity Check on Emergent Properties</td>
                                </tr>
                                <tr>
                                    <th>11:45-12:00</th>
                                    <td><b>Best paper presentation:</b> Rethinking LLM Memorization through the Lens of
                                        Adversarial Compression</td>
                                </tr>
                                <tr>
                                    <th>12:00-13:30</th>
                                    <td><i>Lunch Break</i></td>
                                </tr>
                                <tr>
                                    <th>13:30-15:30</th>
                                    <td>Poster Session:</td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">Evaluating Chinese Large Language Models on Discipline
                                            Knowledge Acquisition via Assessing Memorization and
                                            Robustness</span><br><small>Chuang Liu, Renren Jin, Mark Steedman, Deyi
                                            Xiong</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">Scaling Laws for Data Poisoning in
                                            LLMs</span><br><small>Dillon Bowen, Brendan Murphy, Will Cai, David
                                            Khachaturov, Adam Gleave, Kellin Pelrine</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">LLM Dataset Inference: Did you train on my
                                            dataset?</span><br><small>Pratyush Maini, Hengrui Jia, Nicolas Papernot,
                                            Adam Dziedzic</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">Rethinking LLM Memorization through the Lens of
                                            Adversarial Compression</span><br><small>Avi Schwarzschild, Zhili Feng,
                                            Pratyush Maini, Zachary Chase Lipton, J Zico Kolter</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">TOFU: A Task of Fictitious Unlearning for
                                            LLMs</span><br><small>Pratyush Maini, Zhili Feng, Avi Schwarzschild, Zachary
                                            Chase Lipton, J Zico Kolter</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">Train-to-Test Contamination in Code Generation
                                            Evaluations</span><br><small>Alexandre Matton, Elena Tommasone, Dennis
                                            Aumiller, Milad Alizadeh, Kylie He, Tom Sherborne, Raymond Ma, Maxime
                                            Voisin, Ellen Gilsenan-Mcmahon, Matthias Gallé</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">Benchmark Inflation: Revealing LLM Performance Gaps
                                            Using Retro-Holdouts</span><br><small>Jacob Haimes, Cenny Wenner, Kunvar
                                            Thaman, Vassil Tashev, Clement Neo, Esben Kran, Jason
                                            Hoelscher-Obermaier</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">Confounders in Instance Variation for the Analysis of
                                            Data Contamination</span><br><small>Behzad Mehrbakhsh, Dario Garigliotti,
                                            Fernando Martínez-Plumed, Jose Hernandez-Orallo</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">Unveiling the Spectrum of Data Contamination in Language
                                            Models: A Survey from Detection to Remediation</span><br><small>Chunyuan
                                            Deng, Yilun Zhao, Yuzhao Heng, Yitong Li, Jiannan Cao, Xiangru Tang, Arman
                                            Cohan</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">Task Contamination: Language Models May Not Be Few-Shot
                                            Anymore</span><br><small>Changmao Li, Jeffrey Flanigan</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">A Taxonomy for Data Contamination in Large Language
                                            Models</span><br><small>Medha Palavalli, Amanda Bertsch, Matthew R.
                                            Gormley</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">Using Cochrane Systematic Literature Reviews to Reduce
                                            Contamination in the Evaluation of Large Language
                                            Models</span><br><small>Wojciech Kusa, Moritz Staudinger, Harrisen Scells,
                                            Allan Hanbury</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th></th>
                                    <td><span class="fw-normal">Proving membership in LLM pretraining data via data
                                            watermarks</span><br><small>Johnny Wei, Ryan Yixiang Wang, Robin Jia</small>
                                    </td>
                                </tr>
                                <tr>
                                    <th>15:30-16:00</th>
                                    <td><i>Break</i></td>
                                </tr>
                                <tr>
                                    <th>16:00-16:45</th>
                                    <td><b>Invited talk by Jesse Dodge:</b> Contamination in Web-Scale Datasets and its
                                        Impact on Large
                                        Model Evaluations</td>
                                </tr>
                                <tr>
                                    <th>17:00-17:15</th>
                                    <td>Closing Remarks</td>
                                </tr>
                            </tbody>
                        </table>
                    </div>
                </div>
            </div>

            <div class="py-3 fw-light" id="description">
                <h2 class="border-bottom pb-1">Background & Scope</h2>

                <p class="py-2" align="justify">Data contamination, where evaluation data is inadvertently included in
                    pre-training corpora of large
                    scale models, and language models (LMs) in particular, has become a concern in recent times (<a
                        href="https://aclanthology.org/2023.findings-emnlp.722/">Sainz et al. 2023</a>; <a
                        href="https://aclanthology.org/2023.emnlp-main.308/">Jacovi et al. 2023</a>). The growing scale
                    of both models and data, coupled with massive web crawling, has led to the inclusion
                    of segments from evaluation benchmarks in the pre-training data of LMs (<a
                        href="https://aclanthology.org/2021.emnlp-main.98/">Dodge et al., 2021</a>; <a
                        href="https://arxiv.org/abs/2303.08774">OpenAI, 2023</a>; <a
                        href="https://arxiv.org/abs/2305.10403">Google, 2023</a>; <a
                        href="https://arxiv.org/abs/2310.20707">Elazar et al., 2023</a>). The scale of internet data
                    makes it difficult to prevent this contamination from happening, or even detect when it has
                    happened (<a href="https://arxiv.org/abs/2108.07258">Bommasani et al., 2022</a>; <a
                        href="https://arxiv.org/abs/2212.05129">Mitchell et al., 2023</a>). Crucially, when evaluation
                    data becomes part of pre-training data, it introduces biases
                    and can artificially inflate the performance of LMs on specific tasks or benchmarks (<a
                        href="https://aclanthology.org/2022.acl-short.18/">Magar and
                        Schwartz, 2022</a>). This poses a
                    challenge for fair and unbiased evaluation of models, as their performance may not accurately
                    reflect their generalization capabilities.</p>

                <p class="py-2" align="justify">
                    Although a growing number of papers and state-of-the-art models mention issues of data contamination
                    (<a
                        class="https://proceedings.neurips.cc/paper_files/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf">Brown
                        et al., 2020</a>; <a href="https://openreview.net/forum?id=gEZrGCozdqR">Wei et al., 2022</a>; <a
                        href="https://arxiv.org/abs/2204.02311">Chowdhery et al., 2022</a>; <a
                        href="https://arxiv.org/abs/2303.08774">OpenAI, 2023</a>; <a
                        href="https://arxiv.org/abs/2305.10403">Google, 2023</a>;
                    <a href="https://arxiv.org/abs/2302.13971">Touvron et al., 2023</a>), there is no agreed upon
                    definition or standard methodology to ensure that
                    a model does not report results on contaminated
                    benchmarks. Addressing data contamination is a shared responsibility among researchers, developers,
                    and the broader community. By adopting best practices, increasing transparency, documenting
                    vulnerabilities, and conducting thorough evaluations, we can work towards minimizing the impact of
                    data contamination and ensuring fair and reliable evaluations.
                </p>

            </div>

            <div class="py-3 fw-light" id="invited-speakers">
                <h2 class="border-bottom pb-1">Invited speakers</h2>

                <div class="row py-4 px-lg-3 px-0">
                    <div class="col-12 col-md-4 col-lg-2 mx-lg-2 order-first">
                        <div style="display: flex; justify-content: center;">
                            <img src="https://annargrs.github.io/assets/images/aro.jpg"
                                class="d-block m-2 rounded card-img-top" loading="lazy" style="width:200px;">
                        </div>
                    </div>
                    <div class="col mx-5">
                        <h4 class="text-center text-md-start">Anna Rogers</h4>
                        <p class="text-center text-md-start"><small>Associate Professor at IT University of
                                Copenhagen</small></p>
                        <p><u class="fw-normal">A Sanity Check on Emergent Properties.</u></p>
                        <p><b>Abstract:</b> One of the frequent points in the mainstream narrative about large language
                            models is that they have emergent properties", but there is a lot of disagreement about what
                            that even means. If they are understood as a kind of generalization beyond training data- as
                            something that a model does without being explicitly trained for it- I argue that we have
                            not in fact established the existence of any such properties, and at the moment we do not
                            even have the methodology for doing so.
                        </p>
                    </div>
                </div>

                <div class="row py-4 px-lg-3 px-0">
                    <div class="col-12 col-md-4 col-lg-2 mx-lg-2 order-first">
                        <div style="display: flex; justify-content: center;">
                            <img src="https://jessedodge.github.io/headshots/jesse_headshot_for_ai2.jpg"
                                class="d-block m-2 rounded card-img-top" loading="lazy" style="width:200px;">
                        </div>
                    </div>
                    <div class="col mx-5">
                        <h4 class="text-center text-md-start">Jesse Dodge</h4>
                        <p class="text-center text-md-start"><small>Research Scientist at Allen Institute for AI</small>
                        </p>
                        <p><u class="fw-normal">Contamination in Web-Scale Datasets and its Impact on Large Model
                                Evaluations.</u></p>
                        <p><b>Abstract:</b> We are at a pivotal moment in the history of AI. The AI research community
                            has driven pro gress for decades, but over the past couple years industry has started to
                            make significant advances in model capabilities while purposely being closed about how. In
                            this talk I’ll start by discussing different types of contamination and how they appear in
                            the wild. I’ll then discuss some of our work on building massive datasets by scraping the
                            web, including Dolma and C4. I’ll discuss What’s In My Big Data, a toolkit for documenting
                            the contents of web-scale datasets, and some of our results on measuring contamination in
                            different ways across a variety of popular pretraining corpora. I’ll conclude by discussing
                            evaluation of large models, and how current evaluations have low construct validity and how
                            we don’t have strong evaluations for the actual use cases that users care about.
                        </p>
                    </div>
                </div>

                <div class="row py-4 px-lg-3 px-0">
                    <div class="col-12 col-md-4 col-lg-2 mx-lg-2 order-first">
                        <div style="display: flex; justify-content: center;">
                            <img src="assets/Dieuwke_Hupkes.png" class="d-block m-2 rounded card-img-top" loading="lazy"
                                style="width:200px;">
                        </div>
                    </div>
                    <div class="col mx-5">
                        <h4 class="text-center text-md-start">Dieuwke Hupkes</h4>
                        <p class="text-center text-md-start"><small>Research Scientist at Meta</small></p>
                        <p><u class="fw-normal">Evaluation data contamination: how much is there, and how much does it
                                actually
                                matter?</u></p>
                        <p><b>Abstract:</b> With many of the current "SOTA" LLMs being closed sourced and their training
                            data inaccessible, more and more questions arise that relate to potential contamination of
                            the evaluation datasets used to claim their results. Various claims can be found online that
                            range from suspicions of outright training on evaluation data to inflate results to
                            suggestions that the definitions of contamination used may be inadequate and underestimate
                            its impact. However, even with access to the training corpus, contamination and its impact
                            is far from trivial to assess. In this talk, I discuss common ways of measuring
                            contamination and provide empirical data into how much they impact results for a range of
                            LLMs.
                        </p>
                    </div>
                </div>

                <div class="row py-4 px-lg-3 px-0">
                    <div class="col-12 col-md-4 col-lg-2 mx-lg-2 order-first">
                        <div style="display: flex; justify-content: center;">
                            <img src="http://www.m-mitchell.com/images/meg_cropped_sidebar2.jpg"
                                class="d-block m-2 rounded card-img-top" loading="lazy" style="width:200px;">
                        </div>
                    </div>
                    <div class="col mx-5">
                        <h4 class="text-center text-md-start">Margaret Mitchell</h4>
                        <p class="text-center text-md-start"><small>Researcher and Chief Ethics Scientist at
                                HuggingFace</small></p>
                        <p><u class="fw-normal">On the value of carefully measuring data.</u></p>
                        <p><b>Abstract:</b> Just as we evaluate models, we should measure data. Measuring data involves
                            quantifying different aspects of its composition, such as counts of the top-represented
                            domains, or correlations between sensitive identity terms and other concepts. In this talk,
                            I will define the problem of measuring data and unpack how it can be applied to
                            automatically curating distinct training and evaluation datasets for ML models.
                        </p>
                    </div>
                </div>
                <!--<div class="row pt-2">
                    <p>TBA</p>
                </div>-->

            </div>

            <div class="py-3 fw-light" id="important-dates">
                <h2 class="border-bottom pb-1">Important Dates</h2>
                <div class="pt-2 row">
                    <div class="col-lg-6 col-12">
                        <table class="table table-borderless">
                            <tbody>
                                <tr>
                                    <th style="color: red;"><del style="color: black;">May 17</del> May 31, 2024</th>
                                    <td>Paper submission deadline</td>
                                </tr>
                                <tr>
                                    <th>June 14, 2024</th>
                                    <td>ARR pre-reviewed commitment deadline</td>
                                </tr>
                                <tr>
                                    <th>June 17, 2024</th>
                                    <td>Notification of acceptance</td>
                                </tr>
                                <tr>
                                    <th style="color: red;"><del style="color: black;">July 1</del> July 4, 2024</th>
                                    <td>Camera ready deadline</td>
                                </tr>
                                <tr>
                                    <th>August 16, 2024</th>
                                    <td>Workshop day</td>
                                </tr>
                            </tbody>
                        </table>
                    </div>
                </div>
            </div>

            <div class="py-3 fw-light" id="call_for_papers">
                <h2 class="border-bottom pb-1">Call for papers</h2>
                <p class="pt-2">We welcome paper submissions on all topics related to data contamination, including but
                    not limited to:

                <ul>
                    <li>Definitions, taxonomies and gradings of contamination</li>
                    <li>Contamination detection (both manual and automatic)</li>
                    <li>Community efforts to discover, report and organize contamination events</li>
                    <li>Documentation frameworks for datasets or models</li>
                    <li>Methods to avoid data contamination</li>
                    <li>Methods to forget contaminated data</li>
                    <li>Scaling laws and contamination</li>
                    <li>Memorization and contamination</li>
                    <li>Policies to avoid impact of contamination in publication venues and open source communities</li>
                    <li>Reproducing and attributing results from previous work to data contamination</li>
                    <li>Survey work on data contamination research</li>
                    <li>Data contamination in other modalities</li>
                </ul>
                </p>

                <h5>Paper Submission Information</h5>
                <p>We welcome two types of papers: regular workshop papers and non-archival submissions. Regular
                    workshop papers will be included in the workshop proceedings. All submissions must be in PDF format
                    and made through <a
                        href="https://openreview.net/group?id=aclweb.org/ACL/2024/Workshop/CONDA">OpenReview</a>.</p>
                <div class="pt-2">
                    <ul>
                        <li>
                            <b>Regular workshop papers:</b> Authors can submit papers up to 8 pages, with unlimited
                            pages for references. Authors may submit up to 100 MB of supplementary materials separately
                            and their code for reproducibility. All submissions undergo an double-blind single-track
                            review. Best Paper Award(s) will be given based on nomination by the reviewers. Accepted
                            papers will be presented as posters with the possibility of oral presentations.
                        </li>
                        <li>
                            <b>Non-archival submissions:</b> Cross-submissions are welcome. Accepted papers will be
                            presented at the workshop, but will not be included in the workshop proceedings. Papers must
                            be in PDF format and will be reviewed in a double-blind fashion by workshop reviewers. We
                            also welcome extended abstracts (up to 2 pages) of papers that are work in progress, under
                            review or to be submitted to other venues. Papers in this category need to follow the ACL
                            format.
                        </li>
                    </ul>
                </div>
                <p>In addition to papers submitted directly to the workshop, which will be reviewed by our Programme
                    Committee. We also accept papers reviewed through ACL Rolling Review and committed to the workshop.
                    Please, check the relevant dates for each type of submission.</p>
                <p>Links to OpenReview submission pages:</p>
                <ul>
                    <li><a href="https://openreview.net/group?id=aclweb.org/ACL/2024/Workshop/CONDA">Regular
                            submissions</a></li>
                    <li><a href="https://openreview.net/group?id=aclweb.org/ACL/2024/Workshop/CONDA_ARR_Commitment">ARR
                            pre-reviewed commitment</a></li>
                </ul>
            </div>

            <div class="py-3 fw-light" id="shared_task">
                <h2 class="border-bottom pb-1">Shared Task: Data Contamination Evidence Collection</h2>

                <p class="pt-2">In addition to paper contributions, we are organizing a community effort on centralized
                    data contamination evidence collection. While the problem of data contamination is prevalent and
                    serious, the breadth and depth of this contamination are still largely unknown. The concrete
                    evidence of contamination is scattered across papers, blog posts, and social media, and it is
                    suspected that the true scope of data contamination in NLP is significantly larger than reported.
                </p>

                <p>With this shared task we aim to provide a structured, centralized platform for contamination evidence
                    collection to help the community understand the extent of the problem and to help researchers avoid
                    repeating the same mistakes. The shared task also gathers evidence of clean, non-contaminated
                    instances. The platform is already available for perusal at <a
                        href="https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Database">Data
                        Contamination
                        Database</a>.
                </p>

                <h5>Compilation Paper</h5>
                <p>As a companion to the contamination evidence platform, we will produce a paper that will provide a
                    summary and overview of the evidence collected in the shared task. The participants who contribute
                    to the shared task will be listed as co-authors in the paper, to be published in the workshop
                    proceedings.
                </p>

                <h5>Instructions for Evidence Submission</h5>
                <p>Each submission should report a case of contamination or lack of contamination thereof. The
                    submission can be either about (1) contamination in the corpus used to pre-train language models,
                    where the pre-training corpus contains a specific evaluation dataset, or about (2) contamination in
                    a model that shows evidence of having seen a specific evaluation dataset while being trained. Each
                    submission needs to mention the corpus (or model) and the evaluation dataset, in addition to some
                    evidence of contamination. Alternatively, we also welcome evidence of a lack of contamination.
                </p>

                <p>Reports must be submitted through a Pull Request in the Data Contamination Database space at
                    HuggingFace. The reports must follow the Contribution Guidelines provided in the space and will be
                    reviewed by the organizers. If you have any questions, please contact us at <a
                        href="mailto:conda-workshop@googlegroups.com">conda-workshop@googlegroups.com</a> or open a
                    discussion in the space itself.
                </p>

                <p>URL with contribution guidelines: <a
                        href="https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Database">Data
                        Contamination Database</a> (“Contribution Guidelines” tab).
                </p>

            </div>


            <div class="py-3 fw-light" id="organizers">
                <h2 class="border-bottom pb-1">Organizers</h2>
                You can contact us by email: <a
                    href="mailto:conda-workshop@googlegroups.com">conda-workshop@googlegroups.com</a>
                <div class="row pt-2">

                    <div class="col-lg-3 col-md-6 col-sm-12 p-1 text-center py-4">
                        <div style="display: flex; justify-content: center;">
                            <img src="https://osainz59.github.io/assets/img/oscar2_0_upscaled_square.jpg"
                                class="d-block m-2 rounded card-img-top" loading="lazy" style="width:150px;">
                        </div>
                        <a href="https://osainz59.github.io/"><b>Oscar Sainz</b></a><br>
                        <a><small>HiTZ Center - Ixa</small></a><br>
                        <a><small>University of the Basque Country</small></a><br>
                    </div>

                    <div class="col-lg-3 col-md-6 col-sm-12 p-1 text-center py-4">
                        <div style="display: flex; justify-content: center;">
                            <img src="https://ikergarcia1996.github.io/Iker-Garcia-Ferrero//images/Iker.jpeg"
                                class="d-block m-2 rounded card-img-top" loading="lazy" style="width:150px;">
                        </div>
                        <a href="https://ikergarcia1996.github.io/Iker-Garcia-Ferrero/"><b>Iker García
                                Ferrero</b></a><br>
                        <a><small>HiTZ Center - Ixa</small></a><br>
                        <a><small>University of the Basque Country</small></a><br>
                    </div>

                    <div class="col-lg-3 col-md-6 col-sm-12 p-1 text-center py-4">
                        <div style="display: flex; justify-content: center;">
                            <img src="https://upload.wikimedia.org/wikipedia/commons/c/c1/Eneko_elhuyar_Aitziber_Agirre_RuizeArkautekoa.png"
                                class="d-block m-2 rounded card-img-top" loading="lazy" style="width:150px;">
                        </div>
                        <a href="https://eagirre.github.io/"><b>Eneko Agirre</b></a><br>
                        <a><small>HiTZ Center - Ixa</small></a><br>
                        <a><small>University of the Basque Country</small></a><br>
                    </div>

                    <div class="col-lg-3 col-md-6 col-sm-12 p-1 text-center py-4">
                        <div style="display: flex; justify-content: center;">
                            <img src="assets/jon.ander.square.png" class="d-block m-2 rounded card-img-top"
                                loading="lazy" style="width:150px;">
                        </div>
                        <a href=""><b>Jon Ander Campos</b></a><br>
                        <a><small>Cohere</small></a><br>
                    </div>

                    <div class="col-lg-3 col-md-6 col-sm-12 p-1 text-center py-4">
                        <div style="display: flex; justify-content: center;">
                            <img src="https://alonjacovi.github.io/images/site.png"
                                class="d-block m-2 rounded card-img-top" loading="lazy" style="width:150px;">
                        </div>
                        <a href="https://alonjacovi.github.io"><b>Alon Jacovi</b></a><br>
                        <a><small>Bar Ilan University</small></a><br>
                    </div>

                    <div class="col-lg-3 col-md-6 col-sm-12 p-1 text-center py-4">
                        <div style="display: flex; justify-content: center;">
                            <img src="https://yanaiela.github.io/figs/yanai2.jpg"
                                class="d-block m-2 rounded card-img-top" loading="lazy" style="width:150px;">
                        </div>
                        <a href="https://yanaiela.github.io/"><b>Yanai Elazar</b></a><br>
                        <a><small>Allen Institute for Artificial Intelligence</small></a><br>
                        <a><small>University of Washington</small></a>
                    </div>

                    <div class="col-lg-3 col-md-6 col-sm-12 p-1 text-center py-4">
                        <div style="display: flex; justify-content: center;">
                            <img src="https://irl.spacy.io/static/60f32525d2dc3f6ae521190bb9f54178/bbca6/yoav-goldberg.jpg"
                                class="d-block m-2 rounded card-img-top" loading="lazy" style="width:150px;">
                        </div>
                        <a href="https://u.cs.biu.ac.il/~yogo/"><b>Yoav Goldberg</b></a><br>
                        <a><small>Bar Ilan University</small></a><br>
                        <a><small>Allen Institute for Artificial Intelligence</small></a><br>
                    </div>


                </div>
            </div>

            <div class="py-3 fw-light" id="sponsors">
                <h2 class="border-bottom pb-1">Sponsors</h2>
                <div class="row pt-2">

                    <div class="col-12 col-md p-1 pt-4 text-center">
                        <div style="display: flex; justify-content: center;">
                            <img src="assets/aws_sponsor.svg" class="d-block m-2 rounded card-img-top" loading="lazy"
                                style="width:150px; height: 140px;">
                        </div>
                        <a href="https://aws.amazon.com/bedrock/">AWS AI and Amazon Bedrock</a>

                    </div>

                    <div class="col-12 col-md p-1 pt-4 text-center">
                        <div style="display: flex; justify-content: center;">
                            <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
                                class="d-block m-2 rounded card-img-top" loading="lazy"
                                style="width:150px; height: 140px;">
                        </div>
                        <a href="https://huggingface.co/">HuggingFace</a>

                    </div>

                    <div class="col-12 col-md p-1 pt-4 text-center">
                        <div style="display: flex; justify-content: center; height: 140px;" class="align-items-center">
                            <img src="assets/googlelogo_color_416x140dp.png" class="d-block m-2 rounded card-img-top "
                                loading="lazy" style="width:200px; height: 67.03px;">
                        </div>
                        <a href="https://google.com/">Google</a>

                    </div>

                </div>

            </div>
    </main>

    <footer class="py-3 my-4 text-body-secondary border-top">
        <div class="container text-center pt-2">
            <a>&copy; 2023 CONDA Workshop</a>
        </div>

    </footer>
    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js"
        integrity="sha384-C6RzsynM9kWDrMNeT87bh95OGNyZPhcTNXj1NW7RuBCsyN/o0jlpcV8Qyq46cDfL"
        crossorigin="anonymous"></script>

    <script>

        function change_theme() {
            if (document.documentElement.getAttribute('data-bs-theme') == 'dark') {
                document.documentElement.setAttribute('data-bs-theme', 'light')
            }
            else {
                document.documentElement.setAttribute('data-bs-theme', 'dark')
            }
        }
    </script>

</body>

</html>