bib/cc2020.bib

@InProceedings{cc:MackenzieBenhamPetriTrippasEtAl:2020:cc-news-en,
  author       = "Mackenzie, Joel and Benham, Rodger and Petri, Matthias and Trippas, Johanne R. and Culpepper, J. Shane
                 and Moffat, Alistair",
  title        = "{CC}-News-En: {A} large English news corpus",
  year         = "2020",
  ISBN         = "978-1-4503-6859-9",
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  URL          = "https://doi.org/10.1145/3340531.3412762",
  doi          = "10.1145/3340531.3412762",
  abstract     = "We describe a static, open-access news corpus using data from the Common Crawl Foundation, who provide
                 free, publicly available web archives, including a continuous crawl of international news articles
                 published in multiple languages. Our derived corpus, CC-News-En, contains 44 million English documents
                 collected between September 2016 and March 2018. The collection is comparable in size with the number
                 of documents typically found in a single shard of a large-scale, distributed search engine, and is four
                 times larger than the news collections previously used in offline information retrieval experiments. To
                 complement the corpus, 173 topics were curated using titles from Reddit threads, forming a temporally
                 representative sampling of relevant news topics over the 583 day collection window. Information needs
                 were then generated using automatic summarization tools to produce textual and audio representations,
                 and used to elicit query variations from crowdworkers, with a total of 10,437 queries collected against
                 the 173 topics. Of these, 10,089 include key-stroke level instrumentation that captures the timings of
                 character insertions and deletions made by the workers while typing their queries. These new resources
                 support a wide variety of experiments, including large-scale efficiency exercises and query
                 auto-completion synthesis, with scope for future addition of relevance judgments to support offline
                 effectiveness experiments and hence batch evaluation campaigns.",
  booktitle    = "Proceedings of the 29th ACM International Conference on Information & Knowledge Management",
  pages        = "3077--3084",
  numpages     = "8",
  keywords     = "corpus, user query variations, collection, news search, crowdsourcing",
  location     = "Virtual Event, Ireland",
  series       = "CIKM '20",
  cc-snippet   = "Our derived corpus, CC-News-En, contains 44 million English documents collected between September 2016
                 and March 2018. [...] One such example is the CommonCrawl Foundation,[¹ ] who generate large-scale
                 crawls of the web at regular intervals. A key philosophy behind the Common Crawlis to democratize data,
                 allowing open access with no fees. In late 2016, the Common Crawl Foundation announced a news-specific
                 crawl (CC-News), [² ] with documents being added on a daily basis, and covering sources from a wide
                 range of countries and languages. Here we derive a static, English segment of the CC-Newscrawl that we
                 refer to as CC-News-En. Due to the storage and computation costs involved in filtering out non-English
                 documents, we make the complete corpus available as a free resource, along with asuite of tools which
                 can be used to replicate corpus extraction from the original source CC-News data. We also provide a set
                 of 10,437 user query variations over 173 query topics, including keystroke-level data collected from a
                 novel crowdworking experiment. Our goal is to encourage reproducible and replicable experimentation,
                 with greatly reduced barriers to entry. [...] A total of 2,291 CC-News WARC files were processed to
                 build CC-News-En, covering the period 26 August 2016 to 31 March 2018, inclusive. The first and last
                 WARC files inthis collection are as follows: •CC-NEWS-20160826124520-00000.warc.gz
                 •CC-NEWS-20180331191315-00143.warc.gz The resulting subset of compressed WARC files occupies 2.14 TiB
                 of disk space, and contains a total of 102.5 million documents in over 100 languages. [...] Missing
                 Documents and Temporal Gaps. During the creation of the collection, the
                 CC-NEWS-20170812163812-00038.warc.gz file was not processed correctly by our pipeline, and was
                 subsequently dropped from the CC-News-En corpus. In addition, there are six days within the 583 day
                 period where no WARC files were added to the original CC-News crawl: 22/09/2016 – 25/09/2016
                 inclusive, 18/12/2017, and 22/12/2017. These gaps typically correspond to hardware and software
                 upgrades on the crawl servers.[¹⁸ Private correspondence with Common Crawl Engineers.] It is also
                 important to note that both CC-News and CC-News-En are not intended to be complete crawls of their
                 sources, but rather, to provide a reproducible sample of these sites.",
  cc-author-affiliation = "The University of Melbourne, Melbourne, Australia; RMIT University, Melbourne, Australia;
                 Amazon Alexa, Manhattan Beach, CA, USA",
  cc-class     = "nlp/text-corpora, nlp/corpus-construction, ir/information-extraction",
  cc-dataset-used = "CC-NEWS",
}

@InProceedings{cc:ElKishkyChaudharyGuzmanKoehn:2020:ccaligned,
  title        = "{CCA}ligned: {A} Massive collection of cross-lingual web-document pairs",
  author       = "El-Kishky, Ahmed and Chaudhary, Vishrav and Guzmán, Francisco and Koehn, Philipp",
  booktitle    = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
  year         = "2020",
  address      = "Online",
  publisher    = "Association for Computational Linguistics",
  URL          = "https://www.aclweb.org/anthology/2020.emnlp-main.480",
  pages        = "5960--5969",
  abstract     = "Cross-lingual document alignment aims to identify pairs of documents in two distinct languages that
                 are of comparable content or translations of each other. In this paper, we exploit the signals embedded
                 in URLs to label web documents at scale with an average precision of 94.5{\%} across different language
                 pairs. We mine sixty-eight snapshots of the Common Crawl corpus and identify web document pairs that
                 are translations of each other. We release a new web dataset consisting of over 392 million URL pairs
                 from Common Crawl covering documents in 8144 language pairs of which 137 pairs include English. In
                 addition to curating this massive dataset, we introduce baseline methods that leverage cross-lingual
                 representations to identify aligned documents based on their textual content. Finally, we demonstrate
                 the value of this parallel documents dataset through a downstream task of mining parallel sentences and
                 measuring the quality of machine translations from models trained on this mined data. Our objective in
                 releasing this dataset is to foster new research in cross-lingual NLP across a variety of low, medium,
                 and high-resource languages.",
  cc-snippet   = "[...] we exploit the signals embedded in URLs to label web documents at scale with an average
                 precision of 94.5{\%} across different language pairs. We mine sixty-eight snapshots of the Common
                 Crawl corpus and identify web document pairs that are translations of each other. We release a new web
                 dataset consisting of over 392 million URL pairs from Common Crawl covering documents in 8144 language
                 pairs of which 137 pairs include English. [...] Starting from 68 Common Crawl snapshots with a raw
                 document count of 169.4 billion documents, upon deduplication, the resultant corpus is approximately
                 29.6 billion web documents from 107.8 million distinct web domains – a 83{\%} reduction from the raw
                 corpus.",
  cc-class     = "nlp/machine-translation, nlp/text-corpora, nlp/parallel-corpus, nlp/cross-lingual-document-alignment",
  cc-derived-dataset-about = "CCAligned-2020",
  cc-author-affiliation = "Facebook AI; Johns Hopkins University",
}

@Misc{cc:BrownMannRyderSubbiahEtAl:2020:language-models,
  title        = "Language models are few-shot learners",
  author       = "Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla
                 Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini
                 Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh
                 and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric
                 Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and
                 Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei",
  year         = "2020",
  eprint       = "2005.14165",
  archiveprefix = "arXiv",
  primaryclass = "cs.CL",
  URL          = "https://arxiv.org/abs/2005.14165",
  cc-author-affiliation = "Johns Hopkins University; OpenAI",
  cc-class     = "nlp/language-model, ai/deep-learning, nlp/autoregressive-transformer-language-model,
                 nlp/question-answering, nlp/machine-translation, nlp/text-generation",
  cc-snippet   = "Datasets for language models have rapidly expanded, culminating in the Common Crawl dataset [...]
                 constituting nearly a trillion words. [...] However, we have found that unfiltered or lightly filtered
                 versions of Common Crawl tend to have lower quality than more curated datasets. Therefore, we took 3
                 steps to improve the average quality of our datasets: (1) we downloaded and filtered a version of
                 CommonCrawl based on similarity to a range of high-quality reference corpora, (2) we performed fuzzy
                 deduplication at the document level, within and across datasets, to prevent redundancy and preserve the
                 integrity of our held-out validation set as an accurate measure of overfitting, and (3) we also added
                 known high-quality reference corpora to the training mix to augment CommonCrawl and increase its
                 diversity. Details of the first two points (processing of Common Crawl) are described in Appendix A.",
}

@Misc{cc:JazbecPásztorFaltingsAntulov-FantulinEtAl:2020:news-and-information-transfer,
  title        = "On the impact of publicly available news and information transfer to financial markets",
  author       = "Metod Jazbec and Barna Pásztor and Felix Faltings and Nino Antulov-Fantulin and Petter N. Kolm",
  year         = "2020",
  URL          = "https://arxiv.org/abs/2010.12002",
  cc-author-affiliation = "ETH Zurich, Switzerland; New York University, New York, USA",
  abstract     = "We quantify the propagation and absorption of large-scale publicly available news articles from the
                 World Wide Web to financial markets. To extract publicly available information, we use the news
                 archives from the Common Crawl, a nonprofit organization that crawls a large part of the web. We
                 develop a processing pipeline to identify news articles associated with the constituent companies in
                 the S&P 500 index, an equity market index that measures the stock performance of U.S. companies. Using
                 machine learning techniques, we extract sentiment scores from the Common Crawl News data and employ
                 tools from information theory to quantify the information transfer from public news articles to the
                 U.S. stock market. Furthermore, we analyze and quantify the economic significance of the news-based
                 information with a simple sentiment-based portfolio trading strategy. Our findings provides support for
                 that information in publicly available news on the World Wide Web has a statistically and economically
                 significant impact on events in financial markets.",
  cc-class     = "statistical-finance, ai/machine-learning, nlp/sentiment-analysis",
  cc-snippet   = "In this article, we use news articles from the Common Crawl News, a subset of the Common Crawl’s
                 petabytes of publicly available World Wide Web archives, to measure the impact of the arrival of new
                 information about the constituent stocks in the S&P 500 index at the time of publishing. To the best of
                 our knowledge, our study is the first one to use the Common Crawl in this way. We develop a cloud-based
                 processing pipeline that identifies news articles in the Common Crawl News data that are related to the
                 companies in the S&P 500. As the Common Crawl public data archives are getting bigger, they are opening
                 doors for many real-world “data-hungry” applications such as transformers models GPT49 and BERT50,
                 a recent class of deep learning language models. We believe that public sources of news data is
                 important not only for natural language processing (NLP) and finance communities but also for more
                 general studies in complex systems and computational social sciences that are aiming to characterize
                 (mis)information propagation and dynamics in techno-socio-economic systems. The abundance of
                 high-frequency data around the financial systems enables complex systems researchers to have
                 microscopic observables that allow verification of different models, theories, and hypotheses.",
  cc-dataset-used = "CC-NEWS",
}

@Misc{cc:SquarcinaTempestaVeroneseCalzavaraEtAl:2020:related-domain-attacks,
  title        = "Can {I} take your subdomain? Exploring related-domain attacks in the modern web",
  author       = "Marco Squarcina and Mauro Tempesta and Lorenzo Veronese and Stefano Calzavara and Matteo Maffei",
  year         = "2020",
  URL          = "https://arxiv.org/abs/2012.01946",
  pdf          = "https://arxiv.org/pdf/2012.01946.pdf",
  cc-author-affiliation = "TU Wien, Austria; Università Ca’ Foscari Venezia, Italy",
  cc-class     = "computer-security/internet-security, related-domain attacks",
  cc-snippet   = "Our web security analysis aims at quantifying the number of domains hosting web applications that can
                 be exploited by taking over the vulnerable domains discovered by RDScan. In particular, for every apex
                 domain with at least one vulnerable subdomain, we selected from the CommonCrawl dataset [¹⁹ Common
                 Crawl. Host- and domain-level webgraphs feb/mar/may 2020.
                 https://commoncrawl.org/2020/06/host-and-domain-level-web-graphs-febmarmay-2020/, 2020.] the list of
                 200 most popular related-domains according to the Pagerank score [11]. From the homepage of these
                 domains,we extracted the same-origin links that appear in the HTML code.",
  cc-dataset-used = "hyperlinkgraph/cc-main-2020-feb-mar-may/hostgraph",
}

@Article{cc:RaffelShazeerRobertsLeeEtAl:2020:limits-of-transfer-learning,
  title        = "Exploring the limits of transfer learning with a unified text-to-text transformer",
  author       = "Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena,
                 Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J.",
  journal      = "Journal of Machine Learning Research",
  volume       = "21",
  number       = "140",
  pages        = "1--67",
  year         = "2020",
  URL          = "http://jmlr.org/papers/v21/20-074.html",
  pdf          = "https://www.jmlr.org/papers/volume21/20-074/20-074.pdf",
  abstract     = "Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a
                 downstream task, has emerged as a powerful technique in natural language processing (NLP). The
                 effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and
                 practice. In this paper, we explore the landscape of transfer learning techniques for NLP by
                 introducing a unified framework that converts all text-based language problems into a text-to-text
                 format. Our systematic study compares pre-training objectives, architectures, unlabeled data sets,
                 transfer approaches, and other factors on dozens of language understanding tasks. By combining the
                 insights from our exploration with scale and our new “Colossal Clean Crawled Corpus”, we achieve
                 state-of-the-art results on many benchmarks covering summarization, question answering, text
                 classification, and more. To facilitate future work on transfer learning for NLP, we release our data
                 set, pre-trained models, and code.",
  cc-author-affiliation = "Google, Mountain View, CA, USA",
  cc-derived-dataset-about = "Tensorflow-C4",
  cc-dataset-used = "CC-MAIN-2019-18 (WET)",
  cc-snippet   = "We also introduce our approach for treating every problem as a text-to-text task and describe our
                 “Colossal Clean Crawled Corpus” (C4), the Common Crawl-based data set we created as a source of
                 unlabeled text data. [...] Common Crawl is a publicly-available web archive that provides “web
                 extracted text” by removing markup and other non-text content from the scraped HTML files. This
                 process produces around 20TB of scraped text data each month. Unfortunately, the majority of the
                 resulting text is not natural language. Instead, it largely comprises gibberish or boiler-plate text
                 like menus, error messages, or duplicate text. Furthermore, a good deal of the scraped text contains
                 content that is unlikely to be helpful for any of the tasks we consider (offensive language,
                 placeholder text, source code, etc.). To address these issues, we used the following heuristics for
                 cleaning up Common Crawl’s web extracted text: [...] To assemble our base data set, we downloaded the
                 web extracted text from April 2019 and applied the aforementioned filtering. This produces a collection
                 of text that is not only orders of magnitude larger than most data sets used for pre-training (about
                 750 GB) but also comprises reasonably clean and natural English text. We dub this data set the
                 “Colossal Clean Crawled Corpus” (or C4 for short) and release it as part of TensorFlow Datasets.⁸
                 [⁸https://www.tensorflow.org/datasets/catalog/c4]",
  cc-class     = "nlp/corpus-construction, nlp/language-model",
}

@Book{cc:Patel:2020:structured-data-from-internet,
  title        = "Getting structured data from the internet",
  author       = "Jay M. Patel",
  year         = "2020",
  doi          = "https://doi.org/10.1007/978-1-4842-6576-5",
  publisher    = "Apress",
  URL          = "https://www.apress.com/gp/book/9781484265758",
  cc-author-affiliation = "Specrom Analytics, Ahmedabad, India",
  cc-snippet   = "[Chapter 6: Introduction to Common Crawl Datasets + Chapter 7: Web Crawl Processing on Big Data
                 Scale]",
  cc-class     = "web-mining",
}

@Article{cc:Dunn:2020:corpus-of-global-language-use,
  title        = "Mapping languages: The Corpus of Global Language Use",
  author       = "Dunn, Jonathan",
  journal      = "Language Resources and Evaluation",
  pages        = "999--1018",
  year         = "2020",
  volume       = "54",
  publisher    = "Springer",
  URL          = "https://doi.org/10.1007/s10579-020-09489-2",
  doi          = "10.1007/s10579-020-09489-2",
  abstract     = "This paper describes a web-based corpus of global language use with a focus on how this corpus can be
                 used for data-driven language mapping. First, the corpus provides a representation of where national
                 varieties of major languages are used (e.g., English, Arabic, Russian) together with consistently
                 collected data for each variety. Second, the paper evaluates a language identification model that
                 supports more local languages with smaller sample sizes than alternative off-the-shelf models. Improved
                 language identification is essential for moving beyond majority languages. Given the focus on language
                 mapping, the paper analyzes how well this digital language data represents actual populations by (i)
                 systematically comparing the corpus with demographic ground-truth data and (ii) triangulating the
                 corpus with an alternate Twitter-based dataset. In total, the corpus contains 423 billion words
                 representing 148 languages (with over 1 million words from each language) and 158 countries (again with
                 over 1 million words from each country), all distilled from Common Crawl web data. The main
                 contribution of this paper, in addition to describing this publicly-available corpus, is to provide a
                 comprehensive analysis of the relationship between two sources of digital data (the web and Twitter) as
                 well as their connection to underlying populations.",
  cc-author-affiliation = "University of Canterbury, Christchurch, New Zealand",
  cc-derived-dataset-about = "earthlings.io/CGLU",
  cc-dataset-used = "64 monthly crawls: March 2014 (CC-MAIN-2014-10) -- June 2019 (CC-MAIN-2019-29) (WET)",
  cc-class     = "nlp/corpus-construction, nlp/language-identification",
  cc-snippet   = "The raw portions of the Common Crawl dataset used to build the corpus are shown in Table 2. The corpus
                 uses every portion of the crawl from March 2014 to June 2019, totaling 147 billion web pages in total.
                 No temporal divisions are included in the corpus because these dates represent the time of collection
                 rather than the time of production: web data does not expire and there is a long-tail in which the same
                 samples are observed multiple times across different periods.",
}

@Article{cc:XuZhangDong:2020:CLUECorpus2020,
  title        = "{CLUEC}orpus2020: {A} large-scale Chinese corpus for pre-training language model",
  author       = "Liang Xu and Xuanwei Zhang and Qianqian Dong",
  journal      = "ArXiv",
  year         = "2020",
  volume       = "abs/2003.01355",
  URL          = "https://arxiv.org/abs/2003.01355",
  pdf          = "https://arxiv.org/pdf/2003.01355.pdf",
  cc-author-affiliation = "CLUE Organization",
  cc-class     = "nlp/corpus-construction",
  cc-dataset-used = "July to December 2019 (WARC)",
  cc-snippet   = "we introduce the Chinese corpusfrom CLUE organization, CLUECorpus2020, a large-scale corpus that can
                 be used directly for self-supervised learning such as pre-training of a language model, or language
                 gen-eration. It has 100G raw corpus with 35 billion Chinese characters, which is retrieved from Common
                 Crawl¹. [...] We download the corpus from July to December 2019 from Common Crawl. After the
                 aforementioned filtering method, we extract the corpus of 100GB.",
}

@Article{cc:GiannakoulopoulosPergantisKonstantinouLamprogeorgosEtAl:2020:english-language-dominance,
  title        = "Exploring the Dominance of the English Language on the Websites of {EU} Countries",
  volume       = "12",
  ISSN         = "1999-5903",
  URL          = "http://dx.doi.org/10.3390/fi12040076",
  doi          = "10.3390/fi12040076",
  number       = "4",
  journal      = "Future Internet",
  publisher    = "MDPI AG",
  author       = "Giannakoulopoulos, Andreas and Pergantis, Minas and Konstantinou, Nikos and Lamprogeorgos, Aristeidis
                 and Limniati, Laida and Varlamis, Iraklis",
  year         = "2020",
  month        = apr,
  pages        = "76",
  cc-author-affiliation = "Ionian University, Corfu, Greece; Harokopio University of Athens, Athens, Greece",
  cc-class     = "nlp/corpus-construction, web-science, socio-linguistics",
  abstract     = "The English language is the most dominant language in the Western world and its influence can be
                 noticed in every aspect of human communication. It’s increasing diffusion, especially since the turn
                 of the century, is hard to measure with conventional means. The present research studies the use of
                 language in websites of European Union (EU) member states, in order to collect data about the
                 prevalence of the English language in the different countries and regions of the European Union.To
                 achieve a realistic representation of today’s landscape of the European Web, this study uses avast
                 population of websites and a representative sampling size and methodology. By analyzing and processing
                 the findings from over 100,000 websites from every country in the EU, a solid foundation is set that is
                 used to explore the dominance of the English language in the European World Wide Web in general. This
                 is the first study that examines the presence of English content in the websites of all EU member
                 countries and provides statistical evidence regarding the ratio of English content availability for
                 each country. Conclusively, the results of the research demonstrate that the English language is
                 available on more than one quarter of all websites of non-English speaking EU member states.Moreover,
                 it is available in the vast majority of multilingual and bilingual websites, while at the same time
                 being the only language that is available in a number of monolingual websites. In addition, it is shown
                 preference over the national language in a significant number of cases. A moderate negative correlation
                 is found between a member state’s population and the availability of English in these countries’
                 websites and the same holds true for a member state’s Gross Domestic Product (GDP).Both these
                 correlations indicate that smaller countries tend to provide more content in English in order to
                 establish a stronger presence in the international environment. Taking into account the role of
                 language in the expression of national identity, this study provides data and insights which may
                 contribute to the discussion about the changes underway in the national identity of EU member states.",
  cc-snippet   = "The nature of the present research required as many websites as possible, so that both our total
                 population and our sampling pool were as close a representation of reality as possible. For this
                 purpose,we used information obtained from Common Crawl, a “repository of web crawl data that is
                 universally accessible and analyzable” [34]. Among the data Common Crawl offers is an index of every
                 available webpage for all member states of the EU amongst other countries. A process was developed in
                 PHP:Hypertext Preprocessor (PHP) that used the CompounD indeX (CDX) server Application Program
                 Interface (API) [35] to access Common Crawl’s Uniform Resource Locator (URL) index [36] and created a
                 MariaDB database with information about websites from every member state of the EU. Although Common
                 Crawl’s index provides all available crawled pages, our process of data collecting only focused on
                 recording the landing page of one website per domain.",
}

@Article{cc:SrinathWilsonGiles:2020:PrivaSeer-corpus,
  title        = "Privacy at scale: Introducing the PrivaSeer corpus of web privacy policies",
  author       = "Srinath, Mukund and Wilson, Shomir and Giles, C Lee",
  journal      = "arXiv preprint arXiv:2004.11131",
  year         = "2020",
  URL          = "https://arxiv.org/abs/2004.11131",
  pdf          = "https://arxiv.org/pdf/2004.11131.pdf",
  cc-author-affiliation = "Pennsylvania State University, PA, USA",
  cc-class     = "nlp/corpus-construction, web-science, internet-security/privacy-policies",
  cc-snippet   = "We used Common Crawl² to gather seed URLs to crawl for privacy policies from the web, as we describe
                 in detail below. We filtered the Common Crawl URLs to get a set of possible links to web site privacy
                 policies. We then crawled the filtered set to obtain candidate privacy policy documents. The complete
                 pipeline from the Common Crawl URL dump to the gold standard privacy policy corpus is shown in Figure
                 1. [...] The Common Crawl Foundation is a non-profit which has been releasing large monthly internet
                 web crawls since 2008. Monthly crawl archives provide a “snapshot of the web” by including
                 re-crawls of popular domains (re-crawls from previous archives) and crawls of new domains. Common Crawl
                 has also been releasing a domain-level webgraph from which the harmonic centrality of the crawled
                 domains are calculated. This webgraph his used to sample popular domains that need to be re-crawled and
                 to obtain new uncrawled domains. We downloaded the URL dump of the May, 2019 archive. Common Crawl
                 reports that the archive contains 2.65 billion web pages or 220 TB of uncompressed content which were
                 crawled between 19th and 27th of May, 2019. They also report that this archive contains 825 million
                 URLs which were not contained in any previously released crawl archives. We applied a selection
                 criteria on the downloaded URL dump to filter the URLs of likely privacy policy pages.",
}

@Article{cc:DongTriche:2020:analysis-of-job-skill,
  title        = "A longitudinal analysis of job skills for entry-level data analysts",
  author       = "Dong, Tianxi and Triche, Jason",
  journal      = "Journal of Information Systems Education",
  volume       = "31",
  number       = "4",
  pages        = "312",
  year         = "2020",
  URL          = "https://jise.org/Volume31/n4/JISEv31n4p312.pdf",
  cc-snippet   = "Our first challenge was how to collect job postings over past years because job websites do not keep
                 historical data for more than one year. Therefore, we used the Common Crawl dataset to address this
                 problem (http://commoncrawl.org/). Common Crawl is a non-profit organization that builds and maintains
                 an open repository of web crawl data that is, in essence, a copy of the Internet. Common Crawl data
                 contains over 25 billion web pages (Batikas, Claussen, and Peukert, 2018) and is widely used in
                 hundreds of research projects (Batikas, Claussen, and Peukert, 2018; Cafarella et al., 2018). Since we
                 were only interested in the content from Indeed.com, we only examined a very small fraction of the
                 Common Crawl corpus.",
  cc-author-affiliation = "Trinity University, San Antonio, TX, USA; University of Montana, MT, USA",
  cc-class     = "business-intelligence, nlp/corpus-construction",
}

@Misc{cc:CarliniTramerWallaceJagielskiEtAl:2020:extracting-training-data-from-language-models,
  title        = "Extracting training data from large language models",
  author       = "Nicholas Carlini and Florian Tramer and Eric Wallace and Matthew Jagielski and Ariel Herbert-Voss and
                 Katherine Lee and Adam Roberts and Tom Brown and Dawn Song and Ulfar Erlingsson and Alina Oprea and
                 Colin Raffel",
  year         = "2020",
  URL          = "https://arxiv.org/abs/2012.07805",
  cc-author-affiliation = "Google; Stanford University; UC Berkeley; Northeastern University; OpenAI; Harvard
                 University; Apple",
  cc-class     = "ai/ethical-concerns, nlp/language-models",
  cc-snippet   = "We follow a different data collection process as used in GPT-2 (which follows Reddit links) in order
                 to reduce the likelihood that our dataset has any intersection with the model’s training data. In
                 particular, we select samples from a subset of Common Crawl⁶ [⁶http://commoncrawl.org/] to feed as
                 context to the model.⁷ [⁷It is possible there is some intersection between these two datasets,
                 effectively allowing this strategy to “cheat”. We believe this does not considerably affect
                 results. First, any overlap between the two datasets is rare on average. Second, because we only use
                 the first 5 or 10 tokens of each sample, any possible overlap will be small in absolute terms.]",
}

@Article{cc:SammarKhalilia:2020:archived-palestinian-web,
  author       = "Thaer Sammar and Hadi Khalilia",
  title        = "Going Back in Time to Find What Existed on the Web and How much has been Preserved: How much of
                 Palestinian Web has been Archived?",
  journal      = "مؤتمرات الآداب والعلوم الانسانية والطبيعية",
  year         = "2020",
  abstract     = "The web is an important resource for publishing and sharing content. The main characteristic of the
                 web is its volatility. Content is added, updated, and deleted all the time. Therefore, many national
                 and international institutes started crawling and archiving the content of the web. The main focus of
                 national institutes is to archive the web related to their country heritage, for example, the National
                 Library of the Netherlands is focusing on archiving website that are of value to the Dutch heritage.
                 However, there are still countries that haven’t taken the action to archive their web, which will
                 result in loosing and having a gap in the knowledge. In this research, we focus on shedding the light
                 on the Palestinian web. Precisely, how much of the Palestinian web has been archived. First, we create
                 a list of Palestinian hosts that were on the web. For that we queried Google index exploiting the time
                 range filter in order to get hosts overtime. We collected in 98 hosts in average in 5-years granularity
                 from the year 1990 to 2019. We also obtained Palestinian hosts from the DMOZ directory. We collected
                 188 hosts. Second, we investigate the coverage of collected hosts in the Internet Archive and the
                 Common-Crawl. We found that coverage of Google hosts in the Internet Archive ranges from 0\% to 89\%
                 from oldest to newest time-granularity. The coverage of DMOZ hosts was 96\%. The coverage of Google
                 hosts in the Common-Crawl 57.1\% to 74.3, while the coverage of DMOZ hosts in the Common-Crawl was in
                 average 25\% in all crawls. We found that even the host is covered in Internet Archive and
                 Common-Crawl, the lifespan and the number of archived versions are low.",
  URL          = "http://proceedings.sriweb.org/akn/index.php/art/article/view/410",
  pdf          = "http://proceedings.sriweb.org/akn/index.php/art/article/viewFile/410/466",
  cc-author-affiliation = "Palestine Technical University, Tulkarm, West Bank",
  cc-class     = "web-archiving/regional-coverage",
  cc-dataset-used = "CDX index",
}

@Misc{cc:GehmanGururanganSapChoiEtAl:2020:neural-toxic-degeneration-in-language-models,
  title        = "RealToxicityPrompts: Evaluating Neural Toxic Degeneration in Language Models",
  author       = "Samuel Gehman and Suchin Gururangan and Maarten Sap and Yejin Choi and Noah A. Smith",
  year         = "2020",
  eprint       = "2009.11462",
  archiveprefix = "arXiv",
  primaryclass = "cs.CL",
  pdf          = "https://arxiv.org/pdf/2009.11462.pdf",
  URL          = "https://arxiv.org/abs/2009.11462",
  cc-author-affiliation = "Paul G. Allen School of Computer Science & Engineering, University of Washington, USA; Allen
                 Institute for Artificial Intelligence, Seattle, USA",
  cc-class     = "no-citation-misclassified, ai/ethics-of-machine-learning, ai/machine-learning, nlp/language-model",
  cc-comment   = "based on OpenWebText",
}

@InProceedings{cc:WangXie:2020:alternative-web-archival-formats,
  author       = "Wang, Xinyue and Xie, Zhiwu",
  title        = "The Case For Alternative Web Archival Formats To Expedite The Data-To-Insight Cycle",
  year         = "2020",
  ISBN         = "978-1-4503-7585-6",
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  URL          = "https://doi.org/10.1145/3383583.3398542",
  doi          = "10.1145/3383583.3398542",
  abstract     = "The WARC file format is widely used by web archives to preserve collected web content for future use.
                 With the rapid growth of web archives and the increasing interest to reuse these archives as big data
                 sources for statistical and analytical research, the speed to turn these data into insights becomes
                 critical. In this paper we show that the WARC format carries significant performance penalties for
                 batch processing workload. We trace the root cause of these penalties to its data structure, encoding,
                 and addressing method. We then run controlled experiments to illustrate how severe these problems can
                 be. Indeed, performance gain of one to two orders of magnitude can be achieved simply by reformatting
                 WARC files into Parquet or Avro formats. While these results do not necessarily constitute an
                 endorsement for Avro or Parquet, the time has come for the web archiving community to consider
                 replacing WARC with more efficient web archival formats.",
  booktitle    = "Proceedings of the ACM/IEEE Joint Conference on Digital Libraries in 2020",
  pages        = "177–186",
  numpages     = "10",
  keywords     = "storage management, big data analysis, web archiving, file format",
  location     = "Virtual Event, China",
  series       = "JCDL '20",
  pdf          = "https://vtechworks.lib.vt.edu/bitstream/handle/10919/98565/fp210.pdf",
  cc-author-affiliation = "Virginia Polytechnic Institute and State University, Blacksburg, VA, USA",
  cc-class     = "web-archiving, data formats, big data, data processing, WARC, Parquet",
  cc-abstract  = "We chose to use Common Crawl’s web archiving data crawled from May 20 to 23, 2018. The data set
                 consists of 1219 gzip compressed WARC files totaling 0.98 TB, and contains 53324440 records. The WARC
                 files are organized by crawling time, each containing records crawled from a mutually exclusive time
                 span. We then reformat the WARC files to yield the following five datasets for comparison: 1) the
                 original WARC files; 2) case 1 plus CDX index files built against all the original WARC files; 3)
                 Parquet files containing the same information as case 1, with most columns in String type; 4) the same
                 as case 3 but the Timestamp column is in INT64 Timestamp type; 5) Avro, with the same column data types
                 as in case 4 but the Timestamp column is in INT96. We use a modified ArchiveUnleashedToolkit for
                 reformating WARC to Parquet and Avro. Settings for Parquet/Avro file writing are included in the
                 example code repository on Github² [²https://github.com/xw0078/WebArchiveWithParquetAvro]. The
                 reformatted Parquet and Avro data are gzip compressed, the same as WARC files. [...] More recently,
                 Common Crawl even directly used Parquet to store metadata extracted from WARC [44]. The weakness of
                 this approach, however, is its dependency on an inefficient storage format.",
}

@Article{cc:MaticIordanouSmaragdakisLaoutaris:2020:identifying-sensitive-URLs,
  title        = "Identifying Sensitive {URL}s at Web-Scale",
  author       = "Srdjan Matic and Costas Iordanou and Georgios Smaragdakis and Nikolaos Laoutaris",
  journal      = "Proceedings of the ACM Internet Measurement Conference",
  year         = "2020",
  pages        = "619–633",
  URL          = "https://do.tu-berlin.de/handle/11303/13215",
  pdf          = "https://depositonce.tu-berlin.de/bitstream/11303/13215/4/matic_etal_2020.pdf",
  abstract     = "Several data protection laws include special provisions for protecting personal data relating to
                 religion, health, sexual orientation, and other sensitive categories. Having a well-defined list of
                 sensitive categories is sufficient for filing complaints manually, conducting investigations, and
                 prosecuting cases in courts of law. Data protection laws, however, do not define explicitly what type
                 of content falls under each sensitive category. Therefore, it is unclear how to implement proactive
                 measures such as informing users, blocking trackers, and filing complaints automatically when users
                 visit sensitive domains. To empower such use cases we turn to the Curlie.org crowdsourced taxonomy
                 project for drawing training data to build a text classifier for sensitive URLs. We demonstrate that
                 our classifier can identify sensitive URLs with accuracy above 88%, and even recognize specific
                 sensitive categories with accuracy above 90%. We then use our classifier to search for sensitive URLs
                 in a corpus of 1 Billion URLs collected by the Common Crawl project. We identify more than 155 millions
                 sensitive URLs in more than 4 million domains. Despite their sensitive nature, more than 30% of these
                 URLs belong to domains that fail to use HTTPS. Also, in sensitive web pages with third-party cookies,
                 87% of the third-parties set at least one persistent cookie.",
  cc-author-affiliation = "TU Berlin, Germany; Cyprus University of Technology, Cyprus; IMDEA Networks Institute",
  cc-class     = "computer-security/internet-security, privacy, GDPR, general data protection regulation",
  cc-snippet   = "When it comes to detecting specific sensitive categories, such as those defined by GDPR: Health,
                 Politics, Religion, Sexual Orientation, Ethnicity, our classifier achieves a high classification
                 accuracy as well. For specific categories, such as Health (98%), Politics (92%), Religion (97%), our
                 classifier achieves an accuracy that exceeds the basic classification accuracy between sensitive and
                 non-sensitive URLs (88%).¶ • Applying our classifier on a Common Crawl snapshot of the English
                 speaking Web (around 1 Billion URLs), we identify 155 million sensitive URLs in more than 4 million
                 domains. Health, Religion, and Political Beliefs are the most popular categories with around 70
                 millions, 35 millions, and 32 millions URLs respectively.¶ • Looking among the identified sensitive
                 URLs we reach the conclusion that sensitive URLs are handled as any other URL, without any special
                 provision for the privacy of users. For example, we show that 30% of sensitive URLs are hosted in
                 domains that fail to use HTTPS. Also, in sensitive web pages with third-party cookies, 87% of the
                 third-parties sets at least one persistent cookie.",
}

@Proceedings{cc:Nagel:2020:web-crawler-process-web-archives,
  title        = "{Experiments using a Distributed Web Crawler to Process and Index Web Archives}",
  author       = "Sebastian Nagel",
  year         = "2020",
  publisher    = "Zenodo",
  booktitle    = "Open Search Symposium 2020, 12-14 October 2020, CERN, Geneva, Switzerland",
  doi          = "10.5281/zenodo.4609371",
  URL          = "https://doi.org/10.5281/zenodo.4609371",
  slides       = "https://indico.cern.ch/event/883268/contributions/3995764/attachments/2204558/3729718/2020-10-13--1150--Presentation--Nagel-S--osssym2020-sn-web-archives.pdf",
  cc-author-affiliation = "Common Crawl",
  cc-class     = "web crawling, web archiving",
}

@InProceedings{cc:RothBarronCalzavaraNikiforakisEtAl:2020:Complex-security-policy,
  title        = "Complex security policy? a longitudinal analysis of deployed content security policies",
  author       = "Roth, Sebastian and Barron, Timothy and Calzavara, Stefano and Nikiforakis, Nick and Stock, Ben",
  booktitle    = "Proceedings of the 27th Network and Distributed System Security Symposium (NDSS)",
  year         = "2020",
  doi          = "10.14722/ndss.2020.23046",
  URL          = "https://par.nsf.gov/biblio/10173479",
  pdf          = "https://www.ndss-symposium.org/wp-content/uploads/2020/02/23046.pdf",
  abstract     = "The Content Security Policy (CSP) mechanism was developed as a mitigation against script injection
                 attacks in 2010. In this paper, we leverage the unique vantage point of the Internet Archive to conduct
                 a historical and longitudinal analysis of how CSP deployment has evolved for a set of 10,000 highly
                 ranked domains. In doing so, we document the long- term struggle site operators face when trying to
                 roll out CSP for content restriction and highlight that even seemingly secure whitelists can be
                 bypassed through expired or typo domains. Next to these new insights, we also shed light on the usage
                 of CSP for other use cases, in particular, TLS enforcement and framing control. Here, we find that CSP
                 can be easily deployed to fit those security scenarios, but both lack wide-spread adoption.
                 Specifically, while the underspecified and thus inconsistently implemented X-Frame-Options header is
                 increasingly used on the Web, CSP’s well-specified and secure alternative cannot keep up. To
                 understand the reasons behind this, we run a notification campaign and subsequent survey, concluding
                 that operators have often experienced the complexity of CSP (and given up), utterly unaware of the
                 easy-to-deploy components of CSP. Hence, we find the complexity of secure, yet functional content
                 restriction gives CSP a bad reputation, resulting in operators not leveraging its potential to secure a
                 site against the non-original attack vectors.",
  cc-snippet   = "To determine this IA-specific influence, we chose a second archive service to corroborate the IA’s
                 data. In particular, Common Crawl (CC) [10] has been collecting snapshots of popular sites since 2013.
                 For each date on which we found a CSP in the IA, we queried the CC API for a matching snapshot.
                 Overall, we found 38,129 overlapping snapshots for 940 sites. Out of these, 729 (1.9%) on 127 sites
                 were inconsistent between the two archives. For 96 cases the difference was the lack of
                 block-all-mixed-content or upgrade-insecure-requests in the CC data. Further investigation showed that
                 in the IA, these directives were separated from the remaining CSP with a comma instead of a semicolon.
                 This likely relates to the IA joining headers with the same name with a comma. For those pages, we
                 could always only find a single CSP header in the CC response. Moreover, starting from August 2018,
                 these sites still used the aforementioned directives in the IA data, but CC returned two CSP headers
                 (one including only those directives). Hence, we speculate this relates to a bug in CC, which was fixed
                 around August 2018.",
  cc-author-affiliation = "CISPA Helmholtz Center for Information Security, Germany; Stony Brook University, USA;
                 Università Ca’ Foscari, Venezia, Italy",
  cc-class     = "computer-security/internet-security, web-science",
}