bib/cc2018.bib

@Article{cc:AbdouKulmizevRavishankarAbzianidzeEtAl:2018:semantic-tagging,
  title        = "What can we learn from Semantic Tagging?",
  author       = "Abdou, Mostafa and Kulmizev, Artur and Ravishankar, Vinit and Abzianidze, Lasha and Bos, Johan",
  journal      = "arXiv preprint arXiv:1808.09716",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1808.09716",
  cc-author-affiliation = "University of Groningen, The Netherlands; University of Copenhagen, Denmark; University of
                 Oslo, Norway;",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/semantics, nlp/word-embeddings, nlp/semantic-tagging",
}

@InProceedings{cc:AgrawalAnPapagelis:2018:emotion-enriched-word-representations,
  title        = "Learning emotion-enriched word representations",
  author       = "Agrawal, Ameeta and An, Aijun and Papagelis, Manos",
  booktitle    = "Proceedings of the 27th International Conference on Computational Linguistics",
  pages        = "950--961",
  year         = "2018",
  cc-author-affiliation = "York University, Toronto, Canada",
  cc-class     = "nlp/word-embeddings, nlp/emotion-detection, nlp/sentiment-analysis",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  publisher    = "Association for Computational Linguistics",
  URL          = "https://www.aclweb.org/anthology/C18-1081",
  abstract     = "Most word representation learning methods are based on the distributional hypothesis in linguistics,
                 according to which words that are used and occur in the same contexts tend to possess similar meanings.
                 As a consequence, emotionally dissimilar words, such as “happy” and “sad” occurring in similar
                 contexts would purport more similar meaning than emotionally similar words, such as “happy” and
                 “joy”. This complication leads to rather undesirable outcome in predictive tasks that relate to
                 affect (emotional state), such as emotion classification and emotion similarity. In order to address
                 this limitation, we propose a novel method of obtaining emotion-enriched word representations, which
                 projects emotionally similar words into neighboring spaces and emotionally dissimilar ones far apart.
                 The proposed approach leverages distant supervision to automatically obtain a large training dataset of
                 text documents and two recurrent neural network architectures for learning the emotion-enriched
                 representations. Through extensive evaluation on two tasks, including emotion classification and
                 emotion similarity, we demonstrate that the proposed representations outperform several competitive
                 general-purpose and affective word representations.",
}

@InProceedings{cc:AlohalyTakabiBlanco:2018:learning-ABAC-policies,
  author       = "Alohaly, Manar and Takabi, Hassan and Blanco, Eduardo",
  title        = "A Deep Learning Approach for Extracting Attributes of {ABAC} Policies",
  booktitle    = "Proceedings of the 23Nd ACM on Symposium on Access Control Models and Technologies",
  series       = "SACMAT '18",
  year         = "2018",
  ISBN         = "978-1-4503-5666-4",
  location     = "Indianapolis, Indiana, USA",
  pages        = "137--148",
  numpages     = "12",
  URL          = "http://doi.acm.org/10.1145/3205977.3205984",
  doi          = "10.1145/3205977.3205984",
  acmid        = "3205984",
  publisher    = "ACM",
  address      = "New York, NY, USA",
  keywords     = "access control policy, attribute-based access control, deep learning, natural language processing,
                 policy authoring, relation extraction",
  cc-author-affiliation = "University of North Texas, USA",
  cc-class     = "nlp/machine-translation, computer-security/access-restrictions",
}

@Article{cc:AlshomaryVolskeLichtWachsmuthEtAl:2018:Wikipedia-text-reuse,
  title        = "Wikipedia text reuse: within and without",
  author       = "Alshomary, Milad and Völske, Michael and Licht, Tristan and Wachsmuth, Henning and Stein, Benno and
                 Hagen, Matthias and Potthast, Martin",
  journal      = "arXiv preprint arXiv:1812.09221",
  year         = "2018",
  URL          = "https://link.springer.com/chapter/10.1007/978-3-030-15712-8_49",
  pdf          = "https://webis.de/downloads/publications/papers/stein_2019c.pdf",
  cc-class     = "web-mining, ir/duplicate-detection",
  cc-author-affiliation = "Paderborn University, Germany; Bauhaus-Universität Weimar, Germany;
                 Martin-Luther-Universität Halle-Wittenberg, Germany; Leipzig University, Germany",
  abstract     = "We study text reuse related to Wikipedia at scale by compiling the first corpus of text reuse cases
                 within Wikipedia as well as without (i.e., reuse of Wikipedia text in a sample of the Common Crawl). To
                 discover reuse beyond verbatim copy and paste, we employ state-of-the-art text reuse detection
                 technology, scaling it for the first time to process the entire Wikipedia as part of a distributed
                 retrieval pipeline. We further report on a pilot analysis of the 100 million reuse cases inside, and
                 the 1.6 million reuse cases outside Wikipedia that we discovered. Text reuse inside Wikipedia gives
                 rise to new tasks such as article template induction, fixing quality flaws, or complementing
                 Wikipedia’s ontology. Text reuse outside Wikipedia yields a tangible metric for the emerging field of
                 quantifying Wikipedia’s influence on the web. To foster future research into these tasks, and for
                 reproducibility’s sake, the Wikipedia text reuse corpus and the retrieval pipeline are made freely
                 available.",
  cc-snippet   = "To foster research into Wikipedia textreuse, we compiled the first Wikipedia text reuse corpus,
                 obtained from comparingthe entire Wikipedia to itself as well as to a 10\%-sample of the Common
                 Crawl.",
}

@Article{cc:AmatuniHeBergelson:2018:vector-space-representations,
  title        = "Preserved Structure Across Vector Space Representations",
  author       = "Amatuni, Andrei and He, Estelle and Bergelson, Elika",
  journal      = "arXiv preprint arXiv:1802.00840",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1802.00840",
  cc-author-affiliation = "Duke University",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/semantics, nlp/word-embeddings",
}

@Misc{cc:AmmarMcSherrySalihogluJoglekar:2018:subgraph-queries,
  author       = "Khaled Ammar and Frank McSherry and Semih Salihoglu and Manas Joglekar",
  title        = "Distributed Evaluation of Subgraph Queries Using Worstcase Optimal LowMemory Dataflows",
  year         = "2018",
  eprint       = "arXiv:1802.03760",
  URL          = "https://arxiv.org/pdf/1802.03760.pdf",
  cc-derived-dataset-used = "WDC-hyperlinkgraph",
  cc-class     = "graph-processing",
  cc-author-affiliation = "University of Waterloo, Canada; ETH Zürich, Switzerland; Google, Inc.",
}

@Article{cc:AmmarMcSherrySalihogluJoglekar:2018:subgraph-queries-2,
  title        = "Distributed evaluation of subgraph queries using worst-case optimal low-memory dataflows",
  author       = "Ammar, Khaled and McSherry, Frank and Salihoglu, Semih and Joglekar, Manas",
  journal      = "Proceedings of the VLDB Endowment",
  volume       = "11",
  number       = "6",
  pages        = "691--704",
  year         = "2018",
  publisher    = "VLDB Endowment",
  URL          = "https://dl.acm.org/citation.cfm?id=3199520",
  cc-derived-dataset-used = "WDC-hyperlinkgraph",
  cc-class     = "graph-processing",
  cc-author-affiliation = "University of Waterloo, Canada; ETH Zürich, Switzerland; Google, Inc.",
}

@Article{cc:AnilPereyraPassosOrmandiEtAl:2018:large-scale-distributed-neural,
  title        = "Large scale distributed neural network training through online distillation",
  author       = "Anil, Rohan and Pereyra, Gabriel and Passos, Alexandre and Ormandi, Robert and Dahl, George E. and
                 Hinton, Geoffrey E.",
  journal      = "arXiv preprint arXiv:1804.03235",
  year         = "2018",
  cc-dataset-used = "CC-MAIN-2017-26",
  eprint       = "arXiv:1804.03235",
  URL          = "https://arxiv.org/abs/1804.03235",
  cc-class     = "nlp/neural-networks",
  cc-author-affiliation = "Google; Google Brain; Google DeepMind",
}

@InProceedings{cc:ArshadMirheidariLauingerCrispoEtAl:2018:large-scale-analysis-of-style,
  author       = "Arshad, Sajjad and Mirheidari, Seyed Ali and Lauinger, Tobias and Crispo, Bruno and Kirda, Engin and
                 Robertson, William",
  title        = "Large-Scale Analysis of Style Injection by Relative Path Overwrite",
  booktitle    = "Proceedings of the 2018 World Wide Web Conference",
  series       = "WWW '18",
  year         = "2018",
  ISBN         = "978-1-4503-5639-8",
  location     = "Lyon, France",
  pages        = "237--246",
  numpages     = "10",
  URL          = "https://doi.org/10.1145/3178876.3186090",
  doi          = "10.1145/3178876.3186090",
  publisher    = "International World Wide Web Conferences Steering Committee",
  keywords     = "relative path overwrite, scriptless attack, style injection",
  cc-author-affiliation = "Northeastern University, Boston, MA, USA; University of Trento, Trento, Italy",
  cc-snippet   = "We extract pages using relative-path stylesheets from the Common Crawl dataset [9], automatically test
                 if style directives can be injected using RPO, and determine whether they are interpreted by the
                 browser. [...] For finding the initial seed set of candidate pages with relative-path stylesheets, we
                 leverage the Common Crawl from August 2016, which contains more than 1.6 billion pages. By using an
                 existing dataset, we can quickly identify candidate pages without creating any web crawl traffic. We
                 use a Java HTML parser to filter any pages containing only inline CSS or stylesheets referenced by
                 absolute URLs, leaving us with over 203 million pages on nearly 6 million sites.",
  cc-dataset-used = "CC-MAIN-2016-36",
  cc-class     = "web-science, computer-security/web-application-security",
}

@InProceedings{cc:ArtetxeLabakaAgirre:2018:bilingual-word-embedding-mappings,
  title        = "Generalizing and improving bilingual word embedding mappings with a multi-step framework of linear
                 transformations",
  author       = "Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko",
  booktitle    = "Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence (AAAI-18)",
  year         = "2018",
  URL          = "https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16935/16781",
  cc-author-affiliation = "University of the Basque Country, Spain",
  cc-class     = "nlp/semantics, nlp/word-embeddings, nlp/bilingual-word-embeddings",
}

@Article{cc:ArtetxeLabakaAgirre:2018:robust-self-learning-method,
  title        = "A robust self-learning method for fully unsupervised cross-lingual mappings of word embeddings",
  author       = "Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko",
  journal      = "arXiv preprint arXiv:1805.06297",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1805.06297",
  cc-author-affiliation = "University of the Basque Country, Spain",
  cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
  cc-class     = "nlp/semantics, nlp/word-embeddings, nlp/bilingual-word-embeddings",
}

@Article{cc:ArtetxeLabakaLopez-GazpioAgirre:2018:linguistic-information-in-word-embeddings,
  title        = "Uncovering divergent linguistic information in word embeddings with lessons for intrinsic and
                 extrinsic evaluation",
  author       = "Artetxe, Mikel and Labaka, Gorka and Lopez-Gazpio, Iñigo and Agirre, Eneko",
  journal      = "arXiv preprint arXiv:1809.02094",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1809.02094",
  cc-author-affiliation = "University of the Basque Country, Spain",
  cc-derived-dataset-used = "GloVe-word-embeddings, fastText-word-embeddings",
  cc-class     = "nlp/semantics, nlp/word-embeddings",
}

@Article{cc:ArtetxeSchwenk:2018:parallel-corpus-mining,
  title        = "Margin-based parallel corpus mining with multilingual sentence embeddings",
  author       = "Artetxe, Mikel and Schwenk, Holger",
  journal      = "arXiv preprint arXiv:1811.01136",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1811.01136",
  cc-author-affiliation = "University of the Basque Country, Spain; Facebook AI Research",
  cc-class     = "cc-cited-not-used, nlp/word-embeddings, nlp/sentence-embeddings, nlp/parallel-corpus",
}

@Article{cc:BaharBrixNey:2018:two-dimensional-sequence-to-sequence-model,
  title        = "Towards two-dimensional sequence to sequence model in neural machine translation",
  author       = "Bahar, Parnia and Brix, Christopher and Ney, Hermann",
  journal      = "arXiv preprint arXiv:1810.03975",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1810.03975",
  cc-author-affiliation = "RWTH Aachen University, Germany",
  cc-class     = "nlp/machine-translation",
  cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
}

@Book{cc:Balog:2018:entity-oriented-search,
  title        = "Entity-oriented search",
  author       = "Balog, Krisztian",
  year         = "2018",
  publisher    = "Springer",
  URL          = "https://link.springer.com/content/pdf/10.1007/978-3-319-93935-3.pdf",
  cc-author-affiliation = "University of Stavanger, Norway",
  cc-dataset-used = "CC-MAIN-2017-22",
  cc-snippet   = "Common CrawlCommon Crawl5is a nonprofit organization that regularly crawlsthe Web and makes the data
                 publicly available. The datasets are hosted on AmazonS3 as part of the Amazon Public Datasets
                 program.6As of May 2017, the crawlcontains 2.96 billion web pages and over 250 TB of uncompressed
                 content (inWARC format). The Web Data Commons project7extracts structured data fromthe Common Crawl and
                 makes those publicly available (e.g., the Hyperlink GraphDataset and the Web Table Corpus).",
  cc-class     = "information-retrieval, nlp/named-entity-recognition, linked data",
}

@Article{cc:BarbosaCrescenziDongMerialdoEtAl:2018:big-data-integration,
  title        = "Big Data Integration for Product Specifications.",
  author       = "Barbosa, Luciano and Crescenzi, Valter and Dong, Xin Luna and Merialdo, Paolo and Piai, Federico and
                 Qiu, Disheng and Shen, Yanyan and Srivastava, Divesh",
  journal      = "IEEE Data Eng. Bull.",
  volume       = "41",
  number       = "2",
  pages        = "71--81",
  year         = "2018",
  URL          = "http://sites.computer.org/debull/A18june/A18JUN-CD.pdf#page=73",
  cc-author-affiliation = "Universidade Federal de Pernambuco, Brazil; Roma Tre University, Italy; Amazon; Wanderio;
                 Shanghai Jiao Tong University; AT&T Labs – Research",
  cc-snippet   = "About 68\% of the sources discovered by our approach were not present in Common Crawl. Only 20\% of
                 our sources contained fewer pages than the same sources in Common Crawl, and a very small fraction of
                 the pages in these sources were product pages: on a sample set of 12 websites where Common Crawl
                 presented more pages than in our dataset, we evaluated that only 0.8\% of the pages were product
                 pages.",
  cc-class     = "ir/information-extraction, ir/data-integration",
}

@Article{cc:BarbosaCrescenziDongMerialdoEtAl:2018:product-dataset,
  title        = "Lessons Learned and Research Agenda for Big Data Integration of Product Specifications (Discussion
                 Paper)",
  author       = "Barbosa, Luciano and Crescenzi, Valter and Dong, Xin Luna and Merialdo, Paolo and Piai, Federico and
                 Qiu, Disheng and Shen, Yanyan and Srivastava, Divesh",
  year         = "2018",
  cc-snippet   = "Building a Benchmark Product Dataset – We compared the contents of our dataset with pages in Common
                 Crawl, an open repository of web crawl data. About 68\% of the sources discovered by our approach were
                 not present in Common Crawl. Only 20\% of our sources contained fewer pages than the same sources in
                 Common Crawl, and a very small fraction of the pages in these sources were product pages: on a sample
                 set of 12 websites where Common Crawl presented more pages than in our dataset, we evaluated that only
                 0.8\% of the pages were product pages.",
  URL          = "http://ceur-ws.org/Vol-2161/paper29.pdf",
  cc-author-affiliation = "Universidade Federal de Pernambuco, Brazil; Roma Tre University, Italy; Amazon; Wanderio;
                 Shanghai Jiao Tong University; AT&T Labs – Research",
  cc-class     = "ir/information-extraction, ir/data-integration",
}

@Article{cc:BatikasClaussenPeukert:2018:online-piracy,
  author       = "Batikas, Michail and Claussen, Jörg and Peukert, Christian",
  title        = "Follow The Money: Online Piracy and Self-Regulation in the Advertising Industry",
  year         = "2018",
  pdf          = "http://www.cesifo-group.de/DocDL/cesifo1_wp6852.pdf",
  series       = "CESifo Working Papers",
  volume       = "6852",
  cc-snippet   = "We obtain archived versions of the HTML source code of all URLs for each domain in our gross sample
                 from Common Crawl, a project that has crawled billions of webpages periodically since summer 2013.",
  cc-class     = "web-science",
  cc-author-affiliation = "LMU Munich, Germany; UCP – Católica Lisbon School of Business and Economics, Lisboa,
                 Portugal",
}

@InProceedings{cc:BattleDuanMirandaMukushevaEtAl:2018:automated-extraction-of-visualizations,
  title        = "Beagle: Automated Extraction and Interpretation of Visualizations from the Web",
  author       = "Battle, Leilani and Duan, Peitong and Miranda, Zachery and Mukusheva, Dana and Chang, Remco and
                 Stonebraker, Michael",
  booktitle    = "Proceedings of the 2018 CHI Conference on Human Factors in Computing Systems",
  pages        = "594",
  year         = "2018",
  organization = "ACM",
  abstract     = "``How common is interactive visualization on the web?'' ``What is the most popular visualization
                 design?'' ``How prevalent are pie charts really?'' These questions intimate the role of interactive
                 visualization in the real (online) world. In this paper, we present our approach (and findings) to
                 answering these questions. First, we introduce Beagle, which mines the web for SVG-based visualizations
                 and automatically classifies them by type (i.e., bar, pie, etc.). With Beagle, we extract over 41,000
                 visualizations across five different tools and repositories, and classify them with 85\% accuracy,
                 across 24 visualization types. Given this visualization collection, we study usage across tools. We
                 find that most visualizations fall under four types: bar charts, line charts, scatter charts, and
                 geographic maps. Though controversial, pie charts are relatively rare for the visualization tools that
                 were studied. Our findings also suggest that the total visualization types supported by a given tool
                 could factor into its ease of use. However this effect appears to be mitigated by providing a variety
                 of diverse expert visualization examples to users.",
  URL          = "https://dl.acm.org/citation.cfm?id=3174168",
  cc-author-affiliation = "University of Washington, Seattle, WA, USA; Massachusetts Institute of Technology, Cambridge,
                 MA, USA; Tufts University, Medford, MA, USA",
  cc-snippet   = "As found with other web crawling projects, such as the Common Crawl¹, our web crawls represent a
                 specific point in time for the websites [...]",
  cc-class     = "web-science, web-crawling",
}

@Article{cc:BellomariniFayzrakhmanovGottlobKravchenkoEtAl:2018:data-science-Vadalog,
  title        = "Data Science with Vadalog: Bridging Machine Learning and Reasoning",
  author       = "Bellomarini, Luigi and Fayzrakhmanov, Ruslan R and Gottlob, Georg and Kravchenko, Andrey and Laurenza,
                 Eleonora and Nenov, Yavor and Reissfelder, Stephane and Sallinger, Emanuel and Sherkhonov, Evgeny and
                 Wu, Lianlong",
  journal      = "arXiv preprint arXiv:1807.08712",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1807.08712",
  cc-author-affiliation = "University of Oxford, United Kingdom; Banca d’Italia, Italy; TU Wien, Austria",
  cc-snippet   = "Enterprises increasingly depend on intelligent information systems that operationalise corporate
                 knowledge as a unified source across system boundaries. [...] To maintain their competitive edge,
                 companies need to incorporate multiple heterogeneous sources of information, including [...] external
                 streams of unstructured data (e.g., news and social media feeds, and Common Crawl¹), [...]",
  cc-class     = "ai/semantic-reasoning, ai/machine-learning",
}

@InProceedings{cc:BentivogliCettoloFedericoChristian:2018:machine-translation-human-evaluation,
  title        = "Machine Translation Human Evaluation: an investigation of evaluation based on Post-Editing and its
                 relation with Direct Assessment",
  author       = "Bentivogli, Luisa and Cettolo, Mauro and Federico, Marcello and Christian, Federmann",
  booktitle    = "International Workshop on Spoken Language Translation",
  year         = "2018",
  URL          = "https://workshop2018.iwslt.org/downloads/Proceedings_IWSLT_2018.pdf#page=77",
  cc-author-affiliation = "FBK, Trento, Italy; Amazon AI, East Palo Alto, CA, USA, Microsoft Cloud+AI, Redmond, WA,
                 USA",
  cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
  cc-class     = "nlp/machine-translation",
}

@InProceedings{cc:BevendorffSteinHagenPotthast:2018:Elastic-ChatNoir,
  author       = "Janek Bevendorff and Benno Stein and Matthias Hagen and Martin Potthast",
  title        = "Elastic ChatNoir: Search Engine for the ClueWeb and the Common Crawl",
  booktitle    = "Advances in Information Retrieval - 40th European Conference on {IR} Research, {ECIR} 2018, Grenoble,
                 France, March 26-29, 2018, Proceedings",
  pages        = "820--824",
  year         = "2018",
  URL          = "https://doi.org/10.1007/978-3-319-76941-7_83",
  doi          = "10.1007/978-3-319-76941-7_83",
  cc-dataset-used = "CC-MAIN-2015-11",
  cc-class     = "information-retrieval/search-engine",
  cc-author-affiliation = "Bauhaus-Universität Weimar, Germany; Leipzig University, Germany",
}

@Article{cc:BoldiMarinoSantiniVigna:2018:BUbiNG-Massive-crawling-for,
  title        = "{BU}bi{NG}: Massive crawling for the masses",
  author       = "Boldi, Paolo and Marino, Andrea and Santini, Massimo and Vigna, Sebastiano",
  journal      = "ACM Transactions on the Web (TWEB)",
  volume       = "12",
  number       = "2",
  year         = "2018",
  publisher    = "ACM",
  URL          = "https://dl.acm.org/citation.cfm?id=3160017",
  cc-author-affiliation = "Università degli Studi di Milano, Italy",
  cc-derived-dataset-cited = "WDC-hyperlinkgraph",
  cc-class     = "web-crawling, web-science/hyperlinkgraph",
}

@Article{cc:BrauneFraserHaddow:2018:improving-translation,
  title        = "{D1}. 2: Report on Improving Translation with Monolingual Data",
  author       = "Braune, Fabienne and Fraser, Alex and Haddow, Barry",
  year         = "2018",
  URL          = "http://www.himl.eu/files/D1.2_Using_Non_Parallel.pdf",
  cc-author-affiliation = "University of Edinburgh",
  cc-class     = "nlp/machine-translation",
}

@InProceedings{cc:BrychcinHercigSteinbergerKonkol:2018:UWB-at-SemEval-2018,
  title        = "{UWB} at SemEval-2018 Task 10: Capturing Discriminative Attributes from Word Distributions",
  author       = "Brychcín, Tomáš and Hercig, Tomáš and Steinberger, Josef and Konkol, Michal",
  booktitle    = "Proceedings of The 12th International Workshop on Semantic Evaluation",
  pages        = "935--939",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/S18-1153",
  cc-author-affiliation = "University of West Bohemia, Czech Republic",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/semantics, nlp/word-embeddings",
}

@Article{cc:CafarellaHalevyLeeMadhavanEtAl:2018:ten-years-of-webtables,
  title        = "Ten years of webtables",
  author       = "Cafarella, Michael and Halevy, Alon and Lee, Hongrae and Madhavan, Jayant and Yu, Cong and Wang, Daisy
                 Zhe and Wu, Eugene",
  journal      = "Proceedings of the VLDB Endowment",
  volume       = "11",
  number       = "12",
  pages        = "2140--2149",
  year         = "2018",
  publisher    = "VLDB Endowment",
  URL          = "https://dl.acm.org/citation.cfm?id=3275614",
  pdf          = "http://web.eecs.umich.edu/~michjc/papers/p2140-cafarella.pdf",
  cc-author-affiliation = "Google Inc.; University of Michigan, USA; Megagon Labs; University of Florida, USA; Columbia
                 University, USA",
  cc-class     = "semantic web, web tables, web-mining",
  cc-snippet   = "Several researchers produced web tables from the public Common Crawl [1, 24, 15], thereby making them
                 available to a broad audience outside the large Web companies.",
  cc-derived-dataset-cited = "WDCWebTables, DresdenWebTableCorpus",
}

@Article{cc:CasalnuovoSagaeDevanbu:2018:difference-between-natural-and-programming-language-corpora,
  title        = "Studying the Difference Between Natural and Programming Language Corpora",
  author       = "Casalnuovo, Casey and Sagae, Kenji and Devanbu, Prem",
  journal      = "Empirical Software Engineering",
  pages        = "1--46",
  publisher    = "Springer",
  year         = "2018",
  URL          = "https://link.springer.com/article/10.1007/s10664-018-9669-7",
  pdf          = "https://arxiv.org/pdf/1806.02437.pdf",
  cc-author-affiliation = "University of California, Davis, USA",
  cc-class     = "nlp/corpus-construction, nlp/text-corpora, programming-languages, nlp/syntax",
  cc-derived-dataset-used = "conll-2017-shared-task",
  cc-snippet   = "The Germanand Spanish corpora were selected from a sample of files from the unlabeled datasets from
                 the ConLL 2017 Shared Task (Ginter et al, 2017), which consist of web text obtained from
                 CommonCrawl.⁸ Like the 1 billion token English corpus, we selected a random subsample to make these
                 corpora size comparable with our other corpora. In this sample, we excluded files from the Wikipedia
                 translations, as we observed Wikipedia formatting mixed in with some of the files.",
}

@Article{cc:ChenZhangWangZuoEtAl:2018:image-captioning,
  title        = "Leveraging Unpaired Out-of-Domain Data for Image Captioning",
  author       = "Chen, Xinghan and Zhang, Mingxing and Wang, Zheng and Zuo, Lin and Li, Bo and Yang, Yang",
  journal      = "Pattern Recognition Letters",
  year         = "2018",
  publisher    = "Elsevier",
  URL          = "https://www.sciencedirect.com/science/article/abs/pii/S0167865518309358",
  cc-author-affiliation = "University of Electronic Science and Technology of China (UESTC), Chengdu, PR China",
  cc-class     = "nlp/text-generation, ai/image-classification, nlp/image-captioning, ai/deep-learning",
}

@InProceedings{cc:ChiHuangChenWuEtAl:2018:Zewen-at-SemEval-2018,
  title        = "Zewen at SemEval-2018 Task 1: An Ensemble Model for Affect Prediction in Tweets",
  author       = "Chi, Zewen and Huang, Heyan and Chen, Jiangui and Wu, Hao and Wei, Ran",
  booktitle    = "Proceedings of The 12th International Workshop on Semantic Evaluation",
  pages        = "313--318",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/S18-1046",
  cc-author-affiliation = "Beijing Institute of Technology, China",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp, nlp/sentiment-analysis, nlp/emotion-detection, nlp/word-embeddings",
}

@Article{cc:Chinea-RiosPerisCasacuberta:2018:automatic-metrics,
  title        = "Are Automatic Metrics Robust and Reliable in Specific Machine Translation Tasks?",
  author       = "Chinea-Rios, Mara and Peris, Alvaro and Casacuberta, Francisco",
  year         = "2018",
  pages        = "89--98",
  publisher    = "European Association for Machine Translation",
  URL          = "http://rua.ua.es/dspace/handle/10045/76022",
  pdf          = "http://eamt2018.dlsi.ua.es/proceedings-eamt2018.pdf",
  cc-author-affiliation = "Universitat d'Alacant, Spain",
  cc-class     = "nlp/machine-translation",
  cc-snippet   = "In our setup, we trained a PB-SMT and a NMT system on the same data, from a general corpus extracted
                 from websites (Common Crawl).",
}

@Article{cc:ChollampattNg:2018:neural-network-grammatical-error-correction,
  title        = "A multilayer convolutional encoder-decoder neural network for grammatical error correction",
  author       = "Chollampatt, Shamil and Ng, Hwee Tou",
  journal      = "arXiv preprint arXiv:1801.08831",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1801.08831",
  cc-author-affiliation = "NUS Graduate School for Integrative Sciences and Engineering; Department of Computer Science,
                 National University of Singapore",
  cc-snippet   = "We also make use of the larger English corpora from Wikipedia (1.78B words) for pre-training the word
                 embeddings, and a subset of the Common Crawl corpus (94B words) for training the language model for
                 rescoring.",
  cc-class     = "nlp/grammatical-error-correction, nlp/word-embeddings, nlp/language-model",
}

@InProceedings{cc:ClarksonGentileGruhlRistoskiEtAl:2018:user-centric-ontology-population,
  title        = "User-Centric Ontology Population",
  author       = "Clarkson, Kenneth and Gentile, Anna Lisa and Gruhl, Daniel and Ristoski, Petar and Terdiman, Joseph
                 and Welch, Steve",
  booktitle    = "European Semantic Web Conference",
  pages        = "112--127",
  year         = "2018",
  organization = "Springer",
  URL          = "https://link.springer.com/chapter/10.1007/978-3-319-93417-4_8",
  doi          = "https://doi.org/10.1007/978-3-319-93417-4_8",
  cc-author-affiliation = "IBM Research Almaden, San Jose, USA",
  cc-class     = "semantic web, cc-cited-not-used, ontology extraction",
}

@InProceedings{cc:CohenWiddows:2018:order-neural-word-embeddings,
  title        = "Bringing Order to Neural Word Embeddings with Embeddings Augmented by Random Permutations ({EARP})",
  author       = "Cohen, Trevor and Widdows, Dominic",
  booktitle    = "Proceedings of the 22nd Conference on Computational Natural Language Learning",
  pages        = "465--475",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/K18-1045",
  cc-author-affiliation = "University of Washington, Seattle, USA; Grab, Inc., Seattle, WA, USA",
  cc-class     = "nlp/word-embeddings, cc-cited-not-used",
}

@Article{cc:ConneauKiela:2018:SentEval-evaluation-toolkit,
  title        = "SentEval: An evaluation toolkit for universal sentence representations",
  author       = "Conneau, Alexis and Kiela, Douwe",
  journal      = "arXiv preprint arXiv:1803.05449",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1803.05449",
  cc-author-affiliation = "Facebook Artificial Intelligence Research",
  cc-derived-dataset-used = "GloVe-word-embeddings, fastText-word-embeddings",
  cc-class     = "nlp/word-embeddings, nlp/sentence-embeddings, nlp/evaluation",
}

@Article{cc:ConneauLampleRinottWilliamsEtAl:2018:cross-lingual-sentence-representations,
  title        = "{XNLI}: Evaluating Cross-lingual Sentence Representations",
  author       = "Conneau, Alexis and Lample, Guillaume and Rinott, Ruty and Williams, Adina and Bowman, Samuel R and
                 Schwenk, Holger and Stoyanov, Veselin",
  journal      = "arXiv preprint arXiv:1809.05053",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1809.05053",
  cc-author-affiliation = "Facebook AI Research, USA; New York University, USA",
  cc-derived-dataset-used = "fasttext-word-embeddings",
  cc-class     = "nlp/word-embeddings, nlp/sentence-embeddings",
}

@InProceedings{cc:ConoverHayesBlackburnSkomorochEtAl:2018:Pangloss-fast-entity-linking,
  title        = "Pangloss: Fast Entity Linking in Noisy Text Environments",
  author       = "Conover, Michael and Hayes, Matthew and Blackburn, Scott and Skomoroch, Pete and Shah, Sam",
  booktitle    = "Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining",
  pages        = "168--176",
  year         = "2018",
  URL          = "https://dl.acm.org/citation.cfm?id=3219899",
  organization = "ACM",
  cc-snippet   = "The Common Crawl datasets represents a sample of web crawl data containing raw web page data, metadata
                 and text extracts overseen by a 501(c)(3) nonprofit of the same name. Facilitating ease of access for
                 industrial practitioners, the dataset is hosted for free on Amazon Web Services’ Public Data Set
                 repository in addition to academic hosts the world over. As part of a batch Hadoop job run on a monthly
                 basis we filter the Common Crawl data (∼70TB) down to records which contain at least one hyperlink
                 that points to English Wikipedia. This corpus has proven particularly valuable as a source of signal
                 for associating tokens with knowledge base entries in the context of domain-specific, messy natural
                 language.",
  cc-author-affiliation = "Workday, Inc., San Francisco, CA, USA",
  cc-class     = "ir/information-extraction",
}

@InProceedings{cc:CorreaZanderSilva:2018:open-data-portals,
  title        = "Investigating open data portals automatically: a methodology and some illustrations",
  author       = "Correa, Andreiwid Sheffer and Zander, Pär-Ola and da Silva, Flavio Soares Correa",
  booktitle    = "Proceedings of the 19th Annual International Conference on Digital Government Research: Governance in
                 the Data Age",
  pages        = "82",
  year         = "2018",
  organization = "ACM",
  URL          = "https://dl.acm.org/citation.cfm?id=3209292",
  cc-author-affiliation = "University of Sao Paulo, Sao Paulo, Brazil; Aalborg University, Aalborg, Denmark",
  cc-class     = "open data, information retrieval",
}

@InProceedings{cc:CulpepperDiazSmucker:2018:workshop-IR-Lorne,
  title        = "Research Frontiers in Information Retrieval: Report from the Third Strategic Workshop on Information
                 Retrieval in Lorne ({SWIRL} 2018)",
  author       = "Culpepper, J Shane and Diaz, Fernando and Smucker, Mark D.",
  booktitle    = "ACM SIGIR Forum",
  volume       = "52",
  number       = "1",
  pages        = "34--90",
  year         = "2018",
  organization = "ACM",
  URL          = "http://doi.acm.org/10.1145/3274784.3274788",
  doi          = "10.1145/3274784.3274788",
  pdf          = "http://www.sigir.org/wp-content/uploads/2018/07/p034.pdf",
  cc-author-affiliation = "ACM",
  cc-class     = "cc-cited-not-used, information-retrieval",
}

@InProceedings{cc:Czech:2018:geotag-web-sized-corpus,
  title        = "An Approach to Geotag a Web Sized Corpus of Documents with Addresses in Randstad, Netherlands",
  author       = "Czech, Alexander",
  booktitle    = "Adjunct Proceedings of the 14th International Conference on Location Based Services",
  pages        = "184--188",
  year         = "2018",
  organization = "ETH Zurich",
  URL          = "https://doi.org/10.3929/ethz-b-000225615",
  cc-author-affiliation = "TU Wien, Austria",
  cc-snippet   = "Common Crawl is a non-profit organization that provides raw web crawling data on a monthly basis.
                 Their archives contain over 3.16 billion URLs with over 260 TiB of uncompressed content.",
  cc-class     = "ir/geotagging",
}

@Article{cc:DemirelCinbisIkizler-Cinbis:2018:zero-shot-object-detection,
  title        = "Zero-Shot Object Detection by Hybrid Region Embedding",
  author       = "Demirel, Berkan and Cinbis, Ramazan Gokberk and Ikizler-Cinbis, Nazli",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1805.06157",
  cc-author-affiliation = "HAVELSAN Inc. Ankara, Turkey; Middle East Technical University Ankara, Turkey; Hacettepe
                 University Ankara, Turkey",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "ai/computer-vision, ai/pattern-recognition, nlp/word-embeddings",
}

@Article{cc:DenisovVuFont:2018:unsupervised-domain-adaptation-speech-recognition,
  title        = "Unsupervised Domain Adaptation by Adversarial Learning for Robust Speech Recognition",
  author       = "Denisov, Pavel and Vu, Ngoc Thang and Font, Marc Ferras",
  journal      = "arXiv preprint arXiv:1807.11284",
  URL          = "https://arxiv.org/abs/1807.11284",
  cc-author-affiliation = "University of Stuttgart, Germany",
  year         = "2018",
  cc-class     = "nlp, speech-recognition",
  cc-snippet   = "..., 197 millions words of Italian Deduplicated CommonCrawl Text are used to build Italian language
                 model.",
}

@Article{cc:DevHassanPhillips:2018:word-embedding-alignement,
  title        = "Absolute Orientation for Word Embedding Alignment",
  author       = "Dev, Sunipa and Hassan, Safia and Phillips, Jeff M",
  journal      = "arXiv preprint arXiv:1806.01330",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1806.01330",
  cc-author-affiliation = "University of Utah",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/semantics, nlp/word-embeddings",
}

@Article{cc:EdunovOttAuliGrangier:2018:understanding-back-translation,
  title        = "Understanding Back-Translation at Scale",
  author       = "Edunov, Sergey and Ott, Myle and Auli, Michael and Grangier, David",
  journal      = "arXiv preprint arXiv:1808.09381",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1808.09381",
  cc-author-affiliation = "Facebook AI Research, USA; Google Brain, Mountain View, CA, USA",
  cc-class     = "nlp/machine-translation",
}

@InProceedings{cc:EfremovaEndresVidasMelnik:2018:geo-tagging-address-extraction,
  title        = "A Geo-Tagging Framework for Address Extraction from Web Pages",
  author       = "Efremova, Julia and Endres, Ian and Vidas, Isaac and Melnik, Ofer",
  booktitle    = "Industrial Conference on Data Mining",
  pages        = "288--295",
  year         = "2018",
  publisher    = "Springer",
  URL          = "https://link.springer.com/chapter/10.1007/978-3-319-95786-9_22",
  cc-snippet   = "Common Crawl is a public corpus, mostly stored on Amazon Web Services³. A subset of the CommonCrawl
                 dataset has schema information in the microdata format",
  cc-author-affiliation = "HERE Technologies, Amsterdam, The Netherlands",
  cc-class     = "semantic-web/microformats",
}

@Article{cc:El-ZantJaffres-RunserFrahmShepelyansky:2018:painters-Wikipedia-networks,
  title        = "Interactions and influence of world painters from the reduced Google matrix of Wikipedia networks",
  author       = "El Zant, Samer and Jaffrès-Runser, Katia and Frahm, Klaus M. and Shepelyansky, Dima L.",
  journal      = "IEEE Access",
  year         = "2018",
  publisher    = "IEEE",
  URL          = "https://ieeexplore.ieee.org/abstract/document/8449078",
  cc-author-affiliation = "Université de Toulouse, France",
  cc-class     = "web-science/hyperlinkgraph, graph-processing, cc-cited-not-used",
  abstract     = "This paper concentrates on extracting painting art history knowledge from the network structure of
                 Wikipedia. Therefore, we construct theoretical networks of webpages representing the hyper-linked
                 structure of articles of seven Wikipedia language editions. These seven networks are analyzed to
                 extract the most influential painters in each edition using Google matrix theory. Importance of
                 webpages of over 3000 painters is measured using the PageRank algorithm. The most influential painters
                 are enlisted and their ties are studied with the reduced Google matrix analysis. The reduced Google
                 matrix is a powerful method that captures both direct and hidden interactions between a subset of
                 selected nodes taking into account the indirect links between these nodes via the remaining part of
                 large global network. This method originates from the scattering theory of nuclear and mesoscopic
                 physics and field of quantum chaos. In this paper, we show that it is possible to extract from the
                 components of the reduced Google matrix meaningful information on the ties between these painters. For
                 instance, our analysis groups together painters that belong to the same painting movement and shows
                 meaningful ties between painters of different movements. We also determine the influence of painters on
                 world countries using link sensitivity between Wikipedia articles of painters and countries. The
                 reduced Google matrix approach allows to obtain a balanced view of various cultural opinions of
                 Wikipedia language editions. The world countries with the largest number of top painters of selected
                 seven Wikipedia editions are found to be Italy, France, and Russia. We argue that this approach gives
                 meaningful information about art and that it could be a part of extensive network analysis on human
                 knowledge and cultures.",
}

@Article{cc:Espana-BonetStillerHenning:2018:corpora-for-machine-translation,
  title        = "{M1}. 2--Corpora for the Machine Translation Engines",
  author       = "Espana-Bonet, Cristina and Stiller, Juliane and Henning, Sophie",
  year         = "2018",
  URL          = "https://www.clubs-project.eu/assets/publications/project/M1.2_MTcorpora_v4.0.pdf",
  cc-author-affiliation = "Universität des Saarlandes, Germany; Humboldt-Universität zu Berlin, Germany",
  cc-class     = "nlp/machine-translation, nlp/corpora",
  cc-derived-dataset-cited = "WMT-13-translation-task-common-crawl-corpus",
}

@Article{cc:EstevesReddyChawlaLehmann:2018:obfuscate-fake-news,
  title        = "Belittling the Source: Trustworthiness Indicators to Obfuscate Fake News on the Web",
  author       = "Esteves, Diego and Reddy, Aniketh Janardhan and Chawla, Piyush and Lehmann, Jens",
  journal      = "arXiv preprint arXiv:1809.00494",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1809.00494",
  cc-author-affiliation = "University of Bonn, Germany; University of Ohio, USA; Carnegie Mellon University, Pittsburgh,
                 USA;",
  cc-class     = "nlp, text classification, content credibility, information retrieval",
  cc-snippet   = "PageRankCC: PageRank information computed through the CommonCrawl Corpus",
}

@InProceedings{cc:FaralliLefeverPaolo-Ponzetto:2018:MIsA-multilingual-IsA-extraction,
  title        = "{MI}s{A}: Multilingual Is{A} Extraction from Corpora",
  author       = "Faralli, Stefano and Lefever, Els and Paolo Ponzetto, Simone",
  booktitle    = "The Eleventh International Conference on Language Resources and Evaluation (LREC 2018)",
  pages        = "2040--2044",
  year         = "2018",
  organization = "European Language Resources Association (ELRA)",
  URL          = "https://biblio.ugent.be/publication/8562721",
  cc-author-affiliation = "University of Mannheim, Germany; Ghent University, Belgium",
  cc-class     = "nlp/semantics, data-mining, hypernymy",
  cc-derived-dataset-cited = "WDC-WebIsADb",
}

@InProceedings{cc:FayzrakhmanovSallingerSpencerFurcheEtAl:2018:browserless-web-data-extraction,
  title        = "Browserless web data extraction: challenges and opportunities",
  author       = "Fayzrakhmanov, Ruslan R. and Sallinger, Emanuel and Spencer, Ben and Furche, Tim and Gottlob, Georg",
  booktitle    = "Proceedings of the 2018 World Wide Web Conference on World Wide Web",
  pages        = "1095--1104",
  year         = "2018",
  organization = "International World Wide Web Conferences Steering Committee",
  URL          = "https://dl.acm.org/citation.cfm?id=3186008",
  cc-author-affiliation = "University of Oxford, Oxford, United Kingdom",
  cc-class     = "information retrieval, web-crawling, web-scraping, web-mining",
  cc-snippet   = "The random sites were chosen by randomly sampling URLs from the Common Crawl [10] search index
                 dataset, which includes around 3 billion web pages.",
}

@Article{cc:Funel:2018:analysis-web-graph,
  author       = "Funel, Agostino",
  title        = "Analysis of the Web Graph Aggregated by Host and Pay-Level Domain",
  year         = "2018",
  eprint       = "arXiv:1802.05435",
  URL          = "https://arxiv.org/abs/1802.05435",
  cc-dataset-used = "hyperlinkgraph/cc-main-2017-aug-sep-oct/hostgraph,
                 hyperlinkgraph/cc-main-2017-aug-sep-oct/domaingraph",
  cc-class     = "web-science/hyperlinkgraph",
  cc-author-affiliation = "ENEA, Italy",
}

@Article{cc:GarciaGomez-Perez:2018:word-representations-scientific-publications,
  title        = "Not just about size-{A} Study on the Role of Distributed Word Representations in the Analysis of
                 Scientific Publications",
  author       = "Garcia, Andres and Gomez-Perez, Jose Manuel",
  journal      = "arXiv preprint arXiv:1804.01772",
  year         = "2018",
  cc-derived-dataset-used = "fastText-word-embeddings, GloVe-word-embeddings",
  URL          = "https://arxiv.org/abs/1804.01772",
  cc-author-affiliation = "expertsystem.com, Madrid, Spain",
  cc-class     = "nlp/word-embeddings",
}

@Article{cc:GarciaGomez-Perez:2018:word-representations-scientific-publications-2,
  title        = "Not just about size-{A} Study on the Role of Distributed Word Representations in the Analysis of
                 Scientific Publications",
  author       = "Garcia, Andres and Gomez-Perez, Jose Manuel",
  booktitle    = "Proceedings of the First Workshop on Deep Learning for Knowledge Graphs and Semantic Technologies
                 (DL4KGS) co-located with the 15th Extended Semantic Web Conerence (ESWC 2018) Heraklion, Crete, Greece,
                 June 4, 2018",
  year         = "2018",
  cc-derived-dataset-used = "fastText-word-embeddings, GloVe-word-embeddings",
  pdf          = "http://ceur-ws.org/Vol-2106/paper3.pdf",
  cc-same-as   = "cc:GarciaGomez-Perez:2018:word-representations-scientific-publications",
  cc-author-affiliation = "expertsystem.com, Madrid, Spain",
  cc-class     = "nlp/word-embeddings",
}

@Article{cc:GargSchiebingerJurafskyZou:2018:word-embeddings-gender-and-ethnic-stereotypes,
  title        = "Word embeddings quantify 100 years of gender and ethnic stereotypes",
  author       = "Garg, Nikhil and Schiebinger, Londa and Jurafsky, Dan and Zou, James",
  journal      = "Proceedings of the National Academy of Sciences",
  volume       = "115",
  number       = "16",
  pages        = "E3635--E3644",
  year         = "2018",
  publisher    = "National Acad Sciences",
  URL          = "https://www.pnas.org/content/115/16/E3635.short",
  doi          = "https://doi.org/10.1073/pnas.1720347115",
  cc-author-affiliation = "Stanford University, USA; Chan Zuckerberg Biohub, San Francisco, CA, USA",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/semantics, nlp/word-embeddings, ai/ethics-of-machine-learning, ai/machine-learning",
}

@Article{cc:Ghasemi-GolSzekely:2018:TabVec-table-vectors-web-tables,
  title        = "TabVec: Table Vectors for Classification of Web Tables",
  author       = "Ghasemi-Gol, Majid and Szekely, Pedro",
  journal      = "arXiv preprint arXiv:1802.06290",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1802.06290",
  cc-author-affiliation = "University of Southern California; Information Science Institute",
  cc-class     = "web-tables, information-extraction",
  cc-dataset-used = "CC-MAIN-2015-32",
  cc-snippet   = "[...] we use a random sample of July 2015 Common Crawl (WCC) as a generic domain to compare our system
                 with the state of the art systems",
  cc-derived-dataset-cited = "WDCWebTables, DresdenWebTableCorpus",
}

@InProceedings{cc:GlassGliozzo:2018:discovering-implicit-knowledge,
  title        = "Discovering Implicit Knowledge with Unary Relations",
  author       = "Glass, Michael and Gliozzo, Alfio",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1:
                 Long Papers)",
  volume       = "1",
  pages        = "1585--1594",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/P18-1147",
  cc-author-affiliation = "IBM Research AI",
  cc-class     = "ai/knowledge-base",
}

@InProceedings{cc:GlassGliozzo:2018:web-scale-knowledge-base-population-dataset,
  title        = "A Dataset for Web-Scale Knowledge Base Population",
  author       = "Glass, Michael and Gliozzo, Alfio",
  booktitle    = "European Semantic Web Conference",
  pages        = "256--271",
  year         = "2018",
  publisher    = "Springer",
  URL          = "https://link.springer.com/chapter/10.1007/978-3-319-93417-4_17",
  pdf          = "https://2018.eswc-conferences.org/wp-content/uploads/2018/02/ESWC2018_paper_173.pdf",
  cc-author-affiliation = "Knowledge Induction and Reasoning Group, IBM Research AINew YorkUSA",
  cc-class     = "ai/semantic-reasoning, ai/knowledge-base",
  cc-snippet   = "We introduce and release CC-DBP, a web-scale dataset for training and benchmarking KBP systems. The
                 dataset is based on Common Crawl as the corpus and DBpedia as the target knowledge base [...]",
  cc-derived-dataset-about = "CC-DBP",
  cc-dataset-used = "CC-MAIN-2017-26",
}

@InProceedings{cc:GlassGliozzoHassanzadehMihindukulasooriyaEtAl:2018:implicit-relations-from-text,
  title        = "Inducing implicit relations from text using distantly supervised deep nets",
  author       = "Glass, Michael and Gliozzo, Alfio and Hassanzadeh, Oktie and Mihindukulasooriya, Nandana and
                 Rossiello, Gaetano",
  booktitle    = "International Semantic Web Conference",
  pages        = "38--55",
  year         = "2018",
  organization = "Springer",
  URL          = "https://link.springer.com/chapter/10.1007/978-3-030-00671-6_3",
  cc-author-affiliation = "IBM Research AI, New York, USA; Universidad Politcnica de Madrid, Spain; University of Bari,
                 Italy",
  cc-class     = "ai/knowledge-base, ai/deep-learning, semantic web",
  cc-derived-dataset-used = "CC-DBP",
}

@Article{cc:GoelMatsuyamaMadaioCassell:2018:detecting-indirectness,
  title        = "“{I} think it might help if we multiply, and not add”: Detecting Indirectness in Conversation",
  author       = "Goel, Pranav and Matsuyama, Yoichi and Madaio, Michael and Cassell, Justine",
  year         = "2018",
  URL          = "http://articulab.hcii.cs.cmu.edu/wordpress/wp-content/uploads/2018/04/Goel-IWSDS2018_camera-ready_13Mar.pdf",
  cc-author-affiliation = "Indian Institute of Technology (BHU), India; Carnegie Mellon University",
  cc-class     = "nlp/dialogue-systems, nlp/word-embeddings",
  cc-derived-dataset-used = "GloVe-word-embeddings",
}

@InProceedings{cc:GolemKaranSnajder:2018:aggressive-text-detection,
  title        = "Combining Shallow and Deep Learning for Aggressive Text Detection",
  author       = "Golem, Viktor and Karan, Mladen and Šnajder, Jan",
  booktitle    = "Proceedings of the First Workshop on Trolling, Aggression and Cyberbullying (TRAC-2018)",
  pages        = "188--198",
  year         = "2018",
  pdf          = "www.aclweb.org/anthology/W18-4422",
  cc-author-affiliation = "University of Zagreb, Croatia",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/text-classification, nlp/word-embeddings",
}

@Article{cc:GoodingTerrasBerube:2018:legal-deposit-web-archives,
  title        = "Legal Deposit Web Archives and the Digital Humanities: {A} Universe of Lost Opportunity?",
  author       = "Gooding, Paul and Terras, Melissa and Berube, Linda",
  year         = "2018",
  URL          = "http://eprints.gla.ac.uk/168229/",
  cc-author-affiliation = "University of East Anglia, United Kingdom; University of Edinburgh, United Kingdom",
  cc-snippet   = "Restricted deposit library access requires researchers to look elsewhere for portable web data: by
                 undertaking their own web crawls, or by utilising datasets from Common Crawl (http://commoncrawl.org/)
                 and the Internet Archive (https://archive.org). Both organisations provide vital services to
                 researchers, and both innovate in areas that would traditionally fall under the deposit libraries’
                 purview. They support their mission by exploring the boundaries of copyright, including exceptions for
                 non-commercial text and data mining (Intellectual Property Office, 2014). This contrast between
                 risk-enabled independent organisations and deposit libraries, described by interviewees as risk averse,
                 challenges library/DH collaboration models such as BL Labs (http://labs.bl.uk) and Library of Congress
                 Labs (https://labs.loc.gov).",
  cc-class     = "web-archiving/legal-aspects",
}

@InProceedings{cc:GraesserRus:2018:pooling-word-vector-representations,
  title        = "Pooling Word Vector Representations Across Models",
  author       = "Banjade, Rajendra and Maharjan, Nabin and Gautam, Dipesh and Adrasik, Frank and Graesser, Arthur C.
                 and Rus, Vasile",
  booktitle    = "Computational Linguistics and Intelligent Text Processing: 18th International Conference, CICLing
                 2017, Budapest, Hungary, April 17-23, 2017, Revised Selected Papers",
  volume       = "10761",
  pages        = "17--29",
  year         = "2018",
  organization = "Springer",
  URL          = "https://www.springer.com/de/book/9783319771151",
  cc-author-affiliation = "University of Memphis, USA",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/word-embeddings, nlp/semantics",
}

@Article{cc:GrandBlankPereiraFedorenko:2018:semantic-projection,
  title        = "Semantic projection: recovering human knowledge of multiple, distinct object features from word
                 embeddings",
  author       = "Grand, Gabriel and Blank, Idan Asher and Pereira, Francisco and Fedorenko, Evelina",
  journal      = "arXiv preprint arXiv:1802.01241",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1802.01241",
  cc-author-affiliation = "Harvard University; Massachusetts Institute of Technology; Siemens Healthineers;
                 Massachusetts General Hospital; Harvard Medical School",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/semantics, nlp/word-embeddings",
}

@InProceedings{cc:GraveBojanowskiGuptaJoulinEtAl:2018:learning-word-vectors,
  title        = "Learning word vectors for 157 languages",
  author       = "Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas",
  booktitle    = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC}
                 2018)",
  year         = "2018",
  address      = "Miyazaki, Japan",
  publisher    = "European Language Resources Association (ELRA)",
  URL          = "https://www.aclweb.org/anthology/L18-1550",
  pdf          = "https://www.aclweb.org/anthology/L18-1550.pdf",
  abstract     = "Distributed word representations, or word vectors, have recently been applied to many tasks in natural
                 language processing, leading to state-of-the-art performance. A key ingredient to the successful
                 application of these representations is to train t hem on very large corpora, and use these pre-trained
                 models in downstream tasks. In this paper, we describe how we trained such high qualit y word
                 representations for 157 languages. We used two sources of data to train these models: the free online
                 encyclopedia Wikip edia and data from the common crawl project. We also introduce three new word
                 analogy datasets to evaluate these word vectors, for Fren ch, Hindi and Polish. Finally, we evaluate
                 our pre-trained word vectors on 10 languages for which evaluation datasets exists, sho wing very strong
                 performance compared to previous models.",
  cc-author-affiliation = "Facebook AI Research; École polytechnique fédérale de Lausanne EPFL, Switzerland",
  cc-class     = "nlp/word-embeddings",
  cc-dataset-used = "CC-MAIN-2017-22 (WET)",
  cc-derived-dataset-about = "fastText-word-embeddings",
  cc-snippet   = "The common crawl is a non profit organization which crawls the web and makes the resulting data
                 publicly available. This large scale corpus was previously used to estimate n-gram language models
                 (Buck et al., 2014) or to learn English word vectors (Pennington et al., 2014). To the best of our
                 knowledge, it was not used yet to learn word vectors for a large set of languages. The data is
                 distributed either as raw HTML pages, or as WET files which contain the extracted text data, converted
                 to UTF-8. We decided to use the extracted text data, as it is much smaller in size, and easier to
                 process (no need to remove HTML). We downloaded the May 2017 crawl, corresponding to roughly 24
                 terabytes of raw text data.",
}

@Article{cc:GrundkiewiczJunczys-Dowmunt:2018:grammatical-error-correction-mt,
  title        = "Near Human-Level Performance in Grammatical Error Correction with Hybrid Machine Translation",
  author       = "Grundkiewicz, Roman and Junczys-Dowmunt, Marcin",
  journal      = "arXiv preprint arXiv:1804.05945",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1804.05945",
  cc-author-affiliation = "University of Edinburgh, United Kingdom; Microsoft",
  cc-class     = "nlp/machine-translation, nlp/grammatical-error-correction",
  cc-derived-dataset-used = "Ngrams-LMs-2013",
}

@InProceedings{cc:HazemMorin:2018:meta-embeddings-for-bilingual-lexicon-extraction,
  title        = "Leveraging Meta-Embeddings for Bilingual Lexicon Extraction from Specialized Comparable Corpora",
  author       = "Hazem, Amir and Morin, Emmanuel",
  booktitle    = "Proceedings of the 27th International Conference on Computational Linguistics",
  pages        = "937--949",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/C18-1080",
  cc-author-affiliation = "Université de Nantes, France",
  cc-class     = "nlp/machine-translation, nlp/lexikon, nlp/dictionary-creation",
}

@Article{cc:HedderichKlakow:2018:low-resource-training-neural-network,
  title        = "Training a Neural Network in a Low-Resource Setting on Automatically Annotated Noisy Data",
  author       = "Hedderich, Michael A. and Klakow, Dietrich",
  journal      = "arXiv preprint arXiv:1807.00745",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1807.00745",
  cc-author-affiliation = "Saarland University, Saarbrücken, Germany",
  cc-class     = "nlp/word-embeddings, ai/neural-networks",
  cc-derived-dataset-used = "GloVe-word-embeddings",
}

@InProceedings{cc:HettingerDallmannZeheNieblerEtAl:2018:ClaiRE-at-SemEval-2018,
  title        = "Clai{RE} at SemEval-2018 Task 7: Classification of Relations using Embeddings",
  author       = "Hettinger, Lena and Dallmann, Alexander and Zehe, Albin and Niebler, Thomas and Hotho, Andreas",
  booktitle    = "Proceedings of The 12th International Workshop on Semantic Evaluation",
  pages        = "836--841",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/S18-1134",
  cc-author-affiliation = "University of Würzburg, Germany",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/semantics, nlp/word-embeddings",
}

@Article{cc:HettingerDallmannZeheNieblerEtAl:2018:ClaiRE-at-SemEval-2018-extended-version,
  title        = "Clai{RE} at SemEval-2018 Task 7-Extended Version",
  author       = "Hettinger, Lena and Dallmann, Alexander and Zehe, Albin and Niebler, Thomas and Hotho, Andreas",
  journal      = "arXiv preprint arXiv:1804.05825",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1804.05825",
  cc-author-affiliation = "University of Würzburg, Germany",
  cc-class     = "nlp/semantics, nlp/word-embeddings",
  cc-snippet   = "we employ a publicly available set of 300-dimensional word embeddings trained with GloVe (Pennington
                 et al., 2014) on the Common Crawl data",
}

@Article{cc:HuangLiPingHuang:2018:neural-language-model,
  title        = "Large Margin Neural Language Model",
  author       = "Huang, Jiaji and Li, Yi and Ping, Wei and Huang, Liang",
  journal      = "arXiv preprint arXiv:1808.08987",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1808.08987",
  cc-author-affiliation = "Baidu Research, Sunnyvale, CA, USA; School of EECS, Oregon State University, Corvallis, OR,
                 USA",
  cc-class     = "nlp/language-model, nlp/machine-translation",
  cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
}

@Article{cc:Indig:2018:crawlnak-korpusz,
  title        = "Közös crawlnak is egy korpusz a vége-Korpuszépítés a CommonCrawl .hu domainjából",
  author       = "Indig, Balázs",
  year         = "2018",
  publisher    = "Szegedi Tudományegyetem Informatikai Tanszékcsoport",
  URL          = "http://real.mtak.hu/73329/1/crawl.pdf",
  cc-author-affiliation = "MTA-PPKE Magyar Nyelvtechnológiai Kutatócsoport, Hungaria",
  cc-dataset-used = "CC-MAIN-2017-47",
  cc-description = "analysis of .hu sites in November 2017 crawl",
  cc-class     = "web-science",
}

@Article{cc:IyyerWietingGimpelZettlemoyer:2018:adversarial-example-generation,
  title        = "Adversarial example generation with syntactically controlled paraphrase networks",
  author       = "Iyyer, Mohit and Wieting, John and Gimpel, Kevin and Zettlemoyer, Luke",
  journal      = "arXiv preprint arXiv:1804.06059",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1804.06059",
  cc-author-affiliation = "Allen Institute of Artificial Intelligence, Seattle, United States; UMass Amherst, United
                 States; Carnegie Mellon University, Pittsburgh, PA, USA; Toyota Technological Institute at Chicago, IL,
                 USA; University of Washington, Seattle, WA, USA",
  cc-derived-dataset-about = "WMT-16-translation-task-common-crawl-corpus, patent",
  cc-class     = "nlp/machine-translation, nlp/sentence-paraphrase, nlp/sentence-embeddings",
}

@InProceedings{cc:JoulinBojanowskiMikolovJegouEtAl:2018:learning.bilingual-word-mapping,
  title        = "Loss in translation: Learning bilingual word mapping with a retrieval criterion",
  author       = "Joulin, Armand and Bojanowski, Piotr and Mikolov, Tomas and Jégou, Hervé and Grave, Edouard",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
  pages        = "2979--2984",
  year         = "2018",
  URL          = "https://www.aclweb.org/anthology/papers/D/D18/D18-1330/",
  cc-author-affiliation = "Facebook AI Research",
  cc-class     = "nlp/word-embeddings, nlp/bilingual-word-embeddings",
  cc-derived-dataset-used = "fastText-word-embeddings",
}

@Article{cc:JunczysDowmuntGrundkiewiczGuhaHeafield:2018:neural-grammatical-error,
  title        = "Approaching Neural Grammatical Error Correction as a Low-Resource Machine Translation Task",
  author       = "Junczys-Dowmunt, Marcin and Grundkiewicz, Roman and Guha, Shubha and Heafield, Kenneth",
  journal      = "arXiv preprint arXiv:1804.05940",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1804.05940",
  cc-author-affiliation = "University of Edinburgh, United Kingdom; Microsoft",
  cc-class     = "nlp/machine-translation, nlp/grammatical-error-correction",
}

@Article{cc:JurgensKumarHooverMcFarlandEtAl:2018:measuring-evolution-of-scientific-field,
  title        = "Measuring the evolution of a scientific field through citation frames",
  author       = "Jurgens, David and Kumar, Srijan and Hoover, Raine and McFarland, Dan and Jurafsky, Dan",
  journal      = "Transactions of the Association for Computational Linguistics",
  volume       = "6",
  pages        = "391--406",
  year         = "2018",
  URL          = "https://doi.org/10.1162/tacl_a_00028",
  cc-author-affiliation = "University of Michigan, USA; Stanford University, USA",
  cc-derived-dataset-cited = "GloVe-word-embeddings",
  cc-class     = "nlp/word-embeddings, nlp/text-analysis, nlp/citation-analysis",
}

@Article{cc:KaftanBalazinskaCheungGehrke:2018:Cuttlefish,
  title        = "Cuttlefish: {A} Lightweight Primitive for Adaptive Query Processing",
  author       = "Kaftan, Tomer and Balazinska, Magdalena and Cheung, Alvin and Gehrke, Johannes",
  journal      = "arXiv preprint arXiv:1802.09180",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1802.09180",
  cc-author-affiliation = "University of Washington; Microsoft",
  cc-class     = "information retrieval, regular expression matching, query planning, SQL processing",
  cc-snippet   = "... to search through a contiguously-stored sample of approximately 256 thousand internet web pages
                 collected by the Common Crawl project.",
}

@Misc{cc:KagoshimaLondenbergXu:2018:content-score,
  title        = "Determination of content score",
  author       = "Kagoshima, Alexander and Londenberg, Kai and Xu, Fang",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/337,268",
  URL          = "https://patents.google.com/patent/US20180121430A1/en",
  cc-author-affiliation = "Searchmetrics GmbH",
  cc-snippet   = "The crawler module [310] may automatically crawl a network and acquire contents from one or more
                 resources in the network, acquire the contents from an open repository of web crawl data such as
                 CommonCrawl.org.",
  cc-class     = "patent, cc-cited-not-used",
}

@Misc{cc:KaleTaulaSrivastavaHewavitharana:2018:query-segmentation,
  title        = "Methods and systems for query segmentation",
  author       = "Kale, Ajinkya Gorakhnath and Taula, Thrivikrama and Srivastava, Amit and Hewavitharana, Sanjika",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/681,663",
  URL          = "https://patents.google.com/patent/US20180329999A1/en",
  cc-author-affiliation = "eBay Inc.",
  cc-derived-dataset-cited = "GloVe-word-embeddings",
  cc-class     = "ir/query-segmentation, nlp/word-embeddings, patent",
}

@Article{cc:KarolyLaszlo:2018:webarchivalas-es-torteneti-kutatasok,
  title        = "Webarchiválás és a történeti kutatások / Web Archiving and Historical Research",
  author       = "Károly, Kokas and László, Drótos",
  journal      = "Digitális Bölcsészet",
  volume       = "1",
  pages        = "35--54",
  year         = "2018",
  URL          = "http://ojs.elte.hu/index.php/digitalisbolcseszet/article/view/129",
  doi          = "https://doi.org/10.31400/dh-hun.2018.1.129",
  cc-author-affiliation = "Országos Széchényi Könyvtár, Hungary; SZTE Klebelsberg Könyvtár, Hungary",
  cc-class     = "web-archiving, cc-cited-not-used",
}

@InProceedings{cc:KhalilGuanNabeelYu:2018:detecting-stealthy-malicious-domains,
  title        = "A domain is only as good as its buddies: detecting stealthy malicious domains via graph inference",
  author       = "Khalil, Issa M. and Guan, Bei and Nabeel, Mohamed and Yu, Ting",
  booktitle    = "Proceedings of the Eighth ACM Conference on Data and Application Security and Privacy",
  pages        = "330--341",
  year         = "2018",
  organization = "ACM",
  URL          = "https://dl.acm.org/citation.cfm?id=3176329",
  cc-author-affiliation = "Qatar Computing Research Institute, Doha, Qatar",
  cc-class     = "computer-security/malicious-domain-detection, computer-security/internet-security, graph-processing",
}

@InProceedings{cc:KhayrallahThompsonDuhKoehn:2018:domain-adaption-neural-MT,
  title        = "Regularized Training Objective for Continued Training for Domain Adaptation in Neural Machine
                 Translation",
  author       = "Khayrallah, Huda and Thompson, Brian and Duh, Kevin and Koehn, Philipp",
  booktitle    = "Proceedings of the 2nd Workshop on Neural Machine Translation and Generation",
  pages        = "36--44",
  year         = "2018",
  URL          = "https://www.aclweb.org/anthology/papers/W/W18/W18-2705/",
  cc-author-affiliation = "Johns Hopkins University, USA",
  cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
  cc-class     = "nlp/machine-translation",
}

@InProceedings{cc:KielaWangCho:2018:meta-embeddings-for-sentence-representations,
  title        = "Dynamic meta-embeddings for improved sentence representations",
  author       = "Kiela, Douwe and Wang, Changhan and Cho, Kyunghyun",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
  pages        = "1466--1477",
  year         = "2018",
  publisher    = "Association for Computational Linguistics",
  URL          = "https://www.aclweb.org/anthology/D18-1176",
  abstract     = "While one of the first steps in many NLP systems is selecting what pre-trained word embeddings to use,
                 we argue that such a step is better left for neural networks to figure out by themselves. To that end,
                 we introduce dynamic meta-embeddings, a simple yet effective method for the supervised learning of
                 embedding ensembles, which leads to state-of-the-art performance within the same model class on a
                 variety of tasks. We subsequently show how the technique can be used to shed new light on the usage of
                 word embeddings in NLP systems.",
  cc-author-affiliation = "Facebook AI Research, USA; New York University, USA; CIFAR Global Scholar, Canada",
  cc-derived-dataset-used = "GloVe-word-embeddings, fastText-word-embeddings",
  cc-class     = "nlp/sentence-embeddings, nlp/word-embeddings",
}

@Article{cc:KieselKneistAlshomarySteinEtAl:2018:Reproducible-Web-Corpora-Interactive,
  title        = "Reproducible Web Corpora: Interactive Archiving with Automatic Quality Assessment",
  author       = "Kiesel, Johannes and Kneist, Florian and Alshomary, Milad and Stein, Benno and Hagen, Matthias and
                 Potthast, Martin",
  journal      = "Journal of Data and Information Quality (JDIQ)",
  volume       = "10",
  number       = "4",
  pages        = "17",
  year         = "2018",
  publisher    = "ACM",
  URL          = "https://dl.acm.org/citation.cfm?id=3239574",
  pdf          = "https://webis.de/downloads/publications/papers/stein_2018v.pdf",
  cc-author-affiliation = "Paderborn University, Germany; Bauhaus-Universität Weimar, Germany;
                 Martin-Luther-Universität Halle-Wittenberg, Germany; Leipzig University, Germany; Ulm University,
                 Germany",
  cc-class     = "web-mining, nlp/web-as-corpus",
  cc-dataset-used = "CC-MAIN-2017-04",
  cc-snippet   = "To build a solid benchmark dataset for web reproduction quality assessment, we carefully sampledweb
                 pages with the goal of representing a wide cross-section of the different types and genres of webpages
                 found on the web. As a population of web pages to draw a sample from, we resort to the
                 recentbillion-page Common Crawl 2017-04 [36]. From there, we primarily sampled pages from most ofthe
                 well-known sites—as defined by the website’s Alexa traffic rank [1]⁶—to ensure that our
                 sampleencompasses pages using the most recent web technologies and design standards. Moreover,
                 pagesfrom a number of less well-known sites have been included. Altogether, the Webis Web Archive 17
                 comprises 10,000 web pages.",
}

@Article{cc:KimKimKwak:2018:textbook-question-answering,
  title        = "Textbook Question Answering with Knowledge Graph Understanding and Unsupervised Open-set Text
                 Comprehension",
  author       = "Kim, Daesik and Kim, Seonhoon and Kwak, Nojun",
  journal      = "arXiv preprint arXiv:1811.00232",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1811.00232",
  cc-author-affiliation = "Seoul National University, South Korea; V.DO Inc., South Korea; Naver Corporation, South
                 Korea",
  cc-derived-dataset-used = "GloVe",
  cc-class     = "nlp/question-answering, nlp/word-embeddings, nlp/knowledge-graph, nlp/text-comprehension",
}

@Article{cc:KiyonoSuzukiInui:2018:expert-imitator-networks,
  title        = "Mixture of Expert/Imitator Networks: Scalable Semi-supervised Learning Framework",
  author       = "Kiyono, Shun and Suzuki, Jun and Inui, Kentaro",
  journal      = "arXiv preprint arXiv:1810.05788",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1810.05788",
  cc-author-affiliation = "Tohoku University, Japan; Center for Advanced Intelligence Project, Japan",
  cc-class     = "cc-cited-not-used, nlp/text-classification, ai/deep-learning, ai/neural-networks",
}

@InProceedings{cc:KnowlesKoehn:2018:context-and-copying-in-neural-mt,
  title        = "Context and Copying in Neural Machine Translation",
  author       = "Knowles, Rebecca and Koehn, Philipp",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
  pages        = "3034--3041",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/D18-1339",
  cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
  cc-author-affiliation = "Johns Hopkins University, USA",
  cc-class     = "nlp/machine-translation",
}

@Article{cc:KrantzKalita:2018:abstractive-summarization,
  title        = "Abstractive Summarization Using Attentive Neural Techniques",
  author       = "Krantz, Jacob and Kalita, Jugal",
  journal      = "arXiv preprint arXiv:1810.08838",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1810.08838",
  cc-author-affiliation = "Gonzaga University, USA; University of Colorado, USA",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/text-summarization, nlp/word-embeddings",
}

@InProceedings{cc:KravchenkoPivovarova:2018:DL-Team-at-SemEval-2018,
  title        = "{DL} Team at SemEval-2018 Task 1: Tweet Affect Detection using Sentiment Lexicons and Embeddings",
  author       = "Kravchenko, Dmitry and Pivovarova, Lidia",
  booktitle    = "Proceedings of The 12th International Workshop on Semantic Evaluation",
  pages        = "172--176",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/S18-1025",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-author-affiliation = "Ben-Gurion University of the Negev, Israel; University of Helsinki, Finland",
  cc-class     = "nlp/sentiment-analysis",
}

@Article{cc:Kulmizev:2018:multilingual-word-embeddings,
  title        = "Multilingual word embeddings and their utility in cross-lingual learning",
  author       = "Kulmizev, Artur",
  year         = "2018",
  URL          = "http://hdl.handle.net/10810/29083",
  cc-author-affiliation = "University of Groningen, The Netherlands",
  cc-class     = "nlp/semantics, nlp/word-embeddings, cc-cited-not-used",
}

@InProceedings{cc:KulmizevAbdouRavishankarNissim:2018:discriminator-SemEval-2018-task,
  title        = "Discriminator at SemEval-2018 Task 10: Minimally Supervised Discrimination",
  author       = "Kulmizev, Artur and Abdou, Mostafa and Ravishankar, Vinit and Nissim, Malvina",
  booktitle    = "Proceedings of The 12th International Workshop on Semantic Evaluation",
  pages        = "1008--1012",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/S18-1167",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-author-affiliation = "University of Groningen, The Netherlands; Institute of Formal and Applied Linguistics Charles
                 University in Prague, Czech Republic",
  cc-class     = "nlp/semantics, nlp/word-embeddings",
}

@Article{cc:LagesShepelyanskyZinovyev:2018:inferring-hidden-causal-relations-directed-biological-networks,
  title        = "Inferring hidden causal relations between pathway members using reduced Google matrix of directed
                 biological networks",
  author       = "Lages, José and Shepelyansky, Dima L. and Zinovyev, Andrei",
  journal      = "PloS one",
  volume       = "13",
  number       = "1",
  pages        = "e0190812",
  year         = "2018",
  publisher    = "Public Library of Science",
  URL          = "https://doi.org/10.1371/journal.pone.0190812",
  cc-author-affiliation = "Université de Franche-Comté, Besançon, France",
  cc-class     = "cc-cited-not-used, graph-processing, web-science/hyperlinkgraph, network analysis, biochemistry,
                 proteine structure",
  cc-snippet   = "At present directed networks of real systems can be very large (about 4.2 millions for the English
                 Wikipedia edition in 2013 [18] or 3.5 billion web pages for a publicly accessible web crawl that was
                 gathered by the Common Crawl Foundation in 2012 [53: Meusel R, Vigna S, Lehmberg O, Bizer C. The graph
                 structure in the web—analyzed on different aggregation levels. J. Web Sci. 2015;1:33.]).",
}

@Article{cc:LehmberHassanzadeh:2018:ontology-augmentation-with-web-tables,
  title        = "Ontology Augmentation Through Matching with Web Tables",
  author       = "Lehmberg, Oliver and Hassanzadeh, Oktie",
  URL          = "http://disi.unitn.it/~pavel/om2018/papers/om2018_LTpaper4.pdf",
  year         = "2018",
  cc-author-affiliation = "University of Mannheim, Germany; IBM Research, Yorktown Heights, New York, USA",
  cc-class     = "semantic web, ontology extraction, web tables",
  cc-derived-dataset-used = "WDCWebTables",
  cc-snippet   = "We perform an empirical study of the performance of this approach in using Web Tables extracted from
                 the Common Crawl to augment the properties in DBpedia ontology.",
}

@Article{cc:LiLinChoiFuEtAl:2018:Youtube-annotated-corpus,
  title        = "Youtube av 50k: an annotated corpus for comments in autonomous vehicles",
  author       = "Li, Tao and Lin, Lei and Choi, Minsoo and Fu, Kaiming and Gong, Siyuan and Wang, Jian",
  journal      = "arXiv preprint arXiv:1807.11227",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1807.11227",
  cc-author-affiliation = "Purdue University, Indiana, USA",
  cc-class     = "cc-cited-not-used, nlp/corpus-construction, nlp/opinion-mining, nlp/sentiment-analysis",
}

@Article{cc:LiangLiuZadehMorency:2018:multimodal-language-analysis,
  title        = "Multimodal Language Analysis with Recurrent Multistage Fusion: Supplementary Material",
  author       = "Liang, Paul Pu and Liu, Ziyin and Zadeh, Amir and Morency, Louis-Philippe",
  URL          = "https://arxiv.org/abs/1808.03920",
  pdf          = "https://arxiv.org/pdf/1808.03920.pdf",
  cc-snippet   = "We used 300 dimensional Glove word embeddings trained on 840 billion tokens from the common crawl
                 dataset (Pennington et al., 2014).",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  year         = "2018",
  cc-author-affiliation = "Carnegie Mellon University",
  cc-class     = "nlp/multi-modality, nlp/language-model",
}

@Article{cc:LiaoAlrwaisYuanXingEtAl:2018:cloud-repository-as-malicious-service,
  title        = "Cloud repository as a malicious service: challenge, identification and implication",
  author       = "Liao, Xiaojing and Alrwais, Sumayah and Yuan, Kan and Xing, Luyi and Wang, XiaoFeng and Hao, Shuang
                 and Beyah, Raheem",
  journal      = "Cybersecurity",
  volume       = "1",
  number       = "1",
  pages        = "14",
  year         = "2018",
  publisher    = "Springer",
  URL          = "https://cybersecurity.springeropen.com/articles/10.1186/s42400-018-0015-6",
  pdf          = "https://cybersecurity.springeropen.com/track/pdf/10.1186/s42400-018-0015-6",
  cc-author-affiliation = "Indiana University Bloomington, USA; King Saud University, Saudi Arabia; University of Texas
                 at Dallas, USA; Georgia Institute of Technology, USA",
  cc-class     = "computer-security/malicious-hosting-service, computer-security/internet-security",
  cc-dataset-used = "CC-MAIN-2015-11",
  cc-snippet   = "[...], we developed BarFinder, a scanner that automatically detects Bars through inspecting the
                 topological relations between websites and the cloud bucket they use, in an attempt to capture Bars
                 based on the external features of the websites they serve. [...] Running the scanner over all the data
                 collected by the Common Crawl (Crawl 2015), which indexed five billion web pages, for those associated
                 with all major cloud storage providers (including Amazon S3, Cloudfront, Google Drive, etc.), we found
                 around 1 million sites utilizing 6885 repositories hosted on these clouds. [...] We built the site list
                 with the help of Common Crawl (Crawl 2015), a public big data project that crawls about 5 billion
                 webpages each month through a large-scale Hadoop-based crawler and maintains lists of the crawled
                 websites and their embedded links. Searching the Common Crawl (Crawl 2015) dataset, collected in
                 February 2015, for the websites loading content from the 400 clean and malicious buckets identified
                 above, we found 141,149 websites, were used by our crawler. [...] We further developed a tool in Python
                 to recover cloud URLs from the web content gathered by Common Crawl.",
}

@Article{cc:LiuLiuGuoXiongEtAl:2018:USTC-NEL-speech-translation-system,
  title        = "The {USTC}-{NEL} Speech Translation system at {IWSLT} 2018",
  author       = "Liu, Dan and Liu, Junhua and Guo, Wu and Xiong, Shifu and Ma, Zhiqiang and Song, Rui and Wu,
                 Chongliang and Liu, Quan",
  journal      = "arXiv preprint arXiv:1812.02455",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1812.02455",
  cc-author-affiliation = "University of Science and Technology of China, China; IFLYTEK Co. LTD.",
  cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
  cc-class     = "nlp/machine-translation",
}

@InProceedings{cc:LiuYeungChouHuangEtAl:2018:temporal-modular-networks-for-video-retrieval,
  title        = "Temporal Modular Networks for Retrieving Complex Compositional Activities in Videos",
  author       = "Liu, Bingbin and Yeung, Serena and Chou, Edward and Huang, De-An and Fei-Fei, Li and Niebles, Juan
                 Carlos",
  booktitle    = "Proceedings of the European Conference on Computer Vision (ECCV)",
  pages        = "552--568",
  year         = "2018",
  URL          = "http://openaccess.thecvf.com/content_ECCV_2018/html/Bingbin_Liu_Temporal_Modular_Networks_ECCV_2018_paper.html",
  cc-author-affiliation = "Stanford University, USA; Google Cloud AI, Mountain View, USA",
  cc-class     = "ai/computer-vision, ir/video-retrieval, ai/action-recognition, nlp/word-embeddings",
}

@InProceedings{cc:LoSimardStewartLarkinEtAl:2018:cleaning-noisy-parallel-corpora,
  title        = "Accurate semantic textual similarity for cleaning noisy parallel corpora using semantic machine
                 translation evaluation metric: The {NRC} supervised submissions to the Parallel Corpus Filtering task",
  author       = "Lo, Chi-kiu and Simard, Michel and Stewart, Darlene and Larkin, Samuel and Goutte, Cyril and Littell,
                 Patrick",
  booktitle    = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
  pages        = "908--916",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/W18-6481",
  cc-author-affiliation = "National Research Council, Canada",
  cc-class     = "cc-cited-not-used, nlp/machine-translation, nlp/corpus-construction",
}

@Article{cc:LockardDongEinolghozatiShiralkar:2018:supervised-relation-extraction,
  title        = "{CERES}: Distantly Supervised Relation Extraction from the Semi-Structured Web",
  author       = "Lockard, Colin and Dong, Xin Luna and Einolghozati, Arash and Shiralkar, Prashant",
  journal      = "PVLDB",
  year         = "2018",
  volume       = "11",
  pages        = "1084--1096",
  doi          = "http://doi.org/10.14778/3231751.3231758",
  URL          = "https://arxiv.org/abs/1804.04635",
  cc-author-affiliation = "amazon.com",
  cc-dataset-used = "CC-MAIN-201[3-7]-*",
  cc-class     = "ir/information-extraction, ir/relation-extraction",
  cc-snippet   = "The CommonCrawl corpus consists of monthly snapshots of pages from millions of websites [1] on the
                 Web. We started with a few well-known sites, including rottentomatoes.com, boxofficemojo.com, and
                 themoviedb.org. Based on a Wikipedia list of the largest global film industries by admissions, box
                 office, and number of productions⁸, we then issued Google searches for terms corresponding to these
                 countries, such as “Nigerian film database” and recorded resulting sites that had detail pages
                 related to movies. We also issued a few additional searches related to specific genres we thought may
                 not be well-represented in mainstream sites, including “animated film database” and “documentary
                 film database”. After compiling our list of sites, we then checked CommonCrawl⁹ and kept all sites
                 with more than one hundred pages available. Our final list contains a broad mix of movie sites,
                 including sites based around national film industries, genres, film music, and screen size. Most are in
                 English, but the set also includes sites in Czech, Danish, Icelandic, Italian, Indonesian, and Slovak.
                 ⁸https://en.wikipedia.org/wiki/Film_industry ⁹For each site, we scanned the CommonCrawl indices for
                 all monthly scrapes prior to January 2018 and downloaded all pages for the site from the scrape with
                 the largest number of unique webpages. Note that these scrapes do not necessarily obtain all pages
                 present on a site, so the retrieved pages represent only a subset of the full site.",
}

@Article{cc:MaheshwariTrivediLukovnikovChakrabortyEtAl:2018:learning-to-rank-query-graphs,
  title        = "Learning to Rank Query Graphs for Complex Question Answering over Knowledge Graphs",
  author       = "Maheshwari, Gaurav and Trivedi, Priyansh and Lukovnikov, Denis and Chakraborty, Nilesh and Fischer,
                 Asja and Lehmann, Jens",
  journal      = "arXiv preprint arXiv:1811.01118",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1811.01118",
  cc-author-affiliation = "University of Bonn, Germany; Ruhr University, Bochum, Germany",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "information retrieval, nlp/question-answering, nlp/knowledge-graph, nlp/word-embeddings",
}

@Article{cc:Martinez-RodriguezHoganLopez-Arevalo:2018:information-extraction-meets-Semantic-Web,
  title        = "Information extraction meets the Semantic Web: {A} survey",
  author       = "Martinez-Rodriguez, Jose L. and Hogan, Aidan and Lopez-Arevalo, Ivan",
  journal      = "Semantic Web",
  number       = "Preprint",
  pages        = "1--81",
  year         = "2018",
  publisher    = "IOS Press",
  URL          = "https://content.iospress.com/articles/semantic-web/sw180333",
  pdf          = "http://www.semantic-web-journal.net/system/files/swj1909.pdf",
  cc-author-affiliation = "Cinvestav Tamaulipas, Ciudad Victoria, Mexico; University of Chile, Chile",
  cc-class     = "cc-cited-not-used, semantic web, linked data, information extraction",
}

@Article{cc:McCannKeskarXiongSocher:2018:natural-language-decathlon-Multitask,
  title        = "The natural language decathlon: Multitask learning as question answering",
  author       = "McCann, Bryan and Keskar, Nitish Shirish and Xiong, Caiming and Socher, Richard",
  journal      = "arXiv preprint arXiv:1806.08730",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1806.08730",
  cc-author-affiliation = "Salesforce Research",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/question-answering, nlp/machine-translation, nlp/text-summarization, nlp/sentiment-analysis,
                 nlp/word-embeddings",
}

@Misc{cc:McCannXiongSocher:2018:nlp-using-context-specific-word-vectors,
  title        = "Natural language processing using context-specific word vectors",
  author       = "McCann, Bryan and Xiong, Caiming and Socher, Richard",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/982,841",
  URL          = "https://patents.google.com/patent/US20180373682A1/en",
  cc-author-affiliation = "Salesforce.com, Inc.",
  cc-derived-dataset-cited = "GloVe-word-embeddings",
  cc-class     = "nlp/word-embeddings, patent",
}

@Misc{cc:McCannXiongSocher:2018:nlp-using-neural-network,
  title        = "Natural language processing using a neural network",
  author       = "McCann, Bryan and Xiong, Caiming and Socher, Richard",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 16/000,638",
  URL          = "https://patents.google.com/patent/US20180349359A1/en",
  cc-author-affiliation = "Salesforce.com, Inc.",
  cc-derived-dataset-cited = "GloVe-word-embeddings",
  cc-class     = "nlp/word-embeddings, patent",
}

@Article{cc:MeijersPeris:2018:toponym-co-occurrences,
  title        = "Using toponym co-occurrences to measure relationships between places: review, application and
                 evaluation",
  author       = "Meijers, Evert and Peris, Antoine",
  journal      = "International Journal of Urban Sciences",
  pages        = "1--23",
  year         = "2018",
  publisher    = "Taylor \& Francis",
  URL          = "https://www.tandfonline.com/doi/abs/10.1080/12265934.2018.1497526",
  cc-author-affiliation = "Delft University of Technology, The Netherlands",
  cc-class     = "nlp, coocurrences, toponymy, urban system, place name disambiguation, semantic relatedness",
  cc-snippet   = "We innovate by exploiting a so far unparalleled amount of data, namely the billions of web pages
                 contained in the commoncrawl web archive, and by applying the method also to small places that tend to
                 be ignored by other methods. [...] we use the March 2017 data. The Common Crawl data comes in three
                 formats, of which the WET format is most useful for the co-occurrence method as it only contains
                 extracted plain text.",
}

@InProceedings{cc:MeisheriDey:2018:TCS-Research-at-SemEval-2018,
  title        = "{TCS} Research at SemEval-2018 Task 1: Learning Robust Representations using Multi-Attention
                 Architecture",
  author       = "Meisheri, Hardik and Dey, Lipika",
  booktitle    = "Proceedings of The 12th International Workshop on Semantic Evaluation",
  pages        = "291--299",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/S18-1043",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-author-affiliation = "TCS Research, New Delhi, India",
  cc-class     = "nlp/sentiment-analysis",
}

@Article{cc:MihaylovClarkKhotSabharwal:2018:book-question-answering,
  title        = "Can a suit of armor conduct electricity? a new dataset for open book question answering",
  author       = "Mihaylov, Todor and Clark, Peter and Khot, Tushar and Sabharwal, Ashish",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
  year         = "2018",
  address      = "Brussels, Belgium",
  publisher    = "Association for Computational Linguistics",
  URL          = "https://www.aclweb.org/anthology/D18-1260",
  pages        = "2381--2391",
  abstract     = "We present a new kind of question answering dataset, OpenBookQA, modeled after open book exams for
                 assessing human understanding of a subject. The open book that comes with our questions is a set of
                 1326 elementary level science facts. Roughly 6000 questions probe an understanding of these facts and
                 their application to novel situations. This requires combining an open book fact (e.g., metals conduct
                 electricity) with broad common knowledge (e.g., a suit of armor is made of metal) obtained from other
                 sources. While existing QA datasets over documents or knowledge bases, being generally self-contained,
                 focus on linguistic understanding, OpenBookQA probes a deeper understanding of both the topic{---}in
                 the context of common knowledge{---}and the language it is expressed in. Human performance on
                 OpenBookQA is close to 92{\%}, but many state-of-the-art pre-trained QA methods perform surprisingly
                 poorly, worse than several simple neural baselines we develop. Our oracle experiments designed to
                 circumvent the knowledge retrieval bottleneck demonstrate the value of both the open book and
                 additional facts. We leave it as a challenge to solve the retrieval problem in this multi-hop setting
                 and to close the large gap to human performance.",
  cc-author-affiliation = "Allen Institute for Artificial Intelligence, Seattle, USA; Heidelberg University, Germany",
  cc-snippet   = "For all experiments we used= 300GloVe(Penningtonet al., 2014) embeddings pre-trained on 840B tokens
                 fromCommon Crawl(https://nlp.stanford.edu/projects/glove/).",
  cc-class     = "nlp/question-answering, nlp/word-embeddings, nlp/corpus-construction",
  cc-derived-dataset-used = "GloVe-word-embeddings",
}

@Article{cc:MinZhongSocherXiong:2018:question-answering-from-minimal-context,
  title        = "Efficient and Robust Question Answering from Minimal Context over Documents",
  author       = "Min, Sewon and Zhong, Victor and Socher, Richard and Xiong, Caiming",
  journal      = "arXiv preprint arXiv:1805.08092",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1805.08092",
  cc-author-affiliation = "Seoul National University, South Korea; Salesforce Research",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/question-answering, nlp/word-embeddings",
}

@Article{cc:MirheidariBlackburnWalkerVenneriEtAl:2018:detecting-signs-of-dementia,
  title        = "Detecting signs of dementia using word vector representations",
  author       = "Mirheidari, Bahman and Blackburn, Daniel and Walker, Traci and Venneri, Annalena and Reuber, Markus
                 and Christensen, Heidi",
  journal      = "Proc. Interspeech 2018",
  pages        = "1893--1897",
  year         = "2018",
  URL          = "https://www.isca-speech.org/archive/Interspeech_2018/pdfs/1764.pdf",
  cc-author-affiliation = "University of Sheffield, United Kingdom; Royal Hallamshire Hospital, United Kingdom",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/word-embeddings, nlp/speech-recognition, nlp/clinical-application, dementia detection",
}

@InProceedings{cc:MoffatPetri:2018:index-compression-ANS-coding,
  title        = "Index compression using byte-aligned {ANS} coding and two-dimensional contexts",
  author       = "Moffat, Alistair and Petri, Matthias",
  booktitle    = "Proceedings of the Eleventh ACM International Conference on Web Search and Data Mining",
  pages        = "405--413",
  year         = "2018",
  organization = "ACM",
  URL          = "https://dl.acm.org/citation.cfm?id=3159663",
  abstract     = "We examine approaches used for block-based inverted index compression, such as the OptPFOR mechanism,
                 in which fixed-length blocks of postings data are compressed independently of each other. Building on
                 previous work in which asymmetric numeral systems (ANS) entropy coding is used to represent each block,
                 we explore a number of enhancements: (i) the use of two-dimensional conditioning contexts, with two
                 aggregate parameters used in each block to categorize the distribution of symbol values that underlies
                 the ANS approach, rather than just one; (ii) the use of a byte-friendly strategic mapping from symbols
                 to ANS codeword buckets; and (iii) the use of a context merging process to combine similar probability
                 distributions. Collectively, these improvements yield superior compression for index data,
                 outperforming the reference point set by the Interp mechanism, and hence representing a significant
                 step forward. We describe experiments using the 426 GiB gov2 collection and a new large collection of
                 publicly-available news articles to demonstrate that claim, and provide query evaluation throughput
                 rates compared to other block-based mechanisms.",
  cc-snippet   = "The second pair of test files are derived from publicly available web-sourced news articles²
                 [²http://commoncrawl.org/2016/10/news-dataset-available/], taking English language news sources (as
                 identified by Apache Tika) from 01/09/2016 up until and including 28/02/2017, that is, a six month
                 crawl period that contains 7,508,082 documents.",
  cc-dataset-used = "CC-NEWS",
  cc-dataset-used-subset = "2016/09/01 - 2017/02/28",
  cc-class     = "information-retrieval/search-engine, information-retrieval/inverted-index",
  cc-author-affiliation = "University of Melbourne, Australia",
}

@InProceedings{cc:MotlogelwaThumaLeburu-Dingalo:2018:merging-search-results,
  title        = "Merging search results generated by multiple query variants using data fusion",
  author       = "Motlogelwa, Nkwebi and Thuma, Edwin and Leburu-Dingalo, Tebo",
  booktitle    = "CEUR Workshop Proceedings: Working Notes of CLEF 2018: Conference and Labs of the Evaluation Forum",
  year         = "2018",
  URL          = "http://ceur-ws.org/Vol-2125/paper_194.pdf",
  cc-author-affiliation = "University of Botswana, Botswana",
  cc-derived-dataset-used = "CLEF-eHealth-2018-IR-task",
  cc-class     = "ir/multilingual-information-retrieval, ir/biomedical-information-extraction, ir/query-expansion",
}

@Article{cc:NassifTreudeRobillard:2018:categorizing-software-technologies,
  title        = "Automatically Categorizing Software Technologies",
  author       = "Nassif, Mathieu and Treude, Christoph and Robillard, Martin",
  journal      = "IEEE Transactions on Software Engineering",
  year         = "2018",
  publisher    = "IEEE",
  URL          = "https://ieeexplore.ieee.org/abstract/document/8359344",
  cc-author-affiliation = "McGill University School of Computer Science, Montreal, Quebec, Canada",
  cc-class     = "nlp/semantics, ontology extraction, ir/information-extraction",
  cc-derived-dataset-cited = "WDC-WebIsADb",
  abstract     = "Informal language and the absence of a standard taxonomy for software technologies make it difficult
                 to reliably analyze technology trends on discussion forums and other on-line venues. We propose an
                 automated approach called Witt for the categorization of software technology (an expanded version of
                 the hypernym discovery problem). Witt takes as input a phrase describing a software technology or
                 concept and returns a general category that describes it (e.g., integrated development environment),
                 along with attributes that further qualify it (commercial, php, etc.). By extension, the approach
                 enables the dynamic creation of lists of all technologies of a given type (e.g., web application
                 frameworks). Our approach relies on Stack Overflow and Wikipedia, and involves numerous original domain
                 adaptations and a new solution to the problem of normalizing automatically-detected hypernyms. We
                 compared Witt with six independent taxonomy tools and found that, when applied to software terms, Witt
                 demonstrated better coverage than all evaluated alternate solutions, without a corresponding
                 degradation in false positive rate.",
  cc-snippet   = "All these approaches work by mining large text corpora. Among the latest such techniques is the WebIsA
                 Database [32] from the Web Data Commons project, which extracts hypernyms from CommonCrawl,¹ a
                 corpusof over 2.1 billion web pages. In contrast to these previous works, our method onlyrequires Stack
                 Overflow tag information data and targeted Wikipedia searches. It creates a structure that links a
                 single term to an attributed category that describes the term.",
}

@Article{cc:NavarreteLujan-MoraEtAl:2018:quantitative-analysis-of-microdata-use,
  title        = "A Quantitative Analysis of the Use of Microdata for Semantic Annotations on Educational Resources",
  author       = "Navarrete, Rosa and Luján Mora, Sergio",
  year         = "2018",
  publisher    = "Rinton Press",
  URL          = "http://rua.ua.es/dspace/handle/10045/73711",
  cc-author-affiliation = "Universidad de Alicante, Spain",
  cc-class     = "semantic web, structured data, microdata",
  cc-derived-dataset-cited = "WebDataCommons",
  cc-snippet   = "This quantitative analysis was conducted on datasets extracted from the Common Crawl Corpus [17], as
                 it is the largest corpus of web crawl. The datasets containing structured data were extracted by the
                 Web Data Commons (WDC) project [18] and are available for public use. Two datasets were considered: the
                 first, from December 2014, with 2.01 billion pages, of which 620 million pages correspond to structured
                 data; and the second, from November 2015, with 1.77 billion pages, of which 541 million pages
                 correspond to structured data.",
}

@Article{cc:NegriTurchiChatterjeeBertoldi:2018:eSCAPE-synthetic-corpus,
  title        = "e{SCAPE}: a Large-scale Synthetic Corpus for Automatic Post-Editing",
  author       = "Negri, Matteo and Turchi, Marco and Chatterjee, Rajen and Bertoldi, Nicola",
  journal      = "arXiv preprint arXiv:1803.07274",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1803.07274",
  cc-derived-dataset-used = "WMT-13-translation-task-common-crawl-corpus",
  cc-snippet   = "A widely used resource, described in (Junczys-Dowmunt and Grundkiewicz, 2016), was included in the
                 training set of the winning (and almost all) submissions to the last two English–German rounds of the
                 APE task at WMT (IT domain). It consists of 4.3 million instances created by first filtering a subset
                 of IT-related sentences from the German Common Crawl corpus⁶, and then by using two English–German
                 and German–English PBMT systems trained on in-domain IT corpora for a round-trip translation of the
                 selected sentences (De → En → De).",
  cc-author-affiliation = "Fondazione Bruno Kessler, Trento, Italy; University of Trento, Italy",
  cc-class     = "nlp/machine-translation",
}

@Article{cc:NemeskeyKornai:2018:emergency-vocabulary,
  title        = "Emergency vocabulary",
  author       = "Nemeskey, Dávid Márk and Kornai, András",
  journal      = "Information Systems Frontiers",
  volume       = "20",
  number       = "5",
  pages        = "909--923",
  year         = "2018",
  publisher    = "Springer",
  URL          = "https://link.springer.com/article/10.1007%2Fs10796-018-9843-x",
  cc-author-affiliation = "HAS Institute of Computer Science, Budapest, Hungary",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/vocabulary-extraction, nlp/word-embeddings",
}

@Article{cc:NguyenNguyenIchiseTakeda:2018:EmbNum-semantic-labeling,
  title        = "EmbNum: Semantic labeling for numerical values with deep metric learning",
  author       = "Nguyen, Phuc and Nguyen, Khai and Ichise, Ryutaro and Takeda, Hideaki",
  journal      = "arXiv preprint arXiv:1807.01367",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1807.01367",
  cc-author-affiliation = "SOKENDAI (The Graduate University for Advanced Studies) Shonan Village, Hayama, Kanagawa,
                 Japan; National Institute of Informatics, Tokyo, Japan",
  cc-snippet   = "In a study of Lehmberg et al., 233 million tables were extracted from the July 2015 version of the
                 Common Crawl [...]",
  cc-derived-dataset-cited = "WDCWebTables",
}

@Article{cc:NiuDenkowskiCarpuat:2018:bi-directional-neural-machine-translation,
  title        = "Bi-Directional Neural Machine Translation with Synthetic Parallel Data",
  author       = "Niu, Xing and Denkowski, Michael and Carpuat, Marine",
  journal      = "arXiv preprint arXiv:1805.11213",
  year         = "2018",
  URL          = "https://arxiv.org/pdf/1805.11213.pdf",
  cc-author-affiliation = "University of Maryland; Amazon.com, Inc.",
  cc-class     = "nlp/machine-translation",
}

@InProceedings{cc:OhshimaToyama:2018:structured-data-collection,
  title        = "{SDC}: structured data collection by yourself",
  author       = "Ohshima, Takuya and Toyama, Motomichi",
  booktitle    = "Proceedings of the 8th International Conference on Information Systems and Technologies",
  pages        = "3",
  year         = "2018",
  organization = "ACM",
  URL          = "https://dl.acm.org/citation.cfm?id=3200849",
  cc-author-affiliation = "Keio University, Yokohama, Kanagawa, Japan",
  cc-class     = "web-crawling, semantic web, structured data",
  cc-derived-dataset-cited = "WebDataCommons",
}

@Article{cc:OttAuliGrangerRanzato:2018:uncertainty-in-neural-machine-translation,
  title        = "Analyzing uncertainty in neural machine translation",
  author       = "Ott, Myle and Auli, Michael and Granger, David and Ranzato, Marc'Aurelio",
  journal      = "arXiv preprint arXiv:1803.00047",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1803.00047",
  cc-author-affiliation = "Facebook AI Research, USA",
  cc-class     = "cc-cited-not-used, nlp/machine-translation",
}

@Article{cc:PeirsonAbelTolunay:2018:generating-memes,
  title        = "Dank Learning: Generating Memes Using Deep Neural Networks",
  author       = "Peirson, Abel L. Peirson and Tolunay, E. Meltem",
  journal      = "arXiv preprint arXiv:1806.04510",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1806.04510",
  cc-author-affiliation = "Stanford University, USA",
  cc-class     = "nlp/text-generation, nlp/word-embeddings",
  cc-derived-dataset-used = "GloVe-word-embeddings",
}

@Article{cc:PeroneSilveiraPaula:2018:evaluation-of-sentence-embeddings,
  title        = "Evaluation of sentence embeddings in downstream and linguistic probing tasks",
  author       = "Perone, Christian S. and Silveira, Roberto and Paula, Thomas S.",
  journal      = "arXiv preprint arXiv:1806.06259",
  year         = "2018",
  cc-derived-dataset-used = "fasttext-word-embeddings, GloVe-word-embeddings",
  URL          = "https://arxiv.org/abs/1806.06259",
  cc-author-affiliation = "Universitat Politècnica de Catalunya, Barcelona, Spain",
  cc-class     = "nlp/word-embeddings, nlp/sentence-embeddings",
}

@Article{cc:PetriMoffat:2018:inverted-index,
  author       = "Petri, Matthias and Moffat, Alistair",
  title        = "Compact inverted index storage using general-purpose compression libraries",
  journal      = "Software: Practice and Experience",
  ISSN         = "1097-024X",
  URL          = "http://dx.doi.org/10.1002/spe.2556",
  doi          = "10.1002/spe.2556",
  year         = "2018",
  keywords     = "index compression, inverted index, web search",
  publisher    = "Wiley Online Library",
  abstract     = "Efficient storage of large inverted indexes is one of the key technologies that support current web
                 search services. Here we re-examine mechanisms for representing document-level inverted indexes and
                 within-document term frequencies, including comparing specialized methods developed for this task
                 against recent fast implementations of general-purpose adaptive compression techniques. Experiments
                 with the Gov2-URL collection and a large collection of crawled news stories show that standard
                 compression libraries can provide compression effectiveness as good as or better than previous methods,
                 with decoding rates only moderately slower than reference implementations of those tailored approaches.
                 This surprising outcome means that high-performance index compression can be achieved without requiring
                 the use of specialized implementations.",
  cc-snippet   = "We also develop (and make freely available) a new IR test collection based on the News sub-collection
                 of the Common Crawl∗∗. The News sub-collection provides daily crawls of news websites in many
                 languages. We refer to this collection as CC-NEWS-URL. We provide all scripts to download the freely
                 available source WARC files from Amazon AWS and process them using Apache Tika and Apache Lucene in a
                 consistent manner. The resulting consistency enables researchers to perform experiments on exactly the
                 collection in their experiments, and improves comparability of results between different rounds of
                 experimentation. For example, the number of terms reported for the GOV2-URL collection ranges from 18
                 million up to 48 million, preventing fair and direct comparison between results reported in different
                 papers. The number of WARC files in CC-NEWS-URL increases each day, and hence we specify the collection
                 using: (1) a date range; and (2) a language filter. For example, in this work, we utilize the
                 CC-NEWS-20160901-2017028-EN collection which uses all English language news sources (as identified by
                 Apache Tika) from 01/09/2016 up until and including 28/02/2017, that is, a six month crawl period that
                 contains 7,508,082 documents, 26,240,031 unique terms and 4,457,492,131 postings. Currently the
                 CC-NEWS-URL collection grows by roughly 50,000 English documents per day. This exact parsing can be
                 reproduced by the scripts provided at https://github.com/mpetri/rlz-invidx and
                 https://github.com/mpetri/TikaLuceneWarc, with raw postings lists stored in the popular “ds2i”
                 format††. Document identifiers are again reassigned in URL order. We also explored a date-ordered
                 collection based on the same source data, and obtained – method-for-method – uniformly weaker
                 compression outcomes than for URL-sorted, in part because many of the URLs contain dates encoded in
                 them anyway.",
  cc-dataset-used = "CC-NEWS",
  cc-dataset-used-subset = "2016/09/01 - 2017/02/28",
  cc-class     = "information-retrieval/search-engine, information-retrieval/inverted-index",
  cc-author-affiliation = "University of Melbourne, Australia",
}

@Article{cc:PilehvarKartsaklisProkhorovCollier:2018:Card-660-Cambridge-rare-word-dataset,
  title        = "Card-660: Cambridge Rare Word Dataset-a Reliable Benchmark for Infrequent Word Representation Models",
  author       = "Pilehvar, Mohammad Taher and Kartsaklis, Dimitri and Prokhorov, Victor and Collier, Nigel",
  journal      = "arXiv preprint arXiv:1808.09308",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1808.09308",
  cc-author-affiliation = "University of Cambridge, United Kingdom",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "linguistics, nlp/semantics, nlp/word-embeddings, lexicography",
}

@Article{cc:PrabhumoyeTsvetkovSalakhutdinovBlack:2018:style-transfer-through-back-translation,
  title        = "Style Transfer Through Back-Translation",
  author       = "Prabhumoye, Shrimai and Tsvetkov, Yulia and Salakhutdinov, Ruslan and Black, Alan W",
  journal      = "arXiv preprint arXiv:1804.09000",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1804.09000",
  cc-author-affiliation = "Carnegie Mellon University, Pittsburgh, PA, USA",
  cc-derived-dataset-used = "WMT-13-translation-task-common-crawl-corpus",
  cc-class     = "nlp/machine-translation",
}

@Misc{cc:RaananiLevyBeakstoneFacher:2018:automatically-identify-product-feature-requests,
  title        = "Analyzing conversations to automatically identify product feature requests",
  author       = "Raanani, Roy and Levy, Russell and Beakstone, Micha Yochanan and Facher, Dominik",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/902,751",
  URL          = "https://patents.google.com/patent/US20180183930A1/en",
  cc-author-affiliation = "Affectlayer Inc",
  cc-class     = "nlp/text-corpora, cc-cited-not-used, patent",
  cc-snippet   = "At the same time, natural language processing (NLP) approaches to both topic modeling and
                 world-knowledge modeling, have become much more efficient due to the availability of large, freely
                 accessible natural language corpora (e.g., CommonCrawl), ...",
}

@Misc{cc:RaananiLevyBreadstone:2018:automatic-generation-of-playlists,
  title        = "Automatic generation of playlists from conversations",
  author       = "Raanani, Roy and Levy, Russell and Breadstone, Micha Yochanan",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/793,691",
  URL          = "https://patents.google.com/patent/US20180046710A1/en",
  cc-author-affiliation = "Affectlayer Inc",
  cc-class     = "nlp/text-corpora, cc-cited-not-used, patent",
  cc-snippet   = "At the same time, natural language processing (NLP) approaches to both topic modeling and
                 world-knowledge modeling, have become much more efficient due to the availability of large, freely
                 accessible natural language corpora (e.g., CommonCrawl), ...",
}

@Misc{cc:RaananiLevyBreakstone:2018:coordinating-voice-calls,
  title        = "Coordinating voice calls between representatives and customers to influence an outcome of the call",
  author       = "Raanani, Roy and Levy, Russell and Breakstone, Micha Yochanan",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent 9,900,436",
  URL          = "https://patents.google.com/patent/US9900436B2/en",
  cc-author-affiliation = "Affectlayer Inc",
  cc-class     = "nlp/text-corpora, cc-cited-not-used, patent",
  cc-snippet   = "At the same time, natural language processing (NLP) approaches to both topic modeling and
                 world-knowledge modeling, have become much more efficient due to the availability of large, freely
                 accessible natural language corpora (e.g., CommonCrawl), ...",
}

@Misc{cc:RaananiLevyBreakstone:2018:modeling-voice-calls,
  title        = "Modeling voice calls to improve an outcome of a call between a representative and a customer",
  author       = "Raanani, Roy and Levy, Russell and Breakstone, Micha Yochanan",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 16/017,646",
  URL          = "https://patents.google.com/patent/US20180309873A1/en",
  cc-author-affiliation = "Affectlayer Inc",
  cc-class     = "nlp/text-corpora, cc-cited-not-used, patent",
  cc-snippet   = "At the same time, natural language processing (NLP) approaches to both topic modeling and study
                 world-knowledge modeling, have become much more efficient due to the availability of large, freely
                 accessible natural language corpora (e.g., CommonCrawl), ...",
}

@Misc{cc:RaananiLevyBreakstoneFacher:2018:automatically-identify-action-items,
  title        = "Analyzing conversations to automatically identify action items",
  author       = "Raanani, Roy and Levy, Russell and Breakstone, Micha Yochanan and Facher, Dominik",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/854,642",
  URL          = "https://patents.google.com/patent/US20180122383A1/en",
  cc-author-affiliation = "Affectlayer Inc",
  cc-class     = "nlp/text-corpora, cc-cited-not-used, patent",
  cc-snippet   = "At the same time, natural language processing (NLP) approaches to both topic modeling and
                 world-knowledge modeling, have become much more efficient due to the availability of large, freely
                 accessible natural language corpora (e.g., CommonCrawl), ...",
}

@Misc{cc:RaananiLevyBreakstoneFacher:2018:automatically-identify-customer-pain-points,
  title        = "Analyzing conversations to automatically identify customer pain points",
  author       = "Raanani, Roy and Levy, Russell and Breakstone, Micha Yochanan and Facher, Dominik",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/902,808",
  URL          = "https://patents.google.com/patent/US20180181561A1/en",
  cc-author-affiliation = "Affectlayer Inc",
  cc-class     = "nlp/text-corpora, cc-cited-not-used, patent",
  cc-snippet   = "At the same time, natural language processing (NLP) approaches to both topic modeling and
                 world-knowledge modeling, have become much more efficient due to the availability of large, freely
                 accessible natural language corpora (e.g., CommonCrawl), ...",
}

@Misc{cc:RaananiLevyBreakstoneFacher:2018:identify-product-features,
  title        = "Analyzing conversations to automatically identify product features that resonate with customers",
  author       = "Raanani, Roy and Levy, Russell and Breakstone, Micha Yochanan and Facher, Dominik",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/937,494",
  URL          = "https://patents.google.com/patent/US20180183930A1/en",
  cc-author-affiliation = "Affectlayer Inc",
  cc-class     = "nlp/text-corpora, cc-cited-not-used, patent",
  cc-snippet   = "At the same time, natural language processing (NLP) approaches to both topic modeling and
                 world-knowledge modeling, have become much more efficient due to the availability of large, freely
                 accessible natural language corpora (e.g., CommonCrawl), ...",
}

@Misc{cc:RaananiLevyFacherBreakstone:2018:automatic-pattern-recognition,
  title        = "Automatic pattern recognition in conversations",
  author       = "Raanani, Roy and Levy, Russell and Facher, Dominik and Breakstone, Micha Yochanan",
  year         = "2018",
  note         = "US Patent App. 15/817,490",
  URL          = "http://www.freepatentsonline.com/10110743.html",
  cc-author-affiliation = "Affectlayer Inc",
  cc-class     = "nlp/text-corpora, cc-cited-not-used, patent",
  cc-snippet   = "At the same time, natural language processing (NLP) approaches to both topic modeling and
                 world-knowledge modeling, have become much more efficient due to the availability of large, freely
                 accessible natural language corpora (e.g., CommonCrawl), ...",
}

@Misc{cc:RaananiLevyFacherBreakstone:2018:automatically-identify-deals-at-risk,
  title        = "Analyzing conversations to automatically identify deals at risk",
  author       = "Raanani, Roy and Levy, Russell and Facher, Dominik and Breakstone, Micha Yochanan",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/835,807",
  URL          = "https://patents.google.com/patent/US10133999B2/en",
  cc-author-affiliation = "Affectlayer Inc",
  cc-class     = "nlp/text-corpora, cc-cited-not-used, patent",
  cc-snippet   = "At the same time, natural language processing (NLP) approaches to both topic modeling and
                 world-knowledge modeling, have become much more efficient due to the availability of large, freely
                 accessible natural language corpora (e.g., CommonCrawl), ...",
}

@Misc{cc:RaimanMiller:2018:reader-systems,
  title        = "Global normalized reader systems and methods",
  author       = "Raiman, Jonathan and Miller, John",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/706,486",
  URL          = "https://patents.google.com/patent/US20180300312A1/en",
  cc-author-affiliation = "Baidu USA LLC",
  cc-snippet   = "In embodiments, the 300 dimensional 8.4B token Common Crawl GloVe vectors were used. Words missing
                 from the Common Crawl vocabulary were set to zero.",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/question-answering, nlp/word-embeddings, patent",
}

@Article{cc:RaisonMazareDasBordes:2018:Weaver-deep-co-encoding,
  title        = "Weaver: Deep Co-Encoding of Questions and Documents for Machine Reading",
  author       = "Raison, Martin and Mazaré, Pierre-Emmanuel and Das, Rajarshi and Bordes, Antoine",
  journal      = "arXiv preprint arXiv:1804.10490",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1804.10490",
  cc-author-affiliation = "Facebook AI Research, Paris, France; University of Massachusetts, Amherst, USA",
  cc-derived-dataset-used = "fastText-word-embeddings",
  cc-class     = "nlp/question-answering, nlp/word-embeddings, information retrieval",
}

@Article{cc:RistoskiPetrovskiMikaPaulheim:2018:product-matching,
  title        = "A machine learning approach for product matching and categorization",
  author       = "Ristoski, Petar and Petrovski, Petar and Mika, Peter and Paulheim, Heiko",
  journal      = "Semantic web",
  number       = "Preprint",
  pages        = "1--22",
  year         = "2018",
  publisher    = "IOS Press",
  URL          = "https://content.iospress.com/articles/semantic-web/sw300",
  pdf          = "http://www.semantic-web-journal.net/system/files/swj1664.pdf",
  cc-author-affiliation = "University of Mannheim, Germany; Yahoo Labs, London, United Kingdom",
  cc-class     = "semantic web, information extraction, microdata, linked data, data integration",
  cc-derived-dataset-used = "WDC-triples",
}

@Article{cc:RomanovShivade:2018:natural-language-inference-clinical-domain,
  title        = "Lessons from Natural Language Inference in the Clinical Domain",
  author       = "Romanov, Alexey and Shivade, Chaitanya",
  journal      = "arXiv preprint arXiv:1808.06752",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1808.06752",
  cc-author-affiliation = "University of Massachusetts Lowell, USA; IBM Almaden Research Center, San Jose, CA, USA",
  cc-class     = "nlp, natural language inference",
  cc-derived-dataset-used = "GloVe-word-embeddings, fastText-word-embeddings",
}

@InProceedings{cc:RosenfeldUllman:2018:action-classification,
  title        = "Action Classification via Concepts and Attributes",
  author       = "Rosenfeld, Amir and Ullman, Shimon",
  booktitle    = "2018 24th International Conference on Pattern Recognition (ICPR)",
  pages        = "1499--1505",
  year         = "2018",
  organization = "IEEE",
  URL          = "https://ieeexplore.ieee.org/abstract/document/8546184",
  pdf          = "https://www.researchgate.net/profile/Amir_Rosenfeld/publication/303521516_Action_Classification_via_Concepts_and_Attributes/links/57c6ca5408ae28c01d4eee0e/Action-Classification-via-Concepts-and-Attributes.pdf",
  cc-author-affiliation = "Weizmann Institute of Science, Rehovot, Israel",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/word-embeddings, ai/computer-vision, image-classification",
}

@InProceedings{cc:RossenbachRosendahlKimGracaEtAl:2018:RWTH-Aachen-University-filtering-system,
  title        = "The {RWTH} Aachen University filtering system for the {WMT} 2018 parallel corpus filtering task",
  author       = "Rossenbach, Nick and Rosendahl, Jan and Kim, Yunsu and Graça, Miguel and Gokrani, Aman and Ney,
                 Hermann",
  booktitle    = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
  pages        = "946--954",
  year         = "2018",
  URL          = "https://www.aclweb.org/anthology/W18-6487",
  cc-author-affiliation = "RWTH Aachen University, Germany",
  cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
  cc-class     = "nlp/machine-translation, nlp/corpus-construction",
}

@InProceedings{cc:RoyGangulyBhatiaBedathurEtAl:2018:word-embeddings-for-information-retrieval,
  title        = "Using Word Embeddings for Information Retrieval: How Collection and Term Normalization Choices Affect
                 Performance",
  author       = "Roy, Dwaipayan and Ganguly, Debasis and Bhatia, Sumit and Bedathur, Srikanta and Mitra, Mandar",
  booktitle    = "Proceedings of the 27th ACM International Conference on Information and Knowledge Management",
  pages        = "1835--1838",
  year         = "2018",
  organization = "ACM",
  cc-snippet   = "In future, we plan to solidify these observations [...] as well asexperiment using large datasets
                 (e.g. Common Crawl).",
  URL          = "https://dl.acm.org/citation.cfm?id=3269277",
  pdf          = "http://www.cse.iitd.ac.in/~srikanta/publication/cikm-18/cikm-18.pdf",
  cc-author-affiliation = "Indian Statistical Institute, Kolkata, India; IBM Research, Dublin, Ireland, Dublin, Ireland;
                 IBM Research, Delhi, India, Delhi, India; Indian Institute of Technology, Delhi, Delhi, India",
  cc-class     = "cc-cited-not-used, nlp/word-embeddings, information-retrieval/term-normalization",
}

@Article{cc:RuddHarangSaxe:2018:malicious-email-attachment-detection,
  title        = "{MEADE}: Towards a Malicious Email Attachment Detection Engine",
  author       = "Rudd, Ethan M. and Harang, Richard and Saxe, Joshua",
  journal      = "arXiv preprint arXiv:1804.08162",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1804.08162",
  abstract     = "alicious email attachments are a growing delivery vector for malware. While machine learning has been
                 successfully applied to portable executable (PE) malware detection, we ask, can we extend similar ap-
                 proaches to detect malware across heterogeneous file types commonly found in email attachments? In this
                 paper, we explore the feasibility of applying machine learning as a static countermeasure to detect
                 several types of malicious email attachments including Microsoft Office documents and Zip archives. To
                 this end, we collected a dataset of over 5 million malicious/benign Microsoft Office documents from
                 VirusTotal for evaluation as well as a dataset of benign Microsoft Office documents from the Common
                 Crawl corpus, which we use to provide more realistic estimates of thresholds for false positive rates
                 on in-the-wild data. We also collected a dataset of approximately 500k malicious/benign Zip archives,
                 which we scraped using the VirusTotal service, on which we performed a separate evaluation. We analyze
                 predictive performance of several classifiers on each of the VirusTotal datasets using a 70/30
                 train/test split on first seen time, evaluating feature and classifier types that have been applied
                 successfully in commercial antimalware products and R&D contexts. Using deep neural networks and
                 gradient boosted decision trees, we are able to obtain ROC curves with >0.99 AUC on both Microsoft
                 Office document and Zip archive datasets. Discussion of deployment viability in various antimalware
                 contexts is provided.",
  cc-author-affiliation = "Sophos Group PLC, VA, USA",
  cc-class     = "web-science, computer-security/email-security",
}

@InProceedings{cc:RybinskiMillerDelSerBilbaoEtAl:2018:language-toxicity-classification,
  title        = "On the Design and Tuning of Machine Learning Models for Language Toxicity Classification in Online
                 Platforms",
  author       = "Rybinski, Maciej and Miller, William and Del Ser, Javier and Bilbao, Miren Nekane and Aldana-Montes,
                 José F.",
  booktitle    = "International Symposium on Intelligent and Distributed Computing",
  pages        = "329--343",
  year         = "2018",
  organization = "Springer",
  URL          = "https://link.springer.com/chapter/10.1007/978-3-319-99626-4_29",
  cc-author-affiliation = "University of Málaga, Spain; Anami Precision, San Sebastián, Spain; TECNALIA, Bizkaia,
                 Spain; Basque Center for Applied Mathematics (BCAM), Bizkaia, Spain; University of the Basque Country
                 (UPV/EHU), Bilbao, Spain",
  cc-class     = "nlp/text-classification, nlp/sentiment-analysis, nlp/word-embeddings, ai/deep-learning",
  cc-derived-dataset-used = "GloVe-word-embeddings",
}

@InProceedings{cc:SalehPecina:2018:CUNI-team-CLEF-eHealth,
  title        = "{CUNI} team: {CLEF} eHealth Consumer Health Search Task 2018",
  author       = "Saleh, Shadi and Pecina, Pavel",
  booktitle    = "CEUR Workshop Proceedings: Working Notes of CLEF",
  year         = "2018",
  URL          = "http://ceur-ws.org/Vol-2125/paper_201.pdf",
  cc-snippet   = "Document collection in the CLEF 2018 consumer health search task is created using CommonCrawl
                 platform¹. First, the query set (described in Section 2.2) is submitted to Microsoft Bing APIs, and a
                 list of domains is extracted from the top retrieved results. This list is extended by adding reliable
                 health websites, at the end clefehealth2018_B (which we use in this work) contained 1,653 sites, after
                 excluding non-medical websites such as news websites. After preparing the domain list, these domains
                 are crawled and provided as an indexed collection to the participants.",
  cc-derived-dataset-used = "CLEF-eHealth-2018-IR-task",
  cc-author-affiliation = "Charles University, Czech Republic",
  cc-class     = "ir/multilingual-information-retrieval, ir/biomedical-information-extraction, nlp/machine-translation",
}

@Article{cc:SantusBiemannChersoni:2018:BomJi-at-SemEval-2018,
  title        = "BomJi at SemEval-2018 Task 10: Combining Vector-, Pattern-and Graph-based Information to Identify
                 Discriminative Attributes",
  author       = "Santus, Enrico and Biemann, Chris and Chersoni, Emmanuele",
  journal      = "arXiv preprint arXiv:1804.11251",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1804.11251",
  cc-author-affiliation = "Massachussetts Institute of Technology, USA; Universität Hamburg, Germany; Aix-Marseille
                 University, France",
  cc-class     = "nlp/semantics",
  cc-dataset-used = "??",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-snippet   = "Thirteen features related to word and word-feature frequency were calculated on the basis of the
                 information extracted from a corpus of 3.2B words, corresponding to about 20\% of the Common Crawl.",
}

@InProceedings{cc:Sarma:2018:learning-word-embeddings,
  title        = "Learning Word Embeddings for Data Sparse and Sentiment Rich Data Sets",
  author       = "Sarma, Prathusha Kameswara",
  booktitle    = "Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational
                 Linguistics: Student Research Workshop",
  pages        = "46--53",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/N18-4007",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-author-affiliation = "University of Wisconsin-Madison",
  cc-class     = "nlp/semantics, nlp/word-embeddings",
}

@Article{cc:SarmaLiangSethares:2018:domain-adapted-word-embeddings,
  title        = "Domain Adapted Word Embeddings for Improved Sentiment Classification",
  author       = "Sarma, Prathusha K and Liang, YIngyu and Sethares, William A",
  journal      = "arXiv preprint arXiv:1805.04576",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1805.04576",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-author-affiliation = "University of Wisconsin-Madison",
  cc-class     = "nlp/sentiment-analysis, nlp/word-embeddings",
}

@InProceedings{cc:SarmaSethares:2018:sentiment-analysis-data-poor-domains,
  title        = "Simple Algorithms For Sentiment Analysis On Sentiment Rich, Data Poor Domains.",
  author       = "Sarma, Prathusha K and Sethares, William",
  booktitle    = "Proceedings of the 27th International Conference on Computational Linguistics",
  pages        = "3424--3435",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/C18-1290",
  cc-author-affiliation = "University of Wisconsin-Madison",
  cc-derived-dataset-cited = "GloVe-word-embeddings",
  cc-class     = "nlp/sentiment-analysis",
}

@Article{cc:SchamoniHitschlerRiezler:2018:dataset-multimodal-MT-image-captions,
  title        = "A dataset and reranking method for multimodal {MT} of user-generated image captions",
  author       = "Schamoni, Shigehiko and Hitschler, Julian and Riezler, Stefan",
  journal      = "Vol. 1: MT Researchers’ Track",
  pages        = "140",
  year         = "2018",
  URL          = "https://amtaweb.org/wp-content/uploads/2018/03/AMTA_2018_Proceedings_Research_Track.pdf#page=146",
  cc-author-affiliation = "Heidelberg University, Germany",
  cc-derived-dataset-used = "WMT-13-translation-task-common-crawl-corpus",
  cc-class     = "nlp/machine-translation",
}

@InProceedings{cc:SchamperRosendahlBaharKimEtAl:2018:RWTH-Aachen-University-WMT-2018,
  title        = "The {RWTH} Aachen University supervised machine translation systems for {WMT} 2018",
  author       = "Schamper, Julian and Rosendahl, Jan and Bahar, Parnia and Kim, Yunsu and Nix, Arne and Ney, Hermann",
  booktitle    = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
  pages        = "496--503",
  year         = "2018",
  URL          = "https://www.aclweb.org/anthology/W18-6426",
  cc-author-affiliation = "RWTH Aachen University, Germany",
  cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
  cc-class     = "nlp/machine-translation",
}

@Article{cc:SchelterKunegis:2018:ubiquity-web-tracking,
  author       = "Sebastian Schelter and Jérôme Kunegis",
  title        = "On the Ubiquity of Web Tracking: Insights from a Billion-Page Web Crawl",
  year         = "2018",
  volume       = "4",
  journal      = "The Journal of Web Science",
  doi          = "10.1561/106.00000014",
  URL          = "http://dx.doi.org/10.1561/106.00000014",
  number       = "4",
  pages        = "53--66",
  cc-derived-dataset-about = "tracking-the-trackers",
  cc-class     = "web-science/tracking",
  cc-author-affiliation = "Technical University Berlin, Germany; University of Namur, Belgium",
}

@Article{cc:Schwenk:2018:filtering-mining-parallel-data,
  author       = "Holger Schwenk",
  title        = "Filtering and Mining Parallel Data in a Joint Multilingual Space",
  journal      = "CoRR",
  volume       = "abs/1805.09822",
  year         = "2018",
  URL          = "http://arxiv.org/abs/1805.09822",
  eprint       = "arXiv:1805.09822",
  cc-derived-dataset-used = "WMT-13-translation-task-common-crawl-corpus",
  cc-class     = "nlp/machine-translation",
  cc-author-affiliation = "Facebook AI Research",
}

@InProceedings{cc:SevaSangerLeser:2018:WBI-at-CLEF-eHealth-2018-task,
  title        = "{WBI} at {CLEF} eHealth 2018 Task 1: Language-independent {ICD}-10 coding using multi-lingual
                 embeddings and recurrent neural networks",
  author       = "Ševa, Jurica and Sänger, Mario and Leser, Ulf",
  year         = "2018",
  organization = "CLEF",
  URL          = "http://ceur-ws.org/Vol-2125/paper_118.pdf",
  cc-author-affiliation = "Humboldt-Universität zu Berlin, Germany",
  cc-derived-dataset-used = "CLEF-eHealth-2018-IR-task",
  cc-class     = "ir/multilingual-information-retrieval, ir/biomedical-information-extraction, nlp/machine-translation,
                 nlp/word-embeddings",
}

@Article{cc:ShainFutrellVanSchijndel:2018:semantic-processing-difficulty,
  title        = "Evidence of semantic processing difficulty in naturalistic reading",
  author       = "Shain, Cory and Futrell, Richard and van Schijndel, Marten and Gibson, Edward and Schuler, William",
  year         = "2018",
  URL          = "https://vansky.github.io/assets/pdf/shain_etal-2018-cuny.pdf",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp, psycholinguistics",
  cc-author-affiliation = "Ohio State University; MIT; Johns Hopkins University",
  cc-snippet   = "[...] using GloVe vectors [20] pretrained on the 840B word Common Crawl dataset [...]",
}

@InProceedings{cc:ShalevAdiKeshet:2018:out-of-distribution-detection,
  title        = "Out-of-distribution detection using multiple semantic label representations",
  author       = "Shalev, Gabi and Adi, Yossi and Keshet, Joseph",
  booktitle    = "Advances in Neural Information Processing Systems",
  pages        = "7386--7396",
  year         = "2018",
  URL          = "http://papers.nips.cc/paper/7967-out-of-distribution-detection-using-multiple-semantic-label-representations",
  pdf          = "http://papers.nips.cc/paper/7967-out-of-distribution-detection-using-multiple-semantic-label-representations.pdf",
  cc-author-affiliation = "Bar-Ilan University, Israel",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/semantics, nlp/word-embeddings, ai/neural-networks, ai/computer-vision, nlp/speech-recognition",
}

@Article{cc:ShravaniJhaGuha:2018:correlate-emotional-intelligence-and-happiness,
  title        = "A Machine Learning Approach to Correlate Emotional Intelligence and Happiness Based on Twitter Data",
  author       = "Shravani, Sistla Sai and Jha, Niraj Kumar and Guha, Rajlaksmi",
  year         = "2018",
  URL          = "http://hci2018.bcs.org/prelim_proceedings/papers/Work-in-Progress%20Track/BHCI-2018_paper_115.pdf",
  cc-author-affiliation = "IT Kharagpur, India",
  cc-derived-dataset-used = "fastText-word-embeddings",
  cc-class     = "nlp/sentiment-analysis, nlp/word-embeddings",
}

@Article{cc:SimsekFensel:2018:intent-generation-for-dialogue-systems,
  title        = "Intent Generation for Goal-Oriented Dialogue Systems based on Schema.org Annotations",
  author       = "Şimşek, Umutcan and Fensel, Dieter",
  journal      = "arXiv preprint arXiv:1807.01292",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1807.01292",
  cc-author-affiliation = "University of Innsbruck, Austria",
  cc-class     = "nlp/dialogue-systems, semantic web, microformats",
  cc-derived-dataset-used = "GloVe-word-embeddings",
}

@Misc{cc:SinghJiaoLevinaSaini:2018:using-open-data-to-predict-market-movements,
  title        = "Using open data to predict market movements",
  author       = "Ravinder Singh and Marina Levina and Nelson Jiao and Asha Saini",
  year         = "2018",
  URL          = "https://education.emc.com/content/dam/dell-emc/documents/en-us/2017KS_Ravinder-Using_Open_Data_to_Predict_Market_Movements.pdf",
  cc-author-affiliation = "DELL EMC",
  cc-class     = "market research, nlp, information retrieval",
  cc-snippet   = "We found that The Register articles for specific vendors extracted from the common crawl data set are
                 highly correlated with our reading of General Purpose Magic Quadrant position movements in time. [...]
                 The Figure 11 : Common Crawl Data Processing Flow Diagram shows a broad overview of the steps involved
                 in the analysis of common crawl data. Going from the bottom up it shows how the data is extracted,
                 processed and visualized. The amount of data in each phase becomes more streamlined and, hence, the
                 reduction in size of the data being worked on. We start with the crawl data, extract the pages of
                 interest int o a private storage bucket, and then process it to remove unwanted words/tags. At the end,
                 visualization tools are used to graphically display the results. These can be used to publish standard
                 reports or customized by users to support their own analysis.",
}

@Article{cc:SmithLeeman-MunkSheltonMottEtAl:2018:student-writing-and-drawing,
  title        = "A multimodal assessment framework for integrating student writing and drawing in elementary science
                 learning",
  author       = "Smith, Peter Andrew Miller and Leeman-Munk, Samuel and Shelton, Angi and Mott, Bradford W and Wiebe,
                 Eric and Lester, James",
  journal      = "IEEE Transactions on Learning Technologies",
  year         = "2018",
  publisher    = "IEEE",
  URL          = "https://ieeexplore.ieee.org/abstract/document/8274912/",
  pdf          = "https://www.intellimedia.ncsu.edu/wp-content/uploads/Smith-TLT-2018.pdf",
  cc-author-affiliation = "North Carolina State University, Raleigh, NC, USA; SAS Institute Inc., Cary, NC, USA",
  cc-class     = "nlp/word-embeddings, nlp/semantics, education, tutoring systems, student writing",
}

@PhdThesis{cc:Soldaini:2018:knowledge-and-language-gap-in-medical-information-seeking,
  title        = "The Knowledge and Language Gap in Medical Information Seeking",
  author       = "Soldaini, Luca",
  year         = "2018",
  school       = "Georgetown University",
  URL          = "https://search.proquest.com/openview/e669cd1478b33d52fa4cc71e8393c639/1",
  cc-derived-dataset-cited = "CLEF-eHealth-2018-IR-task",
  cc-author-affiliation = "Georgetown University, USA",
  cc-class     = "ir/multilingual-information-retrieval, ir/biomedical-information-retrieval",
}

@Article{cc:SongWangYuZhangEtAl:2018:graph-structured-passage-representation,
  title        = "Exploring graph-structured passage representation for multi-hop reading comprehension with graph
                 neural networks",
  author       = "Song, Linfeng and Wang, Zhiguo and Yu, Mo and Zhang, Yue and Florian, Radu and Gildea, Daniel",
  journal      = "arXiv preprint arXiv:1809.02040",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1809.02040",
  cc-author-affiliation = "University of Rochester, Rochester, NY, USA; IBM T.J. Watson Research Center, Yorktown
                 Heights, NY, USA; School of Engineering, Westlake University, China",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/word-embeddings, nlp/machine-reading, nlp/coreference-resolution, nlp/question-answering",
}

@InProceedings{cc:SpauldingChenAliKulinskiEtAl:2018:social-robot-system,
  title        = "A social robot system for modeling children's word pronunciation: socially interactive agents track",
  author       = "Spaulding, Samuel and Chen, Huili and Ali, Safinah and Kulinski, Michael and Breazeal, Cynthia",
  booktitle    = "Proceedings of the 17th International Conference on Autonomous Agents and MultiAgent Systems",
  pages        = "1658--1666",
  year         = "2018",
  organization = "International Foundation for Autonomous Agents and Multiagent Systems",
  URL          = "https://dl.acm.org/citation.cfm?id=3237946",
  cc-author-affiliation = "Massachusetts Institute of Technology, Cambridge, MA, USA",
  cc-derived-dataset-cited = "GloVe-word-embeddings",
  cc-class     = "computer-vision, nlp/word-embeddings",
}

@InProceedings{cc:StabDaxenbergerStahlhutMillerEtAl:2018:ArgumenText-searching-for-arguments,
  title        = "ArgumenText: Searching for Arguments in Heterogeneous Sources",
  author       = "Stab, Christian and Daxenberger, Johannes and Stahlhut, Chris and Miller, Tristan and Schiller,
                 Benjamin and Tauchmann, Christopher and Eger, Steffen and Gurevych, Iryna",
  booktitle    = "Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational
                 Linguistics: Demonstrations",
  pages        = "21--25",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/N18-5005",
  cc-snippet   = "we build upon the English part of CommonCrawl, [...] we followed Habernal et al. (2016) for
                 de-duplication, boiler-plate removal using jusText (Pomikálek, 2011), andlanguage detection.² This
                 left us with 400 million heterogeneous plain-text documents in English, with an overall size of 683
                 GiB.",
  cc-author-affiliation = "Ubiquitous Knowledge Processing Lab, Department of Computer Science, Technische Universität
                 Darmstadt, Germany",
  cc-class     = "nlp/argument-mining",
}

@Article{cc:StahlbergGispertByrne:2018:University-of-Cambridges-WMT18,
  title        = "The University of Cambridge's Machine Translation Systems for {WMT18}",
  author       = "Stahlberg, Felix and de Gispert, Adria and Byrne, Bill",
  journal      = "arXiv preprint arXiv:1808.09465",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1808.09465",
  cc-derived-dataset-used = "WMT-13-translation-task-common-crawl-corpus",
  cc-author-affiliation = "University of Cambridge, United Kingdom; SDL Research, Cambridge, United Kingdom",
  cc-class     = "nlp/machine-translation",
}

@Article{cc:Stahlhut:2018:ArgumentText-German,
  title        = "Searching Arguments in German with ArgumenText",
  author       = "Stahlhut, Chris",
  year         = "2018",
  URL          = "http://ceur-ws.org/Vol-2167/short7.pdf",
  cc-author-affiliation = "Ubiquitous Knowledge Processing Lab TU Darmstadt, Germany",
  cc-class     = "nlp/argument-mining",
}

@InProceedings{cc:StergiouRughwaniTsioutsiouliklis:2018:shortcutting-label-propagation,
  title        = "Shortcutting Label Propagation for Distributed Connected Components",
  author       = "Stergiou, Stergios and Rughwani, Dipen and Tsioutsiouliklis, Kostas",
  booktitle    = "Proceedings of the Eleventh ACM International Conference on Web Search and Data Mining",
  pages        = "540--546",
  year         = "2018",
  organization = "ACM",
  URL          = "https://dl.acm.org/citation.cfm?id=3159696",
  cc-author-affiliation = "Yahoo Research, Sunnyvale, CA, USA; Google & Yahoo Research, Mountain View, CA, USA",
  cc-class     = "graph processing",
}

@InProceedings{cc:SuominenKellyGoeuriotNeveolEtAl:2018:overview-of-CLEF-ehealth-evaluation-lab,
  title        = "Overview of the {CLEF} ehealth evaluation lab 2018",
  author       = "Suominen, Hanna and Kelly, Liadh and Goeuriot, Lorraine and Névéol, Aurélie and Ramadier, Lionel
                 and Robert, Aude and Kanoulas, Evangelos and Spijker, Rene and Azzopardi, Leif and Li, Dan and others",
  booktitle    = "International Conference of the Cross-Language Evaluation Forum for European Languages",
  pages        = "286--301",
  year         = "2018",
  organization = "Springer, Cham",
  URL          = "https://link.springer.com/chapter/10.1007/978-3-319-98932-7_26",
  pdf          = "https://strathprints.strath.ac.uk/66428/1/Suominen_etal_CLEF2018_Overview_CLEF_ehealth_evaluation_lab_2018.pdf",
  cc-author-affiliation = "University of Turku, Turku, Finland; The Australian National University (ANU), Australia;
                 Commonwealth Scientific and Industrial Research Organisation (CSIRO), University of Canberra, Canberra,
                 Australia; Maynooth University, Maynooth, Ireland; Univ. Grenoble Alpes, CNRS, Grenoble, France;
                 Université Paris-Saclay, Orsay, France; INSERM, France; University of Amsterdam, Amsterdam,
                 Netherlands; Cochrane Netherlands and UMC Utrecht; Julius Center for Health Sciences and Primary Care,
                 Utrecht, Netherlands; University of Strathclyde, Glasgow, UK; Queensland University of Technology,
                 Brisbane, Australia; Vienna University of Technology, Vienna, Austria; Qatar Computing Research
                 Institute, Doha, Qatar",
  cc-class     = "ir/search-engine-evaluation, nlp/corpus-construction",
  cc-dataset-used = "CC-MAIN-2018-09",
  cc-derived-dataset-about = "CLEF-eHealth-2018-IR-task",
  cc-snippet   = "This year we introduced clefehealth2018 corpus. This was crated by compiling Web pages of selected
                 domains acquired from the CommonCrawl¹¹. An initial list of Websites was identified for acquisition.
                 The list was built by submitting the CLEF 2018 base queries to the Microsoft Bing APIs (through the
                 Azure Cognitive Services) repeatedly over a period of few weeks¹², and acquiring the URLs of the
                 retrieved results. The domains of the URLs were then included in the list, except some domains that
                 were excluded for decency reasons (e.g. pornhub.com). The list was further augmented by including a
                 number of known reliable health Websites and other known unreliable health Websites, from lists
                 previously compiled by health institutions and agencies. The corpus was divided into folders, by domain
                 name. Each folder contained a file for each Webpage from the domain available in the CommonCrawl dump.
                 In total, 2,021 domains were requested from the CommonCrawl dump of 2018-09¹³. Of the 2,021 domains
                 in total, 1,903 were successfully acquired. The remaining domains were discarded due to errors,
                 corrupted or incomplete data returned by the CommonCrawl API (a total of ten retries were attempted for
                 each domain before giving up on a domain). Of the 1,903 crawled domains, 84 were not available in the
                 CommonCrawl dump, and for these, a folder in the corpus exists and represents the domain that was
                 requested; however, the folder is empty, meaning that it was not available in the dump. Note that .pdf
                 documents were excluded from the data acquired from CommonCrawl. A complete list of domains and size of
                 the crawl data for each domain is available at https://github.com/CLEFeHealth/CLEFeHealth2018IRtask/
                 blob/master/clef2018collection_listofdomains.txt. The full collection, clefehealth2018¹⁴, it
                 contains 5,535,120 Web pages and its uncompressed size is about 480GB. In addition to the full
                 collection, an alternative corpus named clefehealth2018_B¹⁵ was created by manually removing a
                 number of domains that were not strictly health-related (e.g., news Websites). This subset contains
                 1,653 domains and its size is about 294GB, uncompressed.",
}

@InProceedings{cc:TafreshiDiab:2018:emotion-detection-and-classification,
  title        = "Emotion Detection and Classification in a Multigenre Corpus with Joint Multi-Task Deep Learning",
  author       = "Tafreshi, Shabnam and Diab, Mona",
  booktitle    = "Proceedings of the 27th International Conference on Computational Linguistics",
  pages        = "2905--2913",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/C18-1246",
  cc-author-affiliation = "George Washington University",
  cc-class     = "nlp/emotion-detection, nlp/word-embeddings",
  cc-derived-dataset-used = "GloVe-word-embeddings, fastText-word-embeddings",
  cc-snippet   = "Our results indicate that common crawl corpus with 2 million words, trained using fastText model has
                 the most word coverage among these genres.",
}

@Article{cc:TempelmeierDemidovaDietze:2018:inferring-missing-categorical-information,
  title        = "Inferring missing categorical information in noisy and sparse web markup",
  author       = "Tempelmeier, Nicolas and Demidova, Elena and Dietze, Stefan",
  journal      = "Proceedings of The Web Conference 2018, 27th edition of the former WWW conference",
  year         = "2018",
  doi          = "http://doi.org/10.1145/3178876.3186028",
  URL          = "https://arxiv.org/abs/1803.00446",
  cc-author-affiliation = "Leibniz Universität Hannover, Germany",
  cc-derived-dataset-used = "WDC-triples",
  cc-class     = "semantic web, linked data",
}

@Article{cc:ThompsonKhayrallahAnastasopoulosMcCarthyEtAl:2018:freezing-subnetworks,
  title        = "Freezing Subnetworks to Analyze Domain Adaptation in Neural Machine Translation",
  author       = "Thompson, Brian and Khayrallah, Huda and Anastasopoulos, Antonios and McCarthy, Arya and Duh, Kevin
                 and Marvin, Rebecca and McNamee, Paul and Gwinnup, Jeremy and Anderson, Tim and Koehn, Philipp",
  journal      = "arXiv preprint arXiv:1809.05218",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1809.05218",
  cc-author-affiliation = "Johns Hopkins University, USA; University of Notre Dame, France; Air Force Research
                 Laboratory, USA",
  cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
  cc-class     = "nlp/machine-translation",
}

@Article{cc:ThompsonTong:2018:track-persistent-identifier,
  title        = "Can Common Crawl reliably track persistent identifier ({PID}) use over time?",
  author       = "Thompson, Henry S. and Tong, Jian",
  journal      = "arXiv preprint arXiv:1802.01424",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1802.01424",
  cc-author-affiliation = "University of Edinburgh, United Kingdom",
  cc-class     = "web-science",
}

@Misc{cc:TirumalaJagmohanKhabiriLiEtAl:2018:control-policies,
  title        = "Facilitating mapping of control policies to regulatory documents",
  author       = "Tirumala, Swapna Buccapatnam and Jagmohan, Ashish and Khabiri, Elham and Li, Ta-Hsin and Riemer,
                 Matthew Daniel and Sheinin, Vadim and Vempaty, Aditya",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/349,766",
  URL          = "https://patents.google.com/patent/US20180137107A1/en",
  cc-author-affiliation = "International Business Machines Corp.",
  cc-snippet   = "The global corpora [203] can comprise a general internet-based collection of texts derived from
                 various sources (e.g., GUTENBERG®, REUTERS®, COMMON CRAWL®, and/or GOOGLE NEWS®).",
  cc-class     = "patent, cc-cited-not-used",
}

@InProceedings{cc:TkachenkoChiaLauw:2018:exploring-corpus-subjectivity,
  title        = "Searching for the {X}-Factor: Exploring Corpus Subjectivity for Word Embeddings",
  author       = "Tkachenko, Maksim and Chia, Chong Cher and Lauw, Hady",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1:
                 Long Papers)",
  volume       = "1",
  pages        = "1212--1221",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/P18-1112",
  cc-author-affiliation = "Singapore Management University, Singapore",
  cc-class     = "nlp/sentiment-analysis, nlp/word-embeddings, cc-cited-not-used",
}

@Misc{cc:ToberNeumann:2018:resource-contents,
  title        = "Creation and optimization of resource contents",
  author       = "Tober, Marcus and Neumann, Daniela",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/284,739",
  URL          = "https://patents.google.com/patent/US20180096067A1/en",
  cc-author-affiliation = "Searchmetrics GmbH",
  cc-snippet   = "The crawler module [310] may automatically crawl a network and acquire contents from one or more
                 resources in the network, acquire the contents from an open repository of web crawl data such as
                 CommonCrawl.org.",
  cc-class     = "patent, cc-cited-not-used",
}

@Article{cc:TosikMalliaGangopadhyay:2018:debunking-fake-news,
  title        = "Debunking Fake News One Feature at a Time",
  author       = "Tosik, Melanie and Mallia, Antonio and Gangopadhyay, Kedar",
  journal      = "arXiv preprint arXiv:1808.02831",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1808.02831",
  cc-author-affiliation = "New York University",
  cc-class     = "nlp, text classification",
  cc-derived-dataset-used = "?? GloVe-word-embeddings",
  cc-snippet   = "Cosine similarity between averaged headline/body Common Crawl vectors",
}

@Article{cc:TranBisk:2018:inducing-grammars,
  title        = "Inducing Grammars with and for Neural Machine Translation",
  author       = "Tran, Ke and Bisk, Yonatan",
  journal      = "arXiv preprint arXiv:1805.10850",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1805.10850",
  cc-author-affiliation = "University of Amsterdam; University of Washington",
  cc-class     = "nlp/maschine-translation, nlp/syntax, nlp/grammar-learning, nlp/dependency-grammar",
}

@Article{cc:TrinhLe:2018:commonsense-reasoning,
  title        = "A Simple Method for Commonsense Reasoning",
  author       = "Trinh, Trieu H and Le, Quoc V",
  journal      = "arXiv preprint arXiv:1806.02847",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1806.02847",
  cc-author-affiliation = "Google Brain",
  abstract     = "Commonsense reasoning is a long-standing challenge for deep learning. For example, it is difficult to
                 use neural networks to tackle the Winograd Schema dataset [ 1]. In this paper, we present a simple
                 method for commonsense reasoning with neural networks, using unsupervised learning. Key to our method
                 is the use of language models, trained on a massive amount of unlabled data, to score multiple choice
                 questions posed by commonsense reasoning tests. On both Pronoun Disambiguation and Winograd Schema
                 challenges, our models outperform previous state-of-the-art methods by a large margin, without using
                 expensive annotated knowledge bases or hand-engineered features. We train an array of large RNN
                 language models that operate at word or character level on LM-1-Billion, CommonCrawl, SQuAD, Gutenberg
                 Books, and a customized corpus for this task and show that diversity of training data plays an
                 important role in test performance. Further analysis also shows that our system successfully discovers
                 important features of the context that decide the correct answer, indicating a good grasp of
                 commonsense knowledge.",
  cc-snippet   = "In particular, we aggregate documents from the CommonCrawl dataset that has the most overlapping
                 n-grams with the questions. [...] We name this dataset STORIES since most of the constituent documents
                 take the form of a story with long chain of coherent events.",
  cc-derived-dataset-about = "CC-Stories",
  cc-class     = "ai/deep-learning, nlp/language-model",
}

@Article{cc:UstalovPanchenkoBiemannPonzetto:2018:graph-clustering-sense-and-frame-induction,
  title        = "Watset: local-global graph clustering with applications in sense and frame induction",
  author       = "Ustalov, Dmitry and Panchenko, Alexander and Biemann, Chris and Ponzetto, Simone Paolo",
  journal      = "arXiv preprint arXiv:1808.06696",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1808.06696",
  cc-author-affiliation = "University of Mannheim, Germany; University of Hamburg, Germany; Skolkovo Institute of
                 Science and Technology, Moskva, Russia",
  cc-class     = "nlp/dependency-parsing, nlp/semantics, nlp/synonymy, nlp/frames-semantics, graph-clustering,
                 web-mining",
  cc-derived-dataset-used = "depcc",
  cc-snippet   = "For the evaluation purposes, we operate on the intersection of triples from DepCC and FrameNet.",
}

@Article{cc:UstalovPanchenkoBiemannPonzetto:2018:sense-aware-hypernymy-extraction,
  title        = "Unsupervised sense-aware hypernymy extraction",
  author       = "Ustalov, Dmitry and Panchenko, Alexander and Biemann, Chris and Ponzetto, Simone Paolo",
  journal      = "arXiv preprint arXiv:1809.06223",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1809.06223",
  cc-author-affiliation = "University of Mannheim, Germany; University of Hamburg, Germany",
  cc-class     = "nlp/semantics, nlp/hypernymy, web-mining",
  cc-derived-dataset-cited = "WDC-WebIsADb",
}

@Article{cc:UstalovPanchenkoKutuzovBiemannEtAl:2018:semantic-frame-induction,
  title        = "Unsupervised semantic frame induction using triclustering",
  author       = "Ustalov, Dmitry and Panchenko, Alexander and Kutuzov, Andrei and Biemann, Chris and Ponzetto, Simone
                 Paolo",
  journal      = "arXiv preprint arXiv:1805.04715",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1805.04715",
  cc-author-affiliation = "University of Mannheim, Germany; University of Hamburg, Germany; University of Oslo, Norway",
  cc-class     = "nlp/dependency-parsing, nlp/semantics, nlp/synonymy, nlp/frames-semantics, graph-clustering,
                 web-mining",
  cc-derived-dataset-used = "depcc",
  cc-snippet   = "In our evaluation, we use triple frequencies from the DepCC dataset (Panchenkoet al., 2018) , which is
                 a dependency-parsed version of the Common Crawl corpus, and the standard 300-dimensional word
                 embeddings model trained on the Google News corpus (Mikolovet al., 2013). [...] For the evaluation
                 purposes, we operate on the intersection of triples from DepCC and FrameNet.",
}

@InCollection{cc:Varian:2018:AI-economics-industrial-organization,
  title        = "Artificial intelligence, economics, and industrial organization",
  author       = "Varian, Hal",
  booktitle    = "The Economics of Artificial Intelligence: An Agenda",
  year         = "2018",
  publisher    = "University of Chicago Press",
  URL          = "https://www.nber.org/papers/w24839",
  abstract     = "Machine learning (ML) and artificial intelligence (AI) have been around for many years. However, in
                 the last 5 years, remarkable progress has been made using multilayered neural networks in diverse areas
                 such as image recognition, speech recognition, and machine translation. AI is a general purpose
                 technology that is likely to impact many industries. In this chapter I consider how machine learning
                 availability might affect the industrial organization of both firms that provide AI services and
                 industries that adopt AI technology. My intent is not to provide an extensive overview of this
                 rapidly-evolving area, but instead to provide a short summary of some of the forces at work and to
                 describe some possible areas for future research.",
  cc-class     = "economy",
  cc-author-affiliation = "National Bureau of Economic Research, Cambridge, MA, USA",
}

@InProceedings{cc:VinayanSomanEtAl:2018:AmritaNLP-at-SemEval-2018,
  title        = "Amrita{NLP} at SemEval-2018 Task 10: Capturing discriminative attributes using convolution neural
                 network over global vector representation.",
  author       = "Vinayan, Vivek and Anand, Kumar M and Soman, K P",
  booktitle    = "Proceedings of The 12th International Workshop on Semantic Evaluation",
  pages        = "1003--1007",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/S18-1166",
  cc-author-affiliation = "Amrita School of Engineering, India",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/semantics, nlp/word-embeddings",
}

@Article{cc:VyasNiuCarpuat:2018:semantic-divergences-in-parallel-text,
  title        = "Identifying Semantic Divergences in Parallel Text without Annotations",
  author       = "Vyas, Yogarshi and Niu, Xing and Carpuat, Marine",
  journal      = "arXiv preprint arXiv:1803.11112",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1803.11112",
  cc-derived-dataset-used = "{?Ngrams-LMs-2013}",
  cc-author-affiliation = "Department of Computer Science, University of Maryland",
  cc-class     = "nlp/machine-translation",
}

@InProceedings{cc:WangChoKiela:2018:code-switched-Named-Entity-Recognition,
  title        = "Code-Switched Named Entity Recognition with Embedding Attention",
  author       = "Wang, Changhan and Cho, Kyunghyun and Kiela, Douwe",
  booktitle    = "Proceedings of the Third Workshop on Computational Approaches to Linguistic Code-Switching",
  pages        = "154--158",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/W18-3221",
  cc-derived-dataset-used = "fastText-word-embeddings",
  cc-author-affiliation = "Facebook AI Research; New York University",
  cc-class     = "nlp/named-entity-recognition, nlp/word-embeddings",
}

@Article{cc:WangIwaihara:2018:mergeable-Wikipedia-articles,
  title        = "Detection of mergeable Wikipedia articles based on overlapping topics",
  author       = "Wang, Renzhi and Iwaihara, Mizuho",
  year         = "2018",
  URL          = "db-event.jpn.org/deim2018/data/papers/157.pdf",
  cc-author-affiliation = "Graduate School of Information, Production and Systems, Waseda University Japan",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "nlp/word-embeddings, ir/duplicate-detection",
}

@InProceedings{cc:WangXieTanSuEtAl:2018:neural-machine-translation-with-decoding-history,
  title        = "Neural Machine Translation with Decoding History Enhanced Attention",
  author       = "Wang, Mingxuan and Xie, Jun and Tan, Zhixing and Su, Jinsong and Xiong, Deyi and Bian, Chao",
  booktitle    = "Proceedings of the 27th International Conference on Computational Linguistics",
  pages        = "1464--1473",
  year         = "2018",
  URL          = "https://www.aclweb.org/anthology/C18-1124",
  cc-author-affiliation = "Mobile Internet Group, Tencent Technology Co., Ltd; Xiamen University, China; Soochow
                 University, China",
  cc-class     = "nlp/machine-translation, cc-cited-not-used",
}

@Misc{cc:WeiNguyenChanLiouEtAl:2018:Systems-and-methods-for,
  title        = "Systems and methods for improved user interface",
  author       = "Wei, Zhuxiaona and Nguyen, Thuan and Chan, Iat and Liou, Kenny M and Wang, Helin and Lu, Houchang",
  year         = "2018",
  publisher    = "Google Patents",
  note         = "US Patent App. 15/621,647",
  URL          = "https://patents.google.com/patent/US20180011688A1/en",
  cc-author-affiliation = "Baidu USA LLC",
  cc-class     = "patent, ir/user-interface",
  cc-snippet   = "For English, in embodiments, the language model is a Kneser-Ney smoothed 5-gram model with pruning
                 that is trained using the KenLM toolkit on cleaned text from the Common Crawl Repository. The
                 vocabulary is the most frequently used 400,000 words from 250 million lines of text, which produces a
                 language model with about 850 million n-grams.",
}

@InProceedings{cc:WietingGimpel:2018:paraphrastic-sentence-embeddings,
  title        = "Paranmt-50m: Pushing the limits of paraphrastic sentence embeddings with millions of machine
                 translations",
  author       = "Wieting, John and Gimpel, Kevin",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1:
                 Long Papers)",
  volume       = "1",
  pages        = "451--462",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/P18-1042",
  cc-author-affiliation = "Carnegie Mellon University, Pittsburgh, PA, USA; Toyota Technological Institute at Chicago,
                 IL, USA",
  cc-derived-dataset-about = "WMT-16-translation-task-common-crawl-corpus",
  cc-class     = "nlp/machine-translation, nlp/sentence-paraphrase, nlp/sentence-embeddings",
}

@Article{cc:WinataWuMadottoFung:2018:bilingual-character-representation-for-named-entity-recognition,
  title        = "Bilingual Character Representation for Efficiently Addressing Out-of-Vocabulary Words in
                 Code-Switching Named Entity Recognition",
  author       = "Winata, Genta Indra and Wu, Chien-Sheng and Madotto, Andrea and Fung, Pascale",
  journal      = "arXiv preprint arXiv:1805.12061",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1805.12061",
  cc-author-affiliation = "Hong Kong University of Science and Technology, Hong Kong",
  cc-derived-dataset-used = "fastText-word-embeddings",
  cc-class     = "nlp/named-entity-recognition, nlp/word-embeddings",
}

@InProceedings{cc:XieGenthialXieNgEtAl:2018:backtranslation-for-grammar-correction,
  title        = "Noising and Denoising Natural Language: Diverse Backtranslation for Grammar Correction",
  author       = "Xie, Ziang and Genthial, Guillaume and Xie, Stanley and Ng, Andrew and Jurafsky, Dan",
  booktitle    = "Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational
                 Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
  volume       = "1",
  pages        = "619--628",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/N18-1057",
  cc-author-affiliation = "Stanford University, USA",
  cc-derived-dataset-used = "Ngrams-LMs-2013",
  cc-class     = "nlp/machine-translation, nlp/grammatical-error-correction",
}

@InProceedings{cc:XiongHeHuWu:2018:multi-channel-encoder-for-neural-machine-translation,
  title        = "Multi-channel encoder for neural machine translation",
  author       = "Xiong, Hao and He, Zhongjun and Hu, Xiaoguang and Wu, Hua",
  booktitle    = "Thirty-Second AAAI Conference on Artificial Intelligence",
  year         = "2018",
  URL          = "https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/viewPaper/16788",
  cc-author-affiliation = "Baidu Inc., China",
  cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
  cc-class     = "nlp/machine-translation",
}

@InProceedings{cc:XuBennettHoogeveenLauEtAl:2018:answer-selection-in-Stack-Overflow,
  title        = "Preferred Answer Selection in Stack Overflow: Better Text Representations... and Metadata, Metadata,
                 Metadata",
  author       = "Xu, Steven and Bennett, Andrew and Hoogeveen, Doris and Lau, Jey Han and Baldwin, Timothy",
  booktitle    = "Proceedings of the 2018 EMNLP Workshop W-NUT: The 4th Workshop on Noisy User-generated Text",
  pages        = "137--147",
  year         = "2018",
  URL          = "https://www.aclweb.org/anthology/W18-6119",
  cc-author-affiliation = "University of Melbourne, Australia",
  cc-derived-dataset-used = "GloVe-word-embeddings",
  cc-class     = "information retrieval, nlp/question-answering, nlp/word-embeddings",
}

@InProceedings{cc:YangGoncalves:2018:improving-personalized-consumer-health-search,
  title        = "Improving personalized consumer health search: notebook for ehealth at clef 2018",
  author       = "Yang, Hua and Gonçalves, Teresa",
  booktitle    = "CEUR Workshop Proceedings: Working Notes of CLEF 2018: Conference and Labs of the Evaluation Forum",
  year         = "2018",
  URL          = "http://ceur-ws.org/Vol-2125/paper_195.pdf",
  cc-author-affiliation = "University of Èvora, Portugal; ZhongYuan University of Technology, Zhengzhou, China",
  cc-class     = "ir/multilingual-information-retrieval, ir/biomedical-information-extraction, ir/query-expansion,
                 ir/learning-to-rank, nlp/word-embeddings",
  cc-derived-dataset-used = "CLEF-eHealth-2018-IR-task",
}

@Article{cc:YannakisFafaliosTzitzikas:2018:query-reordering-SparQL,
  title        = "Heuristics-based Query Reordering for Federated Queries in {SPARQL} 1.1 and {SPARQL}-{LD}",
  author       = "Yannakis, Thanos and Fafalios, Pavlos and Tzitzikas, Yannis",
  year         = "2018",
  URL          = "http://ceur-ws.org/Vol-2110/paper7.pdf",
  cc-author-affiliation = "University of Crete, Greece; Leibniz University of Hannover, Germany",
  cc-class     = "semantic web, linked data, SparQL",
  cc-derived-dataset-used = "WebDataCommons",
}

@Article{cc:YuliantiChenScholerCroftEtAl:2018:ranking-documents,
  title        = "Ranking Documents by Answer-Passage Quality",
  author       = "Yulianti, Evi and Chen, Ruey-Cheng and Scholer, Falk and Croft, W Bruce and Sanderson, Mark",
  year         = "2018",
  URL          = "http://marksanderson.org/publications/my_papers/SIGIR2018a.pdf",
  cc-author-affiliation = "RMIT University, Melbourne, Australia; SEEK Ltd., Melbourne, Australia",
  cc-class     = "information retrieval, nlp/question-answering, cc-cited-not-used",
}

@InProceedings{cc:ZayaniKsentiniTmarGargouri:2018:Miracl-at-clef-2018,
  title        = "Miracl at clef 2018: Consumer health search task",
  author       = "Zayani, Siwar and Ksentini, Nesrine and Tmar, Mohamed and Gargouri, Faiez",
  booktitle    = "CEUR Workshop Proceedings: Working Notes of CLEF 2018: Conference and Labs of the Evaluation Forum",
  year         = "2018",
  URL          = "http://ceur-ws.org/Vol-2125/paper_141.pdf",
  cc-author-affiliation = "University of Sfax, Tunisia",
  cc-derived-dataset-used = "CLEF-eHealth-2018-IR-task",
  cc-class     = "ir/multilingual-information-retrieval, ir/biomedical-information-extraction, ir/query-expansion",
}

@Article{cc:ZeghidourXuLiptchinskyUsunierEtAl:2018:fully-convolutional-speech-recognition,
  title        = "Fully convolutional speech recognition",
  author       = "Zeghidour, Neil and Xu, Qiantong and Liptchinsky, Vitaliy and Usunier, Nicolas and Synnaeve, Gabriel
                 and Collobert, Ronan",
  journal      = "arXiv preprint arXiv:1812.06864",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1812.06864",
  cc-author-affiliation = "Facebook A.I. Research, Paris, France; Facebook A.I. Research, New York & Menlo Park, USA;
                 CoML, ENS/CNRS/EHESS/INRIA/PSL Research University, Paris, France",
  cc-snippet   = "(12k training hours AM, common crawl LM)",
  cc-derived-dataset-used = "??",
  cc-class     = "nlp/speech-recognition",
}

@Article{cc:ZellersBiskSchwartzChoi:2018:Swag-adversarial-dataset,
  title        = "Swag: {A} large-scale adversarial dataset for grounded commonsense inference",
  author       = "Zellers, Rowan and Bisk, Yonatan and Schwartz, Roy and Choi, Yejin",
  journal      = "arXiv preprint arXiv:1808.05326",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1808.05326",
  cc-author-affiliation = "University of Washington, USA",
  cc-class     = "ai/reasoning, nlp/text-generation, nlp/word-embeddings",
  cc-derived-dataset-used = "GloVe-word-embeddings",
}

@InProceedings{cc:ZhanLevy:2018:speaker-choice-Mandarin-Chinese,
  title        = "Comparing Theories of Speaker Choice Using a Model of Classifier Production in Mandarin Chinese",
  author       = "Zhan, Meilin and Levy, Roger",
  booktitle    = "Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational
                 Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
  volume       = "1",
  pages        = "1997--2005",
  year         = "2018",
  URL          = "http://www.aclweb.org/anthology/N18-1181",
  cc-author-affiliation = "Massachusetts Institute of Technology, USA",
  cc-derived-dataset-cited = "WMT-13-translation-task-common-crawl-corpus",
  cc-class     = "nlp/syntax, nlp/corpus-lingustics, nlp/paraphrasing",
}

@Misc{cc:ZhangEtAl:2018:GraphIt,
  author       = "Yunming Zhang and Mengjiao Yang and Riyadh Baghdadi and Shoaib Kamil and Julian Shun and Saman P.
                 Amarasinghe",
  title        = "GraphIt - {A} High-Performance {DSL} for Graph Analytics",
  journal      = "CoRR",
  volume       = "abs/1805.00923",
  year         = "2018",
  URL          = "http://arxiv.org/abs/1805.00923",
  eprint       = "arXiv:1805.00923",
  cc-derived-dataset-used = "WDC-hyperlinkgraph",
  cc-author-affiliation = "MIT CSAIL; Adobe Research",
  cc-class     = "graph-processing",
}

@InProceedings{cc:ZhangHouSuSu:2018:neural-network-for-answer-selection,
  title        = "Two-Step Multi-factor Attention Neural Network for Answer Selection",
  author       = "Zhang, Pengqing and Hou, Yuexian and Su, Zhan and Su, Yi",
  booktitle    = "Pacific Rim International Conference on Artificial Intelligence",
  pages        = "658--670",
  year         = "2018",
  organization = "Springer",
  URL          = "https://link.springer.com/chapter/10.1007/978-3-319-97304-3_50",
  cc-author-affiliation = "Tianjin University, China",
  cc-class     = "nlp/answer-selection, ai/neural-networks, nlp/word-embeddings",
  cc-derived-dataset-used = "GloVe-word-embeddings",
}

@InProceedings{cc:ZhangTanTaoZhengEtAl:2018:identifying-stable-links,
  title        = "{SLIND}: Identifying Stable Links in Online Social Networks",
  author       = "Zhang, Ji and Tan, Leonard and Tao, Xiaohui and Zheng, Xiaoyao and Luo, Yonglong and Lin, Jerry
                 Chun-Wei",
  booktitle    = "International Conference on Database Systems for Advanced Applications",
  pages        = "813--816",
  year         = "2018",
  publisher    = "Springer",
  URL          = "https://link.springer.com/chapter/10.1007/978-3-319-91458-9_54",
  cc-author-affiliation = "University of Southern Queensland, Australia; Anhui Normal University, Wuhu, China; Harbin
                 Institute of Technology Shenzhen Graduate School, Shenzhen, China",
  cc-dataset-used = "CC-MAIN-2016-36",
  cc-snippet   = "The dataset chosen for this study, as well as for the demo, was crawled from Facebook and obtained
                 from the repositories of the Common Crawl (August 2016).",
  cc-class     = "web-science/hyperlinkgraph, web-science/social-networks",
}

@InProceedings{cc:ZhangTaoTanLinEtAl:2018:link-stability-detection,
  title        = "On Link Stability Detection for Online Social Networks",
  author       = "Zhang, Ji and Tao, Xiaohui and Tan, Leonard and Lin, Jerry Chun-Wei and Li, Hongzhou and Chang,
                 Liang",
  booktitle    = "International Conference on Database and Expert Systems Applications",
  pages        = "320--335",
  year         = "2018",
  organization = "Springer",
  URL          = "https://link.springer.com/chapter/10.1007/978-3-319-98809-2_20",
  cc-author-affiliation = "University of Southern Queensland, Toowoomba, Australia; Harbin Institute of Technology
                 Shenzhen Graduate School, Shenzhen, China; Guilin University of Electronic Technology, Guilin, China;
                 Guilin University of Electronic Technology, Guilin, China",
  cc-class     = "graph-processing, social networks",
  keywords     = "link stability, graph theory, online social networks",
  cc-snippet   = "Since the social network we obtain from the repositories of common crawl contains missing links and
                 partial information, stochastic estimations are …",
}

@Article{cc:ZhangXiongSu:2018:neural-machine-translation-deep-attention,
  title        = "Neural Machine Translation with Deep Attention",
  author       = "Zhang, Biao and Xiong, Deyi and Su, Jinsong",
  journal      = "IEEE transactions on pattern analysis and machine intelligence",
  year         = "2018",
  publisher    = "IEEE",
  URL          = "https://ieeexplore.ieee.org/abstract/document/8493282",
  cc-author-affiliation = "Xiamen University, China; Soochow University, China",
  cc-class     = "nlp/machine-translation",
}

@Article{cc:ZhangXiongSuLinEtAl:2018:simplifying-neural-machine-translation,
  title        = "Simplifying Neural Machine Translation with Addition-Subtraction Twin-Gated Recurrent Networks",
  author       = "Zhang, Biao and Xiong, Deyi and Su, Jinsong and Lin, Qian and Zhang, Huiji",
  journal      = "arXiv preprint arXiv:1810.12546",
  year         = "2018",
  URL          = "https://arxiv.org/abs/1810.12546",
  cc-author-affiliation = "Xiamen University, China; Soochow University, China; Xiamen Meiya Pico information Co., Ltd.
                 Xiamen, China",
  cc-class     = "nlp/machine-translation, cc-cited-not-used",
}