-
Notifications
You must be signed in to change notification settings - Fork 2
/
cc2018.bib
2938 lines (2735 loc) · 178 KB
/
cc2018.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@Article{cc:AbdouKulmizevRavishankarAbzianidzeEtAl:2018:semantic-tagging,
title = "What can we learn from Semantic Tagging?",
author = "Abdou, Mostafa and Kulmizev, Artur and Ravishankar, Vinit and Abzianidze, Lasha and Bos, Johan",
journal = "arXiv preprint arXiv:1808.09716",
year = "2018",
URL = "https://arxiv.org/abs/1808.09716",
cc-author-affiliation = "University of Groningen, The Netherlands; University of Copenhagen, Denmark; University of
Oslo, Norway;",
cc-derived-dataset-used = "GloVe-word-embeddings",
cc-class = "nlp/semantics, nlp/word-embeddings, nlp/semantic-tagging",
}
@InProceedings{cc:AgrawalAnPapagelis:2018:emotion-enriched-word-representations,
title = "Learning emotion-enriched word representations",
author = "Agrawal, Ameeta and An, Aijun and Papagelis, Manos",
booktitle = "Proceedings of the 27th International Conference on Computational Linguistics",
pages = "950--961",
year = "2018",
cc-author-affiliation = "York University, Toronto, Canada",
cc-class = "nlp/word-embeddings, nlp/emotion-detection, nlp/sentiment-analysis",
cc-derived-dataset-used = "GloVe-word-embeddings",
publisher = "Association for Computational Linguistics",
URL = "https://www.aclweb.org/anthology/C18-1081",
abstract = "Most word representation learning methods are based on the distributional hypothesis in linguistics,
according to which words that are used and occur in the same contexts tend to possess similar meanings.
As a consequence, emotionally dissimilar words, such as “happy” and “sad” occurring in similar
contexts would purport more similar meaning than emotionally similar words, such as “happy” and
“joy”. This complication leads to rather undesirable outcome in predictive tasks that relate to
affect (emotional state), such as emotion classification and emotion similarity. In order to address
this limitation, we propose a novel method of obtaining emotion-enriched word representations, which
projects emotionally similar words into neighboring spaces and emotionally dissimilar ones far apart.
The proposed approach leverages distant supervision to automatically obtain a large training dataset of
text documents and two recurrent neural network architectures for learning the emotion-enriched
representations. Through extensive evaluation on two tasks, including emotion classification and
emotion similarity, we demonstrate that the proposed representations outperform several competitive
general-purpose and affective word representations.",
}
@InProceedings{cc:AlohalyTakabiBlanco:2018:learning-ABAC-policies,
author = "Alohaly, Manar and Takabi, Hassan and Blanco, Eduardo",
title = "A Deep Learning Approach for Extracting Attributes of {ABAC} Policies",
booktitle = "Proceedings of the 23Nd ACM on Symposium on Access Control Models and Technologies",
series = "SACMAT '18",
year = "2018",
ISBN = "978-1-4503-5666-4",
location = "Indianapolis, Indiana, USA",
pages = "137--148",
numpages = "12",
URL = "http://doi.acm.org/10.1145/3205977.3205984",
doi = "10.1145/3205977.3205984",
acmid = "3205984",
publisher = "ACM",
address = "New York, NY, USA",
keywords = "access control policy, attribute-based access control, deep learning, natural language processing,
policy authoring, relation extraction",
cc-author-affiliation = "University of North Texas, USA",
cc-class = "nlp/machine-translation, computer-security/access-restrictions",
}
@Article{cc:AlshomaryVolskeLichtWachsmuthEtAl:2018:Wikipedia-text-reuse,
title = "Wikipedia text reuse: within and without",
author = "Alshomary, Milad and Völske, Michael and Licht, Tristan and Wachsmuth, Henning and Stein, Benno and
Hagen, Matthias and Potthast, Martin",
journal = "arXiv preprint arXiv:1812.09221",
year = "2018",
URL = "https://link.springer.com/chapter/10.1007/978-3-030-15712-8_49",
pdf = "https://webis.de/downloads/publications/papers/stein_2019c.pdf",
cc-class = "web-mining, ir/duplicate-detection",
cc-author-affiliation = "Paderborn University, Germany; Bauhaus-Universität Weimar, Germany;
Martin-Luther-Universität Halle-Wittenberg, Germany; Leipzig University, Germany",
abstract = "We study text reuse related to Wikipedia at scale by compiling the first corpus of text reuse cases
within Wikipedia as well as without (i.e., reuse of Wikipedia text in a sample of the Common Crawl). To
discover reuse beyond verbatim copy and paste, we employ state-of-the-art text reuse detection
technology, scaling it for the first time to process the entire Wikipedia as part of a distributed
retrieval pipeline. We further report on a pilot analysis of the 100 million reuse cases inside, and
the 1.6 million reuse cases outside Wikipedia that we discovered. Text reuse inside Wikipedia gives
rise to new tasks such as article template induction, fixing quality flaws, or complementing
Wikipedia’s ontology. Text reuse outside Wikipedia yields a tangible metric for the emerging field of
quantifying Wikipedia’s influence on the web. To foster future research into these tasks, and for
reproducibility’s sake, the Wikipedia text reuse corpus and the retrieval pipeline are made freely
available.",
cc-snippet = "To foster research into Wikipedia textreuse, we compiled the first Wikipedia text reuse corpus,
obtained from comparingthe entire Wikipedia to itself as well as to a 10\%-sample of the Common
Crawl.",
}
@Article{cc:AmatuniHeBergelson:2018:vector-space-representations,
title = "Preserved Structure Across Vector Space Representations",
author = "Amatuni, Andrei and He, Estelle and Bergelson, Elika",
journal = "arXiv preprint arXiv:1802.00840",
year = "2018",
URL = "https://arxiv.org/abs/1802.00840",
cc-author-affiliation = "Duke University",
cc-derived-dataset-used = "GloVe-word-embeddings",
cc-class = "nlp/semantics, nlp/word-embeddings",
}
@Misc{cc:AmmarMcSherrySalihogluJoglekar:2018:subgraph-queries,
author = "Khaled Ammar and Frank McSherry and Semih Salihoglu and Manas Joglekar",
title = "Distributed Evaluation of Subgraph Queries Using Worstcase Optimal LowMemory Dataflows",
year = "2018",
eprint = "arXiv:1802.03760",
URL = "https://arxiv.org/pdf/1802.03760.pdf",
cc-derived-dataset-used = "WDC-hyperlinkgraph",
cc-class = "graph-processing",
cc-author-affiliation = "University of Waterloo, Canada; ETH Zürich, Switzerland; Google, Inc.",
}
@Article{cc:AmmarMcSherrySalihogluJoglekar:2018:subgraph-queries-2,
title = "Distributed evaluation of subgraph queries using worst-case optimal low-memory dataflows",
author = "Ammar, Khaled and McSherry, Frank and Salihoglu, Semih and Joglekar, Manas",
journal = "Proceedings of the VLDB Endowment",
volume = "11",
number = "6",
pages = "691--704",
year = "2018",
publisher = "VLDB Endowment",
URL = "https://dl.acm.org/citation.cfm?id=3199520",
cc-derived-dataset-used = "WDC-hyperlinkgraph",
cc-class = "graph-processing",
cc-author-affiliation = "University of Waterloo, Canada; ETH Zürich, Switzerland; Google, Inc.",
}
@Article{cc:AnilPereyraPassosOrmandiEtAl:2018:large-scale-distributed-neural,
title = "Large scale distributed neural network training through online distillation",
author = "Anil, Rohan and Pereyra, Gabriel and Passos, Alexandre and Ormandi, Robert and Dahl, George E. and
Hinton, Geoffrey E.",
journal = "arXiv preprint arXiv:1804.03235",
year = "2018",
cc-dataset-used = "CC-MAIN-2017-26",
eprint = "arXiv:1804.03235",
URL = "https://arxiv.org/abs/1804.03235",
cc-class = "nlp/neural-networks",
cc-author-affiliation = "Google; Google Brain; Google DeepMind",
}
@InProceedings{cc:ArshadMirheidariLauingerCrispoEtAl:2018:large-scale-analysis-of-style,
author = "Arshad, Sajjad and Mirheidari, Seyed Ali and Lauinger, Tobias and Crispo, Bruno and Kirda, Engin and
Robertson, William",
title = "Large-Scale Analysis of Style Injection by Relative Path Overwrite",
booktitle = "Proceedings of the 2018 World Wide Web Conference",
series = "WWW '18",
year = "2018",
ISBN = "978-1-4503-5639-8",
location = "Lyon, France",
pages = "237--246",
numpages = "10",
URL = "https://doi.org/10.1145/3178876.3186090",
doi = "10.1145/3178876.3186090",
publisher = "International World Wide Web Conferences Steering Committee",
keywords = "relative path overwrite, scriptless attack, style injection",
cc-author-affiliation = "Northeastern University, Boston, MA, USA; University of Trento, Trento, Italy",
cc-snippet = "We extract pages using relative-path stylesheets from the Common Crawl dataset [9], automatically test
if style directives can be injected using RPO, and determine whether they are interpreted by the
browser. [...] For finding the initial seed set of candidate pages with relative-path stylesheets, we
leverage the Common Crawl from August 2016, which contains more than 1.6 billion pages. By using an
existing dataset, we can quickly identify candidate pages without creating any web crawl traffic. We
use a Java HTML parser to filter any pages containing only inline CSS or stylesheets referenced by
absolute URLs, leaving us with over 203 million pages on nearly 6 million sites.",
cc-dataset-used = "CC-MAIN-2016-36",
cc-class = "web-science, computer-security/web-application-security",
}
@InProceedings{cc:ArtetxeLabakaAgirre:2018:bilingual-word-embedding-mappings,
title = "Generalizing and improving bilingual word embedding mappings with a multi-step framework of linear
transformations",
author = "Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko",
booktitle = "Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence (AAAI-18)",
year = "2018",
URL = "https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16935/16781",
cc-author-affiliation = "University of the Basque Country, Spain",
cc-class = "nlp/semantics, nlp/word-embeddings, nlp/bilingual-word-embeddings",
}
@Article{cc:ArtetxeLabakaAgirre:2018:robust-self-learning-method,
title = "A robust self-learning method for fully unsupervised cross-lingual mappings of word embeddings",
author = "Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko",
journal = "arXiv preprint arXiv:1805.06297",
year = "2018",
URL = "https://arxiv.org/abs/1805.06297",
cc-author-affiliation = "University of the Basque Country, Spain",
cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
cc-class = "nlp/semantics, nlp/word-embeddings, nlp/bilingual-word-embeddings",
}
@Article{cc:ArtetxeLabakaLopez-GazpioAgirre:2018:linguistic-information-in-word-embeddings,
title = "Uncovering divergent linguistic information in word embeddings with lessons for intrinsic and
extrinsic evaluation",
author = "Artetxe, Mikel and Labaka, Gorka and Lopez-Gazpio, Iñigo and Agirre, Eneko",
journal = "arXiv preprint arXiv:1809.02094",
year = "2018",
URL = "https://arxiv.org/abs/1809.02094",
cc-author-affiliation = "University of the Basque Country, Spain",
cc-derived-dataset-used = "GloVe-word-embeddings, fastText-word-embeddings",
cc-class = "nlp/semantics, nlp/word-embeddings",
}
@Article{cc:ArtetxeSchwenk:2018:parallel-corpus-mining,
title = "Margin-based parallel corpus mining with multilingual sentence embeddings",
author = "Artetxe, Mikel and Schwenk, Holger",
journal = "arXiv preprint arXiv:1811.01136",
year = "2018",
URL = "https://arxiv.org/abs/1811.01136",
cc-author-affiliation = "University of the Basque Country, Spain; Facebook AI Research",
cc-class = "cc-cited-not-used, nlp/word-embeddings, nlp/sentence-embeddings, nlp/parallel-corpus",
}
@Article{cc:BaharBrixNey:2018:two-dimensional-sequence-to-sequence-model,
title = "Towards two-dimensional sequence to sequence model in neural machine translation",
author = "Bahar, Parnia and Brix, Christopher and Ney, Hermann",
journal = "arXiv preprint arXiv:1810.03975",
year = "2018",
URL = "https://arxiv.org/abs/1810.03975",
cc-author-affiliation = "RWTH Aachen University, Germany",
cc-class = "nlp/machine-translation",
cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
}
@Book{cc:Balog:2018:entity-oriented-search,
title = "Entity-oriented search",
author = "Balog, Krisztian",
year = "2018",
publisher = "Springer",
URL = "https://link.springer.com/content/pdf/10.1007/978-3-319-93935-3.pdf",
cc-author-affiliation = "University of Stavanger, Norway",
cc-dataset-used = "CC-MAIN-2017-22",
cc-snippet = "Common CrawlCommon Crawl5is a nonprofit organization that regularly crawlsthe Web and makes the data
publicly available. The datasets are hosted on AmazonS3 as part of the Amazon Public Datasets
program.6As of May 2017, the crawlcontains 2.96 billion web pages and over 250 TB of uncompressed
content (inWARC format). The Web Data Commons project7extracts structured data fromthe Common Crawl and
makes those publicly available (e.g., the Hyperlink GraphDataset and the Web Table Corpus).",
cc-class = "information-retrieval, nlp/named-entity-recognition, linked data",
}
@Article{cc:BarbosaCrescenziDongMerialdoEtAl:2018:big-data-integration,
title = "Big Data Integration for Product Specifications.",
author = "Barbosa, Luciano and Crescenzi, Valter and Dong, Xin Luna and Merialdo, Paolo and Piai, Federico and
Qiu, Disheng and Shen, Yanyan and Srivastava, Divesh",
journal = "IEEE Data Eng. Bull.",
volume = "41",
number = "2",
pages = "71--81",
year = "2018",
URL = "http://sites.computer.org/debull/A18june/A18JUN-CD.pdf#page=73",
cc-author-affiliation = "Universidade Federal de Pernambuco, Brazil; Roma Tre University, Italy; Amazon; Wanderio;
Shanghai Jiao Tong University; AT&T Labs – Research",
cc-snippet = "About 68\% of the sources discovered by our approach were not present in Common Crawl. Only 20\% of
our sources contained fewer pages than the same sources in Common Crawl, and a very small fraction of
the pages in these sources were product pages: on a sample set of 12 websites where Common Crawl
presented more pages than in our dataset, we evaluated that only 0.8\% of the pages were product
pages.",
cc-class = "ir/information-extraction, ir/data-integration",
}
@Article{cc:BarbosaCrescenziDongMerialdoEtAl:2018:product-dataset,
title = "Lessons Learned and Research Agenda for Big Data Integration of Product Specifications (Discussion
Paper)",
author = "Barbosa, Luciano and Crescenzi, Valter and Dong, Xin Luna and Merialdo, Paolo and Piai, Federico and
Qiu, Disheng and Shen, Yanyan and Srivastava, Divesh",
year = "2018",
cc-snippet = "Building a Benchmark Product Dataset – We compared the contents of our dataset with pages in Common
Crawl, an open repository of web crawl data. About 68\% of the sources discovered by our approach were
not present in Common Crawl. Only 20\% of our sources contained fewer pages than the same sources in
Common Crawl, and a very small fraction of the pages in these sources were product pages: on a sample
set of 12 websites where Common Crawl presented more pages than in our dataset, we evaluated that only
0.8\% of the pages were product pages.",
URL = "http://ceur-ws.org/Vol-2161/paper29.pdf",
cc-author-affiliation = "Universidade Federal de Pernambuco, Brazil; Roma Tre University, Italy; Amazon; Wanderio;
Shanghai Jiao Tong University; AT&T Labs – Research",
cc-class = "ir/information-extraction, ir/data-integration",
}
@Article{cc:BatikasClaussenPeukert:2018:online-piracy,
author = "Batikas, Michail and Claussen, Jörg and Peukert, Christian",
title = "Follow The Money: Online Piracy and Self-Regulation in the Advertising Industry",
year = "2018",
pdf = "http://www.cesifo-group.de/DocDL/cesifo1_wp6852.pdf",
series = "CESifo Working Papers",
volume = "6852",
cc-snippet = "We obtain archived versions of the HTML source code of all URLs for each domain in our gross sample
from Common Crawl, a project that has crawled billions of webpages periodically since summer 2013.",
cc-class = "web-science",
cc-author-affiliation = "LMU Munich, Germany; UCP – Católica Lisbon School of Business and Economics, Lisboa,
Portugal",
}
@InProceedings{cc:BattleDuanMirandaMukushevaEtAl:2018:automated-extraction-of-visualizations,
title = "Beagle: Automated Extraction and Interpretation of Visualizations from the Web",
author = "Battle, Leilani and Duan, Peitong and Miranda, Zachery and Mukusheva, Dana and Chang, Remco and
Stonebraker, Michael",
booktitle = "Proceedings of the 2018 CHI Conference on Human Factors in Computing Systems",
pages = "594",
year = "2018",
organization = "ACM",
abstract = "``How common is interactive visualization on the web?'' ``What is the most popular visualization
design?'' ``How prevalent are pie charts really?'' These questions intimate the role of interactive
visualization in the real (online) world. In this paper, we present our approach (and findings) to
answering these questions. First, we introduce Beagle, which mines the web for SVG-based visualizations
and automatically classifies them by type (i.e., bar, pie, etc.). With Beagle, we extract over 41,000
visualizations across five different tools and repositories, and classify them with 85\% accuracy,
across 24 visualization types. Given this visualization collection, we study usage across tools. We
find that most visualizations fall under four types: bar charts, line charts, scatter charts, and
geographic maps. Though controversial, pie charts are relatively rare for the visualization tools that
were studied. Our findings also suggest that the total visualization types supported by a given tool
could factor into its ease of use. However this effect appears to be mitigated by providing a variety
of diverse expert visualization examples to users.",
URL = "https://dl.acm.org/citation.cfm?id=3174168",
cc-author-affiliation = "University of Washington, Seattle, WA, USA; Massachusetts Institute of Technology, Cambridge,
MA, USA; Tufts University, Medford, MA, USA",
cc-snippet = "As found with other web crawling projects, such as the Common Crawl¹, our web crawls represent a
specific point in time for the websites [...]",
cc-class = "web-science, web-crawling",
}
@Article{cc:BellomariniFayzrakhmanovGottlobKravchenkoEtAl:2018:data-science-Vadalog,
title = "Data Science with Vadalog: Bridging Machine Learning and Reasoning",
author = "Bellomarini, Luigi and Fayzrakhmanov, Ruslan R and Gottlob, Georg and Kravchenko, Andrey and Laurenza,
Eleonora and Nenov, Yavor and Reissfelder, Stephane and Sallinger, Emanuel and Sherkhonov, Evgeny and
Wu, Lianlong",
journal = "arXiv preprint arXiv:1807.08712",
year = "2018",
URL = "https://arxiv.org/abs/1807.08712",
cc-author-affiliation = "University of Oxford, United Kingdom; Banca d’Italia, Italy; TU Wien, Austria",
cc-snippet = "Enterprises increasingly depend on intelligent information systems that operationalise corporate
knowledge as a unified source across system boundaries. [...] To maintain their competitive edge,
companies need to incorporate multiple heterogeneous sources of information, including [...] external
streams of unstructured data (e.g., news and social media feeds, and Common Crawl¹), [...]",
cc-class = "ai/semantic-reasoning, ai/machine-learning",
}
@InProceedings{cc:BentivogliCettoloFedericoChristian:2018:machine-translation-human-evaluation,
title = "Machine Translation Human Evaluation: an investigation of evaluation based on Post-Editing and its
relation with Direct Assessment",
author = "Bentivogli, Luisa and Cettolo, Mauro and Federico, Marcello and Christian, Federmann",
booktitle = "International Workshop on Spoken Language Translation",
year = "2018",
URL = "https://workshop2018.iwslt.org/downloads/Proceedings_IWSLT_2018.pdf#page=77",
cc-author-affiliation = "FBK, Trento, Italy; Amazon AI, East Palo Alto, CA, USA, Microsoft Cloud+AI, Redmond, WA,
USA",
cc-derived-dataset-used = "WMT-16-translation-task-common-crawl-corpus",
cc-class = "nlp/machine-translation",
}
@InProceedings{cc:BevendorffSteinHagenPotthast:2018:Elastic-ChatNoir,
author = "Janek Bevendorff and Benno Stein and Matthias Hagen and Martin Potthast",
title = "Elastic ChatNoir: Search Engine for the ClueWeb and the Common Crawl",
booktitle = "Advances in Information Retrieval - 40th European Conference on {IR} Research, {ECIR} 2018, Grenoble,
France, March 26-29, 2018, Proceedings",
pages = "820--824",
year = "2018",
URL = "https://doi.org/10.1007/978-3-319-76941-7_83",
doi = "10.1007/978-3-319-76941-7_83",
cc-dataset-used = "CC-MAIN-2015-11",
cc-class = "information-retrieval/search-engine",
cc-author-affiliation = "Bauhaus-Universität Weimar, Germany; Leipzig University, Germany",
}
@Article{cc:BoldiMarinoSantiniVigna:2018:BUbiNG-Massive-crawling-for,
title = "{BU}bi{NG}: Massive crawling for the masses",
author = "Boldi, Paolo and Marino, Andrea and Santini, Massimo and Vigna, Sebastiano",
journal = "ACM Transactions on the Web (TWEB)",
volume = "12",
number = "2",
year = "2018",
publisher = "ACM",
URL = "https://dl.acm.org/citation.cfm?id=3160017",
cc-author-affiliation = "Università degli Studi di Milano, Italy",
cc-derived-dataset-cited = "WDC-hyperlinkgraph",
cc-class = "web-crawling, web-science/hyperlinkgraph",
}
@Article{cc:BrauneFraserHaddow:2018:improving-translation,
title = "{D1}. 2: Report on Improving Translation with Monolingual Data",
author = "Braune, Fabienne and Fraser, Alex and Haddow, Barry",
year = "2018",
URL = "http://www.himl.eu/files/D1.2_Using_Non_Parallel.pdf",
cc-author-affiliation = "University of Edinburgh",
cc-class = "nlp/machine-translation",
}
@InProceedings{cc:BrychcinHercigSteinbergerKonkol:2018:UWB-at-SemEval-2018,
title = "{UWB} at SemEval-2018 Task 10: Capturing Discriminative Attributes from Word Distributions",
author = "Brychcín, Tomáš and Hercig, Tomáš and Steinberger, Josef and Konkol, Michal",
booktitle = "Proceedings of The 12th International Workshop on Semantic Evaluation",
pages = "935--939",
year = "2018",
URL = "http://www.aclweb.org/anthology/S18-1153",
cc-author-affiliation = "University of West Bohemia, Czech Republic",
cc-derived-dataset-used = "GloVe-word-embeddings",
cc-class = "nlp/semantics, nlp/word-embeddings",
}
@Article{cc:CafarellaHalevyLeeMadhavanEtAl:2018:ten-years-of-webtables,
title = "Ten years of webtables",
author = "Cafarella, Michael and Halevy, Alon and Lee, Hongrae and Madhavan, Jayant and Yu, Cong and Wang, Daisy
Zhe and Wu, Eugene",
journal = "Proceedings of the VLDB Endowment",
volume = "11",
number = "12",
pages = "2140--2149",
year = "2018",
publisher = "VLDB Endowment",
URL = "https://dl.acm.org/citation.cfm?id=3275614",
pdf = "http://web.eecs.umich.edu/~michjc/papers/p2140-cafarella.pdf",
cc-author-affiliation = "Google Inc.; University of Michigan, USA; Megagon Labs; University of Florida, USA; Columbia
University, USA",
cc-class = "semantic web, web tables, web-mining",
cc-snippet = "Several researchers produced web tables from the public Common Crawl [1, 24, 15], thereby making them
available to a broad audience outside the large Web companies.",
cc-derived-dataset-cited = "WDCWebTables, DresdenWebTableCorpus",
}
@Article{cc:CasalnuovoSagaeDevanbu:2018:difference-between-natural-and-programming-language-corpora,
title = "Studying the Difference Between Natural and Programming Language Corpora",
author = "Casalnuovo, Casey and Sagae, Kenji and Devanbu, Prem",
journal = "Empirical Software Engineering",
pages = "1--46",
publisher = "Springer",
year = "2018",
URL = "https://link.springer.com/article/10.1007/s10664-018-9669-7",
pdf = "https://arxiv.org/pdf/1806.02437.pdf",
cc-author-affiliation = "University of California, Davis, USA",
cc-class = "nlp/corpus-construction, nlp/text-corpora, programming-languages, nlp/syntax",
cc-derived-dataset-used = "conll-2017-shared-task",
cc-snippet = "The Germanand Spanish corpora were selected from a sample of files from the unlabeled datasets from
the ConLL 2017 Shared Task (Ginter et al, 2017), which consist of web text obtained from
CommonCrawl.⁸ Like the 1 billion token English corpus, we selected a random subsample to make these
corpora size comparable with our other corpora. In this sample, we excluded files from the Wikipedia
translations, as we observed Wikipedia formatting mixed in with some of the files.",
}
@Article{cc:ChenZhangWangZuoEtAl:2018:image-captioning,
title = "Leveraging Unpaired Out-of-Domain Data for Image Captioning",
author = "Chen, Xinghan and Zhang, Mingxing and Wang, Zheng and Zuo, Lin and Li, Bo and Yang, Yang",
journal = "Pattern Recognition Letters",
year = "2018",
publisher = "Elsevier",
URL = "https://www.sciencedirect.com/science/article/abs/pii/S0167865518309358",
cc-author-affiliation = "University of Electronic Science and Technology of China (UESTC), Chengdu, PR China",
cc-class = "nlp/text-generation, ai/image-classification, nlp/image-captioning, ai/deep-learning",
}
@InProceedings{cc:ChiHuangChenWuEtAl:2018:Zewen-at-SemEval-2018,
title = "Zewen at SemEval-2018 Task 1: An Ensemble Model for Affect Prediction in Tweets",
author = "Chi, Zewen and Huang, Heyan and Chen, Jiangui and Wu, Hao and Wei, Ran",
booktitle = "Proceedings of The 12th International Workshop on Semantic Evaluation",
pages = "313--318",
year = "2018",
URL = "http://www.aclweb.org/anthology/S18-1046",
cc-author-affiliation = "Beijing Institute of Technology, China",
cc-derived-dataset-used = "GloVe-word-embeddings",
cc-class = "nlp, nlp/sentiment-analysis, nlp/emotion-detection, nlp/word-embeddings",
}
@Article{cc:Chinea-RiosPerisCasacuberta:2018:automatic-metrics,
title = "Are Automatic Metrics Robust and Reliable in Specific Machine Translation Tasks?",
author = "Chinea-Rios, Mara and Peris, Alvaro and Casacuberta, Francisco",
year = "2018",
pages = "89--98",
publisher = "European Association for Machine Translation",
URL = "http://rua.ua.es/dspace/handle/10045/76022",
pdf = "http://eamt2018.dlsi.ua.es/proceedings-eamt2018.pdf",
cc-author-affiliation = "Universitat d'Alacant, Spain",
cc-class = "nlp/machine-translation",
cc-snippet = "In our setup, we trained a PB-SMT and a NMT system on the same data, from a general corpus extracted
from websites (Common Crawl).",
}
@Article{cc:ChollampattNg:2018:neural-network-grammatical-error-correction,
title = "A multilayer convolutional encoder-decoder neural network for grammatical error correction",
author = "Chollampatt, Shamil and Ng, Hwee Tou",
journal = "arXiv preprint arXiv:1801.08831",
year = "2018",
URL = "https://arxiv.org/abs/1801.08831",
cc-author-affiliation = "NUS Graduate School for Integrative Sciences and Engineering; Department of Computer Science,
National University of Singapore",
cc-snippet = "We also make use of the larger English corpora from Wikipedia (1.78B words) for pre-training the word
embeddings, and a subset of the Common Crawl corpus (94B words) for training the language model for
rescoring.",
cc-class = "nlp/grammatical-error-correction, nlp/word-embeddings, nlp/language-model",
}
@InProceedings{cc:ClarksonGentileGruhlRistoskiEtAl:2018:user-centric-ontology-population,
title = "User-Centric Ontology Population",
author = "Clarkson, Kenneth and Gentile, Anna Lisa and Gruhl, Daniel and Ristoski, Petar and Terdiman, Joseph
and Welch, Steve",
booktitle = "European Semantic Web Conference",
pages = "112--127",
year = "2018",
organization = "Springer",
URL = "https://link.springer.com/chapter/10.1007/978-3-319-93417-4_8",
doi = "https://doi.org/10.1007/978-3-319-93417-4_8",
cc-author-affiliation = "IBM Research Almaden, San Jose, USA",
cc-class = "semantic web, cc-cited-not-used, ontology extraction",
}
@InProceedings{cc:CohenWiddows:2018:order-neural-word-embeddings,
title = "Bringing Order to Neural Word Embeddings with Embeddings Augmented by Random Permutations ({EARP})",
author = "Cohen, Trevor and Widdows, Dominic",
booktitle = "Proceedings of the 22nd Conference on Computational Natural Language Learning",
pages = "465--475",
year = "2018",
URL = "http://www.aclweb.org/anthology/K18-1045",
cc-author-affiliation = "University of Washington, Seattle, USA; Grab, Inc., Seattle, WA, USA",
cc-class = "nlp/word-embeddings, cc-cited-not-used",
}
@Article{cc:ConneauKiela:2018:SentEval-evaluation-toolkit,
title = "SentEval: An evaluation toolkit for universal sentence representations",
author = "Conneau, Alexis and Kiela, Douwe",
journal = "arXiv preprint arXiv:1803.05449",
year = "2018",
URL = "https://arxiv.org/abs/1803.05449",
cc-author-affiliation = "Facebook Artificial Intelligence Research",
cc-derived-dataset-used = "GloVe-word-embeddings, fastText-word-embeddings",
cc-class = "nlp/word-embeddings, nlp/sentence-embeddings, nlp/evaluation",
}
@Article{cc:ConneauLampleRinottWilliamsEtAl:2018:cross-lingual-sentence-representations,
title = "{XNLI}: Evaluating Cross-lingual Sentence Representations",
author = "Conneau, Alexis and Lample, Guillaume and Rinott, Ruty and Williams, Adina and Bowman, Samuel R and
Schwenk, Holger and Stoyanov, Veselin",
journal = "arXiv preprint arXiv:1809.05053",
year = "2018",
URL = "https://arxiv.org/abs/1809.05053",
cc-author-affiliation = "Facebook AI Research, USA; New York University, USA",
cc-derived-dataset-used = "fasttext-word-embeddings",
cc-class = "nlp/word-embeddings, nlp/sentence-embeddings",
}
@InProceedings{cc:ConoverHayesBlackburnSkomorochEtAl:2018:Pangloss-fast-entity-linking,
title = "Pangloss: Fast Entity Linking in Noisy Text Environments",
author = "Conover, Michael and Hayes, Matthew and Blackburn, Scott and Skomoroch, Pete and Shah, Sam",
booktitle = "Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining",
pages = "168--176",
year = "2018",
URL = "https://dl.acm.org/citation.cfm?id=3219899",
organization = "ACM",
cc-snippet = "The Common Crawl datasets represents a sample of web crawl data containing raw web page data, metadata
and text extracts overseen by a 501(c)(3) nonprofit of the same name. Facilitating ease of access for
industrial practitioners, the dataset is hosted for free on Amazon Web Services’ Public Data Set
repository in addition to academic hosts the world over. As part of a batch Hadoop job run on a monthly
basis we filter the Common Crawl data (∼70TB) down to records which contain at least one hyperlink
that points to English Wikipedia. This corpus has proven particularly valuable as a source of signal
for associating tokens with knowledge base entries in the context of domain-specific, messy natural
language.",
cc-author-affiliation = "Workday, Inc., San Francisco, CA, USA",
cc-class = "ir/information-extraction",
}
@InProceedings{cc:CorreaZanderSilva:2018:open-data-portals,
title = "Investigating open data portals automatically: a methodology and some illustrations",
author = "Correa, Andreiwid Sheffer and Zander, Pär-Ola and da Silva, Flavio Soares Correa",
booktitle = "Proceedings of the 19th Annual International Conference on Digital Government Research: Governance in
the Data Age",
pages = "82",
year = "2018",
organization = "ACM",
URL = "https://dl.acm.org/citation.cfm?id=3209292",
cc-author-affiliation = "University of Sao Paulo, Sao Paulo, Brazil; Aalborg University, Aalborg, Denmark",
cc-class = "open data, information retrieval",
}
@InProceedings{cc:CulpepperDiazSmucker:2018:workshop-IR-Lorne,
title = "Research Frontiers in Information Retrieval: Report from the Third Strategic Workshop on Information
Retrieval in Lorne ({SWIRL} 2018)",
author = "Culpepper, J Shane and Diaz, Fernando and Smucker, Mark D.",
booktitle = "ACM SIGIR Forum",
volume = "52",
number = "1",
pages = "34--90",
year = "2018",
organization = "ACM",
URL = "http://doi.acm.org/10.1145/3274784.3274788",
doi = "10.1145/3274784.3274788",
pdf = "http://www.sigir.org/wp-content/uploads/2018/07/p034.pdf",
cc-author-affiliation = "ACM",
cc-class = "cc-cited-not-used, information-retrieval",
}
@InProceedings{cc:Czech:2018:geotag-web-sized-corpus,
title = "An Approach to Geotag a Web Sized Corpus of Documents with Addresses in Randstad, Netherlands",
author = "Czech, Alexander",
booktitle = "Adjunct Proceedings of the 14th International Conference on Location Based Services",
pages = "184--188",
year = "2018",
organization = "ETH Zurich",
URL = "https://doi.org/10.3929/ethz-b-000225615",
cc-author-affiliation = "TU Wien, Austria",
cc-snippet = "Common Crawl is a non-profit organization that provides raw web crawling data on a monthly basis.
Their archives contain over 3.16 billion URLs with over 260 TiB of uncompressed content.",
cc-class = "ir/geotagging",
}
@Article{cc:DemirelCinbisIkizler-Cinbis:2018:zero-shot-object-detection,
title = "Zero-Shot Object Detection by Hybrid Region Embedding",
author = "Demirel, Berkan and Cinbis, Ramazan Gokberk and Ikizler-Cinbis, Nazli",
year = "2018",
URL = "https://arxiv.org/abs/1805.06157",
cc-author-affiliation = "HAVELSAN Inc. Ankara, Turkey; Middle East Technical University Ankara, Turkey; Hacettepe
University Ankara, Turkey",
cc-derived-dataset-used = "GloVe-word-embeddings",
cc-class = "ai/computer-vision, ai/pattern-recognition, nlp/word-embeddings",
}
@Article{cc:DenisovVuFont:2018:unsupervised-domain-adaptation-speech-recognition,
title = "Unsupervised Domain Adaptation by Adversarial Learning for Robust Speech Recognition",
author = "Denisov, Pavel and Vu, Ngoc Thang and Font, Marc Ferras",
journal = "arXiv preprint arXiv:1807.11284",
URL = "https://arxiv.org/abs/1807.11284",
cc-author-affiliation = "University of Stuttgart, Germany",
year = "2018",
cc-class = "nlp, speech-recognition",
cc-snippet = "..., 197 millions words of Italian Deduplicated CommonCrawl Text are used to build Italian language
model.",
}
@Article{cc:DevHassanPhillips:2018:word-embedding-alignement,
title = "Absolute Orientation for Word Embedding Alignment",
author = "Dev, Sunipa and Hassan, Safia and Phillips, Jeff M",
journal = "arXiv preprint arXiv:1806.01330",
year = "2018",
URL = "https://arxiv.org/abs/1806.01330",
cc-author-affiliation = "University of Utah",
cc-derived-dataset-used = "GloVe-word-embeddings",
cc-class = "nlp/semantics, nlp/word-embeddings",
}
@Article{cc:EdunovOttAuliGrangier:2018:understanding-back-translation,
title = "Understanding Back-Translation at Scale",
author = "Edunov, Sergey and Ott, Myle and Auli, Michael and Grangier, David",
journal = "arXiv preprint arXiv:1808.09381",
year = "2018",
URL = "https://arxiv.org/abs/1808.09381",
cc-author-affiliation = "Facebook AI Research, USA; Google Brain, Mountain View, CA, USA",
cc-class = "nlp/machine-translation",
}
@InProceedings{cc:EfremovaEndresVidasMelnik:2018:geo-tagging-address-extraction,
title = "A Geo-Tagging Framework for Address Extraction from Web Pages",
author = "Efremova, Julia and Endres, Ian and Vidas, Isaac and Melnik, Ofer",
booktitle = "Industrial Conference on Data Mining",
pages = "288--295",
year = "2018",
publisher = "Springer",
URL = "https://link.springer.com/chapter/10.1007/978-3-319-95786-9_22",
cc-snippet = "Common Crawl is a public corpus, mostly stored on Amazon Web Services³. A subset of the CommonCrawl
dataset has schema information in the microdata format",
cc-author-affiliation = "HERE Technologies, Amsterdam, The Netherlands",
cc-class = "semantic-web/microformats",
}
@Article{cc:El-ZantJaffres-RunserFrahmShepelyansky:2018:painters-Wikipedia-networks,
title = "Interactions and influence of world painters from the reduced Google matrix of Wikipedia networks",
author = "El Zant, Samer and Jaffrès-Runser, Katia and Frahm, Klaus M. and Shepelyansky, Dima L.",
journal = "IEEE Access",
year = "2018",
publisher = "IEEE",
URL = "https://ieeexplore.ieee.org/abstract/document/8449078",
cc-author-affiliation = "Université de Toulouse, France",
cc-class = "web-science/hyperlinkgraph, graph-processing, cc-cited-not-used",
abstract = "This paper concentrates on extracting painting art history knowledge from the network structure of
Wikipedia. Therefore, we construct theoretical networks of webpages representing the hyper-linked
structure of articles of seven Wikipedia language editions. These seven networks are analyzed to
extract the most influential painters in each edition using Google matrix theory. Importance of
webpages of over 3000 painters is measured using the PageRank algorithm. The most influential painters
are enlisted and their ties are studied with the reduced Google matrix analysis. The reduced Google
matrix is a powerful method that captures both direct and hidden interactions between a subset of
selected nodes taking into account the indirect links between these nodes via the remaining part of
large global network. This method originates from the scattering theory of nuclear and mesoscopic
physics and field of quantum chaos. In this paper, we show that it is possible to extract from the
components of the reduced Google matrix meaningful information on the ties between these painters. For
instance, our analysis groups together painters that belong to the same painting movement and shows
meaningful ties between painters of different movements. We also determine the influence of painters on
world countries using link sensitivity between Wikipedia articles of painters and countries. The
reduced Google matrix approach allows to obtain a balanced view of various cultural opinions of
Wikipedia language editions. The world countries with the largest number of top painters of selected
seven Wikipedia editions are found to be Italy, France, and Russia. We argue that this approach gives
meaningful information about art and that it could be a part of extensive network analysis on human
knowledge and cultures.",
}
@Article{cc:Espana-BonetStillerHenning:2018:corpora-for-machine-translation,
title = "{M1}. 2--Corpora for the Machine Translation Engines",
author = "Espana-Bonet, Cristina and Stiller, Juliane and Henning, Sophie",
year = "2018",
URL = "https://www.clubs-project.eu/assets/publications/project/M1.2_MTcorpora_v4.0.pdf",
cc-author-affiliation = "Universität des Saarlandes, Germany; Humboldt-Universität zu Berlin, Germany",
cc-class = "nlp/machine-translation, nlp/corpora",
cc-derived-dataset-cited = "WMT-13-translation-task-common-crawl-corpus",
}
@Article{cc:EstevesReddyChawlaLehmann:2018:obfuscate-fake-news,
title = "Belittling the Source: Trustworthiness Indicators to Obfuscate Fake News on the Web",
author = "Esteves, Diego and Reddy, Aniketh Janardhan and Chawla, Piyush and Lehmann, Jens",
journal = "arXiv preprint arXiv:1809.00494",
year = "2018",
URL = "https://arxiv.org/abs/1809.00494",
cc-author-affiliation = "University of Bonn, Germany; University of Ohio, USA; Carnegie Mellon University, Pittsburgh,
USA;",
cc-class = "nlp, text classification, content credibility, information retrieval",
cc-snippet = "PageRankCC: PageRank information computed through the CommonCrawl Corpus",
}
@InProceedings{cc:FaralliLefeverPaolo-Ponzetto:2018:MIsA-multilingual-IsA-extraction,
title = "{MI}s{A}: Multilingual Is{A} Extraction from Corpora",
author = "Faralli, Stefano and Lefever, Els and Paolo Ponzetto, Simone",
booktitle = "The Eleventh International Conference on Language Resources and Evaluation (LREC 2018)",
pages = "2040--2044",
year = "2018",
organization = "European Language Resources Association (ELRA)",
URL = "https://biblio.ugent.be/publication/8562721",
cc-author-affiliation = "University of Mannheim, Germany; Ghent University, Belgium",
cc-class = "nlp/semantics, data-mining, hypernymy",
cc-derived-dataset-cited = "WDC-WebIsADb",
}
@InProceedings{cc:FayzrakhmanovSallingerSpencerFurcheEtAl:2018:browserless-web-data-extraction,
title = "Browserless web data extraction: challenges and opportunities",
author = "Fayzrakhmanov, Ruslan R. and Sallinger, Emanuel and Spencer, Ben and Furche, Tim and Gottlob, Georg",
booktitle = "Proceedings of the 2018 World Wide Web Conference on World Wide Web",
pages = "1095--1104",
year = "2018",
organization = "International World Wide Web Conferences Steering Committee",
URL = "https://dl.acm.org/citation.cfm?id=3186008",
cc-author-affiliation = "University of Oxford, Oxford, United Kingdom",
cc-class = "information retrieval, web-crawling, web-scraping, web-mining",
cc-snippet = "The random sites were chosen by randomly sampling URLs from the Common Crawl [10] search index
dataset, which includes around 3 billion web pages.",
}
@Article{cc:Funel:2018:analysis-web-graph,
author = "Funel, Agostino",
title = "Analysis of the Web Graph Aggregated by Host and Pay-Level Domain",
year = "2018",
eprint = "arXiv:1802.05435",
URL = "https://arxiv.org/abs/1802.05435",
cc-dataset-used = "hyperlinkgraph/cc-main-2017-aug-sep-oct/hostgraph,
hyperlinkgraph/cc-main-2017-aug-sep-oct/domaingraph",
cc-class = "web-science/hyperlinkgraph",
cc-author-affiliation = "ENEA, Italy",
}
@Article{cc:GarciaGomez-Perez:2018:word-representations-scientific-publications,
title = "Not just about size-{A} Study on the Role of Distributed Word Representations in the Analysis of
Scientific Publications",
author = "Garcia, Andres and Gomez-Perez, Jose Manuel",
journal = "arXiv preprint arXiv:1804.01772",
year = "2018",
cc-derived-dataset-used = "fastText-word-embeddings, GloVe-word-embeddings",
URL = "https://arxiv.org/abs/1804.01772",
cc-author-affiliation = "expertsystem.com, Madrid, Spain",
cc-class = "nlp/word-embeddings",
}
@Article{cc:GarciaGomez-Perez:2018:word-representations-scientific-publications-2,
title = "Not just about size-{A} Study on the Role of Distributed Word Representations in the Analysis of
Scientific Publications",
author = "Garcia, Andres and Gomez-Perez, Jose Manuel",
booktitle = "Proceedings of the First Workshop on Deep Learning for Knowledge Graphs and Semantic Technologies
(DL4KGS) co-located with the 15th Extended Semantic Web Conerence (ESWC 2018) Heraklion, Crete, Greece,
June 4, 2018",
year = "2018",
cc-derived-dataset-used = "fastText-word-embeddings, GloVe-word-embeddings",
pdf = "http://ceur-ws.org/Vol-2106/paper3.pdf",
cc-same-as = "cc:GarciaGomez-Perez:2018:word-representations-scientific-publications",
cc-author-affiliation = "expertsystem.com, Madrid, Spain",
cc-class = "nlp/word-embeddings",
}
@Article{cc:GargSchiebingerJurafskyZou:2018:word-embeddings-gender-and-ethnic-stereotypes,
title = "Word embeddings quantify 100 years of gender and ethnic stereotypes",
author = "Garg, Nikhil and Schiebinger, Londa and Jurafsky, Dan and Zou, James",
journal = "Proceedings of the National Academy of Sciences",
volume = "115",
number = "16",
pages = "E3635--E3644",
year = "2018",
publisher = "National Acad Sciences",
URL = "https://www.pnas.org/content/115/16/E3635.short",
doi = "https://doi.org/10.1073/pnas.1720347115",
cc-author-affiliation = "Stanford University, USA; Chan Zuckerberg Biohub, San Francisco, CA, USA",
cc-derived-dataset-used = "GloVe-word-embeddings",
cc-class = "nlp/semantics, nlp/word-embeddings, ai/ethics-of-machine-learning, ai/machine-learning",
}
@Article{cc:Ghasemi-GolSzekely:2018:TabVec-table-vectors-web-tables,
title = "TabVec: Table Vectors for Classification of Web Tables",
author = "Ghasemi-Gol, Majid and Szekely, Pedro",
journal = "arXiv preprint arXiv:1802.06290",
year = "2018",
URL = "https://arxiv.org/abs/1802.06290",
cc-author-affiliation = "University of Southern California; Information Science Institute",
cc-class = "web-tables, information-extraction",
cc-dataset-used = "CC-MAIN-2015-32",
cc-snippet = "[...] we use a random sample of July 2015 Common Crawl (WCC) as a generic domain to compare our system
with the state of the art systems",
cc-derived-dataset-cited = "WDCWebTables, DresdenWebTableCorpus",
}
@InProceedings{cc:GlassGliozzo:2018:discovering-implicit-knowledge,
title = "Discovering Implicit Knowledge with Unary Relations",
author = "Glass, Michael and Gliozzo, Alfio",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1:
Long Papers)",
volume = "1",
pages = "1585--1594",
year = "2018",
URL = "http://www.aclweb.org/anthology/P18-1147",
cc-author-affiliation = "IBM Research AI",
cc-class = "ai/knowledge-base",
}
@InProceedings{cc:GlassGliozzo:2018:web-scale-knowledge-base-population-dataset,
title = "A Dataset for Web-Scale Knowledge Base Population",
author = "Glass, Michael and Gliozzo, Alfio",
booktitle = "European Semantic Web Conference",
pages = "256--271",
year = "2018",
publisher = "Springer",
URL = "https://link.springer.com/chapter/10.1007/978-3-319-93417-4_17",
pdf = "https://2018.eswc-conferences.org/wp-content/uploads/2018/02/ESWC2018_paper_173.pdf",
cc-author-affiliation = "Knowledge Induction and Reasoning Group, IBM Research AINew YorkUSA",
cc-class = "ai/semantic-reasoning, ai/knowledge-base",
cc-snippet = "We introduce and release CC-DBP, a web-scale dataset for training and benchmarking KBP systems. The
dataset is based on Common Crawl as the corpus and DBpedia as the target knowledge base [...]",
cc-derived-dataset-about = "CC-DBP",
cc-dataset-used = "CC-MAIN-2017-26",
}
@InProceedings{cc:GlassGliozzoHassanzadehMihindukulasooriyaEtAl:2018:implicit-relations-from-text,
title = "Inducing implicit relations from text using distantly supervised deep nets",
author = "Glass, Michael and Gliozzo, Alfio and Hassanzadeh, Oktie and Mihindukulasooriya, Nandana and
Rossiello, Gaetano",
booktitle = "International Semantic Web Conference",
pages = "38--55",
year = "2018",
organization = "Springer",
URL = "https://link.springer.com/chapter/10.1007/978-3-030-00671-6_3",
cc-author-affiliation = "IBM Research AI, New York, USA; Universidad Politcnica de Madrid, Spain; University of Bari,
Italy",
cc-class = "ai/knowledge-base, ai/deep-learning, semantic web",
cc-derived-dataset-used = "CC-DBP",
}
@Article{cc:GoelMatsuyamaMadaioCassell:2018:detecting-indirectness,
title = "“{I} think it might help if we multiply, and not add”: Detecting Indirectness in Conversation",
author = "Goel, Pranav and Matsuyama, Yoichi and Madaio, Michael and Cassell, Justine",
year = "2018",
URL = "http://articulab.hcii.cs.cmu.edu/wordpress/wp-content/uploads/2018/04/Goel-IWSDS2018_camera-ready_13Mar.pdf",
cc-author-affiliation = "Indian Institute of Technology (BHU), India; Carnegie Mellon University",
cc-class = "nlp/dialogue-systems, nlp/word-embeddings",
cc-derived-dataset-used = "GloVe-word-embeddings",
}
@InProceedings{cc:GolemKaranSnajder:2018:aggressive-text-detection,
title = "Combining Shallow and Deep Learning for Aggressive Text Detection",
author = "Golem, Viktor and Karan, Mladen and Šnajder, Jan",
booktitle = "Proceedings of the First Workshop on Trolling, Aggression and Cyberbullying (TRAC-2018)",
pages = "188--198",
year = "2018",
pdf = "www.aclweb.org/anthology/W18-4422",
cc-author-affiliation = "University of Zagreb, Croatia",
cc-derived-dataset-used = "GloVe-word-embeddings",
cc-class = "nlp/text-classification, nlp/word-embeddings",
}
@Article{cc:GoodingTerrasBerube:2018:legal-deposit-web-archives,
title = "Legal Deposit Web Archives and the Digital Humanities: {A} Universe of Lost Opportunity?",
author = "Gooding, Paul and Terras, Melissa and Berube, Linda",
year = "2018",
URL = "http://eprints.gla.ac.uk/168229/",
cc-author-affiliation = "University of East Anglia, United Kingdom; University of Edinburgh, United Kingdom",
cc-snippet = "Restricted deposit library access requires researchers to look elsewhere for portable web data: by
undertaking their own web crawls, or by utilising datasets from Common Crawl (http://commoncrawl.org/)
and the Internet Archive (https://archive.org). Both organisations provide vital services to
researchers, and both innovate in areas that would traditionally fall under the deposit libraries’
purview. They support their mission by exploring the boundaries of copyright, including exceptions for
non-commercial text and data mining (Intellectual Property Office, 2014). This contrast between
risk-enabled independent organisations and deposit libraries, described by interviewees as risk averse,
challenges library/DH collaboration models such as BL Labs (http://labs.bl.uk) and Library of Congress
Labs (https://labs.loc.gov).",
cc-class = "web-archiving/legal-aspects",
}
@InProceedings{cc:GraesserRus:2018:pooling-word-vector-representations,
title = "Pooling Word Vector Representations Across Models",
author = "Banjade, Rajendra and Maharjan, Nabin and Gautam, Dipesh and Adrasik, Frank and Graesser, Arthur C.
and Rus, Vasile",
booktitle = "Computational Linguistics and Intelligent Text Processing: 18th International Conference, CICLing
2017, Budapest, Hungary, April 17-23, 2017, Revised Selected Papers",
volume = "10761",
pages = "17--29",
year = "2018",
organization = "Springer",
URL = "https://www.springer.com/de/book/9783319771151",
cc-author-affiliation = "University of Memphis, USA",
cc-derived-dataset-used = "GloVe-word-embeddings",
cc-class = "nlp/word-embeddings, nlp/semantics",
}
@Article{cc:GrandBlankPereiraFedorenko:2018:semantic-projection,
title = "Semantic projection: recovering human knowledge of multiple, distinct object features from word
embeddings",
author = "Grand, Gabriel and Blank, Idan Asher and Pereira, Francisco and Fedorenko, Evelina",
journal = "arXiv preprint arXiv:1802.01241",
year = "2018",
URL = "https://arxiv.org/abs/1802.01241",
cc-author-affiliation = "Harvard University; Massachusetts Institute of Technology; Siemens Healthineers;
Massachusetts General Hospital; Harvard Medical School",
cc-derived-dataset-used = "GloVe-word-embeddings",
cc-class = "nlp/semantics, nlp/word-embeddings",
}
@InProceedings{cc:GraveBojanowskiGuptaJoulinEtAl:2018:learning-word-vectors,
title = "Learning word vectors for 157 languages",
author = "Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas",
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC}
2018)",
year = "2018",
address = "Miyazaki, Japan",
publisher = "European Language Resources Association (ELRA)",
URL = "https://www.aclweb.org/anthology/L18-1550",
pdf = "https://www.aclweb.org/anthology/L18-1550.pdf",
abstract = "Distributed word representations, or word vectors, have recently been applied to many tasks in natural
language processing, leading to state-of-the-art performance. A key ingredient to the successful
application of these representations is to train t hem on very large corpora, and use these pre-trained
models in downstream tasks. In this paper, we describe how we trained such high qualit y word
representations for 157 languages. We used two sources of data to train these models: the free online
encyclopedia Wikip edia and data from the common crawl project. We also introduce three new word
analogy datasets to evaluate these word vectors, for Fren ch, Hindi and Polish. Finally, we evaluate
our pre-trained word vectors on 10 languages for which evaluation datasets exists, sho wing very strong
performance compared to previous models.",
cc-author-affiliation = "Facebook AI Research; École polytechnique fédérale de Lausanne EPFL, Switzerland",
cc-class = "nlp/word-embeddings",
cc-dataset-used = "CC-MAIN-2017-22 (WET)",
cc-derived-dataset-about = "fastText-word-embeddings",
cc-snippet = "The common crawl is a non profit organization which crawls the web and makes the resulting data
publicly available. This large scale corpus was previously used to estimate n-gram language models
(Buck et al., 2014) or to learn English word vectors (Pennington et al., 2014). To the best of our
knowledge, it was not used yet to learn word vectors for a large set of languages. The data is
distributed either as raw HTML pages, or as WET files which contain the extracted text data, converted
to UTF-8. We decided to use the extracted text data, as it is much smaller in size, and easier to
process (no need to remove HTML). We downloaded the May 2017 crawl, corresponding to roughly 24
terabytes of raw text data.",
}
@Article{cc:GrundkiewiczJunczys-Dowmunt:2018:grammatical-error-correction-mt,
title = "Near Human-Level Performance in Grammatical Error Correction with Hybrid Machine Translation",
author = "Grundkiewicz, Roman and Junczys-Dowmunt, Marcin",
journal = "arXiv preprint arXiv:1804.05945",
year = "2018",
URL = "https://arxiv.org/abs/1804.05945",
cc-author-affiliation = "University of Edinburgh, United Kingdom; Microsoft",
cc-class = "nlp/machine-translation, nlp/grammatical-error-correction",
cc-derived-dataset-used = "Ngrams-LMs-2013",
}
@InProceedings{cc:HazemMorin:2018:meta-embeddings-for-bilingual-lexicon-extraction,
title = "Leveraging Meta-Embeddings for Bilingual Lexicon Extraction from Specialized Comparable Corpora",
author = "Hazem, Amir and Morin, Emmanuel",
booktitle = "Proceedings of the 27th International Conference on Computational Linguistics",
pages = "937--949",
year = "2018",
URL = "http://www.aclweb.org/anthology/C18-1080",
cc-author-affiliation = "Université de Nantes, France",
cc-class = "nlp/machine-translation, nlp/lexikon, nlp/dictionary-creation",
}
@Article{cc:HedderichKlakow:2018:low-resource-training-neural-network,
title = "Training a Neural Network in a Low-Resource Setting on Automatically Annotated Noisy Data",
author = "Hedderich, Michael A. and Klakow, Dietrich",
journal = "arXiv preprint arXiv:1807.00745",
year = "2018",
URL = "https://arxiv.org/abs/1807.00745",
cc-author-affiliation = "Saarland University, Saarbrücken, Germany",
cc-class = "nlp/word-embeddings, ai/neural-networks",
cc-derived-dataset-used = "GloVe-word-embeddings",
}
@InProceedings{cc:HettingerDallmannZeheNieblerEtAl:2018:ClaiRE-at-SemEval-2018,
title = "Clai{RE} at SemEval-2018 Task 7: Classification of Relations using Embeddings",
author = "Hettinger, Lena and Dallmann, Alexander and Zehe, Albin and Niebler, Thomas and Hotho, Andreas",
booktitle = "Proceedings of The 12th International Workshop on Semantic Evaluation",
pages = "836--841",
year = "2018",
URL = "http://www.aclweb.org/anthology/S18-1134",
cc-author-affiliation = "University of Würzburg, Germany",
cc-derived-dataset-used = "GloVe-word-embeddings",
cc-class = "nlp/semantics, nlp/word-embeddings",
}
@Article{cc:HettingerDallmannZeheNieblerEtAl:2018:ClaiRE-at-SemEval-2018-extended-version,
title = "Clai{RE} at SemEval-2018 Task 7-Extended Version",
author = "Hettinger, Lena and Dallmann, Alexander and Zehe, Albin and Niebler, Thomas and Hotho, Andreas",
journal = "arXiv preprint arXiv:1804.05825",
year = "2018",
URL = "https://arxiv.org/abs/1804.05825",
cc-author-affiliation = "University of Würzburg, Germany",
cc-class = "nlp/semantics, nlp/word-embeddings",