-
Notifications
You must be signed in to change notification settings - Fork 97
/
book.bib
executable file
·1569 lines (1428 loc) · 58.2 KB
/
book.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@book{Silge2017,
author = {Silge, Julia and Robinson, David},
title = {Text Mining with {R}: A Tidy Approach},
year = {2017},
isbn = {1491981652, 9781491981658},
publisher = {O'Reilly Media, Inc.},
address = {Sebastopol}
}
@Book{xie2015,
title = {Dynamic Documents with {R} and knitr},
author = {Yihui Xie},
publisher = {Chapman and Hall/CRC},
address = {Boca Raton, Florida},
year = {2015},
edition = {2nd},
note = {ISBN 978-1498716963},
url = {http://yihui.name/knitr/},
}
@book{Feldman2007,
title = {The text mining handbook},
author = {R. Feldman{,} and J. Sanger},
publisher = {Cambridge University Press},
address = {Cambridge},
isbn = {9780511546914},
year = {2007},
}
@article{Schofield16,
author = {Schofield, Alexandra and Mimno, David},
title = {Comparing Apples to Apple: The Effects of Stemmers on Topic Models},
journal = {Transactions of the Association for Computational Linguistics},
volume = {4},
number = {},
pages = {287-300},
year = {2016},
doi = {10.1162/tacl_a_00099},
URL = {https://doi.org/10.1162/tacl_a_00099},
eprint = {https://doi.org/10.1162/tacl_a_00099}
}
@article{Porter80,
author = {Porter, Martin F},
journal = {Program},
number = 3,
pages = {130-137},
title = {An algorithm for suffix stripping.},
url = {https://doi.org/10.1108/eb046814},
doi = {10.1108/eb046814},
volume = 14,
year = 1980
}
@article{Lovins68,
author = {Lovins, Julie B.},
journal = {Mechanical Translation and Computational Linguistics},
pages = {22-31},
title = {Development of a stemming algorithm},
volume = 11,
year = 1968
}
@article{Miller95,
author = {Miller, George A.},
title = {WordNet: A Lexical Database for {E}nglish},
journal = {Communications of the ACM},
issue_date = {Nov. 1995},
volume = {38},
number = {11},
month = nov,
year = {1995},
issn = {0001-0782},
pages = {39--41},
numpages = {3},
url = {http://doi.acm.org/10.1145/219717.219748},
doi = {10.1145/219717.219748},
acmid = {219748},
publisher = {ACM},
address = {New York, NY},
}
@article{Arnold17,
author = {Taylor Arnold},
title = {{A Tidy Data Model for Natural Language Processing using
cleanNLP}},
year = {2017},
journal = {{The R Journal}},
doi = {10.32614/RJ-2017-035},
url = {https://doi.org/10.32614/RJ-2017-035},
pages = {248--267},
volume = {9},
number = {2}
}
@Manual{Benoit19,
title = {{spacyr}: Wrapper to the `spaCy' `NLP' Library},
author = {Kenneth Benoit and Akitaka Matsuo},
year = {2020},
note = {R package version 1.2.1},
url = {https://CRAN.R-project.org/package=spacyr},
}
@misc{boost_c_libraries,
author = {},
year = {2007},
url = {https://www.boost.org/doc/libs/1_44_0/libs/regex/doc/html/boost_regex/syntax/basic_extended.html},
journal = {Boost C Libraries}
}
@book{levithan2012regular,
title={Regular Expressions Cookbook},
author={Levithan, J.G.S.},
isbn={9781449327453},
year={2012},
publisher={O'Reilly Media, Inc.},
address = {Sebastopol}
}
@article{Willett06,
volume = {40},
number = {3},
author = {P. Willett},
title = {The {P}orter stemming algorithm: then and now },
publisher = {Emerald},
year = {2006},
journal = {Program: Electronic Library and Information Systems},
pages = {219--223},
doi = {10.1108/00330330610681295},
keywords = {conflation, information retrieval, Porter stemming algorithm, stemming algorithm, suffix, word variant},
url = {http://eprints.whiterose.ac.uk/1434/},
}
@article{Briscoe13,
title={Introduction to Linguistics for Natural Language Processing},
author={Briscoe, Ted},
year = {2013},
url = {https://www.cl.cam.ac.uk/teaching/1314/L100/introling.pdf}
}
@article{Bender11,
title={On achieving and evaluating language-independence in NLP},
author={Bender, Emily M},
journal={Linguistic Issues in Language Technology},
volume={6},
number={3},
pages={1--26},
year={2011}
}
@article{Bender13,
title={Linguistic fundamentals for natural language processing: 100 essentials from morphology and syntax},
author={Bender, Emily M},
journal={Synthesis lectures on human language technologies},
volume={6},
number={3},
pages={1--184},
year={2013},
publisher={Morgan \& Claypool Publishers}
}
@inproceedings{Sap19,
title = "The Risk of Racial Bias in Hate Speech Detection",
author = "Sap, Maarten and
Card, Dallas and
Gabriel, Saadia and
Choi, Yejin and
Smith, Noah A.",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1163",
doi = "10.18653/v1/P19-1163",
pages = "1668--1678"
}
@misc{McCulloch15,
title={Move over {S}hakespeare, teen girls are the real language disruptors},
url={https://qz.com/474671/move-over-shakespeare-teen-girls-are-the-real-language-disruptors/},
journal={Quartz},
publisher={Quartz},
author={McCulloch, Gretchen},
year={2015},
month={Aug}
}
@article{Luhn1960,
author = {Luhn, H. P.},
title = {Key word-in-context index for technical literature ({kwic} index)},
journal = {American Documentation},
volume = {11},
number = {4},
pages = {288-295},
doi = {10.1002/asi.5090110403},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/asi.5090110403},
eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/asi.5090110403},
abstract = {Abstract A distinction is made between bibliographical indexes for new and past literature based on the willingness of the user to trade perfection for currency. Indexes giving keywords in their context are proposed as suitable for disseminating new information. These can be entirely machine-generated and hence kept up-to-date with the current literature. A compatible coding scheme to identify the indexed documents is also proposed. In it elements are automatically extracted from the usual identifiers of the document so that the coded identifier yields a maximum of information while remaining susceptible to normal methods of ordering.},
year = {1960}
}
@inproceedings{nothman-etal-2018-stop,
title = "Stop Word Lists in Free Open-source Software Packages",
author = "Nothman, Joel and
Qin, Hanmin and
Yurchak, Roman",
booktitle = "Proceedings of Workshop for {NLP} Open Source Software ({NLP}-{OSS})",
month = jul,
year = "2018",
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-2502",
doi = "10.18653/v1/W18-2502",
pages = "7--12",
abstract = "Open-source software packages for language processing often include stop word lists. Users may apply them without awareness of their surprising omissions (e.g. {``}hasn{'}t{''} but not {``}hadn{'}t{''}) and inclusions ({``}computer{''}), or their incompatibility with a particular tokenizer. Motivated by issues raised about the Scikit-learn stop list, we investigate variation among and consistency within 52 popular English-language stop lists, and propose strategies for mitigating these issues.",
}
@Manual{Wickham19,
title = {{stringr}: Simple{,} Consistent Wrappers for Common String Operations},
author = {Hadley Wickham},
year = {2019},
note = {R package version 1.4.0},
url = {https://CRAN.R-project.org/package=stringr},
}
@Article{Mullen18,
title = {Fast, Consistent Tokenization of Natural Language Text},
author = {Lincoln A. Mullen and Kenneth Benoit and Os Keyes and Dmitry Selivanov and Jeffrey Arnold},
journal = {Journal of Open Source Software},
year = {2018},
volume = {3},
issue = {23},
pages = {655},
url = {https://doi.org/10.21105/joss.00655},
doi = {10.21105/joss.00655},
}
@Manual{Gagolewski19,
title = {{stringi}: Character string processing facilities},
author = {Marek Gagolewski},
year = {2020},
note = {R package version 1.6.2},
url = {http://www.gagolewski.com/software/stringi/},
}
@Article{Silge16,
title = {tidytext: Text Mining and Analysis Using Tidy Data Principles in {R}},
author = {Julia Silge and David Robinson},
doi = {10.21105/joss.00037},
url = {http://dx.doi.org/10.21105/joss.00037},
year = {2016},
publisher = {The Open Journal},
volume = {1},
number = {3},
journal = {JOSS},
}
@inproceedings{Zou2006,
title = "Evaluation of Stop Word Lists in {C}hinese Language",
author = "Zou, Feng and
Wang, Fu Lee and
Deng, Xiaotie and
Han, Song",
booktitle = "Proceedings of the Fifth International Conference on Language Resources and Evaluation ({LREC}{'}06)",
month = may,
year = "2006",
address = "Genoa, Italy",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2006/pdf/273_pdf.pdf",
abstract = "In modern information retrieval systems, effective indexing can be achieved by removal of stop words. Till now many stop word lists have been developed for English language. However, no standard stop word list has been constructed for Chinese language yet. With the fast development of information retrieval in Chinese language, exploring the evaluation of Chinese stop word lists becomes critical. In this paper, to save the time and release the burden of manual comparison, we propose a novel stop word list evaluation method with a mutual information-based Chinese segmentation methodology. Experiments have been conducted on training texts taken from a recent international Chinese segmentation competition. Results show that effective stop word lists can improve the accuracy of Chinese segmentation significantly.",
}
@inproceedings{Zou2006ACC,
author = {Zou, Feng and Wang, Fu Lee and Deng, Xiaotie and Han, Song and Wang, Lu Sheng},
title = {Automatic Construction of {C}hinese Stop Word List},
booktitle = {Proceedings of the 5th WSEAS International Conference on Applied Computer Science},
series = {ACOS'06},
year = {2006},
isbn = {960-8457-43-2},
location = {Hangzhou, China},
pages = {1009--1014},
numpages = {6},
url = {http://dl.acm.org/citation.cfm?id=1973598.1973793},
acmid = {1973793},
publisher = {World Scientific and Engineering Academy and Society (WSEAS)},
address = {Stevens Point, Wisconsin},
keywords = {information theory, statistical modeling, stop word list},
}
@inproceedings{Huston2010,
author = {Huston, Samuel and Croft, W. Bruce},
title = {Evaluating Verbose Query Processing Techniques},
booktitle = {Proceedings of the 33rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
series = {SIGIR '10},
year = {2010},
isbn = {978-1-4503-0153-4},
location = {Geneva, Switzerland},
pages = {291--298},
numpages = {8},
url = {http://doi.acm.org/10.1145/1835449.1835499},
doi = {10.1145/1835449.1835499},
acmid = {1835499},
publisher = {ACM},
address = {New York, NY},
keywords = {black box, query reformulation, verbose queries},
}
@misc{porter2001snowball,
title={Snowball: A language for stemming algorithms},
author={Porter, Martin F},
year={2001},
url={https://snowballstem.org}
}
@Manual{R-scotus,
title = {{scotus}: Collection of Supreme Court of the United States' Opinions},
author = {Emil Hvitfeldt},
year = {2019},
note = {R package version 1.0.0},
url = {https://github.com/EmilHvitfeldt/scotus},
}
@Manual{R-hcandersenr,
title = {{hcandersenr}: {H.C. Andersen's} Fairy Tales},
author = {Emil Hvitfeldt},
year = {2019},
note = {R package version 0.2.0},
url = {https://CRAN.R-project.org/package=hcandersenr},
}
@Manual{R-stopwords,
title = {{stopwords}: Multilingual Stopword Lists},
author = {Kenneth Benoit and David Muhr and Kohei Watanabe},
year = {2021},
note = {R package version 2.2},
url = {https://CRAN.R-project.org/package=stopwords},
}
@article{Caliskan2016,
author = {Caliskan, Aylin and Bryson, Joanna J. and Narayanan, Arvind},
title = {Semantics derived automatically from language corpora contain human-like biases},
volume = {356},
number = {6334},
pages = {183--186},
year = {2017},
doi = {10.1126/science.aal4230},
publisher = {American Association for the Advancement of Science},
issn = {0036-8075},
URL = {https://science.sciencemag.org/content/356/6334/183},
eprint = {https://science.sciencemag.org/content/356/6334/183.full.pdf},
journal = {Science}
}
@article{Bolukbasi2016,
author = {Tolga Bolukbasi and
Kai{-}Wei Chang and
James Y. Zou and
Venkatesh Saligrama and
Adam Tauman Kalai},
title = {Quantifying and Reducing Stereotypes in Word Embeddings},
journal = {CoRR},
volume = {abs/1606.06121},
year = {2016},
url = {http://arxiv.org/abs/1606.06121},
archivePrefix = {arXiv},
eprint = {1606.06121},
timestamp = {Mon, 13 Aug 2018 16:48:03 +0200},
biburl = {https://dblp.org/rec/bib/journals/corr/BolukbasiCZSK16},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article {Garg2018,
author = {Garg, Nikhil and Schiebinger, Londa and Jurafsky, Dan and Zou, James},
title = {Word embeddings quantify 100 years of gender and ethnic stereotypes},
volume = {115},
number = {16},
pages = {E3635--E3644},
year = {2018},
doi = {10.1073/pnas.1720347115},
publisher = {National Academy of Sciences},
issn = {0027-8424},
URL = {https://www.pnas.org/content/115/16/E3635},
eprint = {https://www.pnas.org/content/115/16/E3635.full.pdf},
journal = {Proceedings of the National Academy of Sciences}
}
@Inbook{Lu2018,
author="Lu, Kaiji
and Mardziel, Piotr
and Wu, Fangjing
and Amancharla, Preetam
and Datta, Anupam",
editor="Nigam, Vivek
and Ban Kirigin, Tajana
and Talcott, Carolyn
and Guttman, Joshua
and Kuznetsov, Stepan
and Thau Loo, Boon
and Okada, Mitsuhiro",
title="Gender Bias in Neural Natural Language Processing",
bookTitle="Logic, Language, and Security: Essays Dedicated to Andre Scedrov on the Occasion of His 65th Birthday",
year="2020",
publisher="Springer International Publishing",
address="Cham",
pages="189--202",
isbn="978-3-030-62077-6",
doi="10.1007/978-3-030-62077-6_14",
url="https://doi.org/10.1007/978-3-030-62077-6_14"
}
@misc{Speer2017,
title={How to make a racist {AI} without really trying},
url={http://blog.conceptnet.io/posts/2017/how-to-make-a-racist-ai-without-really-trying/},
journal={ConceptNet blog},
author={Robyn Speer},
year={2017},
month={Jul}
}
@inproceedings{Gonen2019,
title = "Lipstick on a Pig: {D}ebiasing Methods Cover up Systematic Gender Biases in Word Embeddings But do not Remove Them",
author = "Gonen, Hila and
Goldberg, Yoav",
booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
month = jun,
year = "2019",
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N19-1061",
doi = "10.18653/v1/N19-1061",
pages = "609--614",
}
@inproceedings{Ethayarajh2019,
title = "Understanding Undesirable Word Embedding Associations",
author = "Ethayarajh, Kawin and
Duvenaud, David and
Hirst, Graeme",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1166",
doi = "10.18653/v1/P19-1166",
pages = "1696--1705",
}
@book{Manning:2008:IIR:1394399,
author = {Manning, Christopher D. and Raghavan, Prabhakar and Sch\"{u}tze, Hinrich},
title = {Introduction to Information Retrieval},
year = {2008},
isbn = {0521865719, 9780521865715},
publisher = {Cambridge University Press},
address = {New York, NY},
}
@misc{Moody2017,
title={Stop Using {word2vec}},
url={https://multithreaded.stitchfix.com/blog/2017/10/18/stop-using-word2vec/},
journal={Multithreaded},
publisher={StitchFix},
author={Moody, Chris},
year={2017},
month={Oct}
}
@Book{Boehmke2019,
author = {Boehmke, Brad and Greenwell, Brandon M.},
title = {{Hands-on Machine Learning with R}},
publisher = {CRC Press},
year = {2019},
address = {Boca Raton},
isbn = {9781138495685}
}
@book{Wickham2017,
author = {Wickham, Hadley and Grolemund, Garrett},
title = {R for Data Science: Import, Tidy, Transform, Visualize, and Model Data},
year = {2017},
isbn = {1491910399},
publisher = {O'Reilly Media, Inc.},
address = {Sebastopol}
}
@inproceedings{Levy2014,
title = "Dependency-Based Word Embeddings",
author = "Levy, Omer and
Goldberg, Yoav",
booktitle = "Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jun,
year = "2014",
address = "Baltimore, Maryland",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P14-2050",
doi = "10.3115/v1/P14-2050",
pages = "302--308",
}
@inproceedings{Sheng2019,
title = "The Woman Worked as a Babysitter: On Biases in Language Generation",
author = "Sheng, Emily and
Chang, Kai-Wei and
Natarajan, Premkumar and
Peng, Nanyun",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = "2019",
address = "Hong Kong",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D19-1339",
doi = "10.18653/v1/D19-1339",
pages = "3407--3412",
}
@Manual{Vaughan2020,
title = {{slider}: Sliding Window Functions},
author = {Davis Vaughan},
year = {2021},
note = {R package version 0.2.1},
url = {https://CRAN.R-project.org/package=slider},
}
@Manual{Vaughan2018,
title = {{furrr}: Apply Mapping Functions in Parallel using Futures},
author = {Davis Vaughan and Matt Dancho},
year = {2021},
note = {R package version 0.2.2},
url = {https://CRAN.R-project.org/package=furrr},
}
@article{Wagner2016,
title={Women through the glass ceiling: gender asymmetries in {W}ikipedia},
author={Wagner, Claudia and Graells-Garrido, Eduardo and Garcia, David and Menczer, Filippo},
journal={EPJ Data Science},
volume={5},
number={1},
pages={5},
year={2016},
doi = "10.1140/epjds/s13688-016-0066-4",
url = {https://doi.org/10.1140/epjds/s13688-016-0066-4},
publisher={SpringerOpen}
}
@inproceedings{Pennington2014,
title = "{G}lo{V}e: Global Vectors for Word Representation",
author = "Pennington, Jeffrey and
Socher, Richard and
Manning, Christopher",
booktitle = "Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})",
month = oct,
year = "2014",
address = "Doha, Qatar",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D14-1162",
doi = "10.3115/v1/D14-1162",
pages = "1532--1543",
}
@Manual{Hvitfeldt2020,
title = {{textdata}: Download and Load Various Text Datasets},
author = {Emil Hvitfeldt},
year = {2020},
note = {R package version 0.4.1},
url = {https://CRAN.R-project.org/package=textdata},
}
@Manual{Selivanov2018,
title = {{text2vec}: Modern Text Mining Framework for {R}},
author = {Dmitriy Selivanov and Manuel Bickel and Qing Wang},
year = {2020},
note = {R package version 0.6},
url = {https://CRAN.R-project.org/package=text2vec},
}
@misc{Mikolov2013,
title = {Efficient Estimation of Word Representations in Vector Space},
author = {Tomas Mikolov and Kai Chen and Greg S. Corrado and Jeffrey Dean},
year = {2013},
URL = {http://arxiv.org/abs/1301.3781}
}
@article{Bojanowski2016,
title = "Enriching Word Vectors with Subword Information",
author = "Bojanowski, Piotr and
Grave, Edouard and
Joulin, Armand and
Mikolov, Tomas",
journal = "Transactions of the Association for Computational Linguistics",
volume = "5",
year = "2017",
url = "https://www.aclweb.org/anthology/Q17-1010",
doi = "10.1162/tacl_a_00051",
pages = "135--146",
}
@InProceedings{Le2014, title = {Distributed Representations of Sentences and Documents}, author = {Quoc Le and Tomas Mikolov}, booktitle = {Proceedings of the 31st International Conference on Machine Learning}, pages = {1188--1196}, year = {2014}, editor = {Eric P. Xing and Tony Jebara}, volume = {32}, number = {2}, series = {Proceedings of Machine Learning Research}, address = {Bejing, China}, month = {22--24 Jun}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v32/le14.pdf}, url = {http://proceedings.mlr.press/v32/le14.html}, abstract = {Many machine learning algorithms require the input to be represented as a fixed length feature vector. When it comes to texts, one of the most common representations is bag-of-words. Despite their popularity, bag-of-words models have two major weaknesses: they lose the ordering of the words and they also ignore semantics of the words. For example, "powerful," "strong" and "Paris" are equally distant. In this paper, we propose an unsupervised algorithm that learns vector representations of sentences and text documents. This algorithm represents each document by a dense vector which is trained to predict words in the document. Its construction gives our algorithm the potential to overcome the weaknesses of bag-of-words models. Empirical results show that our technique outperforms bag-of-words models as well as other techniques for text representations. Finally, we achieve new state-of-the-art results on several text classification and sentiment analysis tasks.} }
@inproceedings{Howard2018,
title = "Universal Language Model Fine-tuning for Text Classification",
author = "Howard, Jeremy and
Ruder, Sebastian",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2018",
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-1031",
doi = "10.18653/v1/P18-1031",
pages = "328--339",
}
@inproceedings{Peters2018,
title = "Deep Contextualized Word Representations",
author = "Peters, Matthew and
Neumann, Mark and
Iyyer, Mohit and
Gardner, Matt and
Clark, Christopher and
Lee, Kenton and
Zettlemoyer, Luke",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
month = jun,
year = "2018",
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N18-1202",
doi = "10.18653/v1/N18-1202",
pages = "2227--2237",
}
@inproceedings{carlini2018secret,
author = {Carlini, Nicholas and Liu, Chang and Erlingsson, \'{U}lfar and Kos, Jernej and Song, Dawn},
title = {The Secret Sharer: Evaluating and Testing Unintended Memorization in Neural Networks},
year = {2019},
isbn = {9781939133069},
publisher = {USENIX Association},
address = {USA},
abstract = {This paper describes a testing methodology for quantitatively assessing the risk that rare or unique training-data sequences are unintentionally memorized by generative sequence models--a common type of machine-learning model. Because such models are sometimes trained on sensitive data (e.g., the text of users' private messages), this methodology can benefit privacy by allowing deep-learning practitioners to select means of training that minimize such memorization.In experiments, we show that unintended memorization is a persistent, hard-to-avoid issue that can have serious consequences. Specifically, for models trained without consideration of memorization, we describe new, efficient procedures that can extract unique, secret sequences, such as credit card numbers. We show that our testing strategy is a practical and easy-to-use first line of defense, e.g., by describing its application to quantitatively limit data exposure in Google's Smart Compose, a commercial text-completion neural network trained on millions of users' email messages.},
booktitle = {Proceedings of the 28th USENIX Conference on Security Symposium},
pages = {267–284},
numpages = {18},
location = {Santa Clara, CA},
series = {SEC'19}
}
@inproceedings{Fredrikson2014,
author = {Fredrikson, Matthew and Lantz, Eric and Jha, Somesh and Lin, Simon and Page, David and Ristenpart, Thomas},
title = {Privacy in Pharmacogenetics: An End-to-End Case Study of Personalized Warfarin Dosing},
year = {2014},
isbn = {9781931971157},
publisher = {USENIX Association},
address = {USA},
booktitle = {Proceedings of the 23rd USENIX Conference on Security Symposium},
pages = {17–32},
numpages = {16},
location = {San Diego, CA},
series = {SEC'14}
}
@inproceedings{Fredrikson2015,
author = {Fredrikson, Matt and Jha, Somesh and Ristenpart, Thomas},
title = {Model Inversion Attacks That Exploit Confidence Information and Basic Countermeasures},
year = {2015},
isbn = {9781450338325},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/2810103.2813677},
doi = {10.1145/2810103.2813677},
pages = {1322–1333},
numpages = {12},
keywords = {machine learning, attacks, privacy},
location = {Denver, Colorado},
series = {CCS '15}
}
@software{spacy2,
author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane},
title = {{spaCy: Industrial-strength Natural Language Processing in Python}},
year = 2020,
publisher = {Zenodo},
doi = {10.5281/zenodo.1212303},
url = {https://doi.org/10.5281/zenodo.1212303}
}
@Manual{textrecipes,
title = {{textrecipes}: Extra `Recipes' for Text Processing},
author = {Emil Hvitfeldt},
year = {2020},
note = {R package version 0.4.1},
url = {https://CRAN.R-project.org/package=textrecipes},
}
@book{konig2002germanic,
title={The Germanic Languages},
author={K{\"o}nig, E. and van der Auwera, J.},
isbn={9780415280792},
lccn={lc92037152},
series={Routledge language family descriptions},
url={https://books.google.com.do/books?id=whyUQgAACAAJ},
year={2002},
publisher={Routledge}
}
@article{Sugisaki2018,
title={German compound splitting using the compound productivity of morphemes},
author={Sugisaki, Kyoko and Tuggener, Don},
year={2018},
publisher={Verlag der {\"O}sterreichischen Akademie der Wissenschaften}
}
@inproceedings{ma-etal-2018-state,
title = "State-of-the-art {C}hinese Word Segmentation with Bi-{LSTM}s",
author = "Ma, Ji and
Ganchev, Kuzman and
Weiss, David",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
month = oct # "-" # nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D18-1529",
doi = "10.18653/v1/D18-1529",
pages = "4902--4908",
abstract = "A wide variety of neural-network architectures have been proposed for the task of Chinese word segmentation. Surprisingly, we find that a bidirectional LSTM model, when combined with standard deep learning techniques and best practices, can achieve better accuracy on many of the popular datasets as compared to models based on more complex neuralnetwork architectures. Furthermore, our error analysis shows that out-of-vocabulary words remain challenging for neural-network models, and many of the remaining errors are unlikely to be fixed through architecture changes. Instead, more effort should be made on exploring resources for further improvement.",
}
@inproceedings{Huang2019,
title = "Towards Fast and Accurate Neural {C}hinese Word Segmentation with Multi-Criteria Learning",
author = "Huang, Weipeng and
Cheng, Xingyi and
Chen, Kunlong and
Wang, Taifeng and
Chu, Wei",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://www.aclweb.org/anthology/2020.coling-main.186",
doi = "10.18653/v1/2020.coling-main.186",
pages = "2062--2072",
}
@inproceedings{Caruana2008,
author = {Caruana, Rich and Karampatziakis, Nikos and Yessenalina, Ainur},
title = {An Empirical Evaluation of Supervised Learning in High Dimensions},
year = {2008},
isbn = {9781605582054},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/1390156.1390169},
doi = {10.1145/1390156.1390169},
booktitle = {Proceedings of the 25th International Conference on Machine Learning},
pages = {96–103},
numpages = {8},
location = {Helsinki, Finland},
series = {ICML '08}
}
@inproceedings{Olson2017,
title={Data-driven advice for applying machine learning to bioinformatics problems},
author={Olson, Randal S and Cava, William La and Mustahsan, Zairah and Varik, Akshay and Moore, Jason H},
booktitle={Pacific Symposium on Biocomputing 2018: Proceedings of the Pacific Symposium},
doi = {10.1142/9789813235533_0018},
url = {https://doi.org/10.1142/9789813235533_0018},
pages={192--203},
year={2018},
organization={World Scientific}
}
@inproceedings{Weinberger2009,
author = {Weinberger, Kilian and Dasgupta, Anirban and Langford, John and Smola, Alex and Attenberg, Josh},
title = {Feature Hashing for Large Scale Multitask Learning},
year = {2009},
isbn = {9781605585161},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/1553374.1553516},
doi = {10.1145/1553374.1553516},
booktitle = {Proceedings of the 26th Annual International Conference on Machine Learning},
pages = {1113–1120},
numpages = {8},
location = {Montreal, Quebec, Canada},
series = {ICML ’09}
}
@misc{appleby2008,
author = {Austin Appleby},
title = {MurmurHash},
year = {2008},
url = {https://sites.google.com/site/murmurhash}
}
@inproceedings{NIPS2018_7784,
author = {Freksen, Casper and Kamma, Lior and Larsen, Kasper Green},
title = {Fully Understanding the Hashing Trick},
year = {2018},
publisher = {Curran Associates Inc.},
address = {Red Hook, NY},
booktitle = {Proceedings of the 32nd International Conference on Neural Information Processing Systems},
pages = {5394–5404},
numpages = {11},
location = {Montr\'{e}al, Canada},
doi = {10.5555/3327345.3327444},
url = {https://doi.org/10.5555/3327345.3327444},
series = {NIPS'18}
}
@inproceedings{Forman2008,
author = {Forman, George and Kirshenbaum, Evan},
title = {Extremely Fast Text Feature Extraction for Classification and Indexing},
year = {2008},
isbn = {9781595939913},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/1458082.1458243},
doi = {10.1145/1458082.1458243},
booktitle = {Proceedings of the 17th ACM Conference on Information and Knowledge Management},
pages = {1221–1230},
numpages = {10},
keywords = {text mining, feature engineering, bag-of-words, feature extraction, document categorization, text tokenization, text indexing},
location = {Napa Valley, California},
series = {CIKM ’08}
}
@article{Vantu2016,
author = {Van-Tu, Nguyen and Anh-Cuong, Le},
year = {2016},
month = {05},
pages = {1--8},
title = {Improving Question Classification by Feature Extraction and Selection},
volume = {9},
number = {17},
journal = {Indian Journal of Science and Technology},
url = {https://doi.org/10.17485/ijst/2016/v9i17/93160},
doi = {10.17485/ijst/2016/v9i17/93160}
}
@inproceedings{Joachims1998,
author = {Joachims, Thorsten},
title = {Text Categorization with Support Vector Machines: Learning with Many Relevant Features},
year = {1998},
isbn = {3540644172},
publisher = {Springer-Verlag},
address = {Berlin, Heidelberg},
url = {https://doi.org/10.1007/BFb0026683},
doi = {10.1007/BFb0026683},
booktitle = {Proceedings of the 10th European Conference on Machine Learning},
pages = {137–142},
numpages = {6},
location = {Chemnitz, Germany},
series = {ECML’98}
}
@article{Harman91,
author = {Harman, Donna},
title = {How effective is suffixing?},
journal = {Journal of the American Society for Information Science},
volume = {42},
number = {1},
pages = {7-15},
doi = {10.1002/(SICI)1097-4571(199101)42:1<7::AID-ASI2>3.0.CO;2-P},
url = {https://doi.org/10.1002/(SICI)1097-4571(199101)42:1<7::AID-ASI2>3.0.CO;2-P},
eprint = {https://asistdl.onlinelibrary.wiley.com/doi/pdf/10.1002/%28SICI%291097-4571%28199101%2942%3A1%3C7%3A%3AAID-ASI2%3E3.0.CO%3B2-P},
year = {1991}
}
@book{Chollet2018,
title={Deep Learning with {R}},
author={Chollet, F. and Allaire, J.J.},
isbn={9781617295546},
lccn={2018285360},
url={https://www.manning.com/books/deep-learning-with-r},
year={2018},
address={Shelter Island, NY},
publisher={Manning Publications}
}
@inproceedings{Boser1992,
author = {Boser, Bernhard E. and Guyon, Isabelle M. and Vapnik, Vladimir N.},
title = {A Training Algorithm for Optimal Margin Classifiers},
year = {1992},
isbn = {089791497X},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/130385.130401},
doi = {10.1145/130385.130401},
booktitle = {{Proceedings of the Fifth Annual Workshop on Computational Learning Theory}},
pages = {144–152},
numpages = {9},
location = {Pittsburgh, Pennsylvania},
series = {COLT '92}
}
@article{Friedman2010,
author = {Jerome H. Friedman and Trevor Hastie and Rob Tibshirani},
title = {Regularization Paths for Generalized Linear Models via Coordinate Descent},
journal = {Journal of Statistical Software, Articles},
volume = {33},
number = {1},
year = {2010},
issn = {1548-7660},
pages = {1--22},
doi = {10.18637/jss.v033.i01},
url = {https://www.jstatsoft.org/v033/i01}
}
@ARTICLE{Lex2014,
author={Lex, Alexander and Gehlenborg, Nils and Strobelt, Hendrik and Vuillemot, Romain and Pfister, Hanspeter},
journal={{IEEE Transactions on Visualization and Computer Graphics}},
title={UpSet: Visualization of Intersecting Sets},
year={2014},
volume={20},
number={12},
pages={1983-1992},
doi={10.1109/TVCG.2014.2346248},
url = {https://doi.org/10.1109/TVCG.2014.2346248}
}
@book{James2013,
title={An introduction to statistical learning},
author={James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert},
year={2013},
publisher={Springer},
address={New York}
}
@Manual{Perry2020,
title = {{corpus}: Text Corpus Analysis},
author = {Patrick O. Perry},
year = {2020},
note = {R package version 0.10.2},
url = {https://CRAN.R-project.org/package=corpus},
}
@ARTICLE{kim2006,
author={Kim, S. and Han, K. and Rim, H. and Myaeng, S.H.},
journal={IEEE Transactions on Knowledge and Data Engineering},
title={Some Effective Techniques for Naive Bayes Text Classification},
year={2006},
volume={18},
number={11},
pages={1457-1466},
doi={10.1109/TKDE.2006.180},
url ={https://doi.org/10.1109/TKDE.2006.180}
}
@InProceedings{Kibriya2005,
author="Kibriya, Ashraf M.
and Frank, Eibe
and Pfahringer, Bernhard
and Holmes, Geoffrey",
editor="Webb, Geoffrey I.
and Yu, Xinghuo",
title="Multinomial Naive Bayes for Text Categorization Revisited",
booktitle="AI 2004: Advances in Artificial Intelligence",
doi={10.1007/978-3-540-30549-1_43},
url={https://doi.org/10.1007/978-3-540-30549-1_43},
year="2005",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="488--499",
isbn="978-3-540-30549-1"
}
@InProceedings{Eibe2006,
author="Frank, Eibe
and Bouckaert, Remco R.",
editor="F{\"u}rnkranz, Johannes
and Scheffer, Tobias
and Spiliopoulou, Myra",
title="Naive Bayes for Text Classification with Unbalanced Classes",
booktitle="Knowledge Discovery in Databases: PKDD 2006",
year="2006",
doi="10.1007/11871637_49",
url={https://doi.org/10.1007/11871637_49},
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="503--510",
isbn="978-3-540-46048-0"
}
@Manual{R-googleLanguageR,
title = {{googleLanguageR}: Call Google's `Natural Language' API, `Cloud Translation' API,
`Cloud Speech' API and `Cloud Text-to-Speech' API},
author = {Mark Edmondson},
year = {2020},
note = {R package version 0.3.0},
url = {https://CRAN.R-project.org/package=googleLanguageR},
}
@inproceedings{Tang2018,