book.bib

@book{Silge2017,
 author = {Silge, Julia and Robinson, David},
 title = {Text Mining with {R}: A Tidy Approach},
 year = {2017},
 isbn = {1491981652, 9781491981658},
 publisher = {O'Reilly Media, Inc.},
 address = {Sebastopol}
}

@Book{xie2015,
  title = {Dynamic Documents with {R} and knitr},
  author = {Yihui Xie},
  publisher = {Chapman and Hall/CRC},
  address = {Boca Raton, Florida},
  year = {2015},
  edition = {2nd},
  note = {ISBN 978-1498716963},
  url = {http://yihui.name/knitr/},
}

@book{Feldman2007,
  title = {The text mining handbook},
  author = {R. Feldman{,} and J. Sanger},
  publisher = {Cambridge University Press},
  address = {Cambridge},
  isbn = {9780511546914},
  year = {2007},
}

@article{Schofield16,
  author = {Schofield, Alexandra and Mimno, David},
  title = {Comparing Apples to Apple: The Effects of Stemmers on Topic Models},
  journal = {Transactions of the Association for Computational Linguistics},
  volume = {4},
  number = {},
  pages = {287-300},
  year = {2016},
  doi = {10.1162/tacl_a_00099},
  URL = {https://doi.org/10.1162/tacl_a_00099},
  eprint = {https://doi.org/10.1162/tacl_a_00099}
}

@article{Porter80,
  author = {Porter, Martin F},
  journal = {Program},
  number = 3,
  pages = {130-137},
  title = {An algorithm for suffix stripping.},
  url = {https://doi.org/10.1108/eb046814},
  doi = {10.1108/eb046814},
  volume = 14,
  year = 1980
}

@article{Lovins68,
  author = {Lovins, Julie B.},
  journal = {Mechanical Translation and Computational Linguistics},
  pages = {22-31},
  title = {Development of a stemming algorithm},
  volume = 11,
  year = 1968
}

@article{Miller95,
 author = {Miller, George A.},
 title = {WordNet: A Lexical Database for {E}nglish},
 journal = {Communications of the ACM},
 issue_date = {Nov. 1995},
 volume = {38},
 number = {11},
 month = nov,
 year = {1995},
 issn = {0001-0782},
 pages = {39--41},
 numpages = {3},
 url = {http://doi.acm.org/10.1145/219717.219748},
 doi = {10.1145/219717.219748},
 acmid = {219748},
 publisher = {ACM},
 address = {New York, NY},
}

@article{Arnold17,
  author = {Taylor Arnold},
  title = {{A Tidy Data Model for Natural Language Processing using
          cleanNLP}},
  year = {2017},
  journal = {{The R Journal}},
  doi = {10.32614/RJ-2017-035},
  url = {https://doi.org/10.32614/RJ-2017-035},
  pages = {248--267},
  volume = {9},
  number = {2}
}

@Manual{Benoit19,
    title = {{spacyr}: Wrapper to the `spaCy' `NLP' Library},
    author = {Kenneth Benoit and Akitaka Matsuo},
    year = {2020},
    note = {R package version 1.2.1},
    url = {https://CRAN.R-project.org/package=spacyr},
}

@misc{boost_c_libraries,
  author = {},
  year = {2007},
  url = {https://www.boost.org/doc/libs/1_44_0/libs/regex/doc/html/boost_regex/syntax/basic_extended.html},
  journal = {Boost C Libraries}
}

@book{levithan2012regular,
  title={Regular Expressions Cookbook},
  author={Levithan, J.G.S.},
  isbn={9781449327453},
  year={2012},
  publisher={O'Reilly Media, Inc.},
  address = {Sebastopol}
}

@article{Willett06,
          volume = {40},
          number = {3},
          author = {P. Willett},
           title = {The {P}orter stemming algorithm: then and now },
       publisher = {Emerald},
            year = {2006},
         journal = {Program: Electronic Library and Information Systems},
           pages = {219--223},
           doi   = {10.1108/00330330610681295},
        keywords = {conflation, information retrieval, Porter stemming algorithm, stemming algorithm, suffix, word variant},
             url = {http://eprints.whiterose.ac.uk/1434/},
}

@article{Briscoe13,
  title={Introduction to Linguistics for Natural Language Processing},
  author={Briscoe, Ted},
  year = {2013},
  url = {https://www.cl.cam.ac.uk/teaching/1314/L100/introling.pdf}
}

@article{Bender11,
  title={On achieving and evaluating language-independence in NLP},
  author={Bender, Emily M},
  journal={Linguistic Issues in Language Technology},
  volume={6},
  number={3},
  pages={1--26},
  year={2011}
}

@article{Bender13,
  title={Linguistic fundamentals for natural language processing: 100 essentials from morphology and syntax},
  author={Bender, Emily M},
  journal={Synthesis lectures on human language technologies},
  volume={6},
  number={3},
  pages={1--184},
  year={2013},
  publisher={Morgan \& Claypool Publishers}
}

@inproceedings{Sap19,
    title = "The Risk of Racial Bias in Hate Speech Detection",
    author = "Sap, Maarten  and
      Card, Dallas  and
      Gabriel, Saadia  and
      Choi, Yejin  and
      Smith, Noah A.",
    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2019",
    address = "Florence, Italy",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/P19-1163",
    doi = "10.18653/v1/P19-1163",
    pages = "1668--1678"
}

@misc{McCulloch15,
  title={Move over {S}hakespeare, teen girls are the real language disruptors},
  url={https://qz.com/474671/move-over-shakespeare-teen-girls-are-the-real-language-disruptors/},
  journal={Quartz},
  publisher={Quartz},
  author={McCulloch, Gretchen},
  year={2015},
  month={Aug}
}


@article{Luhn1960,
author = {Luhn, H. P.},
title = {Key word-in-context index for technical literature ({kwic} index)},
journal = {American Documentation},
volume = {11},
number = {4},
pages = {288-295},
doi = {10.1002/asi.5090110403},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/asi.5090110403},
eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/asi.5090110403},
abstract = {Abstract A distinction is made between bibliographical indexes for new and past literature based on the willingness of the user to trade perfection for currency. Indexes giving keywords in their context are proposed as suitable for disseminating new information. These can be entirely machine-generated and hence kept up-to-date with the current literature. A compatible coding scheme to identify the indexed documents is also proposed. In it elements are automatically extracted from the usual identifiers of the document so that the coded identifier yields a maximum of information while remaining susceptible to normal methods of ordering.},
year = {1960}
}

@inproceedings{nothman-etal-2018-stop,
    title = "Stop Word Lists in Free Open-source Software Packages",
    author = "Nothman, Joel  and
      Qin, Hanmin  and
      Yurchak, Roman",
    booktitle = "Proceedings of Workshop for {NLP} Open Source Software ({NLP}-{OSS})",
    month = jul,
    year = "2018",
    address = "Melbourne, Australia",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W18-2502",
    doi = "10.18653/v1/W18-2502",
    pages = "7--12",
    abstract = "Open-source software packages for language processing often include stop word lists. Users may apply them without awareness of their surprising omissions (e.g. {``}hasn{'}t{''} but not {``}hadn{'}t{''}) and inclusions ({``}computer{''}), or their incompatibility with a particular tokenizer. Motivated by issues raised about the Scikit-learn stop list, we investigate variation among and consistency within 52 popular English-language stop lists, and propose strategies for mitigating these issues.",
}

@Manual{Wickham19,
    title = {{stringr}: Simple{,} Consistent Wrappers for Common String Operations},
    author = {Hadley Wickham},
    year = {2019},
    note = {R package version 1.4.0},
    url = {https://CRAN.R-project.org/package=stringr},
}

@Article{Mullen18,
    title = {Fast, Consistent Tokenization of Natural Language Text},
    author = {Lincoln A. Mullen and Kenneth Benoit and Os Keyes and Dmitry Selivanov and Jeffrey Arnold},
    journal = {Journal of Open Source Software},
    year = {2018},
    volume = {3},
    issue = {23},
    pages = {655},
    url = {https://doi.org/10.21105/joss.00655},
    doi = {10.21105/joss.00655},
}


@Manual{Gagolewski19,
    title = {{stringi}: Character string processing facilities},
    author = {Marek Gagolewski},
    year = {2020},
    note = {R package version 1.6.2},
    url = {http://www.gagolewski.com/software/stringi/},
}

@Article{Silge16,
    title = {tidytext: Text Mining and Analysis Using Tidy Data Principles in {R}},
    author = {Julia Silge and David Robinson},
    doi = {10.21105/joss.00037},
    url = {http://dx.doi.org/10.21105/joss.00037},
    year = {2016},
    publisher = {The Open Journal},
    volume = {1},
    number = {3},
    journal = {JOSS},
  }

@inproceedings{Zou2006,
    title = "Evaluation of Stop Word Lists in {C}hinese Language",
    author = "Zou, Feng  and
      Wang, Fu Lee  and
      Deng, Xiaotie  and
      Han, Song",
    booktitle = "Proceedings of the Fifth International Conference on Language Resources and Evaluation ({LREC}{'}06)",
    month = may,
    year = "2006",
    address = "Genoa, Italy",
    publisher = "European Language Resources Association (ELRA)",
    url = "http://www.lrec-conf.org/proceedings/lrec2006/pdf/273_pdf.pdf",
    abstract = "In modern information retrieval systems, effective indexing can be achieved by removal of stop words. Till now many stop word lists have been developed for English language. However, no standard stop word list has been constructed for Chinese language yet. With the fast development of information retrieval in Chinese language, exploring the evaluation of Chinese stop word lists becomes critical. In this paper, to save the time and release the burden of manual comparison, we propose a novel stop word list evaluation method with a mutual information-based Chinese segmentation methodology. Experiments have been conducted on training texts taken from a recent international Chinese segmentation competition. Results show that effective stop word lists can improve the accuracy of Chinese segmentation significantly.",
}


@inproceedings{Zou2006ACC,
 author = {Zou, Feng and Wang, Fu Lee and Deng, Xiaotie and Han, Song and Wang, Lu Sheng},
 title = {Automatic Construction of {C}hinese Stop Word List},
 booktitle = {Proceedings of the 5th WSEAS International Conference on Applied Computer Science},
 series = {ACOS'06},
 year = {2006},
 isbn = {960-8457-43-2},
 location = {Hangzhou, China},
 pages = {1009--1014},
 numpages = {6},
 url = {http://dl.acm.org/citation.cfm?id=1973598.1973793},
 acmid = {1973793},
 publisher = {World Scientific and Engineering Academy and Society (WSEAS)},
 address = {Stevens Point, Wisconsin},
 keywords = {information theory, statistical modeling, stop word list},
}

@inproceedings{Huston2010,
 author = {Huston, Samuel and Croft, W. Bruce},
 title = {Evaluating Verbose Query Processing Techniques},
 booktitle = {Proceedings of the 33rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
 series = {SIGIR '10},
 year = {2010},
 isbn = {978-1-4503-0153-4},
 location = {Geneva, Switzerland},
 pages = {291--298},
 numpages = {8},
 url = {http://doi.acm.org/10.1145/1835449.1835499},
 doi = {10.1145/1835449.1835499},
 acmid = {1835499},
 publisher = {ACM},
 address = {New York, NY},
 keywords = {black box, query reformulation, verbose queries},
}

@misc{porter2001snowball,
  title={Snowball: A language for stemming algorithms},
  author={Porter, Martin F},
  year={2001},
  url={https://snowballstem.org}
}

@Manual{R-scotus,
   title = {{scotus}: Collection of Supreme Court of the United States' Opinions},
   author = {Emil Hvitfeldt},
   year = {2019},
   note = {R package version 1.0.0},
   url = {https://github.com/EmilHvitfeldt/scotus},
 }

@Manual{R-hcandersenr,
   title = {{hcandersenr}: {H.C. Andersen's} Fairy Tales},
   author = {Emil Hvitfeldt},
   year = {2019},
   note = {R package version 0.2.0},
   url = {https://CRAN.R-project.org/package=hcandersenr},
}

@Manual{R-stopwords,
    title = {{stopwords}: Multilingual Stopword Lists},
    author = {Kenneth Benoit and David Muhr and Kohei Watanabe},
    year = {2021},
    note = {R package version 2.2},
    url = {https://CRAN.R-project.org/package=stopwords},
}

@article{Caliskan2016,
        author = {Caliskan, Aylin and Bryson, Joanna J. and Narayanan, Arvind},
        title = {Semantics derived automatically from language corpora contain human-like biases},
        volume = {356},
        number = {6334},
        pages = {183--186},
        year = {2017},
        doi = {10.1126/science.aal4230},
        publisher = {American Association for the Advancement of Science},
        issn = {0036-8075},
        URL = {https://science.sciencemag.org/content/356/6334/183},
        eprint = {https://science.sciencemag.org/content/356/6334/183.full.pdf},
        journal = {Science}
}

@article{Bolukbasi2016,
  author    = {Tolga Bolukbasi and
               Kai{-}Wei Chang and
               James Y. Zou and
               Venkatesh Saligrama and
               Adam Tauman Kalai},
  title     = {Quantifying and Reducing Stereotypes in Word Embeddings},
  journal   = {CoRR},
  volume    = {abs/1606.06121},
  year      = {2016},
  url       = {http://arxiv.org/abs/1606.06121},
  archivePrefix = {arXiv},
  eprint    = {1606.06121},
  timestamp = {Mon, 13 Aug 2018 16:48:03 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/BolukbasiCZSK16},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article {Garg2018,
        author = {Garg, Nikhil and Schiebinger, Londa and Jurafsky, Dan and Zou, James},
        title = {Word embeddings quantify 100 years of gender and ethnic stereotypes},
        volume = {115},
        number = {16},
        pages = {E3635--E3644},
        year = {2018},
        doi = {10.1073/pnas.1720347115},
        publisher = {National Academy of Sciences},
        issn = {0027-8424},
        URL = {https://www.pnas.org/content/115/16/E3635},
        eprint = {https://www.pnas.org/content/115/16/E3635.full.pdf},
        journal = {Proceedings of the National Academy of Sciences}
}

@Inbook{Lu2018,
author="Lu, Kaiji
and Mardziel, Piotr
and Wu, Fangjing
and Amancharla, Preetam
and Datta, Anupam",
editor="Nigam, Vivek
and Ban Kirigin, Tajana
and Talcott, Carolyn
and Guttman, Joshua
and Kuznetsov, Stepan
and Thau Loo, Boon
and Okada, Mitsuhiro",
title="Gender Bias in Neural Natural Language Processing",
bookTitle="Logic, Language, and Security: Essays Dedicated to Andre Scedrov on the Occasion of His 65th Birthday",
year="2020",
publisher="Springer International Publishing",
address="Cham",
pages="189--202",
isbn="978-3-030-62077-6",
doi="10.1007/978-3-030-62077-6_14",
url="https://doi.org/10.1007/978-3-030-62077-6_14"
}

@misc{Speer2017,
  title={How to make a racist {AI} without really trying},
  url={http://blog.conceptnet.io/posts/2017/how-to-make-a-racist-ai-without-really-trying/},
  journal={ConceptNet blog},
  author={Robyn Speer},
  year={2017},
  month={Jul}
}

@inproceedings{Gonen2019,
    title = "Lipstick on a Pig: {D}ebiasing Methods Cover up Systematic Gender Biases in Word Embeddings But do not Remove Them",
    author = "Gonen, Hila  and
      Goldberg, Yoav",
    booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
    month = jun,
    year = "2019",
    address = "Minneapolis, Minnesota",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/N19-1061",
    doi = "10.18653/v1/N19-1061",
    pages = "609--614",
}

@inproceedings{Ethayarajh2019,
    title = "Understanding Undesirable Word Embedding Associations",
    author = "Ethayarajh, Kawin  and
      Duvenaud, David  and
      Hirst, Graeme",
    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2019",
    address = "Florence, Italy",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/P19-1166",
    doi = "10.18653/v1/P19-1166",
    pages = "1696--1705",
}

@book{Manning:2008:IIR:1394399,
 author = {Manning, Christopher D. and Raghavan, Prabhakar and Sch\"{u}tze, Hinrich},
 title = {Introduction to Information Retrieval},
 year = {2008},
 isbn = {0521865719, 9780521865715},
 publisher = {Cambridge University Press},
 address = {New York, NY},
}


@misc{Moody2017,
  title={Stop Using {word2vec}},
  url={https://multithreaded.stitchfix.com/blog/2017/10/18/stop-using-word2vec/},
  journal={Multithreaded},
  publisher={StitchFix},
  author={Moody, Chris},
  year={2017},
  month={Oct}
}

@Book{Boehmke2019,
 author = {Boehmke, Brad and Greenwell, Brandon M.},
 title = {{Hands-on Machine Learning with R}},
 publisher = {CRC Press},
 year = {2019},
 address = {Boca Raton},
 isbn = {9781138495685}
}

@book{Wickham2017,
 author = {Wickham, Hadley and Grolemund, Garrett},
 title = {R for Data Science: Import, Tidy, Transform, Visualize, and Model Data},
 year = {2017},
 isbn = {1491910399},
 publisher = {O'Reilly Media, Inc.},
 address = {Sebastopol}
}

@inproceedings{Levy2014,
    title = "Dependency-Based Word Embeddings",
    author = "Levy, Omer  and
      Goldberg, Yoav",
    booktitle = "Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
    month = jun,
    year = "2014",
    address = "Baltimore, Maryland",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/P14-2050",
    doi = "10.3115/v1/P14-2050",
    pages = "302--308",
}

@inproceedings{Sheng2019,
    title = "The Woman Worked as a Babysitter: On Biases in Language Generation",
    author = "Sheng, Emily  and
      Chang, Kai-Wei  and
      Natarajan, Premkumar  and
      Peng, Nanyun",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
    month = nov,
    year = "2019",
    address = "Hong Kong",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D19-1339",
    doi = "10.18653/v1/D19-1339",
    pages = "3407--3412",
}

@Manual{Vaughan2020,
    title = {{slider}: Sliding Window Functions},
    author = {Davis Vaughan},
    year = {2021},
    note = {R package version 0.2.1},
    url = {https://CRAN.R-project.org/package=slider},
}

@Manual{Vaughan2018,
    title = {{furrr}: Apply Mapping Functions in Parallel using Futures},
    author = {Davis Vaughan and Matt Dancho},
    year = {2021},
    note = {R package version 0.2.2},
    url = {https://CRAN.R-project.org/package=furrr},
}

@article{Wagner2016,
  title={Women through the glass ceiling: gender asymmetries in {W}ikipedia},
  author={Wagner, Claudia and Graells-Garrido, Eduardo and Garcia, David and Menczer, Filippo},
  journal={EPJ Data Science},
  volume={5},
  number={1},
  pages={5},
  year={2016},
  doi = "10.1140/epjds/s13688-016-0066-4",
  url = {https://doi.org/10.1140/epjds/s13688-016-0066-4},
  publisher={SpringerOpen}
}

@inproceedings{Pennington2014,
    title = "{G}lo{V}e: Global Vectors for Word Representation",
    author = "Pennington, Jeffrey  and
      Socher, Richard  and
      Manning, Christopher",
    booktitle = "Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})",
    month = oct,
    year = "2014",
    address = "Doha, Qatar",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D14-1162",
    doi = "10.3115/v1/D14-1162",
    pages = "1532--1543",
}

@Manual{Hvitfeldt2020,
    title = {{textdata}: Download and Load Various Text Datasets},
    author = {Emil Hvitfeldt},
    year = {2020},
    note = {R package version 0.4.1},
    url = {https://CRAN.R-project.org/package=textdata},
}

@Manual{Selivanov2018,
    title = {{text2vec}: Modern Text Mining Framework for {R}},
    author = {Dmitriy Selivanov and Manuel Bickel and Qing Wang},
    year = {2020},
    note = {R package version 0.6},
    url = {https://CRAN.R-project.org/package=text2vec},
}

@misc{Mikolov2013,
title	= {Efficient Estimation of Word Representations in Vector Space},
author	= {Tomas Mikolov and Kai Chen and Greg S. Corrado and Jeffrey Dean},
year	= {2013},
URL	= {http://arxiv.org/abs/1301.3781}
}

@article{Bojanowski2016,
    title = "Enriching Word Vectors with Subword Information",
    author = "Bojanowski, Piotr  and
      Grave, Edouard  and
      Joulin, Armand  and
      Mikolov, Tomas",
    journal = "Transactions of the Association for Computational Linguistics",
    volume = "5",
    year = "2017",
    url = "https://www.aclweb.org/anthology/Q17-1010",
    doi = "10.1162/tacl_a_00051",
    pages = "135--146",
}

 @InProceedings{Le2014, title = {Distributed Representations of Sentences and Documents}, author = {Quoc Le and Tomas Mikolov}, booktitle = {Proceedings of the 31st International Conference on Machine Learning}, pages = {1188--1196}, year = {2014}, editor = {Eric P. Xing and Tony Jebara}, volume = {32}, number = {2}, series = {Proceedings of Machine Learning Research}, address = {Bejing, China}, month = {22--24 Jun}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v32/le14.pdf}, url = {http://proceedings.mlr.press/v32/le14.html}, abstract = {Many machine learning algorithms require the input to be represented as a fixed length feature vector. When it comes to texts, one of the most common representations is bag-of-words. Despite their popularity, bag-of-words models have two major weaknesses: they lose the ordering of the words and they also ignore semantics of the words. For example, "powerful," "strong" and "Paris" are equally distant. In this paper, we propose an unsupervised algorithm that learns vector representations of sentences and text documents. This algorithm represents each document by a dense vector which is trained to predict words in the document. Its construction gives our algorithm the potential to overcome the weaknesses of bag-of-words models. Empirical results show that our technique outperforms bag-of-words models as well as other techniques for text representations. Finally, we achieve new state-of-the-art results on several text classification and sentiment analysis tasks.} } 

@inproceedings{Howard2018,
    title = "Universal Language Model Fine-tuning for Text Classification",
    author = "Howard, Jeremy  and
      Ruder, Sebastian",
    booktitle = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2018",
    address = "Melbourne, Australia",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/P18-1031",
    doi = "10.18653/v1/P18-1031",
    pages = "328--339",
}

@inproceedings{Peters2018,
    title = "Deep Contextualized Word Representations",
    author = "Peters, Matthew  and
      Neumann, Mark  and
      Iyyer, Mohit  and
      Gardner, Matt  and
      Clark, Christopher  and
      Lee, Kenton  and
      Zettlemoyer, Luke",
    booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
    month = jun,
    year = "2018",
    address = "New Orleans, Louisiana",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/N18-1202",
    doi = "10.18653/v1/N18-1202",
    pages = "2227--2237",
}

@inproceedings{carlini2018secret,
author = {Carlini, Nicholas and Liu, Chang and Erlingsson, \'{U}lfar and Kos, Jernej and Song, Dawn},
title = {The Secret Sharer: Evaluating and Testing Unintended Memorization in Neural Networks},
year = {2019},
isbn = {9781939133069},
publisher = {USENIX Association},
address = {USA},
abstract = {This paper describes a testing methodology for quantitatively assessing the risk that rare or unique training-data sequences are unintentionally memorized by generative sequence models--a common type of machine-learning model. Because such models are sometimes trained on sensitive data (e.g., the text of users' private messages), this methodology can benefit privacy by allowing deep-learning practitioners to select means of training that minimize such memorization.In experiments, we show that unintended memorization is a persistent, hard-to-avoid issue that can have serious consequences. Specifically, for models trained without consideration of memorization, we describe new, efficient procedures that can extract unique, secret sequences, such as credit card numbers. We show that our testing strategy is a practical and easy-to-use first line of defense, e.g., by describing its application to quantitatively limit data exposure in Google's Smart Compose, a commercial text-completion neural network trained on millions of users' email messages.},
booktitle = {Proceedings of the 28th USENIX Conference on Security Symposium},
pages = {267–284},
numpages = {18},
location = {Santa Clara, CA},
series = {SEC'19}
}

@inproceedings{Fredrikson2014,
author = {Fredrikson, Matthew and Lantz, Eric and Jha, Somesh and Lin, Simon and Page, David and Ristenpart, Thomas},
title = {Privacy in Pharmacogenetics: An End-to-End Case Study of Personalized Warfarin Dosing},
year = {2014},
isbn = {9781931971157},
publisher = {USENIX Association},
address = {USA},
booktitle = {Proceedings of the 23rd USENIX Conference on Security Symposium},
pages = {17–32},
numpages = {16},
location = {San Diego, CA},
series = {SEC'14}
}

@inproceedings{Fredrikson2015,
author = {Fredrikson, Matt and Jha, Somesh and Ristenpart, Thomas},
title = {Model Inversion Attacks That Exploit Confidence Information and Basic Countermeasures},
year = {2015},
isbn = {9781450338325},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/2810103.2813677},
doi = {10.1145/2810103.2813677},
pages = {1322–1333},
numpages = {12},
keywords = {machine learning, attacks, privacy},
location = {Denver, Colorado},
series = {CCS '15}
}

@software{spacy2,
  author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane},
  title = {{spaCy: Industrial-strength Natural Language Processing in Python}},
  year = 2020,
  publisher = {Zenodo},
  doi = {10.5281/zenodo.1212303},
  url = {https://doi.org/10.5281/zenodo.1212303}
}

@Manual{textrecipes,
    title = {{textrecipes}: Extra `Recipes' for Text Processing},
    author = {Emil Hvitfeldt},
    year = {2020},
    note = {R package version 0.4.1},
    url = {https://CRAN.R-project.org/package=textrecipes},
}

@book{konig2002germanic,
  title={The Germanic Languages},
  author={K{\"o}nig, E. and van der Auwera, J.},
  isbn={9780415280792},
  lccn={lc92037152},
  series={Routledge language family descriptions},
  url={https://books.google.com.do/books?id=whyUQgAACAAJ},
  year={2002},
  publisher={Routledge}
}

@article{Sugisaki2018,
  title={German compound splitting using the compound productivity of morphemes},
  author={Sugisaki, Kyoko and Tuggener, Don},
  year={2018},
  publisher={Verlag der {\"O}sterreichischen Akademie der Wissenschaften}
}

@inproceedings{ma-etal-2018-state,
    title = "State-of-the-art {C}hinese Word Segmentation with Bi-{LSTM}s",
    author = "Ma, Ji  and
      Ganchev, Kuzman  and
      Weiss, David",
    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
    month = oct # "-" # nov,
    year = "2018",
    address = "Brussels, Belgium",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D18-1529",
    doi = "10.18653/v1/D18-1529",
    pages = "4902--4908",
    abstract = "A wide variety of neural-network architectures have been proposed for the task of Chinese word segmentation. Surprisingly, we find that a bidirectional LSTM model, when combined with standard deep learning techniques and best practices, can achieve better accuracy on many of the popular datasets as compared to models based on more complex neuralnetwork architectures. Furthermore, our error analysis shows that out-of-vocabulary words remain challenging for neural-network models, and many of the remaining errors are unlikely to be fixed through architecture changes. Instead, more effort should be made on exploring resources for further improvement.",
}

@inproceedings{Huang2019,
    title = "Towards Fast and Accurate Neural {C}hinese Word Segmentation with Multi-Criteria Learning",
    author = "Huang, Weipeng  and
      Cheng, Xingyi  and
      Chen, Kunlong  and
      Wang, Taifeng  and
      Chu, Wei",
    booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
    month = dec,
    year = "2020",
    address = "Barcelona, Spain (Online)",
    publisher = "International Committee on Computational Linguistics",
    url = "https://www.aclweb.org/anthology/2020.coling-main.186",
    doi = "10.18653/v1/2020.coling-main.186",
    pages = "2062--2072",
}

@inproceedings{Caruana2008,
author = {Caruana, Rich and Karampatziakis, Nikos and Yessenalina, Ainur},
title = {An Empirical Evaluation of Supervised Learning in High Dimensions},
year = {2008},
isbn = {9781605582054},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/1390156.1390169},
doi = {10.1145/1390156.1390169},
booktitle = {Proceedings of the 25th International Conference on Machine Learning},
pages = {96–103},
numpages = {8},
location = {Helsinki, Finland},
series = {ICML '08}
}

@inproceedings{Olson2017,
  title={Data-driven advice for applying machine learning to bioinformatics problems},
  author={Olson, Randal S and Cava, William La and Mustahsan, Zairah and Varik, Akshay and Moore, Jason H},
  booktitle={Pacific Symposium on Biocomputing 2018: Proceedings of the Pacific Symposium},
  doi = {10.1142/9789813235533_0018},
  url = {https://doi.org/10.1142/9789813235533_0018},
  pages={192--203},
  year={2018},
  organization={World Scientific}
}


@inproceedings{Weinberger2009,
author = {Weinberger, Kilian and Dasgupta, Anirban and Langford, John and Smola, Alex and Attenberg, Josh},
title = {Feature Hashing for Large Scale Multitask Learning},
year = {2009},
isbn = {9781605585161},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/1553374.1553516},
doi = {10.1145/1553374.1553516},
booktitle = {Proceedings of the 26th Annual International Conference on Machine Learning},
pages = {1113–1120},
numpages = {8},
location = {Montreal, Quebec, Canada},
series = {ICML ’09}
}

@misc{appleby2008,
author = {Austin Appleby},
title = {MurmurHash},
year = {2008},
url = {https://sites.google.com/site/murmurhash}
}

@inproceedings{NIPS2018_7784,
author = {Freksen, Casper and Kamma, Lior and Larsen, Kasper Green},
title = {Fully Understanding the Hashing Trick},
year = {2018},
publisher = {Curran Associates Inc.},
address = {Red Hook, NY},
booktitle = {Proceedings of the 32nd International Conference on Neural Information Processing Systems},
pages = {5394–5404},
numpages = {11},
location = {Montr\'{e}al, Canada},
doi = {10.5555/3327345.3327444},
url = {https://doi.org/10.5555/3327345.3327444},
series = {NIPS'18}
}

@inproceedings{Forman2008,
author = {Forman, George and Kirshenbaum, Evan},
title = {Extremely Fast Text Feature Extraction for Classification and Indexing},
year = {2008},
isbn = {9781595939913},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/1458082.1458243},
doi = {10.1145/1458082.1458243},
booktitle = {Proceedings of the 17th ACM Conference on Information and Knowledge Management},
pages = {1221–1230},
numpages = {10},
keywords = {text mining, feature engineering, bag-of-words, feature extraction, document categorization, text tokenization, text indexing},
location = {Napa Valley, California},
series = {CIKM ’08}
}
@article{Vantu2016,
author = {Van-Tu, Nguyen and Anh-Cuong, Le},
year = {2016},
month = {05},
pages = {1--8},
title = {Improving Question Classification by Feature Extraction and Selection},
volume = {9},
number = {17},
journal = {Indian Journal of Science and Technology},
url = {https://doi.org/10.17485/ijst/2016/v9i17/93160},
doi = {10.17485/ijst/2016/v9i17/93160}
}

@inproceedings{Joachims1998,
author = {Joachims, Thorsten},
title = {Text Categorization with Support Vector Machines: Learning with Many Relevant Features},
year = {1998},
isbn = {3540644172},
publisher = {Springer-Verlag},
address = {Berlin, Heidelberg},
url = {https://doi.org/10.1007/BFb0026683},
doi = {10.1007/BFb0026683},
booktitle = {Proceedings of the 10th European Conference on Machine Learning},
pages = {137–142},
numpages = {6},
location = {Chemnitz, Germany},
series = {ECML’98}
}

@article{Harman91,
author = {Harman, Donna},
title = {How effective is suffixing?},
journal = {Journal of the American Society for Information Science},
volume = {42},
number = {1},
pages = {7-15},
doi = {10.1002/(SICI)1097-4571(199101)42:1<7::AID-ASI2>3.0.CO;2-P},
url = {https://doi.org/10.1002/(SICI)1097-4571(199101)42:1<7::AID-ASI2>3.0.CO;2-P},
eprint = {https://asistdl.onlinelibrary.wiley.com/doi/pdf/10.1002/%28SICI%291097-4571%28199101%2942%3A1%3C7%3A%3AAID-ASI2%3E3.0.CO%3B2-P},
year = {1991}
}

@book{Chollet2018,
  title={Deep Learning with {R}},
  author={Chollet, F. and Allaire, J.J.},
  isbn={9781617295546},
  lccn={2018285360},
  url={https://www.manning.com/books/deep-learning-with-r},
  year={2018},
  address={Shelter Island, NY},
  publisher={Manning Publications}
}

@inproceedings{Boser1992,
author = {Boser, Bernhard E. and Guyon, Isabelle M. and Vapnik, Vladimir N.},
title = {A Training Algorithm for Optimal Margin Classifiers},
year = {1992},
isbn = {089791497X},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/130385.130401},
doi = {10.1145/130385.130401},
booktitle = {{Proceedings of the Fifth Annual Workshop on Computational Learning Theory}},
pages = {144–152},
numpages = {9},
location = {Pittsburgh, Pennsylvania},
series = {COLT '92}
}

@article{Friedman2010,
   author = {Jerome H. Friedman and Trevor Hastie and Rob Tibshirani},
   title = {Regularization Paths for Generalized Linear Models via Coordinate Descent},
   journal = {Journal of Statistical Software, Articles},
   volume = {33},
   number = {1},
   year = {2010},
   issn = {1548-7660},
   pages = {1--22},
   doi = {10.18637/jss.v033.i01},
   url = {https://www.jstatsoft.org/v033/i01}
}

@ARTICLE{Lex2014,
  author={Lex, Alexander and Gehlenborg, Nils and Strobelt, Hendrik and Vuillemot, Romain and Pfister, Hanspeter},
  journal={{IEEE Transactions on Visualization and Computer Graphics}}, 
  title={UpSet: Visualization of Intersecting Sets}, 
  year={2014},
  volume={20},
  number={12},
  pages={1983-1992},
  doi={10.1109/TVCG.2014.2346248},
  url = {https://doi.org/10.1109/TVCG.2014.2346248}
}


@book{James2013,
  title={An introduction to statistical learning},
  author={James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert},
  year={2013},
  publisher={Springer},
  address={New York}
}

@Manual{Perry2020,
    title = {{corpus}: Text Corpus Analysis},
    author = {Patrick O. Perry},
    year = {2020},
    note = {R package version 0.10.2},
    url = {https://CRAN.R-project.org/package=corpus},
}


@ARTICLE{kim2006,
  author={Kim, S. and Han, K. and Rim, H. and Myaeng, S.H.},
  journal={IEEE Transactions on Knowledge and Data Engineering}, 
  title={Some Effective Techniques for Naive Bayes Text Classification}, 
  year={2006},
  volume={18},
  number={11},
  pages={1457-1466},
  doi={10.1109/TKDE.2006.180},
  url ={https://doi.org/10.1109/TKDE.2006.180}
}

@InProceedings{Kibriya2005,
author="Kibriya, Ashraf M.
and Frank, Eibe
and Pfahringer, Bernhard
and Holmes, Geoffrey",
editor="Webb, Geoffrey I.
and Yu, Xinghuo",
title="Multinomial Naive Bayes for Text Categorization Revisited",
booktitle="AI 2004: Advances in Artificial Intelligence",
doi={10.1007/978-3-540-30549-1_43},
url={https://doi.org/10.1007/978-3-540-30549-1_43},
year="2005",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="488--499",
isbn="978-3-540-30549-1"
}

@InProceedings{Eibe2006,
author="Frank, Eibe
and Bouckaert, Remco R.",
editor="F{\"u}rnkranz, Johannes
and Scheffer, Tobias
and Spiliopoulou, Myra",
title="Naive Bayes for Text Classification with Unbalanced Classes",
booktitle="Knowledge Discovery in Databases: PKDD 2006",
year="2006",
doi="10.1007/11871637_49",
url={https://doi.org/10.1007/11871637_49},
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="503--510",
isbn="978-3-540-46048-0"
}

@Manual{R-googleLanguageR,
    title = {{googleLanguageR}: Call Google's `Natural Language' API, `Cloud Translation' API,
`Cloud Speech' API and `Cloud Text-to-Speech' API},
    author = {Mark Edmondson},
    year = {2020},
    note = {R package version 0.3.0},
    url = {https://CRAN.R-project.org/package=googleLanguageR},
}

@inproceedings{Tang2018,
author = {Tang, Cheng and Garreau, Damien and von Luxburg, Ulrike},
title = {When Do Random Forests Fail?},
year = {2018},
publisher = {Curran Associates Inc.},
address = {Red Hook, NY},
pages = {2987–2997},
numpages = {11},
location = {Montr\'{e}al, Canada},
series = {NIPS'18}
} 
  
@Manual{R-Matrix,
    title = {Matrix: Sparse and Dense Matrix Classes and Methods},
    author = {Douglas Bates and Martin Maechler},
    year = {2021},
    note = {R package version 1.3-2},
    url = {https://CRAN.R-project.org/package=Matrix},
}  

@techreport{Sweeney2000,
  title={Simple demographics often identify people uniquely},
  author={Sweeney, Latanya},
  number = {Data Privacy Working Paper 3},
  institution = {Carnegie Mellon University},
  year={2000},
  url = {https://dataprivacylab.org/projects/identifiability/}
}

@article{Minaee2020,
author = {Minaee, Shervin and Kalchbrenner, Nal and Cambria, Erik and Nikzad, Narjes and Chenaghlu, Meysam and Gao, Jianfeng},
title = {Deep Learning--Based Text Classification: A Comprehensive Review},
year = {2021},
issue_date = {April 2021},
publisher = {Association for Computing Machinery},
address = {New York, NY},
volume = {54},
number = {3},
issn = {0360-0300},
url = {https://doi.org/10.1145/3439726},
doi = {10.1145/3439726},
journal = {ACM Comput. Surv.},
month = apr,
articleno = {62},
numpages = {40},
keywords = {Text classification, sentiment analysis, deep learning, natural language inference, news categorization, question answering, topic classification}
}

@inproceedings{Zhang2015,
author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
title = {Character-Level Convolutional Networks for Text Classification},
year = {2015},
publisher = {MIT Press},
address = {Cambridge, MA},
abstract = {This article offers an empirical exploration on the use of character-level convolutional networks (ConvNets) for text classification. We constructed several large-scale datasets to show that character-level convolutional networks could achieve state-of-the-art or competitive results. Comparisons are offered against traditional models such as bag of words, n-grams and their TFIDF variants, and deep learning models such as word-based ConvNets and recurrent neural networks.},
booktitle = {Proceedings of the 28th International Conference on Neural Information Processing Systems - Volume 1},
pages = {649–657},
numpages = {9},
location = {Montreal, Canada},
series = {NIPS'15}
}

@inproceedings{Vosoughi2016,
author = {Vosoughi, Soroush and Vijayaraghavan, Prashanth and Roy, Deb},
title = {{Tweet2Vec}: Learning Tweet Embeddings Using Character-Level {CNN-LSTM} Encoder-Decoder},
year = {2016},
isbn = {9781450340694},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/2911451.2914762},
doi = {10.1145/2911451.2914762},
booktitle = {Proceedings of the 39th International ACM SIGIR Conference on Research and Development in Information Retrieval},
pages = {1041–1044},
numpages = {4},
keywords = {encoder-decoder, tweet, embedding, twitter, lstm, cnn, tweet2vec, convolutional neural networks},
location = {Pisa, Italy},
series = {SIGIR '16}
}

@misc{shwartzziv2017opening,
      title={Opening the Black Box of Deep Neural Networks via Information}, 
      author={Ravid Shwartz-Ziv and Naftali Tishby},
      year={2017},
      eprint={1703.00810},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}


@article{guidotti2018survey,
author = {Guidotti, Riccardo and Monreale, Anna and Ruggieri, Salvatore and Turini, Franco and Giannotti, Fosca and Pedreschi, Dino},
title = {A Survey of Methods for Explaining Black Box Models},
year = {2018},
issue_date = {January 2019},
publisher = {Association for Computing Machinery},
address = {New York, NY},
volume = {51},
number = {5},
issn = {0360-0300},
url = {https://doi.org/10.1145/3236009},
doi = {10.1145/3236009},
journal = {ACM Computing Surveys},
month = aug,
articleno = {93},
numpages = {42},
keywords = {Open the black box, interpretability, transparent models, explanations}
}

@misc{lampinen2018oneshot,
      title={One-shot and few-shot learning of word embeddings}, 
      author={Andrew K. Lampinen and James L. McClelland},
      year={2018},
      eprint={1710.10280},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@Inbook{bengio2012practical,
author="Bengio, Yoshua",
editor="Montavon, Gr{\'e}goire
and Orr, Genevi{\`e}ve B.
and M{\"u}ller, Klaus-Robert",
title="Practical Recommendations for Gradient-Based Training of Deep Architectures",
bookTitle="Neural Networks: Tricks of the Trade: Second Edition",
year="2012",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="437--478",
abstract="Learning algorithms related to artificial neural networks and in particular for Deep Learning may seem to involve many bells and whistles, called hyper-parameters. This chapter is meant as a practical guide with recommendations for some of the most commonly used hyperparameters, in particular in the context of learning algorithms based on back-propagated gradient and gradient-based optimization. It also discusses how to deal with the fact that more interesting results can be obtained when allowing one to adjust many hyper-parameters. Overall, it describes elements of the practice used to successfully and efficiently train and debug large-scale and often deep multi-layer neural networks. It closes with open questions about the training difficulties observed with deeper architectures.",
isbn="978-3-642-35289-8",
doi="10.1007/978-3-642-35289-8_26",
url="https://doi.org/10.1007/978-3-642-35289-8_26"
}

@article{Gage1994ANA,
  title={A new algorithm for data compression},
  author={P. Gage},
  journal={The C Users Journal archive},
  year={1994},
  volume={12},
  pages={23-38}
}

@Manual{R-reticulate,
    title = {{reticulate}: Interface to `Python'},
    author = {Kevin Ushey and JJ Allaire and Yuan Tang},
    year = {2021},
    note = {R package version 1.20},
    url = {https://CRAN.R-project.org/package=reticulate},
}

@inproceedings{ribeiro2016why,
author = {Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos},
title = {`{Why} Should {I} Trust You?': Explaining the Predictions of Any Classifier},
year = {2016},
isbn = {9781450342322},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/2939672.2939778},
doi = {10.1145/2939672.2939778},
booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {1135–1144},
numpages = {10},
keywords = {explaining machine learning, interpretability, black box classifier, interpretable machine learning},
location = {San Francisco, California},
series = {KDD '16}
}

@article{Ramineni2018,
author = {Ramineni, Chaitanya and Williamson, David},
title = {Understanding Mean Score Differences Between the e-rater® Automated Scoring Engine and Humans for Demographically Based Groups in the {GRE}® General Test},
journal = {ETS Research Report Series},
volume = {2018},
number = {1},
pages = {1-31},
keywords = {Automated scoring, essay scoring, GRE® writing, subgroup differences, shell text, CART},
doi = {https://doi.org/10.1002/ets2.12192},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/ets2.12192},
year = {2018}
}

 @misc{Feathers2019, 
 title={Flawed Algorithms Are Grading Millions of Students' Essays}, 
 url={https://www.vice.com/en/article/pa7dj9/flawed-algorithms-are-grading-millions-of-students-essays}, 
 journal={Motherboard}, 
 publisher={VICE}, 
 author={Feathers, Todd}, 
 year={2019}, 
 month={Aug}
 }

@article{Mohammad13, 
author = {Mohammad, Saif M. and Turney, Peter D.}, 
title = {Crowdsourcing a Word–Emotion Association Lexicon}, 
journal = {Computational Intelligence}, 
volume = {29}, 
number = {3}, 
pages = {436-465}, 
doi = {10.1111/j.1467-8640.2012.00460.x}, 
url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-8640.2012.00460.x}, 
eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-8640.2012.00460.x}, 
year = {2013} 
}

@Manual{R-wordsalad,
    title = {{wordsalad}: Provide Tools to Extract and Analyze Word Vectors},
    author = {Emil Hvitfeldt},
    year = {2020},
    note = {R package version 0.2.0},
    url = {https://CRAN.R-project.org/package=wordsalad},
}
  
@Manual{R-pdftools,
    title = {{pdftools}: Text Extraction, Rendering and Converting of PDF Documents},
    author = {Jeroen Ooms},
    year = {2020},
    note = {R package version 2.3.1},
    url = {https://CRAN.R-project.org/package=pdftools},
}

@Manual{R-httr,
    title = {{httr}: Tools for Working with URLs and HTTP},
    author = {Hadley Wickham},
    year = {2020},
    note = {R package version 1.4.2},
    url = {https://CRAN.R-project.org/package=httr},
}

@article{Johnson1999,
  title={A semantic lexicon for medical language processing},
  author={Johnson, Stephen B},
  journal={Journal of the American Medical Informatics Association},
  volume={6},
  number={3},
  pages={205--218},
  year={1999},
  doi={10.1136/jamia.1999.0060205},
  url={https://doi.org/10.1136/jamia.1999.0060205},
  publisher={BMJ Group BMA House, Tavistock Square, London, WC1H 9JR}
}

@inproceedings{Bender2021,
author = {Bender, Emily M and Gebru, Timnit and McMillan-Major, Angelina and Shmitchell, Shmargaret},
title = {On the Dangers of Stochastic Parrots: Can Language Models Be Too Big? 🦜},
year = {2021},
isbn = {9781450383097},
publisher = {Association for Computing Machinery},
address = {New York, NY},
url = {https://doi.org/10.1145/3442188.3445922},
doi = {10.1145/3442188.3445922},
booktitle = {Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency},
pages = {610–623},
numpages = {14},
location = {Virtual Event, Canada},
series = {FAccT '21}
}

@article{Golub1970,
author = {Golub, G. H. and Reinsch, C.},
title = {Singular Value Decomposition and Least Squares Solutions},
year = {1970},
issue_date = {April     1970},
publisher = {Springer-Verlag},
address = {Berlin, Heidelberg},
volume = {14},
number = {5},
issn = {0029-599X},
url = {https://doi.org/10.1007/BF02163027},
doi = {10.1007/BF02163027},
journal = {Numerische Mathematik},
month = apr,
pages = {403–420},
numpages = {18}
}

@article{ELMAN1990179,
title = {Finding structure in time},
journal = {Cognitive Science},
volume = {14},
number = {2},
pages = {179-211},
year = {1990},
issn = {0364-0213},
doi = {10.1207/s15516709cog1402_1},
url = {https://doi.org/10.1207/s15516709cog1402_1},
author = {Jeffrey L. Elman},
abstract = {Time underlies many interesting human behaviors. Thus, the question of how to represent time in connectionist models is very important. One approach is to represent time implicitly by its effects on processing rather than explicitly (as in a spatial representation). The current report develops a proposal along these lines first described by Jordan (1986) which involves the use of recurrent links in order to provide networks with a dynamic memory. In this approach, hidden unit patterns are fed back to themselves; the internal representations which develop thus reflect task demands in the context of prior internal states. A set of simulations is reported which range from relatively simple problems (temporal version of XOR) to discovering syntactic/semantic features for words. The networks are able to learn interesting internal representations which incorporate task demands with memory demands; indeed, in this approach the notion of memory is inextricably bound up with task processing. These representations reveal a rich structure, which allows them to be highly context-dependent, while also expressing generalizations across classes of items. These representations suggest a method for representing lexical categories and the type/token distinction.}
}

@article{Hochreiter1997,
author = {Hochreiter, Sepp and Schmidhuber, J\"{u}rgen},
title = {Long Short-Term Memory},
year = {1997},
issue_date = {November 15, 1997},
publisher = {MIT Press},
address = {Cambridge, MA},
volume = {9},
number = {8},
issn = {0899-7667},
url = {https://doi.org/10.1162/neco.1997.9.8.1735},
doi = {10.1162/neco.1997.9.8.1735},
abstract = {Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O. 1. Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms.},
journal = {Neural Comput.},
month = nov,
pages = {1735–1780},
numpages = {46}
}

@inproceedings{kim2014,
    title = "Convolutional Neural Networks for Sentence Classification",
    author = "Kim, Yoon",
    booktitle = "Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})",
    month = oct,
    year = "2014",
    address = "Doha, Qatar",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D14-1181",
    doi = "10.3115/v1/D14-1181",
    pages = "1746--1751",
}

@article{Srivastava2014,
  author  = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
  title   = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
  journal = {Journal of Machine Learning Research},
  year    = {2014},
  volume  = {15},
  number  = {56},
  pages   = {1929-1958},
  url     = {http://jmlr.org/papers/v15/srivastava14a.html}
}

@article{Lewis2014,
 author = {Lewis, David D. and Yang, Yiming and Rose, Tony G. and Li, Fan},
 title = {{RCV1}: A New Benchmark Collection for Text Categorization Research},
 journal = {Journal of Machine Learning Research},
 volume = {5},
 year = {2004},
 issn = {1532-4435},
 pages = {361--397},
 url = {https://www.jmlr.org/papers/volume5/lewis04a/lewis04a.pdf},
}

@inproceedings{shrikumar2019learning,
author = {Shrikumar, Avanti and Greenside, Peyton and Kundaje, Anshul},
title = {Learning Important Features through Propagating Activation Differences},
year = {2017},
publisher = {JMLR.org},
booktitle = {Proceedings of the 34th International Conference on Machine Learning - Volume 70},
pages = {3145–3153},
numpages = {9},
location = {Sydney, NSW, Australia},
series = {ICML'17}
}

@Manual{R-LiblineaR,
    title = {LiblineaR: Linear Predictive Models Based on the {LIBLINEAR C/C++} Library},
    author = {Thibault Helleputte},
    year = {2021},
    note = {R package version 2.10-12},
    url = {https://CRAN.R-project.org/package=LiblineaR}
}

@article{Tibshirani1996,
 ISSN = {00359246},
 URL = {http://www.jstor.org/stable/2346178},
 author = {Robert Tibshirani},
 journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
 number = {1},
 pages = {267--288},
 publisher = {[Royal Statistical Society, Wiley]},
 title = {Regression Shrinkage and Selection via the Lasso},
 volume = {58},
 year = {1996}
}

@book{Breiman1984,
  title={{Classification and Regression Trees}},
  author={Breiman, Leo and Friedman, Jerome and Stone, Charles J and Olshen, Richard A},
  year={1984},
  publisher={CRC Press},
  address = {Boca Raton}
}

@Manual{R-textfeatures,
    title = {{textfeatures}: Extracts Features from Text},
    author = {Michael W. Kearney},
    year = {2019},
    note = {R package version 0.3.3},
    url = {https://CRAN.R-project.org/package=textfeatures},
}

@Manual{R-keras,
    title = {{keras}: R Interface to `Keras'},
    author = {JJ Allaire and François Chollet},
    year = {2021},
    note = {R package version 2.4.0},
    url = {https://CRAN.R-project.org/package=keras},
}

@misc{kingma2017adam,
      title={Adam: A Method for Stochastic Optimization}, 
      author={Diederik P. Kingma and Jimmy Ba},
      year={2017},
      eprint={1412.6980},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@article{bender2019rule,
author = {Bender, Emily M},
title = {The \#BenderRule: On Naming the Languages We Study and Why It Matters},
journal = {The Gradient},
year = {2019},
url = {https://thegradient.pub/the-benderrule-on-naming-the-languages-we-study-and-why-it-matters/},
}

@Manual{R-SnowballC,
    title = {SnowballC: Snowball Stemmers Based on the {C} `{libstemmer}' UTF-8 Library},
    author = {Milan Bouchet-Valat},
    year = {2020},
    note = {R package version 0.7.0},
    url = {https://CRAN.R-project.org/package=SnowballC},
}

@Manual{R-hunspell,
    title = {{hunspell}: High-Performance Stemmer, Tokenizer, and Spell Checker},
    author = {Jeroen Ooms},
    year = {2020},
    note = {R package version 3.0.1},
    url = {https://CRAN.R-project.org/package=hunspell},
}

@Article{R-quanteda,
    title = {{quanteda}: An {R} package for the quantitative analysis of textual data},
    journal = {Journal of Open Source Software},
    author = {Kenneth Benoit and Kohei Watanabe and Haiyan Wang and Paul Nulty and Adam Obeng and Stefan Müller and Akitaka Matsuo},
    doi = {10.21105/joss.00774},
    url = {https://doi.org/10.21105/joss.00774},
    volume = {3},
    number = {30},
    pages = {774},
    year = {2018},
}

@Manual{R-widyr,
    title = {{widyr}: Widen, Process, then Re-Tidy Data},
    author = {David Robinson},
    year = {2020},
    note = {R package version 0.1.3},
    url = {https://CRAN.R-project.org/package=widyr},
}

@misc{R-tidymodels,
    title = {Tidymodels: a collection of packages for modeling and machine learning using tidyverse principles.},
    author = {Max Kuhn and Hadley Wickham},
    publisher={RStudio PBC}, 
    url = {https://www.tidymodels.org},
    year = {2021},
}

@article{Wickham2019,
  doi = {10.21105/joss.01686},
  url = {https://doi.org/10.21105/joss.01686},
  year = {2019},
  publisher = {The Open Journal},
  volume = {4},
  number = {43},
  pages = {1686},
  author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani},
  title = {Welcome to the Tidyverse},
  journal = {Journal of Open Source Software}
}

@Manual{R-recipes,
    title = {{recipes}: Preprocessing Tools to Create Design Matrices},
    author = {Max Kuhn and Hadley Wickham},
    year = {2021},
    note = {R package version 0.1.16},
    url = {https://CRAN.R-project.org/package=recipes}
}

@Manual{R-workflows,
    title = {{workflows}: Modeling Workflows},
    author = {Davis Vaughan},
    year = {2021},
    note = {R package version 0.2.2},
    url = {https://CRAN.R-project.org/package=workflows}
}

@Manual{R-rsample,
    title = {{rsample}: General Resampling Infrastructure},
    author = {Julia Silge and Fanny Chow and Max Kuhn and Hadley Wickham},
    year = {2021},
    note = {R package version 0.1.0},
    url = {https://CRAN.R-project.org/package=rsample}
}

@Manual{R-parsnip,
    title = {{parsnip}: A Common API to Modeling and Analysis Functions},
    author = {Max Kuhn and Davis Vaughan},
    year = {2021},
    note = {R package version 0.1.6},
    url = {https://CRAN.R-project.org/package=parsnip},
}

@Manual{R-hardhat,
    title = {{hardhat}: Construct Modeling Packages},
    author = {Davis Vaughan and Max Kuhn},
    year = {2020},
    note = {R package version 0.1.5},
    url = {https://CRAN.R-project.org/package=hardhat},
}

@Manual{R-themis,
    title = {{themis}: Extra Recipe Steps for Dealing with Unbalanced Data},
    author = {Emil Hvitfeldt},
    year = {2020},
    note = {R package version 0.1.4},
    url = {https://CRAN.R-project.org/package=themis},
}

@Manual{R-dials,
    title = {{dials}: Tools for Creating Tuning Parameter Values},
    author = {Max Kuhn},
    year = {2020},
    note = {R package version 0.0.9},
    url = {https://CRAN.R-project.org/package=dials},
}

@Manual{R-yardstick,
    title = {{yardstick}: Tidy Characterizations of Model Performance},
    author = {Max Kuhn and Davis Vaughan},
    year = {2021},
    note = {R package version 0.0.8},
    url = {https://CRAN.R-project.org/package=yardstick},
}

@Manual{R-lime,
    title = {{lime}: Local Interpretable Model-Agnostic Explanations},
    author = {Thomas Lin Pedersen and Michaël Benesty},
    year = {2021},
    note = {R package version 0.5.2},
    url = {https://CRAN.R-project.org/package=lime},
}

@misc{Verge2020, 
title={These students figured out their tests were graded by AI}, 
url={https://www.theverge.com/2020/9/2/21419012/edgenuity-online-class-ai-grading-keyword-mashing-students-school-cheating-algorithm-glitch}, 
publisher={The Verge}, 
author={Chin, Monica}, 
year={2020}, 
month={Sep}
}

@Manual{R-readr,
    title = {{readr}: Read Rectangular Text Data},
    author = {Hadley Wickham and Jim Hester},
    year = {2020},
    note = {R package version 1.4.0},
    url = {https://CRAN.R-project.org/package=readr},
}

@Manual{R-jiebaR,
    title = {{jiebaR}: Chinese Text Segmentation},
    author = {Qin Wenfeng and Wu Yanyi},
    year = {2019},
    note = {R package version 0.11},
    url = {https://CRAN.R-project.org/package=jiebaR},
  }