rspeer · Tahnan · Jul 24, 2018 · Jul 2, 2018 · Jul 15, 2018 · Jul 23, 2018
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,5 @@
+## Version 2.2
+
 ## Version 2.1 (2018-06-18)
 
 Data changes:

diff --git a/README.md b/README.md
@@ -48,13 +48,13 @@ frequency as a decimal between 0 and 1.
  1.07e-05
 
  >>> word_frequency('café', 'en')
- 5.89e-06
+ 5.75e-06
 
  >>> word_frequency('cafe', 'fr')
  1.51e-06
 
  >>> word_frequency('café', 'fr')
- 5.25e-05
+ 5.13e-05
 
 
 `zipf_frequency` is a variation on `word_frequency` that aims to return the
@@ -78,10 +78,10 @@ one occurrence per billion words.
  5.29
 
  >>> zipf_frequency('frequency', 'en')
- 4.42
+ 4.43
 
  >>> zipf_frequency('zipf', 'en')
- 1.55
+ 1.57
 
  >>> zipf_frequency('zipf', 'en', wordlist='small')
  0.0
@@ -276,7 +276,8 @@ produces tokens that follow the recommendations in [Unicode
 Annex #29, Text Segmentation][uax29], including the optional rule that
 splits words between apostrophes and vowels.
 
-There are language-specific exceptions:
+There are exceptions where we change the tokenization to work better
+with certain languages:
 
 - In Arabic and Hebrew, it additionally normalizes ligatures and removes
  combining marks.
@@ -288,19 +289,29 @@ There are language-specific exceptions:
 - In Chinese, it uses the external Python library `jieba`, another optional
  dependency.
 
+- While the @ sign is usually considered a symbol and not part of a word,
+ wordfreq will allow a word to end with "@" or "@s". This is one way of
+ writing gender-neutral words in Spanish and Portuguese.
+
 [uax29]: http://unicode.org/reports/tr29/
 
 When wordfreq's frequency lists are built in the first place, the words are
 tokenized according to this function.
 
+ >>> from wordfreq import tokenize
+ >>> tokenize('l@s niñ@s', 'es')
+ ['l@s', 'niñ@s']
+ >>> zipf_frequency('l@s', 'es')
+ 2.8
+
 Because tokenization in the real world is far from consistent, wordfreq will
 also try to deal gracefully when you query it with texts that actually break
 into multiple tokens:
 
  >>> zipf_frequency('New York', 'en')
  5.28
  >>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
- 3.57
+ 3.61
 
 The word frequencies are combined with the half-harmonic-mean function in order
 to provide an estimate of what their combined frequency would be. In Chinese,

diff --git a/setup.py b/setup.py
@@ -35,7 +35,7 @@
 
 setup(
  name="wordfreq",
- version='2.1.0',
+ version='2.2.0',
  maintainer='Luminoso Technologies, Inc.',
  maintainer_email='info@luminoso.com',
  url='http://github.com/LuminosoInsight/wordfreq/',

diff --git a/tests/test_at_sign.py b/tests/test_at_sign.py
@@ -0,0 +1,109 @@
+from wordfreq import tokenize, lossy_tokenize, word_frequency
+
+
+def test_gender_neutral_at():
+ # Recognize the gender-neutral @ in Spanish as part of the word
+ text = "La protección de los derechos de tod@s l@s trabajador@s migrantes"
+ assert tokenize(text, "es") == [
+ "la",
+ "protección",
+ "de",
+ "los",
+ "derechos",
+ "de",
+ "tod@s",
+ "l@s",
+ "trabajador@s",
+ "migrantes"
+ ]
+
+ text = "el distrito 22@ de Barcelona"
+ assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"]
+ assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"]
+
+ # It also appears in Portuguese
+ text = "direitos e deveres para @s membr@s da comunidade virtual"
+ assert tokenize(text, "pt") == [
+ "direitos",
+ "e",
+ "deveres",
+ "para",
+ "@s",
+ "membr@s",
+ "da",
+ "comunidade",
+ "virtual"
+ ]
+
+ # Because this is part of our tokenization, the language code doesn't
+ # actually matter, as long as it's a language with Unicode tokenization
+ text = "@s membr@s da comunidade virtual"
+ assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
+
+
+def test_at_in_corpus():
+ # We have a word frequency for "l@s"
+ assert word_frequency('l@s', 'es') > 0
+
+ # It's not just treated as a word break
+ assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
+
+
+def test_punctuation_at():
+ # If the @ appears alone in a word, we consider it to be punctuation
+ text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
+ assert tokenize(text, "pt") == [
+ "operadores",
+ "de",
+ "canal",
+ "que",
+ "são",
+ "aqueles",
+ "que",
+ "têm",
+ "um",
+ "ao",
+ "lado",
+ "do",
+ "nick"
+ ]
+
+ assert tokenize(text, "pt", include_punctuation=True) == [
+ "operadores",
+ "de",
+ "canal",
+ ",",
+ "que",
+ "são",
+ "aqueles",
+ "que",
+ "têm",
+ "um",
+ "@",
+ "ao",
+ "lado",
+ "do",
+ "nick"
+ ]
+
+ # If the @ is not at the end of the word or part of the word ending '@s',
+ # it is also punctuation
+ text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL"
+ assert tokenize(text, "es") == [
+ "un",
+ "archivo",
+ "hosts.deny",
+ "que",
+ "contiene",
+ "la",
+ "línea",
+ "all:all",
+ "all"
+ ]
+
+ # Make sure not to catch e-mail addresses
+ text = "info@something.example"
+ assert tokenize(text, "en") == [
+ "info",
+ "something.example"
+ ]
diff --git a/tests/test_chinese.py b/tests/test_chinese.py
@@ -59,7 +59,7 @@ def test_tokens():
 
 def test_combination():
  xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
- assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20)
+ assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)
 
 
 def test_alternate_codes():