From 28fcc9fa6e6bb2b68db3db2a259d735c72d79d4d Mon Sep 17 00:00:00 2001 From: David Liao Date: Mon, 29 Nov 2021 19:48:17 +0000 Subject: [PATCH 1/7] add encoding --- liwc/__init__.py | 5 +++-- liwc/dic.py | 32 +++++++++++++++++++++++--------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/liwc/__init__.py b/liwc/__init__.py index ffd36b6..3c0c069 100644 --- a/liwc/__init__.py +++ b/liwc/__init__.py @@ -9,7 +9,7 @@ __version__ = None -def load_token_parser(filepath): +def load_token_parser(filepath, encoding = "utf-8"): """ Reads a LIWC lexicon from a file in the .dic format, returning a tuple of (parse, category_names), where: @@ -17,8 +17,9 @@ def load_token_parser(filepath): empty) of matching categories * `category_names` is a list of strings representing all LIWC categories in the lexicon + add encoding with utf-8 by default """ - lexicon, category_names = read_dic(filepath) + lexicon, category_names = read_dic(filepath, encoding = encoding) trie = build_trie(lexicon) def parse_token(token): diff --git a/liwc/dic.py b/liwc/dic.py index b9d4f0c..9b7d9f8 100644 --- a/liwc/dic.py +++ b/liwc/dic.py @@ -33,13 +33,27 @@ def read_dic(filepath): * `lexicon` is a dict mapping string patterns to lists of category names * `category_names` is a list of category names (as strings) """ - with open(filepath) as lines: + try: + with open(filepath) as lines: # read up to first "%" (should be very first line of file) - for line in lines: - if line.strip() == "%": - break - # read categories (a mapping from integer string to category name) - category_mapping = dict(_parse_categories(lines)) - # read lexicon (a mapping from matching string to a list of category names) - lexicon = dict(_parse_lexicon(lines, category_mapping)) - return lexicon, list(category_mapping.values()) + for line in lines: + if line.strip() == "%": + break + # read categories (a mapping from integer string to category name) + category_mapping = dict(_parse_categories(lines)) + # read lexicon (a mapping from matching string to a list of category names) + lexicon = dict(_parse_lexicon(lines, category_mapping)) + return lexicon, list(category_mapping.values()) + except UnicodeDecodeError: + # decode with European languages with windows-1252 Danish, Dutch, English, French, German, Italian, Norwegian, + # Portuguese, Swedish + with open(filepath, encoding= "windows-1252") as lines: + # read up to first "%" (should be very first line of file) + for line in lines: + if line.strip() == "%": + break + # read categories (a mapping from integer string to category name) + category_mapping = dict(_parse_categories(lines)) + # read lexicon (a mapping from matching string to a list of category names) + lexicon = dict(_parse_lexicon(lines, category_mapping)) + return lexicon, list(category_mapping.values()) From 9eb193433894341871979f576e682fd0eb300fbb Mon Sep 17 00:00:00 2001 From: David Liao Date: Mon, 29 Nov 2021 19:51:41 +0000 Subject: [PATCH 2/7] Update __init__.py add encoding --- liwc/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/liwc/__init__.py b/liwc/__init__.py index 3c0c069..deb80fb 100644 --- a/liwc/__init__.py +++ b/liwc/__init__.py @@ -17,7 +17,7 @@ def load_token_parser(filepath, encoding = "utf-8"): empty) of matching categories * `category_names` is a list of strings representing all LIWC categories in the lexicon - add encoding with utf-8 by default + add default encoding is utf-8. `encoding` can be overwritten by other encoding such as "windows-1252" """ lexicon, category_names = read_dic(filepath, encoding = encoding) trie = build_trie(lexicon) From fa488026bf74c1cea42ee1c6bafde7fed34f2976 Mon Sep 17 00:00:00 2001 From: David Liao Date: Mon, 29 Nov 2021 22:39:22 +0000 Subject: [PATCH 3/7] Update dic.py make default encoding utf-8 --- liwc/dic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/liwc/dic.py b/liwc/dic.py index 9b7d9f8..4fac424 100644 --- a/liwc/dic.py +++ b/liwc/dic.py @@ -26,7 +26,7 @@ def _parse_lexicon(lines, category_mapping): yield parts[0], [category_mapping[category_id] for category_id in parts[1:]] -def read_dic(filepath): +def read_dic(filepath, encoding = "utf-8"): """ Reads a LIWC lexicon from a file in the .dic format, returning a tuple of (lexicon, category_names), where: From 0acc840147b78f0bde275d091e27e4dd14044842 Mon Sep 17 00:00:00 2001 From: David Liao Date: Tue, 30 Nov 2021 04:42:56 +0000 Subject: [PATCH 4/7] add note --- liwc/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/liwc/__init__.py b/liwc/__init__.py index deb80fb..d368f34 100644 --- a/liwc/__init__.py +++ b/liwc/__init__.py @@ -17,7 +17,9 @@ def load_token_parser(filepath, encoding = "utf-8"): empty) of matching categories * `category_names` is a list of strings representing all LIWC categories in the lexicon - add default encoding is utf-8. `encoding` can be overwritten by other encoding such as "windows-1252" + `encoding = "utf-8"` can be overwritten by other encoding such as "EUC-JP" for Janpanese "IOS-2022" for + Simplified Chinese. The second default is "windows-1252" when the load_token_parser encounters non utf-8 + encoding. """ lexicon, category_names = read_dic(filepath, encoding = encoding) trie = build_trie(lexicon) From 6bcafc3c975a88bc79529ce3e0cc59e676e93dfc Mon Sep 17 00:00:00 2001 From: David Liao Date: Tue, 30 Nov 2021 04:43:20 +0000 Subject: [PATCH 5/7] clean code --- liwc/dic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/liwc/dic.py b/liwc/dic.py index 4fac424..252dc3d 100644 --- a/liwc/dic.py +++ b/liwc/dic.py @@ -47,7 +47,7 @@ def read_dic(filepath, encoding = "utf-8"): except UnicodeDecodeError: # decode with European languages with windows-1252 Danish, Dutch, English, French, German, Italian, Norwegian, # Portuguese, Swedish - with open(filepath, encoding= "windows-1252") as lines: + with open(filepath, encoding="windows-1252") as lines: # read up to first "%" (should be very first line of file) for line in lines: if line.strip() == "%": @@ -57,3 +57,6 @@ def read_dic(filepath, encoding = "utf-8"): # read lexicon (a mapping from matching string to a list of category names) lexicon = dict(_parse_lexicon(lines, category_mapping)) return lexicon, list(category_mapping.values()) + except UnicodeDecodeError as e: + print("encoding requires correct encoding") + From 77bfc23a7fbe393422f86fee0cffc188da93dc3e Mon Sep 17 00:00:00 2001 From: David Liao Date: Mon, 20 Dec 2021 20:29:20 +0000 Subject: [PATCH 6/7] add description `load_token_parser()` now can read multiple dictionaries from the distrubor such as Dutch_LIWC2015_Dictionary, German_LIWC2001_Dictionary, Italian_LIWC2007_Dictionary, Italian_LIWC2007_Dictionary, LIWC2007_English, LIWC2015_English, Spanish_LIWC2007_Dictionary as well as Swedish from the user. --- liwc/.ipynb_checkpoints/trie-checkpoint.py | 34 ++++++++++++++++++++++ liwc/__init__.py | 7 +++-- 2 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 liwc/.ipynb_checkpoints/trie-checkpoint.py diff --git a/liwc/.ipynb_checkpoints/trie-checkpoint.py b/liwc/.ipynb_checkpoints/trie-checkpoint.py new file mode 100644 index 0000000..b19f2fc --- /dev/null +++ b/liwc/.ipynb_checkpoints/trie-checkpoint.py @@ -0,0 +1,34 @@ +def build_trie(lexicon): + """ + Build a character-trie from the plain pattern_string -> categories_list + mapping provided by `lexicon`. + + Some LIWC patterns end with a `*` to indicate a wildcard match. + """ + trie = {} + for pattern, category_names in lexicon.items(): + cursor = trie + for char in pattern: + if char == "*": + cursor["*"] = category_names + break + if char not in cursor: + cursor[char] = {} + cursor = cursor[char] + cursor["$"] = category_names + return trie + + +def search_trie(trie, token, token_i=0): + """ + Search the given character-trie for paths that match the `token` string. + """ + if "*" in trie: + return trie["*"] + if "$" in trie and token_i == len(token): + return trie["$"] + if token_i < len(token): + char = token[token_i] + if char in trie: + return search_trie(trie[char], token, token_i + 1) + return [] diff --git a/liwc/__init__.py b/liwc/__init__.py index d368f34..8ca3d91 100644 --- a/liwc/__init__.py +++ b/liwc/__init__.py @@ -17,9 +17,10 @@ def load_token_parser(filepath, encoding = "utf-8"): empty) of matching categories * `category_names` is a list of strings representing all LIWC categories in the lexicon - `encoding = "utf-8"` can be overwritten by other encoding such as "EUC-JP" for Janpanese "IOS-2022" for - Simplified Chinese. The second default is "windows-1252" when the load_token_parser encounters non utf-8 - encoding. + * `encoding = "utf-8"` can be overwritten by other encoding such as "EUC-JP" for Janpanese. + * load_token_parser now can read multiple dictionaries from the distrubor such as Dutch_LIWC2015_Dictionary, + German_LIWC2001_Dictionary, Italian_LIWC2007_Dictionary, Italian_LIWC2007_Dictionary, LIWC2007_English, LIWC2015_English, + Spanish_LIWC2007_Dictionary as well as Swedish from the user. """ lexicon, category_names = read_dic(filepath, encoding = encoding) trie = build_trie(lexicon) From f994494fbaac1f160e12fb4370e9dbfaa3038b05 Mon Sep 17 00:00:00 2001 From: David Liao Date: Mon, 20 Dec 2021 20:30:28 +0000 Subject: [PATCH 7/7] typo --- liwc/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/liwc/__init__.py b/liwc/__init__.py index 8ca3d91..aed6847 100644 --- a/liwc/__init__.py +++ b/liwc/__init__.py @@ -18,7 +18,7 @@ def load_token_parser(filepath, encoding = "utf-8"): * `category_names` is a list of strings representing all LIWC categories in the lexicon * `encoding = "utf-8"` can be overwritten by other encoding such as "EUC-JP" for Janpanese. - * load_token_parser now can read multiple dictionaries from the distrubor such as Dutch_LIWC2015_Dictionary, + * `load_token_parser()` now can read multiple dictionaries from the distributor such as Dutch_LIWC2015_Dictionary, German_LIWC2001_Dictionary, Italian_LIWC2007_Dictionary, Italian_LIWC2007_Dictionary, LIWC2007_English, LIWC2015_English, Spanish_LIWC2007_Dictionary as well as Swedish from the user. """