From 28fcc9fa6e6bb2b68db3db2a259d735c72d79d4d Mon Sep 17 00:00:00 2001
From: David Liao <davidycliao@gmail.com>
Date: Mon, 29 Nov 2021 19:48:17 +0000
Subject: [PATCH 1/7] add encoding

---
 liwc/__init__.py |  5 +++--
 liwc/dic.py      | 32 +++++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/liwc/__init__.py b/liwc/__init__.py
index ffd36b6..3c0c069 100644
--- a/liwc/__init__.py
+++ b/liwc/__init__.py
@@ -9,7 +9,7 @@
     __version__ = None
 
 
-def load_token_parser(filepath):
+def load_token_parser(filepath, encoding = "utf-8"):
     """
     Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
     (parse, category_names), where:
@@ -17,8 +17,9 @@ def load_token_parser(filepath):
       empty) of matching categories
     * `category_names` is a list of strings representing all LIWC categories in
       the lexicon
+    add encoding with utf-8 by default
     """
-    lexicon, category_names = read_dic(filepath)
+    lexicon, category_names = read_dic(filepath, encoding = encoding)
     trie = build_trie(lexicon)
 
     def parse_token(token):
diff --git a/liwc/dic.py b/liwc/dic.py
index b9d4f0c..9b7d9f8 100644
--- a/liwc/dic.py
+++ b/liwc/dic.py
@@ -33,13 +33,27 @@ def read_dic(filepath):
     * `lexicon` is a dict mapping string patterns to lists of category names
     * `category_names` is a list of category names (as strings)
     """
-    with open(filepath) as lines:
+    try:
+        with open(filepath) as lines:
         # read up to first "%" (should be very first line of file)
-        for line in lines:
-            if line.strip() == "%":
-                break
-        # read categories (a mapping from integer string to category name)
-        category_mapping = dict(_parse_categories(lines))
-        # read lexicon (a mapping from matching string to a list of category names)
-        lexicon = dict(_parse_lexicon(lines, category_mapping))
-    return lexicon, list(category_mapping.values())
+            for line in lines:
+                if line.strip() == "%":
+                    break
+            # read categories (a mapping from integer string to category name)
+            category_mapping = dict(_parse_categories(lines))
+            # read lexicon (a mapping from matching string to a list of category names)
+            lexicon = dict(_parse_lexicon(lines, category_mapping))
+        return lexicon, list(category_mapping.values())
+    except UnicodeDecodeError:
+        # decode with European languages with windows-1252 Danish, Dutch, English, French, German, Italian, Norwegian,
+        # Portuguese, Swedish
+        with open(filepath, encoding= "windows-1252") as lines:
+        # read up to first "%" (should be very first line of file)
+            for line in lines:
+                if line.strip() == "%":
+                    break
+            # read categories (a mapping from integer string to category name)
+            category_mapping = dict(_parse_categories(lines))
+            # read lexicon (a mapping from matching string to a list of category names)
+            lexicon = dict(_parse_lexicon(lines, category_mapping))
+        return lexicon, list(category_mapping.values())

From 9eb193433894341871979f576e682fd0eb300fbb Mon Sep 17 00:00:00 2001
From: David Liao <davidycliao@gmail.com>
Date: Mon, 29 Nov 2021 19:51:41 +0000
Subject: [PATCH 2/7] Update __init__.py

add encoding
---
 liwc/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/liwc/__init__.py b/liwc/__init__.py
index 3c0c069..deb80fb 100644
--- a/liwc/__init__.py
+++ b/liwc/__init__.py
@@ -17,7 +17,7 @@ def load_token_parser(filepath, encoding = "utf-8"):
       empty) of matching categories
     * `category_names` is a list of strings representing all LIWC categories in
       the lexicon
-    add encoding with utf-8 by default
+    add default encoding is utf-8. `encoding` can be overwritten by other encoding such as "windows-1252"
     """
     lexicon, category_names = read_dic(filepath, encoding = encoding)
     trie = build_trie(lexicon)

From fa488026bf74c1cea42ee1c6bafde7fed34f2976 Mon Sep 17 00:00:00 2001
From: David Liao <davidycliao@gmail.com>
Date: Mon, 29 Nov 2021 22:39:22 +0000
Subject: [PATCH 3/7] Update dic.py

make default encoding utf-8
---
 liwc/dic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/liwc/dic.py b/liwc/dic.py
index 9b7d9f8..4fac424 100644
--- a/liwc/dic.py
+++ b/liwc/dic.py
@@ -26,7 +26,7 @@ def _parse_lexicon(lines, category_mapping):
         yield parts[0], [category_mapping[category_id] for category_id in parts[1:]]
 
 
-def read_dic(filepath):
+def read_dic(filepath, encoding = "utf-8"):
     """
     Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
     (lexicon, category_names), where:

From 0acc840147b78f0bde275d091e27e4dd14044842 Mon Sep 17 00:00:00 2001
From: David Liao <davidycliao@gmail.com>
Date: Tue, 30 Nov 2021 04:42:56 +0000
Subject: [PATCH 4/7] add note

---
 liwc/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/liwc/__init__.py b/liwc/__init__.py
index deb80fb..d368f34 100644
--- a/liwc/__init__.py
+++ b/liwc/__init__.py
@@ -17,7 +17,9 @@ def load_token_parser(filepath, encoding = "utf-8"):
       empty) of matching categories
     * `category_names` is a list of strings representing all LIWC categories in
       the lexicon
-    add default encoding is utf-8. `encoding` can be overwritten by other encoding such as "windows-1252"
+    `encoding = "utf-8"` can be overwritten by other encoding such as "EUC-JP" for Janpanese "IOS-2022" for
+    Simplified Chinese. The second default is "windows-1252" when the load_token_parser encounters non utf-8
+    encoding.
     """
     lexicon, category_names = read_dic(filepath, encoding = encoding)
     trie = build_trie(lexicon)

From 6bcafc3c975a88bc79529ce3e0cc59e676e93dfc Mon Sep 17 00:00:00 2001
From: David Liao <davidycliao@gmail.com>
Date: Tue, 30 Nov 2021 04:43:20 +0000
Subject: [PATCH 5/7] clean code

---
 liwc/dic.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/liwc/dic.py b/liwc/dic.py
index 4fac424..252dc3d 100644
--- a/liwc/dic.py
+++ b/liwc/dic.py
@@ -47,7 +47,7 @@ def read_dic(filepath, encoding = "utf-8"):
     except UnicodeDecodeError:
         # decode with European languages with windows-1252 Danish, Dutch, English, French, German, Italian, Norwegian,
         # Portuguese, Swedish
-        with open(filepath, encoding= "windows-1252") as lines:
+        with open(filepath, encoding="windows-1252") as lines:
         # read up to first "%" (should be very first line of file)
             for line in lines:
                 if line.strip() == "%":
@@ -57,3 +57,6 @@ def read_dic(filepath, encoding = "utf-8"):
             # read lexicon (a mapping from matching string to a list of category names)
             lexicon = dict(_parse_lexicon(lines, category_mapping))
         return lexicon, list(category_mapping.values())
+    except UnicodeDecodeError as e:
+        print("encoding requires correct encoding")
+

From 77bfc23a7fbe393422f86fee0cffc188da93dc3e Mon Sep 17 00:00:00 2001
From: David Liao <davidycliao@gmail.com>
Date: Mon, 20 Dec 2021 20:29:20 +0000
Subject: [PATCH 6/7] add description

`load_token_parser()` now can read multiple dictionaries from the distrubor such as Dutch_LIWC2015_Dictionary,
    German_LIWC2001_Dictionary, Italian_LIWC2007_Dictionary, Italian_LIWC2007_Dictionary, LIWC2007_English, LIWC2015_English,
    Spanish_LIWC2007_Dictionary as well as Swedish from the user.
---
 liwc/.ipynb_checkpoints/trie-checkpoint.py | 34 ++++++++++++++++++++++
 liwc/__init__.py                           |  7 +++--
 2 files changed, 38 insertions(+), 3 deletions(-)
 create mode 100644 liwc/.ipynb_checkpoints/trie-checkpoint.py

diff --git a/liwc/.ipynb_checkpoints/trie-checkpoint.py b/liwc/.ipynb_checkpoints/trie-checkpoint.py
new file mode 100644
index 0000000..b19f2fc
--- /dev/null
+++ b/liwc/.ipynb_checkpoints/trie-checkpoint.py
@@ -0,0 +1,34 @@
+def build_trie(lexicon):
+    """
+    Build a character-trie from the plain pattern_string -> categories_list
+    mapping provided by `lexicon`.
+
+    Some LIWC patterns end with a `*` to indicate a wildcard match.
+    """
+    trie = {}
+    for pattern, category_names in lexicon.items():
+        cursor = trie
+        for char in pattern:
+            if char == "*":
+                cursor["*"] = category_names
+                break
+            if char not in cursor:
+                cursor[char] = {}
+            cursor = cursor[char]
+        cursor["$"] = category_names
+    return trie
+
+
+def search_trie(trie, token, token_i=0):
+    """
+    Search the given character-trie for paths that match the `token` string.
+    """
+    if "*" in trie:
+        return trie["*"]
+    if "$" in trie and token_i == len(token):
+        return trie["$"]
+    if token_i < len(token):
+        char = token[token_i]
+        if char in trie:
+            return search_trie(trie[char], token, token_i + 1)
+    return []
diff --git a/liwc/__init__.py b/liwc/__init__.py
index d368f34..8ca3d91 100644
--- a/liwc/__init__.py
+++ b/liwc/__init__.py
@@ -17,9 +17,10 @@ def load_token_parser(filepath, encoding = "utf-8"):
       empty) of matching categories
     * `category_names` is a list of strings representing all LIWC categories in
       the lexicon
-    `encoding = "utf-8"` can be overwritten by other encoding such as "EUC-JP" for Janpanese "IOS-2022" for
-    Simplified Chinese. The second default is "windows-1252" when the load_token_parser encounters non utf-8
-    encoding.
+    * `encoding = "utf-8"` can be overwritten by other encoding such as "EUC-JP" for Janpanese. 
+    * load_token_parser now can read multiple dictionaries from the distrubor such as Dutch_LIWC2015_Dictionary,
+    German_LIWC2001_Dictionary, Italian_LIWC2007_Dictionary, Italian_LIWC2007_Dictionary, LIWC2007_English, LIWC2015_English,
+    Spanish_LIWC2007_Dictionary as well as Swedish from the user.
     """
     lexicon, category_names = read_dic(filepath, encoding = encoding)
     trie = build_trie(lexicon)

From f994494fbaac1f160e12fb4370e9dbfaa3038b05 Mon Sep 17 00:00:00 2001
From: David Liao <davidycliao@gmail.com>
Date: Mon, 20 Dec 2021 20:30:28 +0000
Subject: [PATCH 7/7] typo

---
 liwc/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/liwc/__init__.py b/liwc/__init__.py
index 8ca3d91..aed6847 100644
--- a/liwc/__init__.py
+++ b/liwc/__init__.py
@@ -18,7 +18,7 @@ def load_token_parser(filepath, encoding = "utf-8"):
     * `category_names` is a list of strings representing all LIWC categories in
       the lexicon
     * `encoding = "utf-8"` can be overwritten by other encoding such as "EUC-JP" for Janpanese. 
-    * load_token_parser now can read multiple dictionaries from the distrubor such as Dutch_LIWC2015_Dictionary,
+    * `load_token_parser()` now can read multiple dictionaries from the distributor such as Dutch_LIWC2015_Dictionary,
     German_LIWC2001_Dictionary, Italian_LIWC2007_Dictionary, Italian_LIWC2007_Dictionary, LIWC2007_English, LIWC2015_English,
     Spanish_LIWC2007_Dictionary as well as Swedish from the user.
     """