Skip to content

Commit

Permalink
Urgent fix to remove LIWC lexicons from public repo (#279)
Browse files Browse the repository at this point in the history
* delete small test lexicons

* move .pkl files to assets and remove from GH

* filesystem cleanup

* update certainty pickle location

* remove unpickling certainty

* remove lexicons from pyproject

* change lexical pkl path

* add error handling when lexicons are not found

* update warning message

* add legal caveat and update name of certainty pkl to be correct
  • Loading branch information
xehu authored Aug 15, 2024
1 parent f8f51a8 commit 34ea876
Show file tree
Hide file tree
Showing 72 changed files with 85 additions and 42,960 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ We are a research project created by the [Computational Social Science Lab at UP

[![View - Documentation](https://img.shields.io/badge/view-Documentation-blue?style=for-the-badge)](https://conversational-featurizer.readthedocs.io/en/latest/ "Go to project documentation")

The Team Communication Toolkit is an academic project and is intended to be used for academic purposes only.

</div>

# Getting Started
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "team_comm_tools"
version = "0.1.1"
version = "0.1.2"
requires-python = ">= 3.10"
dependencies = [
"chardet>=3.0.4",
Expand Down Expand Up @@ -61,6 +61,8 @@ where = ["src"]

[tool.setuptools.package-data]
"team_comm_tools" = [
'features/lexicons/**/*',
'features/lexicons/dale_chall.txt',
'features/lexicons/function_words.txt',
'features/lexicons/question_words.txt',
'features/assets/*'
]
22 changes: 11 additions & 11 deletions src/team_comm_tools/features/certainty.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,6 @@

# Note: This feature requires the message WITH punctuation.

# parse certainty lexicon, compile into master regex, delimited by |
# Construct the absolute path to certainty.txt using the current script directory
current_dir = os.path.dirname(__file__)
certainty_file_pkl_path = os.path.join(current_dir, './lexicons/certainty.pkl')
certainty_file_pkl_path = os.path.abspath(certainty_file_pkl_path)
with open(certainty_file_pkl_path, 'rb') as f:
certainty_data = pickle.load(f) # Load pickled data
certainty = pd.read_csv(io.StringIO(certainty_data), sep = ",")
certainty = certainty.sort_values(["NumWords", "NumCharacters"], ascending=False)
master_regex = certainty["Word"].str.cat(sep='\\b|') + "\\b"

def get_certainty(chat):
""" Calculates a score of how "certain" a given expression is, using the Certainty Lexicon.
Expand All @@ -41,6 +30,17 @@ def get_certainty(chat):
float: The certainty score of the utterance.
"""

# parse certainty lexicon, compile into master regex, delimited by |
# Construct the absolute path to certainty.txt using the current script directory
current_dir = os.path.dirname(__file__)
certainty_file_pkl_path = os.path.join(current_dir, './assets/certainty.pkl')
certainty_file_pkl_path = os.path.abspath(certainty_file_pkl_path)
with open(certainty_file_pkl_path, 'rb') as f:
certainty_data = pickle.load(f) # Load pickled data
certainty = pd.read_csv(io.StringIO(certainty_data), sep = ",")
certainty = certainty.sort_values(["NumWords", "NumCharacters"], ascending=False)
master_regex = certainty["Word"].str.cat(sep='\\b|') + "\\b"

# default certainty value is 4.5; aka a "neutral" statement in the event we don't find anything
DEFAULT_CERTAINTY = 4.5

Expand Down
31 changes: 17 additions & 14 deletions src/team_comm_tools/features/lexical_features_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,20 @@ def liwc_features(chat_df: pd.DataFrame, message_col) -> pd.DataFrame:
pd.DataFrame: Dataframe of the lexical features stacked as columns.
"""
# Load the preprocessed lexical regular expressions
current_dir = os.path.dirname(__file__)
lexicon_pkl_file_path = os.path.join(current_dir, './assets/lexicons_dict.pkl')
lexicon_pkl_file_path = os.path.abspath(lexicon_pkl_file_path)
with open(lexicon_pkl_file_path, "rb") as lexicons_pickle_file:
lexicons_dict = pickle.load(lexicons_pickle_file)

# Return the lexical features stacked as columns
return pd.concat(
# Finding the # of occurrences of lexicons of each type for all the messages.
[pd.DataFrame(chat_df[message_col + "_original"].apply(lambda chat: get_liwc_rate(regex, chat)))\
.rename({message_col + "_original": lexicon_type + "_lexical_per_100"}, axis=1)\
for lexicon_type, regex in lexicons_dict.items()],
axis=1
)
try:
current_dir = os.path.dirname(__file__)
lexicon_pkl_file_path = os.path.join(current_dir, './assets/lexicons_dict.pkl')
lexicon_pkl_file_path = os.path.abspath(lexicon_pkl_file_path)
with open(lexicon_pkl_file_path, "rb") as lexicons_pickle_file:
lexicons_dict = pickle.load(lexicons_pickle_file)

# Return the lexical features stacked as columns
return pd.concat(
# Finding the # of occurrences of lexicons of each type for all the messages.
[pd.DataFrame(chat_df[message_col + "_original"].apply(lambda chat: get_liwc_rate(regex, chat)))\
.rename({message_col + "_original": lexicon_type + "_lexical_per_100"}, axis=1)\
for lexicon_type, regex in lexicons_dict.items()],
axis=1
)
except:
print("WARNING: Lexicons not found. Skipping feature...")
Loading

0 comments on commit 34ea876

Please sign in to comment.