update documentation

Watts-Lab · Sep 23, 2024 · a8a9dee · a8a9dee
1 parent baa5112
commit a8a9dee
Show file tree

Hide file tree

Showing 16 changed files with 37 additions and 23 deletions.
diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle
diff --git a/docs/build/doctrees/examples.doctree b/docs/build/doctrees/examples.doctree
diff --git a/docs/build/doctrees/feature_builder.doctree b/docs/build/doctrees/feature_builder.doctree
diff --git a/docs/build/doctrees/index.doctree b/docs/build/doctrees/index.doctree
diff --git a/docs/build/doctrees/utils/check_embeddings.doctree b/docs/build/doctrees/utils/check_embeddings.doctree
diff --git a/docs/build/html/_sources/examples.rst.txt b/docs/build/html/_sources/examples.rst.txt
@@ -90,7 +90,7 @@ Now we are ready to call the FeatureBuilder on our data. All we need to do is de
  output_file_path_conv_level = "./jury_output_conversation_level.csv",
  turns = True
  )
- jury_feature_builder.featurize(col="message")
+ jury_feature_builder.featurize()
 
 Basic Input Columns
 ^^^^^^^^^^^^^^^^^^^^

diff --git a/docs/build/html/_sources/index.rst.txt b/docs/build/html/_sources/index.rst.txt
@@ -76,7 +76,7 @@ Once you import the tool, you will be able to declare a FeatureBuilder object, w
  )
 
  # this line of code runs the FeatureBuilder on your data
- my_feature_builder.featurize(col="message")
+ my_feature_builder.featurize()
 
 Use the Table of Contents below to learn more about our tool. We recommend that you begin in the "Introduction" section, then explore other sections of the documentation as they become relevant to you. We recommend reading :ref:`basics` for a high-level overview of the requirements and parameters, and then reading through the :ref:`examples` for a detailed walkthrough and discussion of considerations.
 

diff --git a/docs/build/html/examples.html b/docs/build/html/examples.html
@@ -160,7 +160,7 @@ <h3>Configuring the FeatureBuilder<a class="headerlink" href="#configuring-the-f
  <span class="n">output_file_path_conv_level</span> <span class="o">=</span> <span class="s2">&quot;./jury_output_conversation_level.csv&quot;</span><span class="p">,</span>
  <span class="n">turns</span> <span class="o">=</span> <span class="kc">True</span>
 <span class="p">)</span>
-<span class="n">jury_feature_builder</span><span class="o">.</span><span class="n">featurize</span><span class="p">(</span><span class="n">col</span><span class="o">=</span><span class="s2">&quot;message&quot;</span><span class="p">)</span>
+<span class="n">jury_feature_builder</span><span class="o">.</span><span class="n">featurize</span><span class="p">()</span>
 </pre></div>
 </div>
 <section id="basic-input-columns">

diff --git a/docs/build/html/feature_builder.html b/docs/build/html/feature_builder.html
@@ -174,22 +174,19 @@
 
 <dl class="py method">
 <dt class="sig sig-object py" id="feature_builder.FeatureBuilder.featurize">
-<span class="sig-name descname"><span class="pre">featurize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">col</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'message'</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#feature_builder.FeatureBuilder.featurize" title="Link to this definition"></a></dt>
+<span class="sig-name descname"><span class="pre">featurize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#feature_builder.FeatureBuilder.featurize" title="Link to this definition"></a></dt>
 <dd><p>Main driver function for feature generation.</p>
 <p>This function creates chat-level features, generates features for different
 truncation percentages of the data if specified, and produces user-level and
 conversation-level features. Finally, the features are saved into the
 designated output files.</p>
 <dl class="field-list simple">
-<dt class="field-odd">Parameters<span class="colon">:</span></dt>
-<dd class="field-odd"><p><strong>col</strong> (<em>str</em><em>, </em><em>optional</em>) – Column to preprocess, defaults to “message”</p>
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>None</p>
 </dd>
-<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
 <dd class="field-even"><p>None</p>
 </dd>
-<dt class="field-odd">Return type<span class="colon">:</span></dt>
-<dd class="field-odd"><p>None</p>
-</dd>
 </dl>
 </dd></dl>
 

diff --git a/docs/build/html/index.html b/docs/build/html/index.html
@@ -139,7 +139,7 @@ <h2>Using the Package<a class="headerlink" href="#using-the-package" title="Link
 <span class="p">)</span>
 
 <span class="c1"># this line of code runs the FeatureBuilder on your data</span>
-<span class="n">my_feature_builder</span><span class="o">.</span><span class="n">featurize</span><span class="p">(</span><span class="n">col</span><span class="o">=</span><span class="s2">&quot;message&quot;</span><span class="p">)</span>
+<span class="n">my_feature_builder</span><span class="o">.</span><span class="n">featurize</span><span class="p">()</span>
 </pre></div>
 </div>
 <p>Use the Table of Contents below to learn more about our tool. We recommend that you begin in the “Introduction” section, then explore other sections of the documentation as they become relevant to you. We recommend reading <a class="reference internal" href="basics.html#basics"><span class="std std-ref">The Basics</span></a> for a high-level overview of the requirements and parameters, and then reading through the <a class="reference internal" href="examples.html#examples"><span class="std std-ref">Worked Example</span></a> for a detailed walkthrough and discussion of considerations.</p>

diff --git a/docs/build/html/searchindex.js b/docs/build/html/searchindex.js
diff --git a/docs/build/html/utils/check_embeddings.html b/docs/build/html/utils/check_embeddings.html
@@ -132,14 +132,15 @@
 
 <dl class="py function">
 <dt class="sig sig-object py" id="utils.check_embeddings.generate_bert">
-<span class="sig-prename descclassname"><span class="pre">utils.check_embeddings.</span></span><span class="sig-name descname"><span class="pre">generate_bert</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">chat_data</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">message_col</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#utils.check_embeddings.generate_bert" title="Link to this definition"></a></dt>
+<span class="sig-prename descclassname"><span class="pre">utils.check_embeddings.</span></span><span class="sig-name descname"><span class="pre">generate_bert</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">chat_data</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">message_col</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">64</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#utils.check_embeddings.generate_bert" title="Link to this definition"></a></dt>
 <dd><p>Generates RoBERTa sentiment scores for the given chat data and saves them to a CSV file.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>chat_data</strong> (<em>pd.DataFrame</em>) – Contains message data to be analyzed for sentiments.</p></li>
 <li><p><strong>output_path</strong> (<em>str</em>) – Path to save the CSV file containing sentiment scores.</p></li>
 <li><p><strong>message_col</strong> (<em>str</em><em>, </em><em>optional</em>) – A string representing the column name that should be selected as the message. Defaults to “message”.</p></li>
+<li><p><strong>batch_size</strong> (<em>int</em>) – The size of each batch for processing sentiment analysis. Defaults to 64.</p></li>
 </ul>
 </dd>
 <dt class="field-even">Raises<span class="colon">:</span></dt>
@@ -224,17 +225,17 @@
 
 <dl class="py function">
 <dt class="sig sig-object py" id="utils.check_embeddings.get_sentiment">
-<span class="sig-prename descclassname"><span class="pre">utils.check_embeddings.</span></span><span class="sig-name descname"><span class="pre">get_sentiment</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">text</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#utils.check_embeddings.get_sentiment" title="Link to this definition"></a></dt>
-<dd><p>Analyzes the sentiment of the given text using a BERT model and returns the scores for positive, negative, and neutral sentiments.</p>
+<span class="sig-prename descclassname"><span class="pre">utils.check_embeddings.</span></span><span class="sig-name descname"><span class="pre">get_sentiment</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texts</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#utils.check_embeddings.get_sentiment" title="Link to this definition"></a></dt>
+<dd><p>Analyzes the sentiment of the given list of texts using a BERT model and returns a DataFrame with scores for positive, negative, and neutral sentiments.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
-<dd class="field-odd"><p><strong>text</strong> (<em>str</em><em> or </em><em>None</em>) – The input text to analyze.</p>
+<dd class="field-odd"><p><strong>texts</strong> (<em>list</em><em> of </em><em>str</em>) – The list of input texts to analyze.</p>
 </dd>
 <dt class="field-even">Returns<span class="colon">:</span></dt>
-<dd class="field-even"><p>A dictionary with sentiment scores.</p>
+<dd class="field-even"><p>A DataFrame with sentiment scores.</p>
 </dd>
 <dt class="field-odd">Return type<span class="colon">:</span></dt>
-<dd class="field-odd"><p>dict</p>
+<dd class="field-odd"><p>pd.DataFrame</p>
 </dd>
 </dl>
 </dd></dl>

diff --git a/docs/source/basics.rst b/docs/source/basics.rst
@@ -49,6 +49,10 @@ Package Assumptions
 
 8. **Vector Data Cache**: Your data's vector data will be cached in **vector_directory**. This directory will be created if it doesn’t exist, but its contents should be reserved for cached vector files.
 
+ * Note: v0.1.3 and earlier compute vectors using _preprocessed_ text by default, which drops capitalization and punctuation. However, this can affect the interpretation of sentiment vectors; for example, "Hello!" has more positive sentiment than "hello." Consequently, from v0.1.4 onwards, we compute vectors using the raw input text, including punctuation and capitalization. To restore this behavior, please set **compute_vectors_from_preprocessed** to True.
+
+ * Additionally, we assume that empty messages are equivalent to "NaN vector," defined `here <https://raw.githubusercontent.com/Watts-Lab/team_comm_tools/refs/heads/main/src/team_comm_tools/features/assets/nan_vector.txt>`_.
+
 9. **Output Files**: We generate three outputs: **output_file_path_chat_level** (Utterance- or Chat-Level Features), **output_file_path_user_level** (Speaker- or User-Level Features), and **output_file_path_conv_level** (Conversation-Level Features).
 
  * This should be a *path*, not just a filename. For example, "./my_file.csv", not just "my_file.csv."
@@ -79,4 +83,6 @@ Here are some parameters that can be customized. For more details, refer to the
 
 4. **ner_training_df** and **ner_cutoff**: Measure the number of named entities in each utterance (see :ref:`named_entity_recognition`).
 
-5. **regenerate_vectors**: Force-regenerate vector data even if it already exists.
+5. **regenerate_vectors**: Force-regenerate vector data even if it already exists.
+
+6. **compute_vectors_from_preprocessed**: Computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation), and this parameter now defaults to False.
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -90,7 +90,7 @@ Now we are ready to call the FeatureBuilder on our data. All we need to do is de
  output_file_path_conv_level = "./jury_output_conversation_level.csv",
  turns = True
  )
- jury_feature_builder.featurize(col="message")
+ jury_feature_builder.featurize()
 
 Basic Input Columns
 ^^^^^^^^^^^^^^^^^^^^

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -76,7 +76,7 @@ Once you import the tool, you will be able to declare a FeatureBuilder object, w
  )
 
  # this line of code runs the FeatureBuilder on your data
- my_feature_builder.featurize(col="message")
+ my_feature_builder.featurize()
 
 Use the Table of Contents below to learn more about our tool. We recommend that you begin in the "Introduction" section, then explore other sections of the documentation as they become relevant to you. We recommend reading :ref:`basics` for a high-level overview of the requirements and parameters, and then reading through the :ref:`examples` for a detailed walkthrough and discussion of considerations.
 

diff --git a/src/team_comm_tools/feature_builder.py b/src/team_comm_tools/feature_builder.py
@@ -85,6 +85,9 @@ class FeatureBuilder:
  :param regenerate_vectors: If true, will regenerate vector data even if it already exists. Defaults to False.
  :type regenerate_vectors: bool, optional
 
+ :param compute_vectors_from_preprocessed: If true, computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation). Defaults to False.
+ :type compute_vectors_from_preprocessed: bool, optional
+
  :return: The FeatureBuilder doesn't return anything; instead, it writes the generated features to files in the specified paths. It will also print out its progress, so you should see "All Done!" in the terminal, which will indicate that the features have been generated.
  :rtype: None
 
@@ -108,14 +111,16 @@ def __init__(
  within_task = False,
  ner_training_df: pd.DataFrame = None,
  ner_cutoff: int = 0.9,
- regenerate_vectors: bool = False
+ regenerate_vectors: bool = False,
+ compute_vectors_from_preprocessed: bool = False
  ) -> None:
 
  # Defining input and output paths.
  self.chat_data = input_df.copy()
  self.orig_data = input_df.copy()
  self.ner_training = ner_training_df
  self.vector_directory = vector_directory
+
  print("Initializing Featurization...")
  self.output_file_path_conv_level = output_file_path_conv_level
  self.output_file_path_user_level = output_file_path_user_level
@@ -218,6 +223,11 @@ def __init__(
  self.ner_cutoff = ner_cutoff
  self.regenerate_vectors = regenerate_vectors
 
+ if(compute_vectors_from_preprocessed == True):
+ self.vector_colname = self.message_col # because the message col will eventually get preprocessed
+ else:
+ self.vector_colname = self.message_col + "_original" # because this contains the original message
+
  # check grouping rules
  if self.conversation_id_col not in self.chat_data.columns and len(self.grouping_keys)==0:
  if(self.conversation_id_col == "conversation_num"):
@@ -338,7 +348,7 @@ def __init__(
  if(not need_sentiment and feature_dict[feature]["bert_sentiment_data"]):
  need_sentiment = True
 
- check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, message_col = self.message_col + "_original")
+ check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, message_col = self.vector_colname)
 
  if(need_sentence):
  self.vect_data = pd.read_csv(self.vect_path, encoding='mac_roman')