Skip to content

Commit

Permalink
update documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
xehu committed Sep 23, 2024
1 parent baa5112 commit a8a9dee
Show file tree
Hide file tree
Showing 16 changed files with 37 additions and 23 deletions.
Binary file modified docs/build/doctrees/environment.pickle
Binary file not shown.
Binary file modified docs/build/doctrees/examples.doctree
Binary file not shown.
Binary file modified docs/build/doctrees/feature_builder.doctree
Binary file not shown.
Binary file modified docs/build/doctrees/index.doctree
Binary file not shown.
Binary file modified docs/build/doctrees/utils/check_embeddings.doctree
Binary file not shown.
2 changes: 1 addition & 1 deletion docs/build/html/_sources/examples.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ Now we are ready to call the FeatureBuilder on our data. All we need to do is de
output_file_path_conv_level = "./jury_output_conversation_level.csv",
turns = True
)
jury_feature_builder.featurize(col="message")
jury_feature_builder.featurize()
Basic Input Columns
^^^^^^^^^^^^^^^^^^^^
Expand Down
2 changes: 1 addition & 1 deletion docs/build/html/_sources/index.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ Once you import the tool, you will be able to declare a FeatureBuilder object, w
)
# this line of code runs the FeatureBuilder on your data
my_feature_builder.featurize(col="message")
my_feature_builder.featurize()
Use the Table of Contents below to learn more about our tool. We recommend that you begin in the "Introduction" section, then explore other sections of the documentation as they become relevant to you. We recommend reading :ref:`basics` for a high-level overview of the requirements and parameters, and then reading through the :ref:`examples` for a detailed walkthrough and discussion of considerations.

Expand Down
2 changes: 1 addition & 1 deletion docs/build/html/examples.html
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ <h3>Configuring the FeatureBuilder<a class="headerlink" href="#configuring-the-f
<span class="n">output_file_path_conv_level</span> <span class="o">=</span> <span class="s2">&quot;./jury_output_conversation_level.csv&quot;</span><span class="p">,</span>
<span class="n">turns</span> <span class="o">=</span> <span class="kc">True</span>
<span class="p">)</span>
<span class="n">jury_feature_builder</span><span class="o">.</span><span class="n">featurize</span><span class="p">(</span><span class="n">col</span><span class="o">=</span><span class="s2">&quot;message&quot;</span><span class="p">)</span>
<span class="n">jury_feature_builder</span><span class="o">.</span><span class="n">featurize</span><span class="p">()</span>
</pre></div>
</div>
<section id="basic-input-columns">
Expand Down
11 changes: 4 additions & 7 deletions docs/build/html/feature_builder.html
Original file line number Diff line number Diff line change
Expand Up @@ -174,22 +174,19 @@

<dl class="py method">
<dt class="sig sig-object py" id="feature_builder.FeatureBuilder.featurize">
<span class="sig-name descname"><span class="pre">featurize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">col</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'message'</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#feature_builder.FeatureBuilder.featurize" title="Link to this definition"></a></dt>
<span class="sig-name descname"><span class="pre">featurize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#feature_builder.FeatureBuilder.featurize" title="Link to this definition"></a></dt>
<dd><p>Main driver function for feature generation.</p>
<p>This function creates chat-level features, generates features for different
truncation percentages of the data if specified, and produces user-level and
conversation-level features. Finally, the features are saved into the
designated output files.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>col</strong> (<em>str</em><em>, </em><em>optional</em>) – Column to preprocess, defaults to “message”</p>
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>None</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dt class="field-even">Return type<span class="colon">:</span></dt>
<dd class="field-even"><p>None</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>None</p>
</dd>
</dl>
</dd></dl>

Expand Down
2 changes: 1 addition & 1 deletion docs/build/html/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ <h2>Using the Package<a class="headerlink" href="#using-the-package" title="Link
<span class="p">)</span>

<span class="c1"># this line of code runs the FeatureBuilder on your data</span>
<span class="n">my_feature_builder</span><span class="o">.</span><span class="n">featurize</span><span class="p">(</span><span class="n">col</span><span class="o">=</span><span class="s2">&quot;message&quot;</span><span class="p">)</span>
<span class="n">my_feature_builder</span><span class="o">.</span><span class="n">featurize</span><span class="p">()</span>
</pre></div>
</div>
<p>Use the Table of Contents below to learn more about our tool. We recommend that you begin in the “Introduction” section, then explore other sections of the documentation as they become relevant to you. We recommend reading <a class="reference internal" href="basics.html#basics"><span class="std std-ref">The Basics</span></a> for a high-level overview of the requirements and parameters, and then reading through the <a class="reference internal" href="examples.html#examples"><span class="std std-ref">Worked Example</span></a> for a detailed walkthrough and discussion of considerations.</p>
Expand Down
2 changes: 1 addition & 1 deletion docs/build/html/searchindex.js

Large diffs are not rendered by default.

13 changes: 7 additions & 6 deletions docs/build/html/utils/check_embeddings.html
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,15 @@

<dl class="py function">
<dt class="sig sig-object py" id="utils.check_embeddings.generate_bert">
<span class="sig-prename descclassname"><span class="pre">utils.check_embeddings.</span></span><span class="sig-name descname"><span class="pre">generate_bert</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">chat_data</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">message_col</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#utils.check_embeddings.generate_bert" title="Link to this definition"></a></dt>
<span class="sig-prename descclassname"><span class="pre">utils.check_embeddings.</span></span><span class="sig-name descname"><span class="pre">generate_bert</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">chat_data</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">message_col</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">64</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#utils.check_embeddings.generate_bert" title="Link to this definition"></a></dt>
<dd><p>Generates RoBERTa sentiment scores for the given chat data and saves them to a CSV file.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>chat_data</strong> (<em>pd.DataFrame</em>) – Contains message data to be analyzed for sentiments.</p></li>
<li><p><strong>output_path</strong> (<em>str</em>) – Path to save the CSV file containing sentiment scores.</p></li>
<li><p><strong>message_col</strong> (<em>str</em><em>, </em><em>optional</em>) – A string representing the column name that should be selected as the message. Defaults to “message”.</p></li>
<li><p><strong>batch_size</strong> (<em>int</em>) – The size of each batch for processing sentiment analysis. Defaults to 64.</p></li>
</ul>
</dd>
<dt class="field-even">Raises<span class="colon">:</span></dt>
Expand Down Expand Up @@ -224,17 +225,17 @@

<dl class="py function">
<dt class="sig sig-object py" id="utils.check_embeddings.get_sentiment">
<span class="sig-prename descclassname"><span class="pre">utils.check_embeddings.</span></span><span class="sig-name descname"><span class="pre">get_sentiment</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">text</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#utils.check_embeddings.get_sentiment" title="Link to this definition"></a></dt>
<dd><p>Analyzes the sentiment of the given text using a BERT model and returns the scores for positive, negative, and neutral sentiments.</p>
<span class="sig-prename descclassname"><span class="pre">utils.check_embeddings.</span></span><span class="sig-name descname"><span class="pre">get_sentiment</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texts</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#utils.check_embeddings.get_sentiment" title="Link to this definition"></a></dt>
<dd><p>Analyzes the sentiment of the given list of texts using a BERT model and returns a DataFrame with scores for positive, negative, and neutral sentiments.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>text</strong> (<em>str</em><em> or </em><em>None</em>) – The input text to analyze.</p>
<dd class="field-odd"><p><strong>texts</strong> (<em>list</em><em> of </em><em>str</em>) – The list of input texts to analyze.</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>A dictionary with sentiment scores.</p>
<dd class="field-even"><p>A DataFrame with sentiment scores.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>dict</p>
<dd class="field-odd"><p>pd.DataFrame</p>
</dd>
</dl>
</dd></dl>
Expand Down
8 changes: 7 additions & 1 deletion docs/source/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ Package Assumptions

8. **Vector Data Cache**: Your data's vector data will be cached in **vector_directory**. This directory will be created if it doesn’t exist, but its contents should be reserved for cached vector files.

* Note: v0.1.3 and earlier compute vectors using _preprocessed_ text by default, which drops capitalization and punctuation. However, this can affect the interpretation of sentiment vectors; for example, "Hello!" has more positive sentiment than "hello." Consequently, from v0.1.4 onwards, we compute vectors using the raw input text, including punctuation and capitalization. To restore this behavior, please set **compute_vectors_from_preprocessed** to True.

* Additionally, we assume that empty messages are equivalent to "NaN vector," defined `here <https://raw.githubusercontent.com/Watts-Lab/team_comm_tools/refs/heads/main/src/team_comm_tools/features/assets/nan_vector.txt>`_.

9. **Output Files**: We generate three outputs: **output_file_path_chat_level** (Utterance- or Chat-Level Features), **output_file_path_user_level** (Speaker- or User-Level Features), and **output_file_path_conv_level** (Conversation-Level Features).

* This should be a *path*, not just a filename. For example, "./my_file.csv", not just "my_file.csv."
Expand Down Expand Up @@ -79,4 +83,6 @@ Here are some parameters that can be customized. For more details, refer to the

4. **ner_training_df** and **ner_cutoff**: Measure the number of named entities in each utterance (see :ref:`named_entity_recognition`).

5. **regenerate_vectors**: Force-regenerate vector data even if it already exists.
5. **regenerate_vectors**: Force-regenerate vector data even if it already exists.

6. **compute_vectors_from_preprocessed**: Computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation), and this parameter now defaults to False.
2 changes: 1 addition & 1 deletion docs/source/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ Now we are ready to call the FeatureBuilder on our data. All we need to do is de
output_file_path_conv_level = "./jury_output_conversation_level.csv",
turns = True
)
jury_feature_builder.featurize(col="message")
jury_feature_builder.featurize()
Basic Input Columns
^^^^^^^^^^^^^^^^^^^^
Expand Down
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ Once you import the tool, you will be able to declare a FeatureBuilder object, w
)
# this line of code runs the FeatureBuilder on your data
my_feature_builder.featurize(col="message")
my_feature_builder.featurize()
Use the Table of Contents below to learn more about our tool. We recommend that you begin in the "Introduction" section, then explore other sections of the documentation as they become relevant to you. We recommend reading :ref:`basics` for a high-level overview of the requirements and parameters, and then reading through the :ref:`examples` for a detailed walkthrough and discussion of considerations.

Expand Down
14 changes: 12 additions & 2 deletions src/team_comm_tools/feature_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ class FeatureBuilder:
:param regenerate_vectors: If true, will regenerate vector data even if it already exists. Defaults to False.
:type regenerate_vectors: bool, optional
:param compute_vectors_from_preprocessed: If true, computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation). Defaults to False.
:type compute_vectors_from_preprocessed: bool, optional
:return: The FeatureBuilder doesn't return anything; instead, it writes the generated features to files in the specified paths. It will also print out its progress, so you should see "All Done!" in the terminal, which will indicate that the features have been generated.
:rtype: None
Expand All @@ -108,14 +111,16 @@ def __init__(
within_task = False,
ner_training_df: pd.DataFrame = None,
ner_cutoff: int = 0.9,
regenerate_vectors: bool = False
regenerate_vectors: bool = False,
compute_vectors_from_preprocessed: bool = False
) -> None:

# Defining input and output paths.
self.chat_data = input_df.copy()
self.orig_data = input_df.copy()
self.ner_training = ner_training_df
self.vector_directory = vector_directory

print("Initializing Featurization...")
self.output_file_path_conv_level = output_file_path_conv_level
self.output_file_path_user_level = output_file_path_user_level
Expand Down Expand Up @@ -218,6 +223,11 @@ def __init__(
self.ner_cutoff = ner_cutoff
self.regenerate_vectors = regenerate_vectors

if(compute_vectors_from_preprocessed == True):
self.vector_colname = self.message_col # because the message col will eventually get preprocessed
else:
self.vector_colname = self.message_col + "_original" # because this contains the original message

# check grouping rules
if self.conversation_id_col not in self.chat_data.columns and len(self.grouping_keys)==0:
if(self.conversation_id_col == "conversation_num"):
Expand Down Expand Up @@ -338,7 +348,7 @@ def __init__(
if(not need_sentiment and feature_dict[feature]["bert_sentiment_data"]):
need_sentiment = True

check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, message_col = self.message_col + "_original")
check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, message_col = self.vector_colname)

if(need_sentence):
self.vect_data = pd.read_csv(self.vect_path, encoding='mac_roman')
Expand Down

0 comments on commit a8a9dee

Please sign in to comment.