undertheseanlp · rain1024 · Jul 14, 2023 · Jul 31, 2023 · Jul 31, 2023 · Jul 31, 2023
diff --git a/.github/workflows/crates-build.yml b/.github/workflows/crates-build.yml
@@ -0,0 +1,22 @@
+on:
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  crates-build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Setup Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          override: true
+      - name: Build Release
+        working-directory: extensions/underthesea_core
+        run: |
+          cargo test
+          cargo build --release
+
+
diff --git a/.github/workflows/crates-release.yml b/.github/workflows/crates-release.yml
@@ -0,0 +1,34 @@
+on:
+  push:
+    branches: [ main ]
+
+jobs:
+  crates-release:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Setup Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          override: true
+      - name: Check if version already exists on crates.io
+        working-directory: extensions/underthesea_core
+        id: check-version
+        run: |
+          CURRENT_VERSION=$(cargo pkgid | cut -d# -f2 | cut -d: -f2)
+          EXISTS=$(cargo search "underthesea_core" --limit 100 | grep "$CURRENT_VERSION")
+          if [[ -z "$EXISTS" ]]; then
+            echo "The version $CURRENT_VERSION does not exist on crates.io"
+            echo "::set-output name=exists::false"
+          else
+            echo "The version $CURRENT_VERSION already exists on crates.io"
+            echo "::set-output name=exists::true"
+          fi
+      - name: Build Release
+        if: steps.check-version.outputs.exists == 'false'
+        working-directory: extensions/underthesea_core
+        env:
+          CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+        run: cargo publish
diff --git a/examples/classification/.gitignore b/examples/classification/.gitignore
@@ -0,0 +1 @@
+outputs
diff --git a/examples/classification/README.md b/examples/classification/README.md
@@ -1,41 +1 @@
-# Vietnamese Text Classification with underthesea
-
-```
-Author: Vu Anh
-Date: July 27, 2023
-```
-
-Vietnamese text classification is an integral domain in Natural Language Processing (NLP) concerning the Vietnamese language. The objective of text classification is to assign predefined labels or categories to a given text based on its content. This report presents an overview of the challenges, methodologies, and advancements in Vietnamese text classification.
-
-## Methodologies and Approaches
-
-Zero-shot classification is a technique allowing models to categorize data into previously unseen classes without direct training. Leveraging large language models like GPT-3 and GPT-4, this method is invaluable when there's a shortage of labeled data or when it's impractical to gather annotations for every conceivable class. Such models use their vast training knowledge to generalize across tasks, making them versatile and adaptable to new classification challenges.
-
-## Results
-
-The following table presents the outcomes of the Vietnamese Text Classification endeavor when utilizing Large Language Models (LLMs):
-
-```
-<table>
-    <thead>
-        <tr>
-            <th>Dataset</th>
-            <th>LLMs</th>
-            <th>F1 Metric</th>
-        </tr>
-    </thead>
-    <tbody>
-        <tr>
-            <td></td>
-            <td>GPT-3.5</td>
-            <td></td>
-        </tr>
-        <tr>
-            <td></td>
-            <td>GPT-4</td>
-            <td></td>
-        </tr>
-    </tbody>
-</table>
-```
-
+# Classification
diff --git a/examples/classification/preprocess_data.py b/examples/classification/preprocess_data.py
@@ -0,0 +1,17 @@
+from datasets import load_dataset
+
+dataset = load_dataset("uit-nlp/vietnamese_students_feedback")
+print(dataset)
+
+print(dataset["train"][0])
+
+sentences = []
+for i in range(10):
+    item = dataset["train"][i]
+    sentence = item["sentence"]
+    sentences.append(sentence)
+    print(item)
+
+with open("tmp/train.txt", "w") as f:
+    content = "\n".join(sentences)
+    f.write(content)
diff --git a/examples/classification/technical_report.md b/examples/classification/technical_report.md
@@ -0,0 +1,41 @@
+# Vietnamese Text Classification with underthesea
+
+```
+Author: Vu Anh
+Date: July 27, 2023
+```
+
+Vietnamese text classification is an integral domain in Natural Language Processing (NLP) concerning the Vietnamese language. The objective of text classification is to assign predefined labels or categories to a given text based on its content. This report presents an overview of the challenges, methodologies, and advancements in Vietnamese text classification.
+
+## Methodologies and Approaches
+
+Zero-shot classification is a technique allowing models to categorize data into previously unseen classes without direct training. Leveraging large language models like GPT-3 and GPT-4, this method is invaluable when there's a shortage of labeled data or when it's impractical to gather annotations for every conceivable class. Such models use their vast training knowledge to generalize across tasks, making them versatile and adaptable to new classification challenges.
+
+## Results
+
+The following table presents the outcomes of the Vietnamese Text Classification endeavor when utilizing Large Language Models (LLMs):
+
+```
+<table>
+    <thead>
+        <tr>
+            <th>Dataset</th>
+            <th>LLMs</th>
+            <th>F1 Metric</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td></td>
+            <td>GPT-3.5</td>
+            <td></td>
+        </tr>
+        <tr>
+            <td></td>
+            <td>GPT-4</td>
+            <td></td>
+        </tr>
+    </tbody>
+</table>
+```
+
diff --git a/examples/classification/tmp/.gitignore b/examples/classification/tmp/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/extensions/underthesea_core/Cargo.toml b/extensions/underthesea_core/Cargo.toml
@@ -29,6 +29,7 @@ serde = { version = "1.0", features = [ "derive" ] }
 regex = "1"
 rayon = "1.5"
 crfs = "0.1"
+nalgebra = "0.29"
 
 [dependencies.pyo3]
 version = "0.15.0"

diff --git a/extensions/underthesea_core/HISTORY.rst b/extensions/underthesea_core/HISTORY.rst
@@ -2,6 +2,12 @@
 History
 ================================================================================
 
+1.0.5 (2023-07-31)
+--------------------------------------------------------------------------------
+
+* Add metrics::consine_similarity
+* Add features::tfidf_featurizer
+
 1.0.4 (2023-04-28)
 --------------------------------------------------------------------------------
 

diff --git a/extensions/underthesea_core/src/features/mod.rs b/extensions/underthesea_core/src/features/mod.rs
@@ -0,0 +1,3 @@
+mod tfidf_featurizer;
+
+pub use tfidf_featurizer::TfidfFeaturizer;
diff --git a/extensions/underthesea_core/src/features/tfidf_featurizer.rs b/extensions/underthesea_core/src/features/tfidf_featurizer.rs
@@ -0,0 +1,160 @@
+//! tfidf_featurizer.rs
+//! 
+//! Provides functionality for computing Term Frequency-Inverse Document Frequency (TFIDF) vectors.
+//! 
+//! Author: Vu Anh 
+//! Date: 2023-07-29
+
+use std::collections::{HashMap, HashSet};
+
+pub struct TfidfFeaturizer {
+    idf: Vec<f64>,
+    term_to_index: HashMap<String, usize>
+}
+
+impl TfidfFeaturizer {
+    pub fn new() -> Self {
+        TfidfFeaturizer {
+            idf: Vec::new(),
+            term_to_index: HashMap::new()
+        }
+    }
+
+    pub fn get_idf(&self) -> &Vec<f64> {
+        &self.idf
+    }
+
+    fn compute_idf(&mut self, documents: &[Vec<String>]) {
+        let n = documents.len() as f64;
+
+        let mut word_freq = HashMap::new();
+
+        for doc in documents.iter() {
+            let mut seen_terms = HashSet::new();
+
+            for term in doc {
+                if !seen_terms.contains(term) {
+                    let idx = match self.term_to_index.get(term) {
+                        Some(&existing_idx) => existing_idx,
+                        None => {
+                            let new_idx = self.term_to_index.len();
+                            self.term_to_index.insert(term.clone(), new_idx);
+                            new_idx
+                        }
+                    };
+                    *word_freq.entry(idx).or_insert(0.0) += 1.0;
+                    seen_terms.insert(term.clone());
+                }
+            }
+        }
+
+        self.idf.resize(self.term_to_index.len(), 0.0);
+        for(&idx, &freq) in &word_freq {
+            self.idf[idx] = (n / freq).ln();
+        }
+    }
+
+    pub fn train(&mut self, texts: &[&str]){
+        let documents: Vec<Vec<String>> = texts.iter().map(|text| {
+            text.split_whitespace().map(|word| word.to_string()).collect()
+        }).collect();
+
+        self.compute_idf(&documents);
+    }
+
+    pub fn predict(&self, texts: &Vec<&str>) -> Vec<Vec<f64>> {
+        texts.iter().map(|text| {
+            let words: Vec<String> = text.split_whitespace().map(|word| word.to_string()).collect();
+            let mut tfidf_vector = vec![0.0; self.term_to_index.len()];
+
+            // compute term frequence for this text
+            let mut tf = HashMap::new();
+            for word in &words {
+                *tf.entry(word).or_insert(0.0) += 1.0;
+            }
+
+            let keys: Vec<_> = tf.keys().cloned().collect();
+            for word in keys {
+                if let Some(freq) = tf.get_mut(&word) {
+                    *freq /= words.len() as f64;
+                }
+            }
+
+            // compute tfidf values
+            for (word, &index) in &self.term_to_index {
+                if let Some(&term_freq) = tf.get(word) {
+                    tfidf_vector[index] = term_freq * self.idf[index];
+                }
+            }
+
+            tfidf_vector
+        }).collect()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metrics::cosine_similarity;
+
+    #[test]
+    fn test_constructor(){
+        TfidfFeaturizer::new();
+    }
+
+    #[test]
+    fn test_train_tfidf() {
+        let mut tfidf_featurizer = TfidfFeaturizer::new();
+        let texts = vec![
+            "i love you",
+            "you hate me",
+            "me too"
+        ];
+
+        // Train tfidf vectorizer
+        tfidf_featurizer.train(&texts);
+
+        // vocab: i love you hate me too
+
+        let idf_actual = tfidf_featurizer.get_idf();
+        assert_eq!(idf_actual.len(), 6);
+
+        let idf_expected = vec![
+            (3.0f64/1.0f64).ln(),
+            (3.0f64/1.0f64).ln(),
+            (3.0f64/2.0f64).ln(),
+            (3.0f64/1.0f64).ln(),
+            (3.0f64/2.0f64).ln(),
+            (3.0f64/1.0f64).ln(),
+        ];
+        assert!((cosine_similarity(&idf_actual, &idf_expected) - 1.0).abs() < 1e-9);
+
+        // Predict tfidf values
+        let output = tfidf_featurizer.predict(&texts);
+        assert!(output.len() == 3);
+
+        // Document 1: "i love you"
+        let doc1_actual = output[0].clone();
+        let doc1_expected = vec![
+            (1.0f64 / 3.0f64) * (3.0f64).ln() ,
+            (1.0f64 / 3.0f64) * (3.0f64).ln() ,
+            (1.0f64 / 3.0f64) * (3.0f64 / 2.0f64).ln() ,
+            0.0f64,
+            0.0f64,
+            0.0f64
+        ];
+        assert!((cosine_similarity(&doc1_actual, &doc1_expected) - 1.0).abs() < 1e-9);
+
+        // Document 2: "you hate me"
+        let doc2_actual = output[1].clone();
+        let doc2_expected = vec![
+            0.0f64,
+            0.0f64,
+            (1.0f64 / 3.0f64) * (3.0f64 / 2.0f64).ln(),
+            (1.0f64 / 3.0f64) * (3.0f64 / 1.0f64).ln(),
+            (1.0f64 / 3.0f64) * (3.0f64 / 2.0f64).ln(),
+            0.0f64
+        ];
+        assert!((cosine_similarity(&doc2_actual, &doc2_expected) - 1.0).abs() < 1e-9);
+    }
+}
diff --git a/extensions/underthesea_core/src/lib.rs b/extensions/underthesea_core/src/lib.rs
@@ -5,6 +5,10 @@ use pyo3::prelude::*;
 use std::collections::HashSet;
 
 pub mod featurizers;
+pub mod features;
+pub mod metrics;
+
+// pub mod logistic;
 
 #[pyclass]
 pub struct CRFFeaturizer {
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		mod tfidf_featurizer;

		pub use tfidf_featurizer::TfidfFeaturizer;