From c19e2690a5065620b4c771bf3a5a2395df13259c Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Fri, 14 Jul 2023 23:45:38 +0700 Subject: [PATCH 01/12] GH-673: add logistic struct --- extensions/underthesea_core/Cargo.toml | 1 + extensions/underthesea_core/src/lib.rs | 2 + extensions/underthesea_core/src/logistic.rs | 90 +++++++++++++++++++++ extensions/underthesea_core/tests/models.rs | 2 +- 4 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 extensions/underthesea_core/src/logistic.rs diff --git a/extensions/underthesea_core/Cargo.toml b/extensions/underthesea_core/Cargo.toml index 07822dfd..6b3747b8 100644 --- a/extensions/underthesea_core/Cargo.toml +++ b/extensions/underthesea_core/Cargo.toml @@ -29,6 +29,7 @@ serde = { version = "1.0", features = [ "derive" ] } regex = "1" rayon = "1.5" crfs = "0.1" +ndarray = { version = "0.15", features = ["approx"] } [dependencies.pyo3] version = "0.15.0" diff --git a/extensions/underthesea_core/src/lib.rs b/extensions/underthesea_core/src/lib.rs index 472462c3..a491bb16 100644 --- a/extensions/underthesea_core/src/lib.rs +++ b/extensions/underthesea_core/src/lib.rs @@ -6,6 +6,8 @@ use std::collections::HashSet; pub mod featurizers; +pub mod logistic; + #[pyclass] pub struct CRFFeaturizer { pub object: featurizers::CRFFeaturizer diff --git a/extensions/underthesea_core/src/logistic.rs b/extensions/underthesea_core/src/logistic.rs new file mode 100644 index 00000000..29373b09 --- /dev/null +++ b/extensions/underthesea_core/src/logistic.rs @@ -0,0 +1,90 @@ +extern crate ndarray; + +use ndarray::{Array1, Array2}; +use ndarray::prelude::*; + +pub struct LogisticRegression { + weights: Array1, + learning_rate: f64, + iterations: usize, +} + +impl LogisticRegression { + // Constructor + pub fn new() -> Self { + LogisticRegression { + weights: Array1::zeros(1), + learning_rate: 0.01, + iterations: 1000, + } + } + + // Method to set learning_rate and iterations + pub fn with_hyperparams(mut self, learning_rate: f64, iterations: usize) -> Self { + self.learning_rate = learning_rate; + self.iterations = iterations; + self + } + + // Sigmoid function + fn sigmoid(z: f64) -> f64 { + 1.0 / (1.0 + (-z).exp()) + } + + pub fn fit(&mut self, x_train: &Array2, y_train: &Array1) { + let m = x_train.nrows(); // number of samples + let n = x_train.ncols(); // number of features + self.weights = Array1::zeros(n); // initializing weights + + // Gradient Descent + for _ in 0..self.iterations { + let mut gradient = Array1::zeros(n); // initialize gradient + + // calculate gradient for each sample + for j in 0..m { + let x = x_train.row(j).to_owned(); + let h = Self::sigmoid(x.dot(&self.weights)); + let error = h - y_train[j]; + gradient = gradient + error * &x; + } + + // update weights + self.weights = &self.weights - self.learning_rate * gradient / m as f64; + } + } + + // Predict function for predicting an output with the learned weights + pub fn predict(&self, x: &Array1) -> f64 { + let z = x.dot(&self.weights); + Self::sigmoid(z) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use ndarray::array; + + #[test] + fn logistic_regression_test() { + // Initialize logistic regression model + let mut model = LogisticRegression::new().with_hyperparams(0.01, 20000); + + // Training data + let x_train = array![[0., 1., 2.], [1., 2., 3.], [2., 3., 4.], [3., 4., 5.]]; + let y_train = array![0., 0., 1., 1.]; + + // Fit model + model.fit(&x_train, &y_train); + + // Test data + let x_test = array![2., 3., 4.]; + + // Predict + let prediction = model.predict(&x_test); + + // Test that the model's prediction is close to the expected value + print!("Error {}", (prediction - 1.).abs()); + assert!((prediction - 0.1).abs() < 1.0); + } +} \ No newline at end of file diff --git a/extensions/underthesea_core/tests/models.rs b/extensions/underthesea_core/tests/models.rs index eddfa2ad..216f53bb 100644 --- a/extensions/underthesea_core/tests/models.rs +++ b/extensions/underthesea_core/tests/models.rs @@ -5,6 +5,6 @@ mod tests { #[test] fn test_crfs(){ let buf = fs::read("tests/wt_crf_2018_09_13.bin").unwrap(); - let model = crfs::Model::new(&buf).unwrap(); + let _model = crfs::Model::new(&buf).unwrap(); } } \ No newline at end of file From c84fd54570ab2225b76079c392dc6ef2e4937157 Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Mon, 31 Jul 2023 15:29:11 +0700 Subject: [PATCH 02/12] GH-673: add TfidfFeaturizer --- examples/classification/.gitignore | 1 + examples/classification/README.md | 42 +---- examples/classification/preprocess_data.py | 17 ++ examples/classification/technical_report.md | 41 +++++ examples/classification/tmp/.gitignore | 2 + extensions/underthesea_core/Cargo.toml | 1 + extensions/underthesea_core/HISTORY.rst | 5 + extensions/underthesea_core/src/lib.rs | 2 + .../src/metrics/cosine_similarity.rs | 35 ++++ .../underthesea_core/src/metrics/mod.rs | 3 + extensions/underthesea_core/src/tfidf.rs | 160 ++++++++++++++++++ extensions/underthesea_core/tests/models.rs | 2 +- 12 files changed, 269 insertions(+), 42 deletions(-) create mode 100644 examples/classification/.gitignore create mode 100644 examples/classification/preprocess_data.py create mode 100644 examples/classification/technical_report.md create mode 100644 examples/classification/tmp/.gitignore create mode 100644 extensions/underthesea_core/src/metrics/cosine_similarity.rs create mode 100644 extensions/underthesea_core/src/metrics/mod.rs create mode 100644 extensions/underthesea_core/src/tfidf.rs diff --git a/examples/classification/.gitignore b/examples/classification/.gitignore new file mode 100644 index 00000000..e6d35e74 --- /dev/null +++ b/examples/classification/.gitignore @@ -0,0 +1 @@ +outputs \ No newline at end of file diff --git a/examples/classification/README.md b/examples/classification/README.md index 9d0a1891..f9f7e7b0 100644 --- a/examples/classification/README.md +++ b/examples/classification/README.md @@ -1,41 +1 @@ -# Vietnamese Text Classification with underthesea - -``` -Author: Vu Anh -Date: July 27, 2023 -``` - -Vietnamese text classification is an integral domain in Natural Language Processing (NLP) concerning the Vietnamese language. The objective of text classification is to assign predefined labels or categories to a given text based on its content. This report presents an overview of the challenges, methodologies, and advancements in Vietnamese text classification. - -## Methodologies and Approaches - -Zero-shot classification is a technique allowing models to categorize data into previously unseen classes without direct training. Leveraging large language models like GPT-3 and GPT-4, this method is invaluable when there's a shortage of labeled data or when it's impractical to gather annotations for every conceivable class. Such models use their vast training knowledge to generalize across tasks, making them versatile and adaptable to new classification challenges. - -## Results - -The following table presents the outcomes of the Vietnamese Text Classification endeavor when utilizing Large Language Models (LLMs): - -``` - - - - - - - - - - - - - - - - - - - - -
DatasetLLMsF1 Metric
GPT-3.5
GPT-4
-``` - +# Classification \ No newline at end of file diff --git a/examples/classification/preprocess_data.py b/examples/classification/preprocess_data.py new file mode 100644 index 00000000..93284192 --- /dev/null +++ b/examples/classification/preprocess_data.py @@ -0,0 +1,17 @@ +from datasets import load_dataset + +dataset = load_dataset("uit-nlp/vietnamese_students_feedback") +print(dataset) + +print(dataset["train"][0]) + +sentences = [] +for i in range(10): + item = dataset["train"][i] + sentence = item["sentence"] + sentences.append(sentence) + print(item) + +with open("tmp/train.txt", "w") as f: + content = "\n".join(sentences) + f.write(content) diff --git a/examples/classification/technical_report.md b/examples/classification/technical_report.md new file mode 100644 index 00000000..9d0a1891 --- /dev/null +++ b/examples/classification/technical_report.md @@ -0,0 +1,41 @@ +# Vietnamese Text Classification with underthesea + +``` +Author: Vu Anh +Date: July 27, 2023 +``` + +Vietnamese text classification is an integral domain in Natural Language Processing (NLP) concerning the Vietnamese language. The objective of text classification is to assign predefined labels or categories to a given text based on its content. This report presents an overview of the challenges, methodologies, and advancements in Vietnamese text classification. + +## Methodologies and Approaches + +Zero-shot classification is a technique allowing models to categorize data into previously unseen classes without direct training. Leveraging large language models like GPT-3 and GPT-4, this method is invaluable when there's a shortage of labeled data or when it's impractical to gather annotations for every conceivable class. Such models use their vast training knowledge to generalize across tasks, making them versatile and adaptable to new classification challenges. + +## Results + +The following table presents the outcomes of the Vietnamese Text Classification endeavor when utilizing Large Language Models (LLMs): + +``` + + + + + + + + + + + + + + + + + + + + +
DatasetLLMsF1 Metric
GPT-3.5
GPT-4
+``` + diff --git a/examples/classification/tmp/.gitignore b/examples/classification/tmp/.gitignore new file mode 100644 index 00000000..c96a04f0 --- /dev/null +++ b/examples/classification/tmp/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file diff --git a/extensions/underthesea_core/Cargo.toml b/extensions/underthesea_core/Cargo.toml index 07822dfd..dcb37373 100644 --- a/extensions/underthesea_core/Cargo.toml +++ b/extensions/underthesea_core/Cargo.toml @@ -29,6 +29,7 @@ serde = { version = "1.0", features = [ "derive" ] } regex = "1" rayon = "1.5" crfs = "0.1" +nalgebra = "0.29" [dependencies.pyo3] version = "0.15.0" diff --git a/extensions/underthesea_core/HISTORY.rst b/extensions/underthesea_core/HISTORY.rst index 00cc332c..7953f36c 100644 --- a/extensions/underthesea_core/HISTORY.rst +++ b/extensions/underthesea_core/HISTORY.rst @@ -2,6 +2,11 @@ History ================================================================================ +1.0.5 (2023-07-31) +-------------------------------------------------------------------------------- + +* Add metrics::consine_similarity + 1.0.4 (2023-04-28) -------------------------------------------------------------------------------- diff --git a/extensions/underthesea_core/src/lib.rs b/extensions/underthesea_core/src/lib.rs index 472462c3..e1a0caf2 100644 --- a/extensions/underthesea_core/src/lib.rs +++ b/extensions/underthesea_core/src/lib.rs @@ -5,6 +5,8 @@ use pyo3::prelude::*; use std::collections::HashSet; pub mod featurizers; +pub mod tfidf; +pub mod metrics; #[pyclass] pub struct CRFFeaturizer { diff --git a/extensions/underthesea_core/src/metrics/cosine_similarity.rs b/extensions/underthesea_core/src/metrics/cosine_similarity.rs new file mode 100644 index 00000000..d030141b --- /dev/null +++ b/extensions/underthesea_core/src/metrics/cosine_similarity.rs @@ -0,0 +1,35 @@ +//! Cosine similarity +//! +//! Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. +//! +//! # Author: Vu Anh +//! # Date: 2023-07-30 +use nalgebra::DVector; + +pub fn cosine_similarity(a: &Vec, b: &Vec) -> f64 { + let va = DVector::from_vec(a.clone()); + let vb = DVector::from_vec(b.clone()); + + va.dot(&vb) / (va.norm() * vb.norm()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cosine_similarity_1() { + let a = vec![1.0, 2.0, 3.0]; + let b = vec![1.0, 2.0, 3.0]; + + assert!((cosine_similarity(&a, &b) - 1.0).abs() < f64::EPSILON); + } + + #[test] + fn test_cosine_similarity_orthogonal() { + let a = vec![1.0, 0.0]; + let b = vec![0.0, 1.0]; + + assert!((cosine_similarity(&a, &b)).abs() < f64::EPSILON); + } +} \ No newline at end of file diff --git a/extensions/underthesea_core/src/metrics/mod.rs b/extensions/underthesea_core/src/metrics/mod.rs new file mode 100644 index 00000000..2d12bd3b --- /dev/null +++ b/extensions/underthesea_core/src/metrics/mod.rs @@ -0,0 +1,3 @@ +mod cosine_similarity; + +pub use cosine_similarity::cosine_similarity; \ No newline at end of file diff --git a/extensions/underthesea_core/src/tfidf.rs b/extensions/underthesea_core/src/tfidf.rs new file mode 100644 index 00000000..a4512c05 --- /dev/null +++ b/extensions/underthesea_core/src/tfidf.rs @@ -0,0 +1,160 @@ +//! tfidf.rs +//! +//! Provides functionality for computing Term Frequency-Inverse Document Frequency (TFIDF) vectors. +//! +//! Author: Vu Anh +//! Date: 2023-07-29 + +use std::collections::{HashMap, HashSet}; + +pub struct TfidfFeaturizer { + idf: Vec, + term_to_index: HashMap +} + +impl TfidfFeaturizer { + pub fn new() -> Self { + TfidfFeaturizer { + idf: Vec::new(), + term_to_index: HashMap::new() + } + } + + pub fn get_idf(&self) -> &Vec { + &self.idf + } + + fn compute_idf(&mut self, documents: &[Vec]) { + let n = documents.len() as f64; + + let mut word_freq = HashMap::new(); + + for doc in documents.iter() { + let mut seen_terms = HashSet::new(); + + for term in doc { + if !seen_terms.contains(term) { + let idx = match self.term_to_index.get(term) { + Some(&existing_idx) => existing_idx, + None => { + let new_idx = self.term_to_index.len(); + self.term_to_index.insert(term.clone(), new_idx); + new_idx + } + }; + *word_freq.entry(idx).or_insert(0.0) += 1.0; + seen_terms.insert(term.clone()); + } + } + } + + self.idf.resize(self.term_to_index.len(), 0.0); + for(&idx, &freq) in &word_freq { + self.idf[idx] = (n / freq).ln(); + } + } + + pub fn train(&mut self, texts: &[&str]){ + let documents: Vec> = texts.iter().map(|text| { + text.split_whitespace().map(|word| word.to_string()).collect() + }).collect(); + + self.compute_idf(&documents); + } + + pub fn predict(&self, texts: &Vec<&str>) -> Vec> { + texts.iter().map(|text| { + let words: Vec = text.split_whitespace().map(|word| word.to_string()).collect(); + let mut tfidf_vector = vec![0.0; self.term_to_index.len()]; + + // compute term frequence for this text + let mut tf = HashMap::new(); + for word in &words { + *tf.entry(word).or_insert(0.0) += 1.0; + } + + let keys: Vec<_> = tf.keys().cloned().collect(); + for word in keys { + if let Some(freq) = tf.get_mut(&word) { + *freq /= words.len() as f64; + } + } + + // compute tfidf values + for (word, &index) in &self.term_to_index { + if let Some(&term_freq) = tf.get(word) { + tfidf_vector[index] = term_freq * self.idf[index]; + } + } + + tfidf_vector + }).collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metrics::cosine_similarity; + + #[test] + fn test_constructor(){ + TfidfFeaturizer::new(); + } + + #[test] + fn test_train_tfidf() { + let mut tfidf_featurizer = TfidfFeaturizer::new(); + let texts = vec![ + "i love you", + "you hate me", + "me too" + ]; + + // Train tfidf vectorizer + tfidf_featurizer.train(&texts); + + // vocab: i love you hate me too + + let idf_actual = tfidf_featurizer.get_idf(); + assert_eq!(idf_actual.len(), 6); + + let idf_expected = vec![ + (3.0f64/1.0f64).ln(), + (3.0f64/1.0f64).ln(), + (3.0f64/2.0f64).ln(), + (3.0f64/1.0f64).ln(), + (3.0f64/2.0f64).ln(), + (3.0f64/1.0f64).ln(), + ]; + assert!((cosine_similarity(&idf_actual, &idf_expected) - 1.0).abs() < 1e-9); + + // Predict tfidf values + let output = tfidf_featurizer.predict(&texts); + assert!(output.len() == 3); + + // Document 1: "i love you" + let doc1_actual = output[0].clone(); + let doc1_expected = vec![ + (1.0f64 / 3.0f64) * (3.0f64).ln() , + (1.0f64 / 3.0f64) * (3.0f64).ln() , + (1.0f64 / 3.0f64) * (3.0f64 / 2.0f64).ln() , + 0.0f64, + 0.0f64, + 0.0f64 + ]; + assert!((cosine_similarity(&doc1_actual, &doc1_expected) - 1.0).abs() < 1e-9); + + // Document 2: "you hate me" + let doc2_actual = output[1].clone(); + let doc2_expected = vec![ + 0.0f64, + 0.0f64, + (1.0f64 / 3.0f64) * (3.0f64 / 2.0f64).ln(), + (1.0f64 / 3.0f64) * (3.0f64 / 1.0f64).ln(), + (1.0f64 / 3.0f64) * (3.0f64 / 2.0f64).ln(), + 0.0f64 + ]; + assert!((cosine_similarity(&doc2_actual, &doc2_expected) - 1.0).abs() < 1e-9); + } +} \ No newline at end of file diff --git a/extensions/underthesea_core/tests/models.rs b/extensions/underthesea_core/tests/models.rs index eddfa2ad..216f53bb 100644 --- a/extensions/underthesea_core/tests/models.rs +++ b/extensions/underthesea_core/tests/models.rs @@ -5,6 +5,6 @@ mod tests { #[test] fn test_crfs(){ let buf = fs::read("tests/wt_crf_2018_09_13.bin").unwrap(); - let model = crfs::Model::new(&buf).unwrap(); + let _model = crfs::Model::new(&buf).unwrap(); } } \ No newline at end of file From 15fe483d5b4f4ddae4a1441321449837a370f6bc Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Mon, 31 Jul 2023 15:35:05 +0700 Subject: [PATCH 03/12] release crates --- .github/workflows/release-crates.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .github/workflows/release-crates.yaml diff --git a/.github/workflows/release-crates.yaml b/.github/workflows/release-crates.yaml new file mode 100644 index 00000000..fcb633b3 --- /dev/null +++ b/.github/workflows/release-crates.yaml @@ -0,0 +1,13 @@ +on: + pull_request: + branches: [ main ] + types: + - labeled + - synchronize + +jobs: + release-crates: + runs-on: ubuntu-latest + steps: + - name: Checkout + if: contains(env.PR_LABELS, 'ci-prompt') \ No newline at end of file From dd2707558e97e0cd31248d6e4d1ec7513737436e Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Mon, 31 Jul 2023 15:36:35 +0700 Subject: [PATCH 04/12] update --- .github/workflows/release-crates.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/release-crates.yaml b/.github/workflows/release-crates.yaml index fcb633b3..fa2646b3 100644 --- a/.github/workflows/release-crates.yaml +++ b/.github/workflows/release-crates.yaml @@ -1,9 +1,6 @@ on: pull_request: branches: [ main ] - types: - - labeled - - synchronize jobs: release-crates: From e50023194887ca64f86173dd0a133d1c0465b3c5 Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Mon, 31 Jul 2023 15:36:49 +0700 Subject: [PATCH 05/12] update --- .github/workflows/release-crates.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-crates.yaml b/.github/workflows/release-crates.yaml index fa2646b3..7c60491d 100644 --- a/.github/workflows/release-crates.yaml +++ b/.github/workflows/release-crates.yaml @@ -7,4 +7,4 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - if: contains(env.PR_LABELS, 'ci-prompt') \ No newline at end of file + uses: actions/checkout@v2 \ No newline at end of file From 63a2603db7ffc4bfa2bd69dd5d082df42ecf6d02 Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Mon, 31 Jul 2023 15:40:44 +0700 Subject: [PATCH 06/12] update --- .github/workflows/release-crates.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release-crates.yaml b/.github/workflows/release-crates.yaml index 7c60491d..ac3b41f6 100644 --- a/.github/workflows/release-crates.yaml +++ b/.github/workflows/release-crates.yaml @@ -7,4 +7,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 \ No newline at end of file + uses: actions/checkout@v2 + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + - name: Build Release + working-directory: underthesea/extensions/underthesea_core + run: | + cargo build --release + \ No newline at end of file From cc53790baea795a57cf8ceef71a81bbb39f341dc Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Mon, 31 Jul 2023 15:44:15 +0700 Subject: [PATCH 07/12] update --- .github/workflows/release-crates.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release-crates.yaml b/.github/workflows/release-crates.yaml index ac3b41f6..b16f6d8a 100644 --- a/.github/workflows/release-crates.yaml +++ b/.github/workflows/release-crates.yaml @@ -14,7 +14,8 @@ jobs: toolchain: stable override: true - name: Build Release - working-directory: underthesea/extensions/underthesea_core + working-directory: extensions/underthesea_core run: | + cargo login cargo build --release \ No newline at end of file From 8537e651e3e4585234f7116a2f0fb30eec8b9cbf Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Mon, 31 Jul 2023 15:47:16 +0700 Subject: [PATCH 08/12] update --- .github/workflows/release-crates.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/release-crates.yaml b/.github/workflows/release-crates.yaml index b16f6d8a..70db490e 100644 --- a/.github/workflows/release-crates.yaml +++ b/.github/workflows/release-crates.yaml @@ -15,6 +15,8 @@ jobs: override: true - name: Build Release working-directory: extensions/underthesea_core + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} run: | cargo login cargo build --release From 56d310a0b77c87ce1e51fac8099485185e9609ed Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Mon, 31 Jul 2023 15:49:06 +0700 Subject: [PATCH 09/12] update --- .github/workflows/release-crates.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/release-crates.yaml b/.github/workflows/release-crates.yaml index 70db490e..01608d96 100644 --- a/.github/workflows/release-crates.yaml +++ b/.github/workflows/release-crates.yaml @@ -18,6 +18,5 @@ jobs: env: CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} run: | - cargo login cargo build --release \ No newline at end of file From 6410d8a429d4de701c20df3d3053f74b3cc9847f Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Mon, 31 Jul 2023 15:54:01 +0700 Subject: [PATCH 10/12] update --- .github/workflows/release-crates.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release-crates.yaml b/.github/workflows/release-crates.yaml index 01608d96..00d89412 100644 --- a/.github/workflows/release-crates.yaml +++ b/.github/workflows/release-crates.yaml @@ -19,4 +19,5 @@ jobs: CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} run: | cargo build --release + cargo publish \ No newline at end of file From 8343ad76a6a79a392dddbdda9415f3b70f1a7e4d Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Mon, 31 Jul 2023 16:19:36 +0700 Subject: [PATCH 11/12] update --- .github/workflows/crates-build.yml | 22 +++++++++++++++++++ ...release-crates.yaml => crates-release.yml} | 6 ++--- extensions/underthesea_core/HISTORY.rst | 1 + .../underthesea_core/src/features/mod.rs | 3 +++ .../tfidf_featurizer.rs} | 2 +- extensions/underthesea_core/src/lib.rs | 2 +- 6 files changed, 31 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/crates-build.yml rename .github/workflows/{release-crates.yaml => crates-release.yml} (87%) create mode 100644 extensions/underthesea_core/src/features/mod.rs rename extensions/underthesea_core/src/{tfidf.rs => features/tfidf_featurizer.rs} (99%) diff --git a/.github/workflows/crates-build.yml b/.github/workflows/crates-build.yml new file mode 100644 index 00000000..43c59f0f --- /dev/null +++ b/.github/workflows/crates-build.yml @@ -0,0 +1,22 @@ +on: + pull_request: + branches: [ main ] + +jobs: + crates-build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + - name: Build Release + working-directory: extensions/underthesea_core + run: | + cargo test + cargo build --release + + \ No newline at end of file diff --git a/.github/workflows/release-crates.yaml b/.github/workflows/crates-release.yml similarity index 87% rename from .github/workflows/release-crates.yaml rename to .github/workflows/crates-release.yml index 00d89412..9a0a1a95 100644 --- a/.github/workflows/release-crates.yaml +++ b/.github/workflows/crates-release.yml @@ -1,9 +1,9 @@ on: - pull_request: + push: branches: [ main ] jobs: - release-crates: + crates-build: runs-on: ubuntu-latest steps: - name: Checkout @@ -18,6 +18,6 @@ jobs: env: CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} run: | - cargo build --release cargo publish + \ No newline at end of file diff --git a/extensions/underthesea_core/HISTORY.rst b/extensions/underthesea_core/HISTORY.rst index 7953f36c..5fc041eb 100644 --- a/extensions/underthesea_core/HISTORY.rst +++ b/extensions/underthesea_core/HISTORY.rst @@ -6,6 +6,7 @@ History -------------------------------------------------------------------------------- * Add metrics::consine_similarity +* Add features::tfidf_featurizer 1.0.4 (2023-04-28) -------------------------------------------------------------------------------- diff --git a/extensions/underthesea_core/src/features/mod.rs b/extensions/underthesea_core/src/features/mod.rs new file mode 100644 index 00000000..5074f844 --- /dev/null +++ b/extensions/underthesea_core/src/features/mod.rs @@ -0,0 +1,3 @@ +mod tfidf_featurizer; + +pub use tfidf_featurizer::TfidfFeaturizer; \ No newline at end of file diff --git a/extensions/underthesea_core/src/tfidf.rs b/extensions/underthesea_core/src/features/tfidf_featurizer.rs similarity index 99% rename from extensions/underthesea_core/src/tfidf.rs rename to extensions/underthesea_core/src/features/tfidf_featurizer.rs index a4512c05..2c0a0f01 100644 --- a/extensions/underthesea_core/src/tfidf.rs +++ b/extensions/underthesea_core/src/features/tfidf_featurizer.rs @@ -1,4 +1,4 @@ -//! tfidf.rs +//! tfidf_featurizer.rs //! //! Provides functionality for computing Term Frequency-Inverse Document Frequency (TFIDF) vectors. //! diff --git a/extensions/underthesea_core/src/lib.rs b/extensions/underthesea_core/src/lib.rs index 8eb03899..e9c22239 100644 --- a/extensions/underthesea_core/src/lib.rs +++ b/extensions/underthesea_core/src/lib.rs @@ -5,7 +5,7 @@ use pyo3::prelude::*; use std::collections::HashSet; pub mod featurizers; -pub mod tfidf; +pub mod features; pub mod metrics; // pub mod logistic; From 25dc4e17ea8971ac8c734a90f6258161029ff8dd Mon Sep 17 00:00:00 2001 From: Vu Anh Date: Mon, 31 Jul 2023 16:33:37 +0700 Subject: [PATCH 12/12] update --- .github/workflows/crates-release.yml | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/.github/workflows/crates-release.yml b/.github/workflows/crates-release.yml index 9a0a1a95..4a5563f2 100644 --- a/.github/workflows/crates-release.yml +++ b/.github/workflows/crates-release.yml @@ -3,7 +3,7 @@ on: branches: [ main ] jobs: - crates-build: + crates-release: runs-on: ubuntu-latest steps: - name: Checkout @@ -13,11 +13,22 @@ jobs: with: toolchain: stable override: true + - name: Check if version already exists on crates.io + working-directory: extensions/underthesea_core + id: check-version + run: | + CURRENT_VERSION=$(cargo pkgid | cut -d# -f2 | cut -d: -f2) + EXISTS=$(cargo search "underthesea_core" --limit 100 | grep "$CURRENT_VERSION") + if [[ -z "$EXISTS" ]]; then + echo "The version $CURRENT_VERSION does not exist on crates.io" + echo "::set-output name=exists::false" + else + echo "The version $CURRENT_VERSION already exists on crates.io" + echo "::set-output name=exists::true" + fi - name: Build Release + if: steps.check-version.outputs.exists == 'false' working-directory: extensions/underthesea_core env: CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} - run: | - cargo publish - - \ No newline at end of file + run: cargo publish \ No newline at end of file