new text classification tutorial (#6)

behrica · web-flow · commit 4d46111dc033 · 2024-11-03T18:35:14.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ classes
 .calva
 .vscode
 docs/
+
diff --git a/notebooks/toc.edn b/notebooks/toc.edn
@@ -53,4 +53,12 @@
   :cmd "clj render.clj"
   :tags [:onnx :ml]}
 
+ {:created "2024-11-03"
+  :updated "2024-11-03"
+  :title "Text classification with metamorh.ml and xgboost"
+  :url "projects/ml/text-classification/index.html"
+  :source-path "projects/ml/text-classification"
+  :cmd "clj render.clj"
+  :tags [:nlp :ml]}
+
  ]
diff --git a/projects/ml/text-classification/.devcontainer/Dockerfile b/projects/ml/text-classification/.devcontainer/Dockerfile
@@ -0,0 +1,32 @@
+ARG BASE_IMAGE=temurin-21-tools-deps-jammy
+FROM clojure:${BASE_IMAGE}
+
+ARG USERNAME=vscode
+ARG USER_UID=1000
+ARG USER_GID=$USER_UID
+    
+# Create the user
+RUN groupadd --gid $USER_GID $USERNAME \
+    && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
+    #
+    # [Optional] Add sudo support. Omit if you don't need to install software after connecting.
+    && apt-get update \
+    && apt-get install -y sudo \
+    && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
+    && chmod 0440 /etc/sudoers.d/$USERNAME
+    
+    
+# [Optional] Set the default user. Omit if you want to keep the default as root.
+USER $USERNAME
+SHELL ["/bin/bash", "-ec"]
+ENTRYPOINT ["bash"]
+
+
+# Prepare clojure tools
+RUN clojure -Ttools list && \
+clojure -Ttools install io.github.seancorfield/clj-new '{:git/tag "v1.2.404" :git/sha "d4a6508"}' :as clj-new && \
+clojure -Ttools install-latest :lib io.github.seancorfield/deps-new :as new && \
+clojure -Ttools list
+
+RUN sudo apt-get update && \
+    sudo apt-get install -y lsb-release libgomp1
diff --git a/projects/ml/text-classification/.devcontainer/devcontainer.json b/projects/ml/text-classification/.devcontainer/devcontainer.json
@@ -0,0 +1,26 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/scicloj/devcontainer-templates/tree/main/src/basecloj
+{
+	"name": "text classification",
+	"build": {
+		"dockerfile": "Dockerfile",
+		"args": {
+			"BASE_IMAGE": "temurin-21-tools-deps-jammy",
+			"USERNAME": "${localEnv:USER:vscode}"
+		}
+	},
+	"remoteUser": "${localEnv:USER:vscode}",
+	"containerUser": "${localEnv:USER:vscode}",
+	"features": {
+		"ghcr.io/devcontainers/features/git:1": {},
+		"ghcr.io/rocker-org/devcontainer-features/quarto-cli:1": {}
+
+	},
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"betterthantomorrow.calva"
+			]
+		}
+	}
+}
diff --git a/projects/ml/text-classification/.gitignore b/projects/ml/text-classification/.gitignore
@@ -0,0 +1 @@
+nlp-getting-started.zip
diff --git a/projects/ml/text-classification/deps.edn b/projects/ml/text-classification/deps.edn
@@ -0,0 +1,11 @@
+{:deps {scicloj.ml.xgboost {:git/url "https://github.com/scicloj/scicloj.ml.xgboost"
+                            :git/sha "db8af224d349db51e2cce0444d0caef2abc26f81"}
+
+        org.clojure/clojure {:mvn/version "1.12.0"}
+        org.scicloj/clay {:mvn/version "2-alpha78"}
+        scicloj/tablecloth {:mvn/version "7.029.2"}
+        org.clojure/data.csv {:mvn/version "1.1.0"}
+        org.scicloj/metamorph.ml          {:git/url "https://github.com/scicloj/metamorph.ml"
+                                           :git/sha "6cd751f7c06dd2964a9ea108204f059ee29bb66c"}}
+ :aliases {:dev {:jvm-opts ["-Djava.awt.headless=true"]}}
+ }
diff --git a/projects/ml/text-classification/notebooks/index.clj b/projects/ml/text-classification/notebooks/index.clj
@@ -0,0 +1,208 @@
+(load-string (slurp  "https://raw.githubusercontent.com/scicloj/clojure-data-tutorials/main/header.edn"))
+
+^:kindly/hide-code
+(ns index 
+  (:require
+   [clojure.data.csv :as csv]
+   [clojure.java.io :as io]
+   [clojure.string :as str]
+   [scicloj.metamorph.ml.text :as text]
+   [tablecloth.api :as tc]
+   [tablecloth.column.api :as tcc]
+   [scicloj.metamorph.ml :as ml]
+   [tech.v3.dataset.modelling :as ds-mod]
+   [scicloj.clay.v2.api :as clay]))
+
+(comment
+  (require '[scicloj.clay.v2.api :as clay])
+  (clay/start!)
+  (clay/make! {:source-path "notebooks/index.clj"
+               :show false
+               }))
+
+;; The following code shows how to perform text classification from a Kaggle 
+;; dataset and make a submission file, ready to get uploaded to 
+;;Kaggle for scoring.
+;;
+;; It makes use of the tidy text / TFIDF functionality present in `metamorph.ml`
+;; and the ability of the xgboost model to handle tidy text data as input.
+;;
+;; First we need a fn to tokenize a line of text
+;; The simplest such function is:
+(defn- tokenize-fn [text]
+  (str/split text #" "))
+
+
+;; It does not do any text normalization, which is always required in NLP tasks
+;; in order to have a more general model.
+;;
+;; The following reads line-by-line a file from disk and converts it on the fly
+;; to the  `tidy text` representation, it which each word 
+;; is a row in a dataset.
+;;
+;; `line-parse-fn` needs to split an input line into [text meta],
+;; and the `text` is then further handled by `tokenize-fn` and split into tokens.
+;; The format of the data has the text in field 4 and the label in 5.
+
+;; We ignore all other columns so far:
+(defn- line-parse-fn [line]
+  [(nth line 3)
+   (Integer/parseInt (nth line 4))])
+
+;; This triggers the parsing and produces a (seq of) "long" datasets
+;; (1 for our small text)
+;; and the vocabulary obtained during parsing.
+(def tidy-train
+  (text/->tidy-text (csv/read-csv (io/reader "train.csv"))
+                    seq
+                    line-parse-fn
+                    tokenize-fn
+                    :skip-lines 1))
+
+(def tidy-train-ds 
+  (-> tidy-train :datasets first))
+;; The combination of columns :document, :token-pos and :token-index
+;; together with the vocabulary table is an exact representation of the text
+;; Unless we normalize it as part of hte `tokenize-fn`
+;;
+;; `meta` is any other information of a row to be kept, usualy the "label"
+;; in case of training data.
+
+tidy-train-ds
+
+;; The lookup table allow to convert from :token-idx to words and 
+;;back if needed.
+(def train--token-lookup-table (:token-lookup-table tidy-train))
+(map str (take 20 train--token-lookup-table))
+
+;; As we can see, the tokens are not cleaned / standardized at all. 
+;;This gives as well a large vocabulary size of
+(count train--token-lookup-table)
+
+
+;; Now we convert the text into a bag-of-words format, which looses
+;; any word order and calculates a metric which is known to work well
+;; for text classification, the so called TFIDF score.
+(def train-tfidf
+  (text/->tfidf tidy-train-ds))
+
+;; The resulting table represent conceptually well three "sparse matrices"
+;; where :document and :token-idx are  x,y coordinates and matrix cell values
+;; are :token-count, term-frequency (:tf) or TFIDF
+;;
+;; Not present rows (the large majority) are 0 values.
+
+;; A subset of machine learning algorithms can deal with sparse matrices, 
+;; without then need to convert them into
+;; dense matrices first, which is in most cases impossible due to the memory
+;; consumption
+
+;; The train-tfidf dataset represents therefore  3 sparse matrices with
+;; dimensions
+(tcc/reduce-max (:document train-tfidf))
+;; times
+(tcc/reduce-max (:token-idx train-tfidf))
+;; time 3
+;; =
+
+(* (tcc/reduce-max (:document train-tfidf))
+   (tcc/reduce-max (:token-idx train-tfidf))
+   3)
+
+
+;; while only having shape:
+
+(tc/shape train-tfidf)
+
+;; This is because most matrix elements are 0, as 
+;; any text does "not contain" most words.
+;;
+;; As TFIDF (and its variants) are one of the most common numeric representations for text,
+;; "sparse matrixes" and models supporting them is a prerequisite for NLP.
+;;
+;; Only since a few years we have "dense text representations" based on "embeddings",
+;; which will not be discussed here today,
+
+;; Now we get the data ready for training.
+
+(def train-ds
+  (-> train-tfidf
+      (tc/rename-columns {:meta :label})
+      (tc/select-columns [:document :token-idx :tfidf :label]) ;; we only need those
+      (ds-mod/set-inference-target [:label])))
+
+train-ds
+
+(def n-sparse-columns (inc (tcc/reduce-max (train-ds :token-idx))))
+
+;; The model used is from library `scicloj.ml.xgboost` which is the well known xgboost model
+;; behind a wrapper to make it work with tidy text data.
+;;
+;; We use :tfidf column as the "feature".
+
+(require '[scicloj.ml.xgboost]) 
+;; registers the mode under key :xgboost/classification
+
+(def model
+  (ml/train train-ds {:model-type :xgboost/classification
+                         :sparse-column :tfidf
+                         :seed 123
+                         :num-class 2
+                         :n-sparse-columns n-sparse-columns}))
+
+
+;; Now we have a trained model, which we can use for prediction on the test data.
+
+;; This time we do parsing and tfidf in one go.
+;;
+;; Important here:
+;;
+;; We pass the vocabulary "obtained before" in order to be sure, that
+;; :token-idx maps to the same words in both datasets. In case of "new tokens",
+;; we ignore them and map them to a special token, "[UNKNOWN]"
+(def tfidf-test-ds
+  (->
+   (text/->tidy-text (csv/read-csv (io/reader "test.csv"))
+                     seq
+                     (fn [line]
+                       [(nth line 3) {:id (first line)}])
+                     tokenize-fn
+                     :skip-lines 1
+                     :new-token-behaviour :as-unknown
+                     :token->index-map train--token-lookup-table)
+   :datasets
+   first
+   text/->tfidf
+   (tc/select-columns [:document :token-idx :tfidf :meta]) 
+   ;; he :id for Kaggle
+   (tc/add-column
+    :id (fn [df] (map
+                  #(:id %)
+                  (:meta df))))
+   (tc/drop-columns [:meta])))
+
+;; This gives the dataset which can be passed into the `predict` function of `metamorph.ml`
+tfidf-test-ds
+
+(def prediction
+  (ml/predict tfidf-test-ds model))
+
+;; The raw predictions contain the "document" each prediction is about.
+;; This we can use to match predictions and the input "ids" in order to produce teh format
+;; required by Kaggle
+prediction
+
+(->
+ (tc/right-join prediction tfidf-test-ds :document)
+ (tc/unique-by [:id :label])
+ (tc/select-columns [:id :label])
+ (tc/update-columns {:label (partial map int)})
+ (tc/rename-columns {:label :target})
+ (tc/write-csv! "submission.csv"))
+
+;; The produced CVS file can be uploaded to Kaggle for scoring.
+
+(->>
+ (io/reader "submission.csv")
+ line-seq
+ (take 10))
diff --git a/projects/ml/text-classification/render.clj b/projects/ml/text-classification/render.clj
@@ -0,0 +1,11 @@
+(ns render
+  (:require [scicloj.clay.v2.api :as clay]))
+
+(clay/make! {:format [:quarto :html]
+             :show false
+             :base-source-path "notebooks"
+             :source-path ["index.clj"]
+             :base-target-path "docs"
+             :clean-up-target-dir true})
+(System/exit 0)
+
diff --git a/projects/ml/text-classification/sample_submission.csv b/projects/ml/text-classification/sample_submission.csv
diff --git a/projects/ml/text-classification/submission.csv b/projects/ml/text-classification/submission.csv
diff --git a/projects/ml/text-classification/test.csv b/projects/ml/text-classification/test.csv
diff --git a/projects/ml/text-classification/train.csv b/projects/ml/text-classification/train.csv

-Original file line number
+Diff line change
 .calva
 .vscode
 docs/
++