Skip to content

Commit

Permalink
new text classification tutorial (#6)
Browse files Browse the repository at this point in the history
  • Loading branch information
behrica authored Nov 3, 2024
1 parent eb1a498 commit 4d46111
Show file tree
Hide file tree
Showing 12 changed files with 19,088 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ classes
.calva
.vscode
docs/

8 changes: 8 additions & 0 deletions notebooks/toc.edn
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,12 @@
:cmd "clj render.clj"
:tags [:onnx :ml]}

{:created "2024-11-03"
:updated "2024-11-03"
:title "Text classification with metamorh.ml and xgboost"
:url "projects/ml/text-classification/index.html"
:source-path "projects/ml/text-classification"
:cmd "clj render.clj"
:tags [:nlp :ml]}

]
32 changes: 32 additions & 0 deletions projects/ml/text-classification/.devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
ARG BASE_IMAGE=temurin-21-tools-deps-jammy
FROM clojure:${BASE_IMAGE}

ARG USERNAME=vscode
ARG USER_UID=1000
ARG USER_GID=$USER_UID

# Create the user
RUN groupadd --gid $USER_GID $USERNAME \
&& useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
#
# [Optional] Add sudo support. Omit if you don't need to install software after connecting.
&& apt-get update \
&& apt-get install -y sudo \
&& echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
&& chmod 0440 /etc/sudoers.d/$USERNAME


# [Optional] Set the default user. Omit if you want to keep the default as root.
USER $USERNAME
SHELL ["/bin/bash", "-ec"]
ENTRYPOINT ["bash"]


# Prepare clojure tools
RUN clojure -Ttools list && \
clojure -Ttools install io.github.seancorfield/clj-new '{:git/tag "v1.2.404" :git/sha "d4a6508"}' :as clj-new && \
clojure -Ttools install-latest :lib io.github.seancorfield/deps-new :as new && \
clojure -Ttools list

RUN sudo apt-get update && \
sudo apt-get install -y lsb-release libgomp1
26 changes: 26 additions & 0 deletions projects/ml/text-classification/.devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/scicloj/devcontainer-templates/tree/main/src/basecloj
{
"name": "text classification",
"build": {
"dockerfile": "Dockerfile",
"args": {
"BASE_IMAGE": "temurin-21-tools-deps-jammy",
"USERNAME": "${localEnv:USER:vscode}"
}
},
"remoteUser": "${localEnv:USER:vscode}",
"containerUser": "${localEnv:USER:vscode}",
"features": {
"ghcr.io/devcontainers/features/git:1": {},
"ghcr.io/rocker-org/devcontainer-features/quarto-cli:1": {}

},
"customizations": {
"vscode": {
"extensions": [
"betterthantomorrow.calva"
]
}
}
}
1 change: 1 addition & 0 deletions projects/ml/text-classification/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
nlp-getting-started.zip
11 changes: 11 additions & 0 deletions projects/ml/text-classification/deps.edn
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{:deps {scicloj.ml.xgboost {:git/url "https://github.com/scicloj/scicloj.ml.xgboost"
:git/sha "db8af224d349db51e2cce0444d0caef2abc26f81"}

org.clojure/clojure {:mvn/version "1.12.0"}
org.scicloj/clay {:mvn/version "2-alpha78"}
scicloj/tablecloth {:mvn/version "7.029.2"}
org.clojure/data.csv {:mvn/version "1.1.0"}
org.scicloj/metamorph.ml {:git/url "https://github.com/scicloj/metamorph.ml"
:git/sha "6cd751f7c06dd2964a9ea108204f059ee29bb66c"}}
:aliases {:dev {:jvm-opts ["-Djava.awt.headless=true"]}}
}
208 changes: 208 additions & 0 deletions projects/ml/text-classification/notebooks/index.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
(load-string (slurp "https://raw.githubusercontent.com/scicloj/clojure-data-tutorials/main/header.edn"))

^:kindly/hide-code
(ns index
(:require
[clojure.data.csv :as csv]
[clojure.java.io :as io]
[clojure.string :as str]
[scicloj.metamorph.ml.text :as text]
[tablecloth.api :as tc]
[tablecloth.column.api :as tcc]
[scicloj.metamorph.ml :as ml]
[tech.v3.dataset.modelling :as ds-mod]
[scicloj.clay.v2.api :as clay]))

(comment
(require '[scicloj.clay.v2.api :as clay])
(clay/start!)
(clay/make! {:source-path "notebooks/index.clj"
:show false
}))

;; The following code shows how to perform text classification from a Kaggle
;; dataset and make a submission file, ready to get uploaded to
;;Kaggle for scoring.
;;
;; It makes use of the tidy text / TFIDF functionality present in `metamorph.ml`
;; and the ability of the xgboost model to handle tidy text data as input.
;;
;; First we need a fn to tokenize a line of text
;; The simplest such function is:
(defn- tokenize-fn [text]
(str/split text #" "))


;; It does not do any text normalization, which is always required in NLP tasks
;; in order to have a more general model.
;;
;; The following reads line-by-line a file from disk and converts it on the fly
;; to the `tidy text` representation, it which each word
;; is a row in a dataset.
;;
;; `line-parse-fn` needs to split an input line into [text meta],
;; and the `text` is then further handled by `tokenize-fn` and split into tokens.
;; The format of the data has the text in field 4 and the label in 5.

;; We ignore all other columns so far:
(defn- line-parse-fn [line]
[(nth line 3)
(Integer/parseInt (nth line 4))])

;; This triggers the parsing and produces a (seq of) "long" datasets
;; (1 for our small text)
;; and the vocabulary obtained during parsing.
(def tidy-train
(text/->tidy-text (csv/read-csv (io/reader "train.csv"))
seq
line-parse-fn
tokenize-fn
:skip-lines 1))

(def tidy-train-ds
(-> tidy-train :datasets first))
;; The combination of columns :document, :token-pos and :token-index
;; together with the vocabulary table is an exact representation of the text
;; Unless we normalize it as part of hte `tokenize-fn`
;;
;; `meta` is any other information of a row to be kept, usualy the "label"
;; in case of training data.

tidy-train-ds

;; The lookup table allow to convert from :token-idx to words and
;;back if needed.
(def train--token-lookup-table (:token-lookup-table tidy-train))
(map str (take 20 train--token-lookup-table))

;; As we can see, the tokens are not cleaned / standardized at all.
;;This gives as well a large vocabulary size of
(count train--token-lookup-table)


;; Now we convert the text into a bag-of-words format, which looses
;; any word order and calculates a metric which is known to work well
;; for text classification, the so called TFIDF score.
(def train-tfidf
(text/->tfidf tidy-train-ds))

;; The resulting table represent conceptually well three "sparse matrices"
;; where :document and :token-idx are x,y coordinates and matrix cell values
;; are :token-count, term-frequency (:tf) or TFIDF
;;
;; Not present rows (the large majority) are 0 values.

;; A subset of machine learning algorithms can deal with sparse matrices,
;; without then need to convert them into
;; dense matrices first, which is in most cases impossible due to the memory
;; consumption

;; The train-tfidf dataset represents therefore 3 sparse matrices with
;; dimensions
(tcc/reduce-max (:document train-tfidf))
;; times
(tcc/reduce-max (:token-idx train-tfidf))
;; time 3
;; =

(* (tcc/reduce-max (:document train-tfidf))
(tcc/reduce-max (:token-idx train-tfidf))
3)


;; while only having shape:

(tc/shape train-tfidf)

;; This is because most matrix elements are 0, as
;; any text does "not contain" most words.
;;
;; As TFIDF (and its variants) are one of the most common numeric representations for text,
;; "sparse matrixes" and models supporting them is a prerequisite for NLP.
;;
;; Only since a few years we have "dense text representations" based on "embeddings",
;; which will not be discussed here today,

;; Now we get the data ready for training.

(def train-ds
(-> train-tfidf
(tc/rename-columns {:meta :label})
(tc/select-columns [:document :token-idx :tfidf :label]) ;; we only need those
(ds-mod/set-inference-target [:label])))

train-ds

(def n-sparse-columns (inc (tcc/reduce-max (train-ds :token-idx))))

;; The model used is from library `scicloj.ml.xgboost` which is the well known xgboost model
;; behind a wrapper to make it work with tidy text data.
;;
;; We use :tfidf column as the "feature".

(require '[scicloj.ml.xgboost])
;; registers the mode under key :xgboost/classification

(def model
(ml/train train-ds {:model-type :xgboost/classification
:sparse-column :tfidf
:seed 123
:num-class 2
:n-sparse-columns n-sparse-columns}))


;; Now we have a trained model, which we can use for prediction on the test data.

;; This time we do parsing and tfidf in one go.
;;
;; Important here:
;;
;; We pass the vocabulary "obtained before" in order to be sure, that
;; :token-idx maps to the same words in both datasets. In case of "new tokens",
;; we ignore them and map them to a special token, "[UNKNOWN]"
(def tfidf-test-ds
(->
(text/->tidy-text (csv/read-csv (io/reader "test.csv"))
seq
(fn [line]
[(nth line 3) {:id (first line)}])
tokenize-fn
:skip-lines 1
:new-token-behaviour :as-unknown
:token->index-map train--token-lookup-table)
:datasets
first
text/->tfidf
(tc/select-columns [:document :token-idx :tfidf :meta])
;; he :id for Kaggle
(tc/add-column
:id (fn [df] (map
#(:id %)
(:meta df))))
(tc/drop-columns [:meta])))

;; This gives the dataset which can be passed into the `predict` function of `metamorph.ml`
tfidf-test-ds

(def prediction
(ml/predict tfidf-test-ds model))

;; The raw predictions contain the "document" each prediction is about.
;; This we can use to match predictions and the input "ids" in order to produce teh format
;; required by Kaggle
prediction

(->
(tc/right-join prediction tfidf-test-ds :document)
(tc/unique-by [:id :label])
(tc/select-columns [:id :label])
(tc/update-columns {:label (partial map int)})
(tc/rename-columns {:label :target})
(tc/write-csv! "submission.csv"))

;; The produced CVS file can be uploaded to Kaggle for scoring.

(->>
(io/reader "submission.csv")
line-seq
(take 10))
11 changes: 11 additions & 0 deletions projects/ml/text-classification/render.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
(ns render
(:require [scicloj.clay.v2.api :as clay]))

(clay/make! {:format [:quarto :html]
:show false
:base-source-path "notebooks"
:source-path ["index.clj"]
:base-target-path "docs"
:clean-up-target-dir true})
(System/exit 0)

Loading

0 comments on commit 4d46111

Please sign in to comment.