Skip to content

Commit 4d46111

Browse files
authored
new text classification tutorial (#6)
1 parent eb1a498 commit 4d46111

File tree

12 files changed

+19088
-0
lines changed

12 files changed

+19088
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ classes
99
.calva
1010
.vscode
1111
docs/
12+

notebooks/toc.edn

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,4 +53,12 @@
5353
:cmd "clj render.clj"
5454
:tags [:onnx :ml]}
5555

56+
{:created "2024-11-03"
57+
:updated "2024-11-03"
58+
:title "Text classification with metamorh.ml and xgboost"
59+
:url "projects/ml/text-classification/index.html"
60+
:source-path "projects/ml/text-classification"
61+
:cmd "clj render.clj"
62+
:tags [:nlp :ml]}
63+
5664
]
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
ARG BASE_IMAGE=temurin-21-tools-deps-jammy
2+
FROM clojure:${BASE_IMAGE}
3+
4+
ARG USERNAME=vscode
5+
ARG USER_UID=1000
6+
ARG USER_GID=$USER_UID
7+
8+
# Create the user
9+
RUN groupadd --gid $USER_GID $USERNAME \
10+
&& useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
11+
#
12+
# [Optional] Add sudo support. Omit if you don't need to install software after connecting.
13+
&& apt-get update \
14+
&& apt-get install -y sudo \
15+
&& echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
16+
&& chmod 0440 /etc/sudoers.d/$USERNAME
17+
18+
19+
# [Optional] Set the default user. Omit if you want to keep the default as root.
20+
USER $USERNAME
21+
SHELL ["/bin/bash", "-ec"]
22+
ENTRYPOINT ["bash"]
23+
24+
25+
# Prepare clojure tools
26+
RUN clojure -Ttools list && \
27+
clojure -Ttools install io.github.seancorfield/clj-new '{:git/tag "v1.2.404" :git/sha "d4a6508"}' :as clj-new && \
28+
clojure -Ttools install-latest :lib io.github.seancorfield/deps-new :as new && \
29+
clojure -Ttools list
30+
31+
RUN sudo apt-get update && \
32+
sudo apt-get install -y lsb-release libgomp1
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
2+
// README at: https://github.com/scicloj/devcontainer-templates/tree/main/src/basecloj
3+
{
4+
"name": "text classification",
5+
"build": {
6+
"dockerfile": "Dockerfile",
7+
"args": {
8+
"BASE_IMAGE": "temurin-21-tools-deps-jammy",
9+
"USERNAME": "${localEnv:USER:vscode}"
10+
}
11+
},
12+
"remoteUser": "${localEnv:USER:vscode}",
13+
"containerUser": "${localEnv:USER:vscode}",
14+
"features": {
15+
"ghcr.io/devcontainers/features/git:1": {},
16+
"ghcr.io/rocker-org/devcontainer-features/quarto-cli:1": {}
17+
18+
},
19+
"customizations": {
20+
"vscode": {
21+
"extensions": [
22+
"betterthantomorrow.calva"
23+
]
24+
}
25+
}
26+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
nlp-getting-started.zip
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{:deps {scicloj.ml.xgboost {:git/url "https://github.com/scicloj/scicloj.ml.xgboost"
2+
:git/sha "db8af224d349db51e2cce0444d0caef2abc26f81"}
3+
4+
org.clojure/clojure {:mvn/version "1.12.0"}
5+
org.scicloj/clay {:mvn/version "2-alpha78"}
6+
scicloj/tablecloth {:mvn/version "7.029.2"}
7+
org.clojure/data.csv {:mvn/version "1.1.0"}
8+
org.scicloj/metamorph.ml {:git/url "https://github.com/scicloj/metamorph.ml"
9+
:git/sha "6cd751f7c06dd2964a9ea108204f059ee29bb66c"}}
10+
:aliases {:dev {:jvm-opts ["-Djava.awt.headless=true"]}}
11+
}
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
(load-string (slurp "https://raw.githubusercontent.com/scicloj/clojure-data-tutorials/main/header.edn"))
2+
3+
^:kindly/hide-code
4+
(ns index
5+
(:require
6+
[clojure.data.csv :as csv]
7+
[clojure.java.io :as io]
8+
[clojure.string :as str]
9+
[scicloj.metamorph.ml.text :as text]
10+
[tablecloth.api :as tc]
11+
[tablecloth.column.api :as tcc]
12+
[scicloj.metamorph.ml :as ml]
13+
[tech.v3.dataset.modelling :as ds-mod]
14+
[scicloj.clay.v2.api :as clay]))
15+
16+
(comment
17+
(require '[scicloj.clay.v2.api :as clay])
18+
(clay/start!)
19+
(clay/make! {:source-path "notebooks/index.clj"
20+
:show false
21+
}))
22+
23+
;; The following code shows how to perform text classification from a Kaggle
24+
;; dataset and make a submission file, ready to get uploaded to
25+
;;Kaggle for scoring.
26+
;;
27+
;; It makes use of the tidy text / TFIDF functionality present in `metamorph.ml`
28+
;; and the ability of the xgboost model to handle tidy text data as input.
29+
;;
30+
;; First we need a fn to tokenize a line of text
31+
;; The simplest such function is:
32+
(defn- tokenize-fn [text]
33+
(str/split text #" "))
34+
35+
36+
;; It does not do any text normalization, which is always required in NLP tasks
37+
;; in order to have a more general model.
38+
;;
39+
;; The following reads line-by-line a file from disk and converts it on the fly
40+
;; to the `tidy text` representation, it which each word
41+
;; is a row in a dataset.
42+
;;
43+
;; `line-parse-fn` needs to split an input line into [text meta],
44+
;; and the `text` is then further handled by `tokenize-fn` and split into tokens.
45+
;; The format of the data has the text in field 4 and the label in 5.
46+
47+
;; We ignore all other columns so far:
48+
(defn- line-parse-fn [line]
49+
[(nth line 3)
50+
(Integer/parseInt (nth line 4))])
51+
52+
;; This triggers the parsing and produces a (seq of) "long" datasets
53+
;; (1 for our small text)
54+
;; and the vocabulary obtained during parsing.
55+
(def tidy-train
56+
(text/->tidy-text (csv/read-csv (io/reader "train.csv"))
57+
seq
58+
line-parse-fn
59+
tokenize-fn
60+
:skip-lines 1))
61+
62+
(def tidy-train-ds
63+
(-> tidy-train :datasets first))
64+
;; The combination of columns :document, :token-pos and :token-index
65+
;; together with the vocabulary table is an exact representation of the text
66+
;; Unless we normalize it as part of hte `tokenize-fn`
67+
;;
68+
;; `meta` is any other information of a row to be kept, usualy the "label"
69+
;; in case of training data.
70+
71+
tidy-train-ds
72+
73+
;; The lookup table allow to convert from :token-idx to words and
74+
;;back if needed.
75+
(def train--token-lookup-table (:token-lookup-table tidy-train))
76+
(map str (take 20 train--token-lookup-table))
77+
78+
;; As we can see, the tokens are not cleaned / standardized at all.
79+
;;This gives as well a large vocabulary size of
80+
(count train--token-lookup-table)
81+
82+
83+
;; Now we convert the text into a bag-of-words format, which looses
84+
;; any word order and calculates a metric which is known to work well
85+
;; for text classification, the so called TFIDF score.
86+
(def train-tfidf
87+
(text/->tfidf tidy-train-ds))
88+
89+
;; The resulting table represent conceptually well three "sparse matrices"
90+
;; where :document and :token-idx are x,y coordinates and matrix cell values
91+
;; are :token-count, term-frequency (:tf) or TFIDF
92+
;;
93+
;; Not present rows (the large majority) are 0 values.
94+
95+
;; A subset of machine learning algorithms can deal with sparse matrices,
96+
;; without then need to convert them into
97+
;; dense matrices first, which is in most cases impossible due to the memory
98+
;; consumption
99+
100+
;; The train-tfidf dataset represents therefore 3 sparse matrices with
101+
;; dimensions
102+
(tcc/reduce-max (:document train-tfidf))
103+
;; times
104+
(tcc/reduce-max (:token-idx train-tfidf))
105+
;; time 3
106+
;; =
107+
108+
(* (tcc/reduce-max (:document train-tfidf))
109+
(tcc/reduce-max (:token-idx train-tfidf))
110+
3)
111+
112+
113+
;; while only having shape:
114+
115+
(tc/shape train-tfidf)
116+
117+
;; This is because most matrix elements are 0, as
118+
;; any text does "not contain" most words.
119+
;;
120+
;; As TFIDF (and its variants) are one of the most common numeric representations for text,
121+
;; "sparse matrixes" and models supporting them is a prerequisite for NLP.
122+
;;
123+
;; Only since a few years we have "dense text representations" based on "embeddings",
124+
;; which will not be discussed here today,
125+
126+
;; Now we get the data ready for training.
127+
128+
(def train-ds
129+
(-> train-tfidf
130+
(tc/rename-columns {:meta :label})
131+
(tc/select-columns [:document :token-idx :tfidf :label]) ;; we only need those
132+
(ds-mod/set-inference-target [:label])))
133+
134+
train-ds
135+
136+
(def n-sparse-columns (inc (tcc/reduce-max (train-ds :token-idx))))
137+
138+
;; The model used is from library `scicloj.ml.xgboost` which is the well known xgboost model
139+
;; behind a wrapper to make it work with tidy text data.
140+
;;
141+
;; We use :tfidf column as the "feature".
142+
143+
(require '[scicloj.ml.xgboost])
144+
;; registers the mode under key :xgboost/classification
145+
146+
(def model
147+
(ml/train train-ds {:model-type :xgboost/classification
148+
:sparse-column :tfidf
149+
:seed 123
150+
:num-class 2
151+
:n-sparse-columns n-sparse-columns}))
152+
153+
154+
;; Now we have a trained model, which we can use for prediction on the test data.
155+
156+
;; This time we do parsing and tfidf in one go.
157+
;;
158+
;; Important here:
159+
;;
160+
;; We pass the vocabulary "obtained before" in order to be sure, that
161+
;; :token-idx maps to the same words in both datasets. In case of "new tokens",
162+
;; we ignore them and map them to a special token, "[UNKNOWN]"
163+
(def tfidf-test-ds
164+
(->
165+
(text/->tidy-text (csv/read-csv (io/reader "test.csv"))
166+
seq
167+
(fn [line]
168+
[(nth line 3) {:id (first line)}])
169+
tokenize-fn
170+
:skip-lines 1
171+
:new-token-behaviour :as-unknown
172+
:token->index-map train--token-lookup-table)
173+
:datasets
174+
first
175+
text/->tfidf
176+
(tc/select-columns [:document :token-idx :tfidf :meta])
177+
;; he :id for Kaggle
178+
(tc/add-column
179+
:id (fn [df] (map
180+
#(:id %)
181+
(:meta df))))
182+
(tc/drop-columns [:meta])))
183+
184+
;; This gives the dataset which can be passed into the `predict` function of `metamorph.ml`
185+
tfidf-test-ds
186+
187+
(def prediction
188+
(ml/predict tfidf-test-ds model))
189+
190+
;; The raw predictions contain the "document" each prediction is about.
191+
;; This we can use to match predictions and the input "ids" in order to produce teh format
192+
;; required by Kaggle
193+
prediction
194+
195+
(->
196+
(tc/right-join prediction tfidf-test-ds :document)
197+
(tc/unique-by [:id :label])
198+
(tc/select-columns [:id :label])
199+
(tc/update-columns {:label (partial map int)})
200+
(tc/rename-columns {:label :target})
201+
(tc/write-csv! "submission.csv"))
202+
203+
;; The produced CVS file can be uploaded to Kaggle for scoring.
204+
205+
(->>
206+
(io/reader "submission.csv")
207+
line-seq
208+
(take 10))
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
(ns render
2+
(:require [scicloj.clay.v2.api :as clay]))
3+
4+
(clay/make! {:format [:quarto :html]
5+
:show false
6+
:base-source-path "notebooks"
7+
:source-path ["index.clj"]
8+
:base-target-path "docs"
9+
:clean-up-target-dir true})
10+
(System/exit 0)
11+

0 commit comments

Comments
 (0)