1
+ (load-string (slurp " https://raw.githubusercontent.com/scicloj/clojure-data-tutorials/main/header.edn" ))
2
+
3
+ ^:kindly/hide-code
4
+ (ns index
5
+ (:require
6
+ [clojure.data.csv :as csv]
7
+ [clojure.java.io :as io]
8
+ [clojure.string :as str]
9
+ [scicloj.metamorph.ml.text :as text]
10
+ [tablecloth.api :as tc]
11
+ [tablecloth.column.api :as tcc]
12
+ [scicloj.metamorph.ml :as ml]
13
+ [tech.v3.dataset.modelling :as ds-mod]
14
+ [scicloj.clay.v2.api :as clay]))
15
+
16
+ (comment
17
+ (require '[scicloj.clay.v2.api :as clay])
18
+ (clay/start! )
19
+ (clay/make! {:source-path " notebooks/index.clj"
20
+ :show false
21
+ }))
22
+
23
+ ; ; The following code shows how to perform text classification from a Kaggle
24
+ ; ; dataset and make a submission file, ready to get uploaded to
25
+ ; ;Kaggle for scoring.
26
+ ; ;
27
+ ; ; It makes use of the tidy text / TFIDF functionality present in `metamorph.ml`
28
+ ; ; and the ability of the xgboost model to handle tidy text data as input.
29
+ ; ;
30
+ ; ; First we need a fn to tokenize a line of text
31
+ ; ; The simplest such function is:
32
+ (defn- tokenize-fn [text]
33
+ (str/split text #" " ))
34
+
35
+
36
+ ; ; It does not do any text normalization, which is always required in NLP tasks
37
+ ; ; in order to have a more general model.
38
+ ; ;
39
+ ; ; The following reads line-by-line a file from disk and converts it on the fly
40
+ ; ; to the `tidy text` representation, it which each word
41
+ ; ; is a row in a dataset.
42
+ ; ;
43
+ ; ; `line-parse-fn` needs to split an input line into [text meta],
44
+ ; ; and the `text` is then further handled by `tokenize-fn` and split into tokens.
45
+ ; ; The format of the data has the text in field 4 and the label in 5.
46
+
47
+ ; ; We ignore all other columns so far:
48
+ (defn- line-parse-fn [line]
49
+ [(nth line 3 )
50
+ (Integer/parseInt (nth line 4 ))])
51
+
52
+ ; ; This triggers the parsing and produces a (seq of) "long" datasets
53
+ ; ; (1 for our small text)
54
+ ; ; and the vocabulary obtained during parsing.
55
+ (def tidy-train
56
+ (text/->tidy-text (csv/read-csv (io/reader " train.csv" ))
57
+ seq
58
+ line-parse-fn
59
+ tokenize-fn
60
+ :skip-lines 1 ))
61
+
62
+ (def tidy-train-ds
63
+ (-> tidy-train :datasets first))
64
+ ; ; The combination of columns :document, :token-pos and :token-index
65
+ ; ; together with the vocabulary table is an exact representation of the text
66
+ ; ; Unless we normalize it as part of hte `tokenize-fn`
67
+ ; ;
68
+ ; ; `meta` is any other information of a row to be kept, usualy the "label"
69
+ ; ; in case of training data.
70
+
71
+ tidy-train-ds
72
+
73
+ ; ; The lookup table allow to convert from :token-idx to words and
74
+ ; ;back if needed.
75
+ (def train--token-lookup-table (:token-lookup-table tidy-train))
76
+ (map str (take 20 train--token-lookup-table))
77
+
78
+ ; ; As we can see, the tokens are not cleaned / standardized at all.
79
+ ; ;This gives as well a large vocabulary size of
80
+ (count train--token-lookup-table)
81
+
82
+
83
+ ; ; Now we convert the text into a bag-of-words format, which looses
84
+ ; ; any word order and calculates a metric which is known to work well
85
+ ; ; for text classification, the so called TFIDF score.
86
+ (def train-tfidf
87
+ (text/->tfidf tidy-train-ds))
88
+
89
+ ; ; The resulting table represent conceptually well three "sparse matrices"
90
+ ; ; where :document and :token-idx are x,y coordinates and matrix cell values
91
+ ; ; are :token-count, term-frequency (:tf) or TFIDF
92
+ ; ;
93
+ ; ; Not present rows (the large majority) are 0 values.
94
+
95
+ ; ; A subset of machine learning algorithms can deal with sparse matrices,
96
+ ; ; without then need to convert them into
97
+ ; ; dense matrices first, which is in most cases impossible due to the memory
98
+ ; ; consumption
99
+
100
+ ; ; The train-tfidf dataset represents therefore 3 sparse matrices with
101
+ ; ; dimensions
102
+ (tcc/reduce-max (:document train-tfidf))
103
+ ; ; times
104
+ (tcc/reduce-max (:token-idx train-tfidf))
105
+ ; ; time 3
106
+ ; ; =
107
+
108
+ (* (tcc/reduce-max (:document train-tfidf))
109
+ (tcc/reduce-max (:token-idx train-tfidf))
110
+ 3 )
111
+
112
+
113
+ ; ; while only having shape:
114
+
115
+ (tc/shape train-tfidf)
116
+
117
+ ; ; This is because most matrix elements are 0, as
118
+ ; ; any text does "not contain" most words.
119
+ ; ;
120
+ ; ; As TFIDF (and its variants) are one of the most common numeric representations for text,
121
+ ; ; "sparse matrixes" and models supporting them is a prerequisite for NLP.
122
+ ; ;
123
+ ; ; Only since a few years we have "dense text representations" based on "embeddings",
124
+ ; ; which will not be discussed here today,
125
+
126
+ ; ; Now we get the data ready for training.
127
+
128
+ (def train-ds
129
+ (-> train-tfidf
130
+ (tc/rename-columns {:meta :label })
131
+ (tc/select-columns [:document :token-idx :tfidf :label ]) ; ; we only need those
132
+ (ds-mod/set-inference-target [:label ])))
133
+
134
+ train-ds
135
+
136
+ (def n-sparse-columns (inc (tcc/reduce-max (train-ds :token-idx ))))
137
+
138
+ ; ; The model used is from library `scicloj.ml.xgboost` which is the well known xgboost model
139
+ ; ; behind a wrapper to make it work with tidy text data.
140
+ ; ;
141
+ ; ; We use :tfidf column as the "feature".
142
+
143
+ (require '[scicloj.ml.xgboost])
144
+ ; ; registers the mode under key :xgboost/classification
145
+
146
+ (def model
147
+ (ml/train train-ds {:model-type :xgboost/classification
148
+ :sparse-column :tfidf
149
+ :seed 123
150
+ :num-class 2
151
+ :n-sparse-columns n-sparse-columns}))
152
+
153
+
154
+ ; ; Now we have a trained model, which we can use for prediction on the test data.
155
+
156
+ ; ; This time we do parsing and tfidf in one go.
157
+ ; ;
158
+ ; ; Important here:
159
+ ; ;
160
+ ; ; We pass the vocabulary "obtained before" in order to be sure, that
161
+ ; ; :token-idx maps to the same words in both datasets. In case of "new tokens",
162
+ ; ; we ignore them and map them to a special token, "[UNKNOWN]"
163
+ (def tfidf-test-ds
164
+ (->
165
+ (text/->tidy-text (csv/read-csv (io/reader " test.csv" ))
166
+ seq
167
+ (fn [line]
168
+ [(nth line 3 ) {:id (first line)}])
169
+ tokenize-fn
170
+ :skip-lines 1
171
+ :new-token-behaviour :as-unknown
172
+ :token->index-map train--token-lookup-table)
173
+ :datasets
174
+ first
175
+ text/->tfidf
176
+ (tc/select-columns [:document :token-idx :tfidf :meta ])
177
+ ; ; he :id for Kaggle
178
+ (tc/add-column
179
+ :id (fn [df] (map
180
+ #(:id %)
181
+ (:meta df))))
182
+ (tc/drop-columns [:meta ])))
183
+
184
+ ; ; This gives the dataset which can be passed into the `predict` function of `metamorph.ml`
185
+ tfidf-test-ds
186
+
187
+ (def prediction
188
+ (ml/predict tfidf-test-ds model))
189
+
190
+ ; ; The raw predictions contain the "document" each prediction is about.
191
+ ; ; This we can use to match predictions and the input "ids" in order to produce teh format
192
+ ; ; required by Kaggle
193
+ prediction
194
+
195
+ (->
196
+ (tc/right-join prediction tfidf-test-ds :document )
197
+ (tc/unique-by [:id :label ])
198
+ (tc/select-columns [:id :label ])
199
+ (tc/update-columns {:label (partial map int)})
200
+ (tc/rename-columns {:label :target })
201
+ (tc/write-csv! " submission.csv" ))
202
+
203
+ ; ; The produced CVS file can be uploaded to Kaggle for scoring.
204
+
205
+ (->>
206
+ (io/reader " submission.csv" )
207
+ line-seq
208
+ (take 10 ))
0 commit comments