1+ (load-string (slurp " https://raw.githubusercontent.com/scicloj/clojure-data-tutorials/main/header.edn" ))
2+
3+ ^:kindly/hide-code
4+ (ns index
5+ (:require
6+ [clojure.data.csv :as csv]
7+ [clojure.java.io :as io]
8+ [clojure.string :as str]
9+ [scicloj.metamorph.ml.text :as text]
10+ [tablecloth.api :as tc]
11+ [tablecloth.column.api :as tcc]
12+ [scicloj.metamorph.ml :as ml]
13+ [tech.v3.dataset.modelling :as ds-mod]
14+ [scicloj.clay.v2.api :as clay]))
15+
16+ (comment
17+ (require '[scicloj.clay.v2.api :as clay])
18+ (clay/start! )
19+ (clay/make! {:source-path " notebooks/index.clj"
20+ :show false
21+ }))
22+
23+ ; ; The following code shows how to perform text classification from a Kaggle
24+ ; ; dataset and make a submission file, ready to get uploaded to
25+ ; ;Kaggle for scoring.
26+ ; ;
27+ ; ; It makes use of the tidy text / TFIDF functionality present in `metamorph.ml`
28+ ; ; and the ability of the xgboost model to handle tidy text data as input.
29+ ; ;
30+ ; ; First we need a fn to tokenize a line of text
31+ ; ; The simplest such function is:
32+ (defn- tokenize-fn [text]
33+ (str/split text #" " ))
34+
35+
36+ ; ; It does not do any text normalization, which is always required in NLP tasks
37+ ; ; in order to have a more general model.
38+ ; ;
39+ ; ; The following reads line-by-line a file from disk and converts it on the fly
40+ ; ; to the `tidy text` representation, it which each word
41+ ; ; is a row in a dataset.
42+ ; ;
43+ ; ; `line-parse-fn` needs to split an input line into [text meta],
44+ ; ; and the `text` is then further handled by `tokenize-fn` and split into tokens.
45+ ; ; The format of the data has the text in field 4 and the label in 5.
46+
47+ ; ; We ignore all other columns so far:
48+ (defn- line-parse-fn [line]
49+ [(nth line 3 )
50+ (Integer/parseInt (nth line 4 ))])
51+
52+ ; ; This triggers the parsing and produces a (seq of) "long" datasets
53+ ; ; (1 for our small text)
54+ ; ; and the vocabulary obtained during parsing.
55+ (def tidy-train
56+ (text/->tidy-text (csv/read-csv (io/reader " train.csv" ))
57+ seq
58+ line-parse-fn
59+ tokenize-fn
60+ :skip-lines 1 ))
61+
62+ (def tidy-train-ds
63+ (-> tidy-train :datasets first))
64+ ; ; The combination of columns :document, :token-pos and :token-index
65+ ; ; together with the vocabulary table is an exact representation of the text
66+ ; ; Unless we normalize it as part of hte `tokenize-fn`
67+ ; ;
68+ ; ; `meta` is any other information of a row to be kept, usualy the "label"
69+ ; ; in case of training data.
70+
71+ tidy-train-ds
72+
73+ ; ; The lookup table allow to convert from :token-idx to words and
74+ ; ;back if needed.
75+ (def train--token-lookup-table (:token-lookup-table tidy-train))
76+ (map str (take 20 train--token-lookup-table))
77+
78+ ; ; As we can see, the tokens are not cleaned / standardized at all.
79+ ; ;This gives as well a large vocabulary size of
80+ (count train--token-lookup-table)
81+
82+
83+ ; ; Now we convert the text into a bag-of-words format, which looses
84+ ; ; any word order and calculates a metric which is known to work well
85+ ; ; for text classification, the so called TFIDF score.
86+ (def train-tfidf
87+ (text/->tfidf tidy-train-ds))
88+
89+ ; ; The resulting table represent conceptually well three "sparse matrices"
90+ ; ; where :document and :token-idx are x,y coordinates and matrix cell values
91+ ; ; are :token-count, term-frequency (:tf) or TFIDF
92+ ; ;
93+ ; ; Not present rows (the large majority) are 0 values.
94+
95+ ; ; A subset of machine learning algorithms can deal with sparse matrices,
96+ ; ; without then need to convert them into
97+ ; ; dense matrices first, which is in most cases impossible due to the memory
98+ ; ; consumption
99+
100+ ; ; The train-tfidf dataset represents therefore 3 sparse matrices with
101+ ; ; dimensions
102+ (tcc/reduce-max (:document train-tfidf))
103+ ; ; times
104+ (tcc/reduce-max (:token-idx train-tfidf))
105+ ; ; time 3
106+ ; ; =
107+
108+ (* (tcc/reduce-max (:document train-tfidf))
109+ (tcc/reduce-max (:token-idx train-tfidf))
110+ 3 )
111+
112+
113+ ; ; while only having shape:
114+
115+ (tc/shape train-tfidf)
116+
117+ ; ; This is because most matrix elements are 0, as
118+ ; ; any text does "not contain" most words.
119+ ; ;
120+ ; ; As TFIDF (and its variants) are one of the most common numeric representations for text,
121+ ; ; "sparse matrixes" and models supporting them is a prerequisite for NLP.
122+ ; ;
123+ ; ; Only since a few years we have "dense text representations" based on "embeddings",
124+ ; ; which will not be discussed here today,
125+
126+ ; ; Now we get the data ready for training.
127+
128+ (def train-ds
129+ (-> train-tfidf
130+ (tc/rename-columns {:meta :label })
131+ (tc/select-columns [:document :token-idx :tfidf :label ]) ; ; we only need those
132+ (ds-mod/set-inference-target [:label ])))
133+
134+ train-ds
135+
136+ (def n-sparse-columns (inc (tcc/reduce-max (train-ds :token-idx ))))
137+
138+ ; ; The model used is from library `scicloj.ml.xgboost` which is the well known xgboost model
139+ ; ; behind a wrapper to make it work with tidy text data.
140+ ; ;
141+ ; ; We use :tfidf column as the "feature".
142+
143+ (require '[scicloj.ml.xgboost])
144+ ; ; registers the mode under key :xgboost/classification
145+
146+ (def model
147+ (ml/train train-ds {:model-type :xgboost/classification
148+ :sparse-column :tfidf
149+ :seed 123
150+ :num-class 2
151+ :n-sparse-columns n-sparse-columns}))
152+
153+
154+ ; ; Now we have a trained model, which we can use for prediction on the test data.
155+
156+ ; ; This time we do parsing and tfidf in one go.
157+ ; ;
158+ ; ; Important here:
159+ ; ;
160+ ; ; We pass the vocabulary "obtained before" in order to be sure, that
161+ ; ; :token-idx maps to the same words in both datasets. In case of "new tokens",
162+ ; ; we ignore them and map them to a special token, "[UNKNOWN]"
163+ (def tfidf-test-ds
164+ (->
165+ (text/->tidy-text (csv/read-csv (io/reader " test.csv" ))
166+ seq
167+ (fn [line]
168+ [(nth line 3 ) {:id (first line)}])
169+ tokenize-fn
170+ :skip-lines 1
171+ :new-token-behaviour :as-unknown
172+ :token->index-map train--token-lookup-table)
173+ :datasets
174+ first
175+ text/->tfidf
176+ (tc/select-columns [:document :token-idx :tfidf :meta ])
177+ ; ; he :id for Kaggle
178+ (tc/add-column
179+ :id (fn [df] (map
180+ #(:id %)
181+ (:meta df))))
182+ (tc/drop-columns [:meta ])))
183+
184+ ; ; This gives the dataset which can be passed into the `predict` function of `metamorph.ml`
185+ tfidf-test-ds
186+
187+ (def prediction
188+ (ml/predict tfidf-test-ds model))
189+
190+ ; ; The raw predictions contain the "document" each prediction is about.
191+ ; ; This we can use to match predictions and the input "ids" in order to produce teh format
192+ ; ; required by Kaggle
193+ prediction
194+
195+ (->
196+ (tc/right-join prediction tfidf-test-ds :document )
197+ (tc/unique-by [:id :label ])
198+ (tc/select-columns [:id :label ])
199+ (tc/update-columns {:label (partial map int)})
200+ (tc/rename-columns {:label :target })
201+ (tc/write-csv! " submission.csv" ))
202+
203+ ; ; The produced CVS file can be uploaded to Kaggle for scoring.
204+
205+ (->>
206+ (io/reader " submission.csv" )
207+ line-seq
208+ (take 10 ))
0 commit comments