From bcd323de21f946afa063116329afba7618175f60 Mon Sep 17 00:00:00 2001 From: Anshuman Chhabra Date: Wed, 18 Jul 2018 02:22:06 +0530 Subject: [PATCH] Added fast direct CSV-to-matrix functionality --- c_src/Tensorflex.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++ lib/nifs.ex | 4 +++ lib/tensorflex.ex | 22 ++++++++++++++ test/sample1.csv | 3 ++ test/sample2.csv | 4 +++ 5 files changed, 104 insertions(+) create mode 100644 test/sample1.csv create mode 100644 test/sample2.csv diff --git a/c_src/Tensorflex.c b/c_src/Tensorflex.c index 5a46d90..731742b 100644 --- a/c_src/Tensorflex.c +++ b/c_src/Tensorflex.c @@ -22,6 +22,7 @@ typedef union } mx_t; #define POS(MX, ROW, COL) ((MX)->data[(ROW)* (MX)->ncols + (COL)]) +#define BUF_SIZE 500000 static int get_number(ErlNifEnv* env, ERL_NIF_TERM term, double* dp); static Matrix* alloc_matrix(ErlNifEnv* env, unsigned nrows, unsigned ncols); @@ -726,6 +727,75 @@ static ERL_NIF_TERM load_image_as_tensor(ErlNifEnv *env, int argc, const ERL_NIF } +static ERL_NIF_TERM load_csv_as_matrix(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) +{ + ERL_NIF_TERM mat_ret; + ErlNifBinary filepath; + enif_inspect_binary(env,argv[0], &filepath); + char* file = enif_alloc(filepath.size+1); + memset(file, 0, filepath.size+1); + memcpy(file, (void *) filepath.data, filepath.size); + char buf_init[BUF_SIZE], buf[BUF_SIZE]; + char *val_init, *line_init, *val, *line; + + unsigned int header_atom_len; + enif_get_atom_length(env, argv[1], &header_atom_len, ERL_NIF_LATIN1); + char* header_atom = (char*)enif_alloc(header_atom_len + 1); + enif_get_atom(env, argv[1], header_atom, header_atom_len + 1, ERL_NIF_LATIN1); + + ErlNifBinary delimiter; + enif_inspect_binary(env,argv[2], &delimiter); + char* delimiter_str = enif_alloc(delimiter.size+1); + memset(delimiter_str, 0, delimiter.size+1); + memcpy(delimiter_str, (void *) delimiter.data, delimiter.size); + + FILE *f_init = fopen(file, "rb"); + unsigned i = 0, j = 0; + while((line_init=fgets(buf_init,sizeof(buf_init),f_init))!=NULL) { + j = 0; + val_init = strtok(line_init,delimiter_str); + while(val_init != NULL) { + val_init = strtok(NULL,delimiter_str); + j++; + } + i++; + } + fclose(f_init); + + int flag = 0; + if(strcmp(header_atom, "true") == 0) { + i--; + flag = 1; + } + + mx_t mx; + mx.p = alloc_matrix(env, i, j); + FILE *f = fopen(file, "rb"); + i = 0; + while((line=fgets(buf,sizeof(buf),f))!=NULL) { + j = 0; + val = strtok(line,delimiter_str); + while(val != NULL) { + if(flag == 0) { + POS(mx.p, i, j) = atof(val); + j++; + } + val = strtok(NULL,delimiter_str); + } + + if(flag == 1){ + flag = 0; + i--; + } + i++; + } + fclose(f); + + mat_ret = enif_make_resource(env, mx.p); + enif_release_resource(mx.p); + return mat_ret; +} + static ErlNifFunc nif_funcs[] = { @@ -747,6 +817,7 @@ static ErlNifFunc nif_funcs[] = { "float32_tensor_alloc", 1, float32_tensor_alloc }, { "run_session", 5, run_session }, { "load_image_as_tensor", 1, load_image_as_tensor }, + { "load_csv_as_matrix", 3, load_csv_as_matrix }, }; ERL_NIF_INIT(Elixir.Tensorflex.NIFs, nif_funcs, res_loader, NULL, NULL, NULL) diff --git a/lib/nifs.ex b/lib/nifs.ex index 5410841..7135661 100644 --- a/lib/nifs.ex +++ b/lib/nifs.ex @@ -73,6 +73,10 @@ defmodule Tensorflex.NIFs do raise "NIF load_image_as_tensor/1 not implemented" end + def load_csv_as_matrix(_filepath, _header, _delimiter) do + raise "NIF load_csv_as_matrix/3 not implemented" + end + def run_session(_graph, _input_tensor, _output_tensor, _input_opname, _output_opname) do raise "NIF run_session/5 not implemented" end diff --git a/lib/tensorflex.ex b/lib/tensorflex.ex index b8c7924..302eab1 100644 --- a/lib/tensorflex.ex +++ b/lib/tensorflex.ex @@ -105,6 +105,28 @@ defmodule Tensorflex do {:ok, %Tensor{datatype: :tf_uint8, tensor: ref}} end + def load_csv_as_matrix(filepath, opts \\ []) do + unless File.exists?(filepath) do + raise ArgumentError, "csv file does not exist" + end + + unless (Path.extname(filepath) == ".csv") do + raise ArgumentError, "file is not a CSV file" + end + + defaults = [header: :true, delimiter: ","] + opts = Keyword.merge(defaults, opts) |> Enum.into(%{}) + %{header: header, delimiter: delimiter} = opts + + if(header != :true and header != :false) do + raise ArgumentError, "header indicator atom must be either :true or :false" + end + + ref = NIFs.load_csv_as_matrix(filepath, header, delimiter) + {nrows, ncols} = NIFs.size_of_matrix(ref) + %Matrix{nrows: nrows, ncols: ncols, data: ref} + end + def run_session(%Graph{def: graphdef, name: filepath}, %Tensor{datatype: input_datatype, tensor: input_ref}, %Tensor{datatype: output_datatype, tensor: output_ref}, input_opname, output_opname) do NIFs.run_session(graphdef, input_ref, output_ref, input_opname, output_opname) end diff --git a/test/sample1.csv b/test/sample1.csv new file mode 100644 index 0000000..1a24f59 --- /dev/null +++ b/test/sample1.csv @@ -0,0 +1,3 @@ +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 diff --git a/test/sample2.csv b/test/sample2.csv new file mode 100644 index 0000000..6b67579 --- /dev/null +++ b/test/sample2.csv @@ -0,0 +1,4 @@ +col1-col2-col3-col4 +1-2-3-4 +5-6-7-8 +9-10-11-12