From bcd323de21f946afa063116329afba7618175f60 Mon Sep 17 00:00:00 2001
From: Anshuman Chhabra <anshuman.lalakers@gmail.com>
Date: Wed, 18 Jul 2018 02:22:06 +0530
Subject: [PATCH] Added fast direct CSV-to-matrix functionality

---
 c_src/Tensorflex.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/nifs.ex        |  4 +++
 lib/tensorflex.ex  | 22 ++++++++++++++
 test/sample1.csv   |  3 ++
 test/sample2.csv   |  4 +++
 5 files changed, 104 insertions(+)
 create mode 100644 test/sample1.csv
 create mode 100644 test/sample2.csv

diff --git a/c_src/Tensorflex.c b/c_src/Tensorflex.c
index 5a46d90..731742b 100644
--- a/c_src/Tensorflex.c
+++ b/c_src/Tensorflex.c
@@ -22,6 +22,7 @@ typedef union
 } mx_t;
 
 #define POS(MX, ROW, COL) ((MX)->data[(ROW)* (MX)->ncols + (COL)])
+#define BUF_SIZE 500000
 
 static int get_number(ErlNifEnv* env, ERL_NIF_TERM term, double* dp);
 static Matrix* alloc_matrix(ErlNifEnv* env, unsigned nrows, unsigned ncols);
@@ -726,6 +727,75 @@ static ERL_NIF_TERM load_image_as_tensor(ErlNifEnv *env, int argc, const ERL_NIF
 
 }
 
+static ERL_NIF_TERM load_csv_as_matrix(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) 
+{
+  ERL_NIF_TERM mat_ret;
+  ErlNifBinary filepath;
+  enif_inspect_binary(env,argv[0], &filepath);
+  char* file = enif_alloc(filepath.size+1);
+  memset(file, 0, filepath.size+1);
+  memcpy(file, (void *) filepath.data, filepath.size);
+  char buf_init[BUF_SIZE], buf[BUF_SIZE];
+  char *val_init, *line_init, *val, *line;
+
+  unsigned int header_atom_len;
+  enif_get_atom_length(env, argv[1], &header_atom_len, ERL_NIF_LATIN1);
+  char* header_atom = (char*)enif_alloc(header_atom_len + 1);
+  enif_get_atom(env, argv[1], header_atom, header_atom_len + 1, ERL_NIF_LATIN1);
+
+  ErlNifBinary delimiter;
+  enif_inspect_binary(env,argv[2], &delimiter);
+  char* delimiter_str = enif_alloc(delimiter.size+1);
+  memset(delimiter_str, 0, delimiter.size+1);
+  memcpy(delimiter_str, (void *) delimiter.data, delimiter.size);
+  
+  FILE *f_init = fopen(file, "rb");
+  unsigned i = 0, j = 0;
+  while((line_init=fgets(buf_init,sizeof(buf_init),f_init))!=NULL) {
+     j = 0;
+     val_init = strtok(line_init,delimiter_str);
+     while(val_init != NULL) {
+	 val_init = strtok(NULL,delimiter_str);
+         j++;
+     }
+     i++;
+  }
+  fclose(f_init);
+  
+  int flag = 0;
+  if(strcmp(header_atom, "true") == 0) {
+     i--;
+     flag = 1;
+  }
+  
+  mx_t mx;
+  mx.p = alloc_matrix(env, i, j);
+  FILE *f = fopen(file, "rb");
+  i = 0;
+  while((line=fgets(buf,sizeof(buf),f))!=NULL) {
+     j = 0;
+     val = strtok(line,delimiter_str);
+     while(val != NULL) {
+	 if(flag == 0) {
+           POS(mx.p, i, j) = atof(val);
+	   j++;
+	 }
+	 val = strtok(NULL,delimiter_str);
+     }
+
+     if(flag == 1){
+         flag = 0;
+   	 i--;
+     }
+     i++;
+  }
+  fclose(f);
+  
+  mat_ret = enif_make_resource(env, mx.p);
+  enif_release_resource(mx.p);
+  return mat_ret;
+}
+
 
 static ErlNifFunc nif_funcs[] =
   {
@@ -747,6 +817,7 @@ static ErlNifFunc nif_funcs[] =
     { "float32_tensor_alloc", 1, float32_tensor_alloc },
     { "run_session", 5, run_session },
     { "load_image_as_tensor", 1, load_image_as_tensor },
+    { "load_csv_as_matrix", 3, load_csv_as_matrix },
   };
 
 ERL_NIF_INIT(Elixir.Tensorflex.NIFs, nif_funcs, res_loader, NULL, NULL, NULL)
diff --git a/lib/nifs.ex b/lib/nifs.ex
index 5410841..7135661 100644
--- a/lib/nifs.ex
+++ b/lib/nifs.ex
@@ -73,6 +73,10 @@ defmodule Tensorflex.NIFs do
     raise "NIF load_image_as_tensor/1 not implemented"
   end
   
+  def load_csv_as_matrix(_filepath, _header, _delimiter) do
+    raise "NIF load_csv_as_matrix/3 not implemented"
+  end
+
   def run_session(_graph, _input_tensor, _output_tensor, _input_opname, _output_opname) do
     raise "NIF run_session/5 not implemented"
   end
diff --git a/lib/tensorflex.ex b/lib/tensorflex.ex
index b8c7924..302eab1 100644
--- a/lib/tensorflex.ex
+++ b/lib/tensorflex.ex
@@ -105,6 +105,28 @@ defmodule Tensorflex do
     {:ok, %Tensor{datatype: :tf_uint8, tensor: ref}}
   end
 
+  def load_csv_as_matrix(filepath, opts \\ []) do
+    unless File.exists?(filepath) do
+      raise ArgumentError, "csv file does not exist"
+    end
+
+    unless (Path.extname(filepath) == ".csv") do
+      raise ArgumentError, "file is not a CSV file"
+    end
+
+    defaults = [header: :true, delimiter: ","]
+    opts = Keyword.merge(defaults, opts) |> Enum.into(%{})
+    %{header: header, delimiter: delimiter} = opts
+    
+    if(header != :true and header != :false) do
+      raise ArgumentError, "header indicator atom must be either :true or :false"
+    end
+
+    ref = NIFs.load_csv_as_matrix(filepath, header, delimiter)
+    {nrows, ncols} = NIFs.size_of_matrix(ref)
+    %Matrix{nrows: nrows, ncols: ncols, data: ref}
+  end
+
   def run_session(%Graph{def: graphdef, name: filepath}, %Tensor{datatype: input_datatype, tensor: input_ref}, %Tensor{datatype: output_datatype, tensor: output_ref}, input_opname, output_opname) do
     NIFs.run_session(graphdef, input_ref, output_ref, input_opname, output_opname)
   end
diff --git a/test/sample1.csv b/test/sample1.csv
new file mode 100644
index 0000000..1a24f59
--- /dev/null
+++ b/test/sample1.csv
@@ -0,0 +1,3 @@
+1,2,3,4,5
+6,7,8,9,10
+11,12,13,14,15
diff --git a/test/sample2.csv b/test/sample2.csv
new file mode 100644
index 0000000..6b67579
--- /dev/null
+++ b/test/sample2.csv
@@ -0,0 +1,4 @@
+col1-col2-col3-col4
+1-2-3-4
+5-6-7-8
+9-10-11-12