Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added fast C based direct CSV-to-matrix functionality with options #23

Merged
merged 1 commit into from
Jul 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions c_src/Tensorflex.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ typedef union
} mx_t;

#define POS(MX, ROW, COL) ((MX)->data[(ROW)* (MX)->ncols + (COL)])
#define BUF_SIZE 500000

static int get_number(ErlNifEnv* env, ERL_NIF_TERM term, double* dp);
static Matrix* alloc_matrix(ErlNifEnv* env, unsigned nrows, unsigned ncols);
Expand Down Expand Up @@ -726,6 +727,75 @@ static ERL_NIF_TERM load_image_as_tensor(ErlNifEnv *env, int argc, const ERL_NIF

}

static ERL_NIF_TERM load_csv_as_matrix(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[])
{
ERL_NIF_TERM mat_ret;
ErlNifBinary filepath;
enif_inspect_binary(env,argv[0], &filepath);
char* file = enif_alloc(filepath.size+1);
memset(file, 0, filepath.size+1);
memcpy(file, (void *) filepath.data, filepath.size);
char buf_init[BUF_SIZE], buf[BUF_SIZE];
char *val_init, *line_init, *val, *line;

unsigned int header_atom_len;
enif_get_atom_length(env, argv[1], &header_atom_len, ERL_NIF_LATIN1);
char* header_atom = (char*)enif_alloc(header_atom_len + 1);
enif_get_atom(env, argv[1], header_atom, header_atom_len + 1, ERL_NIF_LATIN1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should avoid creating atoms from user input such as files. We should return binaries here.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't quite understand what you meant. The above code is just for obtaining the header atom. The user specifies whether the header is present or not as :true or :false in Elixir and I am just reading that value in C here. Do you mean that instead of an atom, I should ask the user to send a string as argument?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The path to the file is specified in arg[0] which is read as a binary in C

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@anshuman23 oh, I thought you were reading from the file. This is basically the conversion of true and false to C values? Sorry the confusion then. :)

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes this is just that.
Not at all, thanks for your inputs! :)

PS: I'll brainstorm a bit on what to do next and give you an update via email soon


ErlNifBinary delimiter;
enif_inspect_binary(env,argv[2], &delimiter);
char* delimiter_str = enif_alloc(delimiter.size+1);
memset(delimiter_str, 0, delimiter.size+1);
memcpy(delimiter_str, (void *) delimiter.data, delimiter.size);

FILE *f_init = fopen(file, "rb");
unsigned i = 0, j = 0;
while((line_init=fgets(buf_init,sizeof(buf_init),f_init))!=NULL) {
j = 0;
val_init = strtok(line_init,delimiter_str);
while(val_init != NULL) {
val_init = strtok(NULL,delimiter_str);
j++;
}
i++;
}
fclose(f_init);

int flag = 0;
if(strcmp(header_atom, "true") == 0) {
i--;
flag = 1;
}

mx_t mx;
mx.p = alloc_matrix(env, i, j);
FILE *f = fopen(file, "rb");
i = 0;
while((line=fgets(buf,sizeof(buf),f))!=NULL) {
j = 0;
val = strtok(line,delimiter_str);
while(val != NULL) {
if(flag == 0) {
POS(mx.p, i, j) = atof(val);
j++;
}
val = strtok(NULL,delimiter_str);
}

if(flag == 1){
flag = 0;
i--;
}
i++;
}
fclose(f);

mat_ret = enif_make_resource(env, mx.p);
enif_release_resource(mx.p);
return mat_ret;
}


static ErlNifFunc nif_funcs[] =
{
Expand All @@ -747,6 +817,7 @@ static ErlNifFunc nif_funcs[] =
{ "float32_tensor_alloc", 1, float32_tensor_alloc },
{ "run_session", 5, run_session },
{ "load_image_as_tensor", 1, load_image_as_tensor },
{ "load_csv_as_matrix", 3, load_csv_as_matrix },
};

ERL_NIF_INIT(Elixir.Tensorflex.NIFs, nif_funcs, res_loader, NULL, NULL, NULL)
Expand Down
4 changes: 4 additions & 0 deletions lib/nifs.ex
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ defmodule Tensorflex.NIFs do
raise "NIF load_image_as_tensor/1 not implemented"
end

def load_csv_as_matrix(_filepath, _header, _delimiter) do
raise "NIF load_csv_as_matrix/3 not implemented"
end

def run_session(_graph, _input_tensor, _output_tensor, _input_opname, _output_opname) do
raise "NIF run_session/5 not implemented"
end
Expand Down
22 changes: 22 additions & 0 deletions lib/tensorflex.ex
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,28 @@ defmodule Tensorflex do
{:ok, %Tensor{datatype: :tf_uint8, tensor: ref}}
end

def load_csv_as_matrix(filepath, opts \\ []) do
unless File.exists?(filepath) do
raise ArgumentError, "csv file does not exist"
end

unless (Path.extname(filepath) == ".csv") do
raise ArgumentError, "file is not a CSV file"
end

defaults = [header: :true, delimiter: ","]
opts = Keyword.merge(defaults, opts) |> Enum.into(%{})
%{header: header, delimiter: delimiter} = opts

if(header != :true and header != :false) do
raise ArgumentError, "header indicator atom must be either :true or :false"
end

ref = NIFs.load_csv_as_matrix(filepath, header, delimiter)
{nrows, ncols} = NIFs.size_of_matrix(ref)
%Matrix{nrows: nrows, ncols: ncols, data: ref}
end

def run_session(%Graph{def: graphdef, name: filepath}, %Tensor{datatype: input_datatype, tensor: input_ref}, %Tensor{datatype: output_datatype, tensor: output_ref}, input_opname, output_opname) do
NIFs.run_session(graphdef, input_ref, output_ref, input_opname, output_opname)
end
Expand Down
3 changes: 3 additions & 0 deletions test/sample1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
4 changes: 4 additions & 0 deletions test/sample2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
col1-col2-col3-col4
1-2-3-4
5-6-7-8
9-10-11-12