tmbdev · Oct 1, 2015 · Oct 2, 2015 · Oct 4, 2015 · Oct 4, 2015 · Oct 6, 2015
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-OLD
+DONE
 JUNK
 .hg
 book/
@@ -13,3 +13,15 @@ build/
 *.os
 *.a
 *.so
+.scons*
+clstm.pb.cc
+clstm.pb.h
+clstmtrain
+clstmfilter
+clstmfiltertrain
+clstmocr
+clstmocrtrain
+test-lstm
+test-lstm2
+clstm.py
+clstm_wrap.cc
diff --git a/Makefile b/Makefile
@@ -0,0 +1,7 @@
+clstm_compute_cuda.o: clstm_compute_cuda.cc
+	scons clstm_compute_cuda.o
+test.o: test.cu
+	/usr/local/cuda/bin/nvcc --std=c++11 -x cu -DEIGEN_USE_GPU --expt-relaxed-constexpr -I/usr/local/include/eigen3 -c test.cu
+DONE: Dockerfile
+	docker build -t tmbdev/ubuntu-cuda .
+	touch DONE
diff --git a/clstm-hdf5-to-proto.cc → OLD/clstm-hdf5-to-proto.cc b/clstm-hdf5-to-proto.cc → OLD/clstm-hdf5-to-proto.cc
@@ -190,7 +190,7 @@ void save_net_raw(const char *fname, INetwork *net) {
   h5->open(fname, true);
   net->attributes["clstm-version"] = "1";
   for (auto &kv : net->attributes) {
-    h5->setAttr(kv.first, kv.second);
+    h5->attr.set(kv.first, kv.second);
   }
   save_codec(h5.get(), "codec", net->codec, net->noutput());
   save_codec(h5.get(), "icodec", net->icodec, net->ninput());
@@ -209,7 +209,7 @@ void load_net_raw(INetwork *net, const char *fname) {
   using namespace h5eigen;
   unique_ptr<HDF5> h5(make_HDF5());
   h5->open(fname);
-  h5->getAttrs(net->attributes);
+  h5->attr.gets(net->attributes);
   load_codec(net->icodec, h5.get(), "icodec");
   load_codec(net->codec, h5.get(), "codec");
   net->weights("", [&h5](const string &prefix, VecMat a, VecMat da) {
@@ -227,7 +227,7 @@ void load_attributes(map<string, string> &attributes, const string &fname) {
   using namespace h5eigen;
   unique_ptr<HDF5> h5(make_HDF5());
   h5->open(fname.c_str());
-  h5->getAttrs(attributes);
+  h5->attr.gets(attributes);
 }
 
 Network load_net_hdf5(const string &fname) {

diff --git a/clstm_extras.i → OLD/clstm_extras.i b/clstm_extras.i → OLD/clstm_extras.i
diff --git a/clstm_hdf5.cc → OLD/clstm_hdf5.cc b/clstm_hdf5.cc → OLD/clstm_hdf5.cc
@@ -9,7 +9,6 @@
 #include <string>
 #include <memory>
 #include <math.h>
-#include <Eigen/Dense>
 #include "clstm.pb.h"
 #include <stdarg.h>
 
@@ -60,7 +59,7 @@ void save_net_raw(const char *fname, INetwork *net) {
   h5->open(fname, true);
   net->attributes["clstm-version"] = "1";
   for (auto &kv : net->attributes) {
-    h5->setAttr(kv.first, kv.second);
+    h5->attr.set(kv.first, kv.second);
   }
   save_codec(h5.get(), "codec", net->codec, net->noutput());
   save_codec(h5.get(), "icodec", net->icodec, net->ninput());
@@ -79,7 +78,7 @@ void load_net_raw(INetwork *net, const char *fname) {
   using namespace h5eigen;
   unique_ptr<HDF5> h5(make_HDF5());
   h5->open(fname);
-  h5->getAttrs(net->attributes);
+  h5->attr.gets(net->attributes);
   load_codec(net->icodec, h5.get(), "icodec");
   load_codec(net->codec, h5.get(), "codec");
   net->weights("", [&h5](const string &prefix, VecMat a, VecMat da) {
@@ -97,7 +96,7 @@ void load_attributes(map<string, string> &attributes, const string &fname) {
   using namespace h5eigen;
   unique_ptr<HDF5> h5(make_HDF5());
   h5->open(fname.c_str());
-  h5->getAttrs(attributes);
+  h5->attr.gets(attributes);
 }
 
 Network load_net(const string &fname) {

diff --git a/clstmconv.cc → OLD/clstmconv.cc b/clstmconv.cc → OLD/clstmconv.cc
@@ -135,10 +135,10 @@ int main_seq(int argc, char **argv) {
   int nhidden = getrenv("nhidden", getrenv("hidden", 100));
   int nhidden2 = getrenv("nhidden2", getrenv("hidden2", -1));
   net = make_net(net_type, {
-                               {"ninput", dataset.nin},
-                               {"noutput", dataset.nout},
-                               {"nhidden", nhidden},
-                               {"nhidden2", nhidden2},
+                            {"ninput", dataset.nin},
+                            {"noutput", dataset.nout},
+                            {"nhidden", nhidden},
+                            {"nhidden2", nhidden2},
                            });
 
   double lrate = getdenv("lrate", 1e-4);

diff --git a/clstmctc.cc → OLD/clstmctc.cc b/clstmctc.cc → OLD/clstmctc.cc
@@ -526,7 +526,7 @@ int main(int argc, char **argv) {
     print(string(argv[0]) + " " + usage);
     exit(1);
   }
-  try {
+  TRY {
     string mode = getsenv("mode", "train");
     if (getienv("eval", 0)) {  // for old scripts
       return main_eval(argc, argv);
@@ -542,11 +542,9 @@ int main(int argc, char **argv) {
     } else {
       return main_eval(argc, argv);
     }
-#ifndef NOEXCEPTION
-  } catch (const char *msg) {
+  } CATCH (const char *msg) {
     print("EXCEPTION", msg);
-#endif
-  } catch (...) {
+  } CATCH (...) {
     print("UNKNOWN EXCEPTION");
   }
 }
diff --git a/clstmimg.cc → OLD/clstmimg.cc b/clstmimg.cc → OLD/clstmimg.cc
diff --git a/clstmseq.cc → OLD/clstmseq.cc b/clstmseq.cc → OLD/clstmseq.cc
diff --git a/clstmtext.cc → OLD/clstmtext.cc b/clstmtext.cc → OLD/clstmtext.cc
@@ -106,12 +106,12 @@ void set_inputs_with_eps(INetwork *net, wstring &s, int neps) {
   Sequence &seq = net->inputs;
   int d = net->ninput();
   seq.clear();
-  seq.resize(cs.size() * (neps+1) + neps);
+  seq.resize(cs.size() * (neps + 1) + neps);
   for (int i = 0; i < neps; i++) seq[i].setZero(d, 1);
   for (int pos = 0; pos < cs.size(); pos++) {
     seq[pos].setZero(d, 1);
     seq[pos](cs[pos], 0) = 1.0;
-    for (int i = 0; i < neps; i++) seq[pos+1+i].setZero(d, 1);
+    for (int i = 0; i < neps; i++) seq[pos + 1 + i].setZero(d, 1);
   }
 }
 
@@ -205,7 +205,7 @@ int main_train(int argc, char **argv) {
     net = load_net(load_name);
     nclasses = net->codec.size();
     iclasses = net->icodec.size();
-    neps = stoi(net->attributes["neps"]);
+    neps = net->attr.get("neps");
   } else {
     vector<int> icodec, codec;
     get_codec(icodec, samples, &Sample::in);
@@ -240,8 +240,7 @@ int main_train(int argc, char **argv) {
   double start_time = now();
   double best_erate = 1e38;
 
-  int start =
-      stoi(getdef(net->attributes, "trial", getsenv("start", "-1"))) + 1;
+  int start = net->attr.get("trial", getienv("start", -1)) + 1;
   if (start > 0) print("start", start);
   for (int trial = start; trial < ntrain; trial++) {
     bool report = (report_every > 0) && (trial % report_every == 0);
@@ -251,7 +250,7 @@ int main_train(int argc, char **argv) {
       char fname[4096];
       sprintf(fname, save_name.c_str(), trial);
       print("saving", fname);
-      net->attributes["trial"] = to_string(trial);
+      net->attr.set("trial", trial);
       save_net(fname, net);
       if (after_save != "") system(after_save.c_str());
       cout.flush();
@@ -265,8 +264,8 @@ int main_train(int argc, char **argv) {
       if (save_every == 0 && erate < best_erate) {
         best_erate = erate;
         print("saving", save_name, "at", erate);
-        net->attributes["trial"] = to_string(trial);
-        net->attributes["last_err"] = to_string(best_erate);
+        net->attr.set("trial", trial);
+        net->attr.set("last_err", best_erate);
         save_net(save_name, net);
         if (after_save != "") system(after_save.c_str());
       }
@@ -352,7 +351,7 @@ int main_filter(int argc, char **argv) {
   if (load_name == "") THROW("must give load= parameter");
   Network net;
   net = load_net(load_name);
-  int neps = stoi(net->attributes["neps"]);
+  int neps = net->attr.get("neps");
   dprint("codec", net->codec.size(), "icodec", net->icodec.size(), "neps",
          neps);
 
@@ -413,18 +412,16 @@ int main(int argc, char **argv) {
     print(string(argv[0]) + " " + usage);
     exit(1);
   }
-  try {
+  TRY {
     string mode = getsenv("mode", "train");
     if (mode == "train") {
       return main_train(argc, argv);
     } else if (mode == "filter") {
       return main_filter(argc, argv);
     }
-#ifndef NOEXCEPTION
-  } catch (const char *msg) {
+  } CATCH (const char *msg) {
     print("EXCEPTION", msg);
-#endif
-  } catch (...) {
+  } CATCH (...) {
     print("UNKNOWN EXCEPTION");
   }
 }
diff --git a/h5eigen.h → OLD/h5eigen.h b/h5eigen.h → OLD/h5eigen.h
@@ -125,15 +125,15 @@ struct HDF5 {
     for (int i = 0; i < N; i++) a(i, 0) = data[i];
     dataset.vlenReclaim(dtype, mspace, DSetMemXferPropList::DEFAULT, vl);
   }
-  void setAttr(string name, string value) {
+  void attr.set(string name, string value) {
     Group root(h5->openGroup("/"));
     StrType strdatatype(PredType::C_S1, 256);
     DataSpace attr_dataspace = DataSpace(H5S_SCALAR);
     H5std_string buffer(value);
     root.createAttribute(name, strdatatype, attr_dataspace)
         .write(strdatatype, buffer);
   }
-  string getAttr(string name) {
+  string attr.get(string name) {
     Group root(h5->openGroup("/"));
     StrType strdatatype(PredType::C_S1, 256);
     DataSpace attr_dataspace = DataSpace(H5S_SCALAR);
@@ -142,7 +142,7 @@ struct HDF5 {
         .read(strdatatype, buffer);
     return buffer;
   }
-  void getAttrs(map<string, string> &result) {
+  void attr.gets(map<string, string> &result) {
     Group root(h5->openGroup("/"));
     StrType strdatatype(PredType::C_S1, 256);
     DataSpace attr_dataspace = DataSpace(H5S_SCALAR);

diff --git a/h5multi.h → OLD/h5multi.h b/h5multi.h → OLD/h5multi.h
diff --git a/h5tensor.h → OLD/h5tensor.h b/h5tensor.h → OLD/h5tensor.h
diff --git a/lstm_test.i → OLD/lstm_test.i b/lstm_test.i → OLD/lstm_test.i
diff --git a/multidim.h → OLD/multidim.h b/multidim.h → OLD/multidim.h
@@ -97,7 +97,7 @@ struct mdarray {
   int rank() {
     for (int i = 0; i < MAXRANK + 1; i++)
       if (!dims[i]) return i;
-    THROW("bad rank");
+    MDCHECK(MAXRANK < MAXRANK);
   }
 
   // total number of elements in linearized array

diff --git a/pyeigen.h → OLD/pyeigen.h b/pyeigen.h → OLD/pyeigen.h
diff --git a/pymulti.h → OLD/pymulti.h b/pymulti.h → OLD/pymulti.h
diff --git a/OLD/test-timing.cc b/OLD/test-timing.cc
@@ -0,0 +1,124 @@
+#include <assert.h>
+#include <math.h>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "clstm.h"
+#include "clstm_compute.h"
+#include "extras.h"
+#include "utils.h"
+
+using namespace std;
+using std::vector;
+using std::shared_ptr;
+using std::unique_ptr;
+using std::to_string;
+using std::make_pair;
+using std::cout;
+using std::stoi;
+using namespace Eigen;
+using namespace ocropus;
+using std_string = std::string;
+#define string std_string
+
+typedef vector<Params> ParamVec;
+
+double sqr(double x) { return x * x; }
+
+double randu() {
+  static double state = 0.23498023948923408293248;
+  state = 179.93489901293380918 * state + 0.719408230890328424;
+  state -= floor(state);
+  return state;
+}
+
+double uniform(double lo = 0.0, double hi = 1.0) {
+  double x = fabs(randu());
+  double result = (hi - lo) * x + lo;
+  PRINT(result);
+  return result;
+}
+double exp_uniform(double lo = 1.0, double hi = 100.0) {
+  assert(lo > 0 && hi > lo);
+  double result = exp(uniform(log(lo), log(hi)));
+  PRINT(result);
+  return result;
+}
+
+void randten(Tensor2 &a, int n, int m) {
+  a.resize(n, m);
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < m; j++) {
+      a(i, j) = randu();
+    }
+  }
+}
+
+vector<vector<int>> conditions;
+
+struct Timing {
+  string prefix;
+  Context *context;
+  Tensor2 a, b, c;
+  Timing() {}
+  Timing(string prefix, Context *context) : prefix(prefix), context(context) {}
+  void operator<<=(function<void(Tensor2 &, Tensor2 &, Tensor2 &)> f) {
+    for (int i = 0; i < conditions.size(); i++) {
+      double total = 0.0;
+      double count = 0;
+      int n = conditions[i][0];
+      int l = conditions[i][1];
+      int m = conditions[i][2];
+      assert(n > 0 && n < 100000);
+      assert(l > 0 && l < 100000);
+      assert(m > 0 && m < 100000);
+      for (int k = 0; k < 10; k++) {
+        a.context = context;
+        b.context = context;
+        c.context = context;
+        randten(a, n, l);
+        randten(b, l, m);
+        c.resize(n, m);
+        double start = now();
+        f(c, a, b);
+        double finish = now();
+        total += finish - start;
+        count++;
+      }
+      print(prefix, n, l, m, total / count);
+    }
+  }
+};
+
+inline Eigen::array<Eigen::IndexPair<int>, 1> axispairs(int i, int j) {
+  Eigen::array<Eigen::IndexPair<int>, 1> result = {Eigen::IndexPair<int>(i, j)};
+  return result;
+}
+
+int main(int argc, char **argv) {
+  int ntrial = getienv("ntrial", 1000);
+  int maxmat = getienv("maxmat", 1000);
+  for (int i = 0; i < ntrial; i++) {
+    int n, l, m;
+    n = exp_uniform(1, maxmat);
+    l = exp_uniform(1, maxmat);
+    m = exp_uniform(1, maxmat);
+    assert(n > 0 && n < 100000);
+    assert(l > 0 && l < 100000);
+    assert(m > 0 && m < 100000);
+    vector<int> v{n, l, m};
+    conditions.push_back(v);
+  }
+  TRY {
+    Timing nocontext("none", new Context());
+    nocontext <<= [](Tensor2 &c, Tensor2 &a, Tensor2 &b) {
+      c = a().contract(b(), axispairs(1, 0));
+    };
+    Timing threaded("threaded", new ThreadedContext(4));
+    threaded <<= [](Tensor2 &c, Tensor2 &a, Tensor2 &b) {
+      c = a().contract(b(), axispairs(1, 0));
+    };
+  }
+  CATCH(const char *message) { print("ERROR", message); }
+}
diff --git a/README.md b/README.md
@@ -1,43 +1,152 @@
 # clstm
 
-[![Join the chat at https://gitter.im/tmbdev/clstm](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/tmbdev/clstm?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![CircleCI](https://circleci.com/gh/tmbdev/clstm/tree/master.svg?style=svg)](https://circleci.com/gh/tmbdev/clstm/tree/master)
 
-To build a standalone C library, run
+CLSTM is an implementation of the
+[LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory) recurrent neural
+network model in C++, using the [Eigen](http://eigen.tuxfamily.org) library for
+numerical computations.
 
-    scons
-    sudo scons install
-
-Prerequisites:
+# Status and scope
+
+CLSTM is mainly in maintenance mode now. It was created at a time when there weren't a lot of good LSTM
+implementations around, but several good options have become available over the last year. Nevertheless, if
+you need a small library for text line recognition with few dependencies, CLSTM is still a good option.
+
+# Installation using Docker
+
+You can train and run clstm without installation to the local machine using the
+docker image, which is based on Ubuntu 16.04. This is the best option for
+running clstm on a Windows host.
+
+You can either run the [last version of the clstm
+image](https://hub.docker.com/r/kbai/clstm) from Docker Hub or build the Docker
+image from the repo (see [`./docker/Dockerfile`](./docker/Dockerfile)).
+
+The command line syntax differs from a native installation:
+
+```
+docker run --rm -it -e [VARIABLES...] kbai/clstm BINARY [ARGS...]
+```
+
+is equivalent to
+
+```
+[VARIABLES...] BINARY [ARGS...]
+```
+
+For example:
 
- - scons, Eigen
+```
+docker run --rm -it -e ntrain=1000 kbai/clstm clstmocrtrain traininglist.txt
+```
+
+is equivalent to
+
+```
+ntrain=1000 clstmocrtrain traininglist.txt
+```
+
+# Installation from source
+
+## Prerequisites
+
+ - scons, swig, Eigen
  - protocol buffer library and compiler
- - HDF5 libraries and C++, Python bindings (optional, for HDF5 I/O)
- - ZMQ libraries and C++, Python bindings (optional, for display)
+ - libpng
+ - Optional: HDF5, ZMQ, Python
+
+```sh
+# Ubuntu 15.04, 16.04 / Debian 8, 9
+sudo apt-get install scons libprotobuf-dev protobuf-compiler libpng-dev libeigen3-dev swig
+
+# Ubuntu 14.04:
+sudo apt-get install scons libprotobuf-dev protobuf-compiler libpng-dev swig
+```
+
+The Debian repositories jessie-backports and stretch include sufficiently new libeigen3-dev packages.
+
+It is also possible to download [Eigen](http://eigen.tuxfamily.org) with Tensor support (> v3.3-beta1)
+and copy the header files to an `include` path:
+
+```sh
+# with wget
+wget 'https://github.com/RLovelett/eigen/archive/3.3-rc1.tar.gz'
+tar xf 3.3-rc1.tar.gz
+rm -f /usr/local/include/eigen3
+mv eigen-3.3-rc1 /usr/local/include/eigen3
+# or with git:
+sudo git clone --depth 1 --single-branch --branch 3.3-rc1 \
+  "https://github.com/RLovelett/eigen" /usr/local/include/eigen3
+```
+
+To use the [visual debugging methods](#user-content-display), additionally:
 
-On Ubuntu, this means:
+```sh
+# Ubuntu 15.04:
+sudo apt-get install libzmq3-dev libzmq3 libzmqpp-dev libzmqpp3 libpng12-dev
+```
+
+For [HDF5](#user-content-hdf5), additionally:
+
+```sh
+# Ubuntu 15.04:
+sudo apt-get install hdf5-helpers libhdf5-8 libhdf5-cpp-8 libhdf5-dev python-h5py
+
+# Ubuntu 14.04:
+sudo apt-get install hdf5-helpers libhdf5-7 libhdf5-dev python-h5py
+```
+
+## Building
+
+To build a standalone C library, run
+
+    scons
+    sudo scons install
 
-    sudo apt-get install libeigen3-dev \
-    hdf5-helpers libhdf5-8 libhdf5-cpp-8 libhdf5-dev python-h5py \
-    libprotobuf-dev libprotobuf9 protobuf-compiler \                  
-    libzmq3-dev libzmq3 libzmqpp-dev libzmqpp3 libpng12-dev
-
 There are a bunch of options:
 
  - `debug=1` build with debugging options, no optimization
- - `display=1` build with display support for debugging (requires ZMQ, Python)
+ - <a id="display">`display=1`</a> build with display support for debugging (requires ZMQ, Python)
  - `prefix=...` install under a different prefix (untested)
  - `eigen=...` where to look for Eigen include files (should contain `Eigen/Eigen`)
- - `hdf5lib=hdf5` what HDF5 library to use; enables HDF5 command line programs (may need `hdf5_serial` in some environments)
+ - `openmp=...` build with multi-processing support. Set the
+   [`OMP_NUM_THREADS`](https://eigen.tuxfamily.org/dox/TopicMultiThreading.html)
+   environment variable to the number of threads for Eigen to use.
+ - <a id="hdf5">`hdf5lib=hdf5`</a> what HDF5 library to use; enables HDF5 command line 
+   programs (may need `hdf5_serial` in some environments)
+
+## Running the tests
 
 After building the executables, you can run two simple test runs as follows:
 
  - `run-cmu` will train an English-to-IPA LSTM
  - `run-uw3-500` will download a small OCR training/test set and train an OCR LSTM
 
+There is a full set of tests in the current version of clstm; just
+run them with:
+
+```sh
+./run-tests
+```
+
+This will check:
+
+ - gradient checkers for layers and compute steps
+ - training a simple model through the C++ API
+ - training a simple model through the Python API
+ - checking the command line training tools, including loading and saving
+
+## Python bindings
+
 To build the Python extension, run
 
-    python setup.py build
-    sudo python setup.py install
+```sh
+python setup.py build
+sudo python setup.py install
+```
+
+(this is currently broken)
 
 # Documentation / Examples
 
@@ -51,35 +160,36 @@ http://nbviewer.ipython.org/github/tmbdev/clstm/tree/master/misc/
 
 The `clstm` library operates on the Sequence type as its fundamental
 data type, representing variable length sequences of fixed length vectors.
-Internally, this is represented as an STL vector of Eigen dynamic vectors
-
-    typedef stl::vector<Eigen::VectorXf> Sequence;
-
-NB: This will be changed to an Eigen::Tensor
+The underlying Sequence type is a rank 4 tensor with accessors for
+individual rank-2 tensors at different time steps.
 
 Networks are built from objects implementing the `INetwork` interface.
 The `INetwork` interface contains:
 
-    struct INetwork {
-        Sequence inputs, d_inputs;      // input sequence, input deltas
-        Sequence outputs, d_outputs;    // output sequence, output deltas
-        void forward();                 // propagate inputs to outputs
-        void backward();                // propagate d_outputs to d_inputs
-        void update();                  // update weights from the last backward() step
-        void setLearningRate(Float,Float); // set learning rates
-        ...
-    };
-
-Network structures can be hierarchical and there are some network 
+```c++
+struct INetwork {
+    Sequence inputs, d_inputs;      // input sequence, input deltas
+    Sequence outputs, d_outputs;    // output sequence, output deltas
+    void forward();                 // propagate inputs to outputs
+    void backward();                // propagate d_outputs to d_inputs
+    void update();                  // update weights from the last backward() step
+    void setLearningRate(Float,Float); // set learning rates
+    ...
+};
+```
+
+Network structures can be hierarchical and there are some network
 implementations whose purpose it is to combine other networks into more
 complex structures.
 
-    struct INetwork {
-        ...
-        vector<shared_ptr<INetwork>> sub;
-        void add(shared_ptr<INetwork> net);
-        ...
-    };
+```c++
+struct INetwork {
+    ...
+    vector<shared_ptr<INetwork>> sub;
+    void add(shared_ptr<INetwork> net);
+    ...
+};
+```
 
 At its lowest level, layers are created by:
 
@@ -96,7 +206,7 @@ There are three different functions for constructing layers and networks:
  - `make_net(kind,args)` initializes a whole collection of layers at once
  - `make_net_init(kind,params)` is like `make_net`, but parameters are given in string form
 
-The `layer(kind,ninput,noutput,args,sub)` function will perform 
+The `layer(kind,ninput,noutput,args,sub)` function will perform
 these steps in sequence.
 
 Layers and networks are usually passed around as `shared_ptr<INetwork>`;
@@ -106,10 +216,12 @@ This can be used to construct network architectures in C++ pretty
 easily. For example, the following creates a network that stacks
 a softmax output layer on top of a standard LSTM layer:
 
-    Network net = layer("Stacked", ninput, noutput, {}, {
-        layer("LSTM", ninput, nhidden,{},{}),
-        layer("SoftmaxLayer", nhidden, noutput,{},{})
-    });
+```c++
+Network net = layer("Stacked", ninput, noutput, {}, {
+    layer("LSTM", ninput, nhidden,{},{}),
+    layer("SoftmaxLayer", nhidden, noutput,{},{})
+});
+```
 
 Note that you need to make sure that the number of input and
 output units are consistent between layers.
@@ -128,8 +240,8 @@ and testing code, for plotting, and for HDF5 input/output. Unlike Eigen,
 it uses standard C/C++ row major element order, as libraries like
 HDF5 expect. (NB: This will be replaced with Eigen::Tensor.)
 
-LSTM models are stored in protocol buffer format (`clstm.proto`), 
-although adding new formats is easy. There is an older HDF5-based 
+LSTM models are stored in protocol buffer format (`clstm.proto`),
+although adding new formats is easy. There is an older HDF5-based
 storage format.
 
 # Python API
@@ -138,7 +250,7 @@ The `clstm.i` file implements a simple Python interface to clstm, plus
 a wrapper that makes an INetwork mostly a replacement for the lstm.py
 implementation from ocropy.
 
-# Comand Line Drivers
+# Command Line Drivers
 
 There are several command line drivers:
 
@@ -148,7 +260,7 @@ There are several command line drivers:
   - `clstmocrtrain training-images test-images` learns OCR (or image-to-text) transformations;
     - input files are lists of text line images; the corresponding UTF-8 ground truth is expected in the corresponding `.gt.txt` file
   - `clstmocr` applies learned OCR models
- 
+
  In addition, you get the following HDF5-based commands:
 
   - clstmseq learns sequence-to-sequence mappings
@@ -157,15 +269,10 @@ There are several command line drivers:
 
 Note that most parameters are passed through the environment:
 
-    lrate=3e-5 clstmctc uw3-dew.h5
-
+```
+lrate=3e-5 clstmctc uw3-dew.h5
+```
+
 See the notebooks in the `misc/` subdirectory for documentation on the parameters and examples of usage.
 
 (You can find all parameters via `grep 'get.env' *.cc`.)
-
-# TODO / UPCOMING
-
-  - Lua and Torch bindings
-  - more recurrent network types
-  - replacement of mdarray with Eigen Tensors
-  - 2D LSTM support
diff --git a/SConstruct b/SConstruct
@@ -1,163 +1,173 @@
 # -*- Python -*-
-import os,sys,os.path
-import distutils.sysconfig
 
-if os.path.isdir(".hg"):
-    hgversion = os.popen("hg -q id").read().strip()
-elif os.path.isdir(".git"):
-    hgversion = os.popen("git rev-list HEAD | sed 1q").read().strip()
-else:
-    hgversion = os.popen("date").read().strip()
-print "version",hgversion
+# CLSTM requires C++11, and installs in /usr/local by default
+
+import os
+import sys
+import os.path
+import distutils.sysconfig
 
-# A bunch of utility functions to make the rest of the SConstruct file a little simpler.
+# A bunch of utility functions to make the rest of the SConstruct file a
+# little simpler.
 
 def die(msg):
-    sys.stderr.write("ERROR "+msg+"\n")
+    sys.stderr.write("ERROR " + msg + "\n")
     Exit(1)
-
-def option(name,dflt):
-    return (ARGUMENTS.get(name) or os.environ.get(name,dflt))
-
-def findonpath(fname,path):
+def option(name, dflt):
+    result = (ARGUMENTS.get(name) or os.environ.get(name, dflt))
+    if type(dflt)==int: result = int(result)
+    return result
+def findonpath(fname, path):
     for dir in path:
-        if os.path.exists(os.path.join(dir,fname)):
+        if os.path.exists(os.path.join(dir, fname)):
             return dir
-    raise die("%s: not found" % fname)
+    die("%s: not found" % fname)
+
+# A protocol buffer builder.
 
 def protoc(target, source, env):
     os.system("protoc %s --cpp_out=." % source[0])
 def protoemitter(target, source, env):
     for s in source:
-        base,_ = os.path.splitext(str(s))
-        target.extend([base+".pb.cc", base+".pb.h"])
+        base, _ = os.path.splitext(str(s))
+        target.extend([base + ".pb.cc", base + ".pb.h"])
     return target, source
 
 protoc_builder = Builder(action=protoc,
                          emitter=protoemitter,
                          src_suffix=".proto")
 
-# CLSTM requires C++11, and installes in /usr/local by default
-
 prefix = option('prefix', "/usr/local")
 
 env = Environment()
-env.Append(CPPDEFINES={"HGVERSION" : '\\"'+hgversion+'\\"'})
-env.Append(CPPDEFINES={'THROW' : 'throw'})
+env.Append(CPPDEFINES={'THROW': 'throw', 'CATCH': 'catch', 'TRY': 'try'})
+env.Append(CPPDEFINES={'CLSTM_ALL_TENSOR': '1'})
 env["BUILDERS"]["Protoc"] = protoc_builder
 
-if option("double",0):
-    env.Append(CPPDEFINES={'LSTM_DOUBLE' : '1'})
+options = option("options", "")
+env["CXX"] = option("CXX", "g++") + " --std=c++11 -Wno-unused-result "+options
 
-# With omp=1 support, Eigen and other parts of the code may use multi-threading.
+if option("double", 0):
+    env.Append(CPPDEFINES={'LSTM_DOUBLE': '1'})
 
-if option("omp",0):
-    env["CXX"] = option("CXX", "g++") + " --std=c++11 -Wno-unused-result -fopenmp"
-else:
-    env["CXX"] = option("CXX", "g++") + " --std=c++11 -Wno-unused-result"
 
 # With profile=1, the code will be compiled suitable for profiling and debug.
 # With debug=1, the code will be compiled suitable for debugging.
 
-if option("profile",0):
-    env.Append(CXXFLAGS="-g -pg -fno-inline".split())
-    env.Append(CCFLAGS="-g -pg".split())
+profile = option("profile", 0)
+debug = option("debug", 0)
+
+if profile>0:
+    #env.Append(CXXFLAGS="-g -pg -O2".split())
+    env.Append(CCFLAGS="-g -pg -O2".split())
     env.Append(LINKFLAGS="-g -pg".split())
-elif option("debug",0):
-    env.Append(CXXFLAGS="-g -fno-inline".split())
+elif debug>1:
+    #env.Append(CXXFLAGS="-g -fno-inline".split())
     env.Append(CCFLAGS="-g".split())
     env.Append(LINKFLAGS="-g".split())
-else:
-    env.Append(CXXFLAGS="-g -O3 -finline".split())
+elif debug>0:
+    #env.Append(CXXFLAGS="-g".split())
     env.Append(CCFLAGS="-g".split())
-
-# Extra layers (old layers or testing)
-
-if option("extras",0):
-    env.Append(CPPDEFINES={'CLSTM_EXTRAS' : 1})
+    env.Append(LINKFLAGS="-g".split())
+elif debug==0:
+    #env.Append(CXXFLAGS="-g -O3 -DEIGEN_NO_DEBUG".split())
+    env.Append(CCFLAGS="-g -O3 -DEIGEN_NO_DEBUG".split())
+elif debug<0:
+    env.Append(CCFLAGS="-g -Ofast -DEIGEN_NO_DEBUG -finline -ffast-math -fno-signaling-nans -funsafe-math-optimizations -ffinite-math-only -march=native".split())
 
 # Try to locate the Eigen include files (they are in different locations
 # on different systems); you can specify an include path for Eigen with
 # `eigen=/mypath/include`
 
-if option("eigen","")=="":
-    inc = findonpath("Eigen/Eigen","""
-        /usr/include
+if option("eigen", "") == "":
+    inc = findonpath("Eigen/Eigen", """
+        /usr/local/include
         /usr/local/include/eigen3
+        /usr/include
         /usr/include/eigen3""".split())
 else:
-    inc = findonpath("Eigen/Eigen",[option("eigen")])
+    inc = findonpath("Eigen/Eigen", [option("eigen", "")])
+
 env.Append(CPPPATH=[inc])
+env.Append(LIBS=["png", "protobuf"])
 
-# You can enable display debugging with `display=1`
+# You can enable display debugging with `display=1` (probably not working right now)
 
-if option("display",0):
-    env.Append(LIBS=["zmqpp","zmq"])
-    env.Append(CPPDEFINES={'add_raw' : option("add_raw",'add')})
+if option("display", 0):
+    env.Append(LIBS=["zmqpp", "zmq"])
+    env.Append(CPPDEFINES={'add_raw': option("add_raw", 'add')})
 else:
-    env.Append(CPPDEFINES={'NODISPLAY' : 1})
+    env.Append(CPPDEFINES={'NODISPLAY': 1})
 
-env.Append(LIBS=["png","protobuf"])
+if option("openmp", 0):
+    env.Append(CCFLAGS="-fopenmp")
 
 # We need to compile the protocol buffer definition as part of the build.
 
 env.Protoc("clstm.proto")
 
+cuda = env.Object("clstm_compute_cuda.o", "clstm_compute_cuda.cc",
+           CXX="./nvcc-wrapper")
+
 # Build the CLSTM library.
 
-libs = env["LIBS"]
 libsrc = ["clstm.cc", "ctc.cc", "clstm_proto.cc", "clstm_prefab.cc",
-          "extras.cc", "clstm.pb.cc"]
-libclstm = env.StaticLibrary("clstm", source = libsrc)
+          "tensor.cc", "batches.cc", "extras.cc", "clstm.pb.cc", 
+          "clstm_compute.cc"]
+if option("gpu", 0):
+  env.Append(LIBS=["cudart","cublas","cuda"])
+  env.Append(LIBPATH=["/usr/local/cuda/lib64"])
+  env.Append(CPPPATH=["/usr/local/cuda/include"])
+  env.Append(CPPDEFINES={'CLSTM_CUDA' : 1, 'EIGEN_USE_GPU' : 1})
+  libsrc = [cuda] + libsrc
 
-programs = """clstmtext clstmfilter clstmfiltertrain clstmocr clstmocrtrain""".split()
+libs = env["LIBS"]
+libclstm = env.StaticLibrary("clstm", libsrc)
+
+all = [libclstm]
+
+programs = """clstmfilter clstmfiltertrain clstmocr clstmocrtrain""".split()
 for program in programs:
-    env.Program(program,[program+".cc"],LIBS=[libclstm]+libs)
+    all += [env.Program(program, [program + ".cc"], LIBS=[libclstm] + libs)]
     Default(program)
 
-env.Program("test-forward",["test-forward.cc"],LIBS=[libclstm]+libs)
-
 # env.Program("fstfun", "fstfun.cc", LIBS=[libclstm]+libs+["fst","dl"])
 
 Alias('install-lib',
-      Install(os.path.join(prefix,"lib"), libclstm))
+      Install(os.path.join(prefix, "lib"), libclstm))
 Alias('install-include',
-      Install(os.path.join(prefix,"include"), ["clstm.h"]))
+      Install(os.path.join(prefix, "include"), ["clstm.h"]))
 Alias('install',
       ['install-lib', 'install-include'])
 
-# If you have HDF5 installed, set hdf5lib=hdf5_serial (or something like that)
-# and you will get a bunch of command line programs that can be trained from
-# HDF5 data files. This code is messy and may get deprecated eventually.
+# A simple test of the C++ LSTM implementation.
+all += [env.Program("test-lstm", ["test-lstm.cc"], LIBS=[libclstm] + libs)]
+all += [env.Program("test-lstm2", ["test-lstm2.cc"], LIBS=[libclstm] + libs)]
+all += [env.Program("test-batchlstm", ["test-batchlstm.cc"], LIBS=[libclstm] + libs)]
+all += [env.Program("test-deriv", ["test-deriv.cc"], LIBS=[libclstm] + libs)]
+all += [env.Program("test-cderiv", ["test-cderiv.cc"], LIBS=[libclstm] + libs)]
+all += [env.Program("test-ctc", ["test-ctc.cc"], LIBS=[libclstm] + libs)]
+all += [env.Program("test-2d", ["test-2d.cc"], LIBS=[libclstm] + libs)]
 
-if option("hdf5lib", "")!="":
-    h5env = env.Clone()
-    inc = findonpath("hdf5.h","""
-        /usr/include
-        /usr/local/include/hdf5/serial
-        /usr/local/include/hdf5
-        /usr/include/hdf5/serial
-        /usr/include/hdf5""".split())
-    h5env.Append(CPPPATH=[inc])
-    h5env.Append(LIBS=["hdf5_cpp"])
-    h5env.Append(LIBS=[option("hdf5lib", "hdf5_serial")])
-    h5env.Prepend(LIBS=[libclstm])
-    for program in "clstmctc clstmseq clstmconv".split():
-        h5env.Program(program,[program+".cc"])
-
-
-# You can construct the Python extension from scons using `pyswig=1`; however,
+# You can construct the Python extension from scons using the `pyswig` target; however,
 # the recommended way of compiling it is with "python setup.py build"
 
-if option("pyswig", 0):
-    swigenv = env.Clone( SWIGFLAGS=["-python","-c++"], SHLIBPREFIX="")
-    swigenv.Append(CPPPATH=[distutils.sysconfig.get_python_inc()])
-    swigenv.SharedLibrary("_clstm.so",
-                          ["clstm.i", "clstm.cc", "extras.cc", "clstm.pb.cc"],
-                          LIBS=libs)
-
+swigenv = env.Clone()
+swigenv.Tool("swig")
+swigenv.Append(SWIG="3.0")
+swigenv.Append(CPPPATH=[distutils.sysconfig.get_python_inc()])
+pyswig = swigenv.SharedLibrary("_clstm.so",
+                               ["clstm.i", "clstm.cc", "clstm_proto.cc", "extras.cc",
+                                "clstm.pb.cc", "clstm_compute.cc",
+                               "clstm_prefab.cc", "ctc.cc"],
+                               SWIGFLAGS=['-python', '-c++'],
+                               SHLIBPREFIX="",
+                               LIBS=libs)
+Alias('pyswig', [pyswig])
 
 destlib = distutils.sysconfig.get_config_var("DESTLIB")
 Alias('pyinstall',
       Install(os.path.join(destlib, "site-packages"),
               ["_clstm.so", "clstm.py"]))
+
+Alias('all', [all])
diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,14 @@
+TODO:
+
+ - 2D primitives: switch_batch_time, stack_neighbors
+ - 2D command line
+ - single mat option
+ - implement GRU, RNN, IIR+log
+ - implement per-class or per-step weights
+ - OMP parallel training
+ - add convolutional layers
+
+Experiments:
+
+ - different initializations
+ - other update rules
diff --git a/batches.cc b/batches.cc
@@ -0,0 +1,101 @@
+#include "batches.h"
+#include <string>
+
+// random initialization of sequences etc.
+
+namespace {
+
+// very simple "random" number generator; this
+// is just used for initializations
+
+double state = getenv("seed") ? atof(getenv("seed")) : 0.1;
+
+inline double randu() {
+  state = 189843.9384938 * state + 0.328340981343;
+  state -= floor(state);
+  return state;
+}
+
+inline double randn() {
+  double u1 = randu();
+  double u2 = randu();
+  double r = -2 * log(u1);
+  double theta = 2 * M_PI * u2;
+  double z0 = r * cos(theta);
+  return z0;
+}
+}
+
+namespace ocropus {
+
+// Random initializations with different distributions.
+
+void rinit(TensorMap2 a, Float s, const char *mode_, Float offset) {
+  int n = a.dimension(0), m = a.dimension(1);
+  std::string mode(mode_);
+  if (mode == "unif") {
+    for (int i = 0; i < n; i++)
+      for (int j = 0; j < m; j++) a(i, j) = 2 * s * randu() - s + offset;
+  } else if (mode == "negbiased") {
+    for (int i = 0; i < n; i++)
+      for (int j = 0; j < m; j++) a(i, j) = 3 * s * randu() - 2 * s + offset;
+  } else if (mode == "pos") {
+    for (int i = 0; i < n; i++)
+      for (int j = 0; j < m; j++) a(i, j) = s * randu() + offset;
+  } else if (mode == "neg") {
+    for (int i = 0; i < n; i++)
+      for (int j = 0; j < m; j++) a(i, j) = -s * randu() + offset;
+  } else if (mode == "normal") {
+    for (int i = 0; i < n; i++)
+      for (int j = 0; j < m; j++) a(i, j) = s * randn() + offset;
+  }
+}
+
+void rinit(Tensor2 &t, int r, int c, Float s, const char *mode_, Float offset) {
+  // use a temporary so that initialization of GPU tensors works
+  Tensor2 temp;
+  temp.resize(r, c);
+  rinit(temp(), s, mode_, offset);
+  t = temp;
+}
+
+void rinit(Batch &m, int r, int c, Float s, const char *mode, Float offset) {
+  rinit(m.v, r, c, s, mode, offset);
+  m.zeroGrad();
+}
+
+void rinit(Sequence &m, int N, int r, int c, Float s, const char *mode,
+           Float offset) {
+  m.steps.resize(N);
+  for (int t = 0; t < N; t++) rinit(m[t], r, c, s, mode, offset);
+}
+
+// checking for NaNs in different objects
+
+bool anynan(TensorMap2 a) {
+  for (int j = 0; j < a.dimension(0); j++) {
+    for (int k = 0; k < a.dimension(1); k++) {
+      float x = a(j, k);
+      if (std::isnan(x)) return true;
+    }
+  }
+  return false;
+}
+
+bool anynan(Batch &a) {
+  if(anynan(a.v())) return true;
+  if(anynan(a.d())) return true;
+  return false;
+}
+bool anynan(Params &a) {
+  if (anynan(a.v())) return true;
+  if (anynan(a.d())) return true;
+  return false;
+}
+
+bool anynan(Sequence &a) {
+  for (int i = 0; i < a.size(); i++)
+    if (anynan(a[i])) return true;
+  return false;
+}
+}
diff --git a/batches.h b/batches.h
@@ -0,0 +1,174 @@
+#ifndef ocropus_batches__
+#define ocropus_batches__
+
+#include <array>
+#include <vector>
+#include "tensor.h"
+#include "utils.h"
+
+namespace ocropus {
+using std::vector;
+
+struct Batch {
+  Tensor2 v;
+  Tensor2 d;
+  virtual ~Batch() {}
+  int rows() const { return v.dimension(0); }
+  int cols() const { return v.dimension(1); }
+  int getGpu() { return v.getGpu(); }
+  void clear() {
+    v.setZero();
+    d.setZero();
+  }
+  void zeroGrad() { d.setZero(rows(), cols()); }
+};
+
+struct BatchStorage : Batch {
+  void setGpu(int n) {
+    v.setGpu(n);
+    d.setGpu(n);
+  }
+  void like(Batch &other) {
+    setGpu(other.getGpu());
+    resize(other.rows(), other.cols());
+  }
+  void setZero(int n, int m) {
+    v.setZero(n, m);
+    d.setZero(n, m);
+  }
+  void resize(int n, int m) { setZero(n, m); }
+};
+
+typedef BatchStorage Params;
+
+// typedef vector<Mat> Sequence;
+struct Sequence {
+  int gpu = -1;
+  vector<BatchStorage> steps;
+  Float *data = nullptr;
+  int dims[4] = {0, 0, 0, 0};
+
+  TensorMap4 map4() {
+    return TensorMap4(data, dims[0], dims[1], dims[2], dims[3]);
+  }
+  Sequence() {}
+  Sequence(int N, int r, int b) { resize(N, r, b); }
+  Sequence(Sequence &other) {
+    like(other);
+    copy(other);
+  }
+  Sequence(const Sequence &other) {
+    like((Sequence &)other);
+    copy((Sequence &)other);
+  }
+  ~Sequence() { free_gpu(data, gpu); }
+  int getGpu() const { return gpu; }
+  void setGpu(int n) {
+    gpu = n;
+    clear();
+  }
+  void clear() {
+    steps.clear();
+    if (data) free_gpu(data, gpu);
+    data = nullptr;
+    dims[0] = 0;
+    dims[1] = 0;
+    dims[2] = 0;
+    dims[3] = 0;
+  }
+  void allocate(int N, int n, int m) {
+    if (data) clear();
+    dims[0] = n;
+    dims[1] = m;
+    dims[2] = 2;
+    dims[3] = N;
+    alloc_gpu(data, nbytes(), gpu);
+  }
+
+  int size() const { return dims[3]; }
+  int rows() const { return dims[0]; }
+  int cols() const { return dims[1]; }
+  int total_size() const { return dims[0] * dims[1] * dims[2] * dims[3]; }
+  int nbytes() const { return total_size() * sizeof *data; }
+  void check() const {
+    // the data pointer must be null iff the sequence has zero length
+    assert(dims[3] == 0 ? !data : true);
+    assert(!data ? dims[3] == 0 : true);
+    if (!data) return;
+    // batches must have non-zero size
+    assert(steps[0].rows() > 0);
+    assert(steps[0].cols() > 0);
+    int N = size();
+    int n = rows();
+    int m = cols();
+    for (int t = 0; t < N; t++) {
+      // all batches must be displaced to the right locations and consistent
+      assert(steps[t].v.displaced);
+      assert(steps[t].d.displaced);
+      assert(steps[t].v.ptr == data + (n * m) * (2 * t));
+      assert(steps[t].d.ptr == data + (n * m) * (2 * t + 1));
+      assert(steps[t].v.getGpu() == getGpu());
+      assert(steps[t].rows() == steps[0].rows());
+      assert(steps[t].cols() == steps[0].cols());
+    }
+  }
+  void resize(int N, int n, int m) {
+    check();
+    if (N != size() || n != rows() || m != cols()) {
+      clear();
+      allocate(N, n, m);
+      steps.resize(N);
+      for (int t = 0; t < N; t++) {
+        steps[t].v.displaceTo(data + (n * m) * (2 * t), n, m, gpu);
+        steps[t].d.displaceTo(data + (n * m) * (2 * t + 1), n, m, gpu);
+      }
+    }
+    //reset data, whether new or reused
+    memset(data,0,nbytes());
+  }
+  void like(const Sequence &other) {
+    resize(other.size(), other.rows(), other.cols());
+  }
+
+  void copy(const Sequence &other) {
+    other.check();
+    like(other);
+    check();
+    memcpy_gpu(data, gpu, other.data, other.gpu, nbytes());
+  }
+  void operator=(Sequence &other) { copy(other); }
+  Batch &operator[](int i) { return steps[i]; }
+  const Batch &operator[](int i) const { return steps[i]; }
+  void zero() {
+    for (int t = 0; t < steps.size(); t++) steps[t].clear();
+  }
+  void zeroGrad() {
+    for (int t = 0; t < steps.size(); t++) steps[t].zeroGrad();
+  }
+};
+
+void rinit(TensorMap2 m, Float s, const char *mode = "unif",
+           Float offset = 0.0);
+void rinit(Batch &m, int no, int ni, Float s, const char *mode = "unif",
+           Float offset = 0.0);
+void rinit(Params &m, int N, int no, int ni, Float s, const char *mode = "pos",
+           Float offset = 0.0);
+void rinit(Sequence &m, int no, int ni, Float s, const char *mode = "unif",
+           Float offset = 0.0);
+bool anynan(Batch &a);
+bool anynan(Params &a);
+bool anynan(Sequence &a);
+
+inline void check_normalized(Batch &a) {
+  for (int b = 0; b < a.cols(); b++) {
+    double total = 0.0;
+    for (int i = 0; i < a.rows(); i++) total += a.v(i, b);
+    assert(fabs(total - 1.0) < 1e-5);
+  }
+}
+inline void check_normalized(Sequence &a) {
+  for (int t = 0; t < a.size(); t++) check_normalized(a[t]);
+}
+}
+
+#endif
diff --git a/clstm.cc b/clstm.cc
diff --git a/clstm.h b/clstm.h
diff --git a/clstm.i b/clstm.i
diff --git a/clstm_compute.cc b/clstm_compute.cc
diff --git a/clstm_compute.h b/clstm_compute.h
@@ -0,0 +1,106 @@
+#ifndef clstm_compute__
+#define clstm_compute__
+
+#include <utility>
+#include "batches.h"
+
+namespace ocropus {
+using namespace std;
+
+constexpr int LIN = 0;
+constexpr int SIG = 1;
+constexpr int TANH = 2;
+constexpr int RELU = 3;
+constexpr int LOGMAG = 4;
+
+extern Eigen::DefaultDevice default_device;
+
+inline int gpu_id(Tensor2 &t) { return t.getGpu(); }
+inline int gpu_id(Batch &b) { return gpu_id(b.v); }
+inline int gpu_id(Sequence &s) { return gpu_id(s[0]); }
+
+// If this has been compiled with CUDA, there is a gpu_device
+// function in the CUDA-compiled code; otherwise, we default
+// to something that always returns a nullptr for the GPU
+// device.
+
+#ifdef CLSTM_CUDA
+Eigen::GpuDevice *gpu_device(int id);
+#else
+inline Eigen::GpuDevice *gpu_device(int id) {
+  assert(id < 0);
+  return nullptr;
+}
+#endif
+
+template <class T>
+inline Eigen::GpuDevice *gpu(T arg) {
+  int id = gpu_id(arg);
+  return gpu_device(id);
+}
+
+// This bit of macro and template magic allows us to
+// transparently select between CPU and GPU versions of
+// computations. The computations themselves are
+// expressed using standard Eigen::Tensor notation and
+// devices in clstm_compute.cc. Only clstm_compute.cc
+// needs to be compiled with nvcc, greatly cutting down
+// on the exposure to incompatibilities and bugs in nvcc.
+
+#ifdef CLSTM_CUDA
+#define DEFGENERIC(NAME, ...)                                \
+  template <typename Arg, typename... Args>                  \
+  void NAME(Arg &&arg, Args &&... args) {                    \
+    extern void NAME(Eigen::DefaultDevice *, __VA_ARGS__);   \
+    extern void NAME(Eigen::GpuDevice *, __VA_ARGS__);       \
+    Eigen::GpuDevice *dev = gpu_device(gpu_id(arg));         \
+    if (dev) {                                               \
+      NAME(dev, arg, std::forward<Args>(args)...);           \
+      return;                                                \
+    }                                                        \
+    NAME(&default_device, arg, std::forward<Args>(args)...); \
+  }
+#else
+#define DEFGENERIC(NAME, ...)                                \
+  template <typename Arg, typename... Args>                  \
+  void NAME(Arg &&arg, Args &&... args) {                    \
+    extern void NAME(Eigen::DefaultDevice *, __VA_ARGS__);   \
+    NAME(&default_device, arg, std::forward<Args>(args)...); \
+  }
+#endif
+
+DEFGENERIC(forward_nonlin, Batch &, Batch &, int);
+DEFGENERIC(backward_nonlin, Batch &, Batch &, int);
+DEFGENERIC(forward_nonlin0, Batch &, int);
+DEFGENERIC(backward_nonlin0, Batch &, int);
+DEFGENERIC(forward_lin1, Batch &, Params &, Batch &);
+DEFGENERIC(backward_lin1, Batch &, Params &, Batch &);
+DEFGENERIC(forward_full1, Batch &, Params &, Batch &, int);
+DEFGENERIC(backward_full1, Batch &, Params &, Batch &, int);
+DEFGENERIC(forward_stack, Batch &, Batch &, Batch &);
+DEFGENERIC(backward_stack, Batch &, Batch &, Batch &);
+DEFGENERIC(forward_stack_delay, Batch &, Batch &, Sequence &, int);
+DEFGENERIC(backward_stack_delay, Batch &, Batch &, Sequence &, int);
+DEFGENERIC(forward_reverse, Sequence &, Sequence &);
+DEFGENERIC(backward_reverse, Sequence &, Sequence &);
+DEFGENERIC(forward_btswitch, Sequence &, Sequence &);
+DEFGENERIC(backward_btswitch, Sequence &, Sequence &);
+DEFGENERIC(forward_batchstack, Sequence &, Sequence &, int pre = 1,
+           int post = 1);
+DEFGENERIC(backward_batchstack, Sequence &, Sequence &, int pre = 1,
+           int post = 1);
+DEFGENERIC(forward_softmax, Batch &, Params &, Batch &);
+DEFGENERIC(backward_softmax, Batch &, Params &, Batch &);
+DEFGENERIC(forward_statemem, Batch &, Batch &, Batch &, Sequence &, int,
+           Batch &);
+DEFGENERIC(backward_statemem, Batch &, Batch &, Batch &, Sequence &, int,
+           Batch &);
+DEFGENERIC(forward_nonlingate, Batch &, Batch &, Batch &, int);
+DEFGENERIC(backward_nonlingate, Batch &, Batch &, Batch &, int);
+
+DEFGENERIC(fill, Tensor2 &, Float value);
+DEFGENERIC(clip_gradient, Batch &, Float value);
+DEFGENERIC(sgd_update, Params &, Float lr, Float mom);
+};
+
+#endif
diff --git a/clstm_compute_cuda.cc b/clstm_compute_cuda.cc
@@ -0,0 +1,9 @@
+#define DEVICE Eigen::GpuDevice
+#ifndef EIGEN_USE_GPU
+#error "EIGEN_USE_GPU not defined"
+#endif
+#ifndef __CUDACC__
+#error "not compiling in CUDA mode"
+#endif
+
+#include "clstm_compute.cc"
diff --git a/clstm_prefab.cc b/clstm_prefab.cc
@@ -1,11 +1,13 @@
 #include "clstm.h"
 #include <assert.h>
-#include <iostream>
-#include <vector>
-#include <string>
-#include <memory>
 #include <math.h>
 #include <stdarg.h>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "extras.h"
+#include "utils.h"
 
 namespace ocropus {
 map<string, INetworkFactory> network_factories;
@@ -19,9 +21,9 @@ string get(const Assoc &params, const string &key, const string &dflt) {
 // A 1D unidirectional LSTM with Softmax/Sigmoid output layer.
 
 Network make_lstm1(const Assoc &params) {
-  int ninput = params.at("ninput");
-  int nhidden = params.at("nhidden");
-  int noutput = params.at("noutput");
+  int ninput = params.get("ninput");
+  int nhidden = params.get("nhidden");
+  int noutput = params.get("noutput");
   string lstm_type = get(params, "lstm_type", "NPLSTM");
   string output_type = get(params, "output_type",
                            noutput == 1 ? "SigmoidLayer" : "SoftmaxLayer");
@@ -33,9 +35,9 @@ Network make_lstm1(const Assoc &params) {
 // A 1D unidirectional reversed LSTM with Softmax/Sigmoid output layer.
 
 Network make_revlstm1(const Assoc &params) {
-  int ninput = params.at("ninput");
-  int nhidden = params.at("nhidden");
-  int noutput = params.at("noutput");
+  int ninput = params.get("ninput");
+  int nhidden = params.get("nhidden");
+  int noutput = params.get("noutput");
   string lstm_type = get(params, "lstm_type", "NPLSTM");
   string output_type = get(params, "output_type",
                            noutput == 1 ? "SigmoidLayer" : "SoftmaxLayer");
@@ -48,9 +50,9 @@ Network make_revlstm1(const Assoc &params) {
 // A 1D bidirectional LSTM with Softmax/Sigmoid output layer.
 
 Network make_bidi(const Assoc &params) {
-  int ninput = params.at("ninput");
-  int nhidden = params.at("nhidden");
-  int noutput = params.at("noutput");
+  int ninput = params.get("ninput");
+  int nhidden = params.get("nhidden");
+  int noutput = params.get("noutput");
   string lstm_type = get(params, "lstm_type", "NPLSTM");
   string output_type = get(params, "output_type",
                            noutput == 1 ? "SigmoidLayer" : "SoftmaxLayer");
@@ -65,13 +67,27 @@ Network make_bidi(const Assoc &params) {
        layer(output_type, 2 * nhidden, noutput, params, {})});
 }
 
+// A 1D bidirectional LSTM with Softmax/Sigmoid output layer.
+
+Network make_bidi0(const Assoc &params) {
+  int ninput = params.get("ninput");
+  int noutput = params.get("noutput");
+  string lstm_type = get(params, "lstm_type", "NPLSTM");
+  return layer("Parallel", ninput, 2 * noutput, {},
+               {
+                   layer(lstm_type, ninput, noutput, params, {}),
+                   layer("Reversed", ninput, ninput, {},
+                         {layer(lstm_type, ninput, noutput, params, {})}),
+               });
+}
+
 // Two stacked 1D bidirectional LSTM with Softmax/Sigmoid output layer.
 
 Network make_bidi2(const Assoc &params) {
-  int ninput = params.at("ninput");
-  int nhidden = params.at("nhidden");
-  int nhidden2 = params.at("nhidden2");
-  int noutput = params.at("noutput");
+  int ninput = params.get("ninput");
+  int nhidden = params.get("nhidden");
+  int nhidden2 = params.get("nhidden2");
+  int noutput = params.get("noutput");
   string lstm_type = get(params, "lstm_type", "NPLSTM");
   string output_type = get(params, "output_type",
                            noutput == 1 ? "SigmoidLayer" : "SoftmaxLayer");
@@ -92,11 +108,54 @@ Network make_bidi2(const Assoc &params) {
        layer(output_type, 2 * nhidden2, noutput, params, {})});
 }
 
+Network make_perplstm(const Assoc &params) {
+  int ninput = params.get("ninput");
+  int nhidden = params.get("nhidden");
+  int noutput = params.get("noutput");
+  string output_type = get(params, "output_type", "SigmoidLayer");
+  Network vertical = make_bidi({{"ninput", ninput},
+                                {"nhidden", nhidden},
+                                {"noutput", noutput},
+                                {"output_type", output_type}});
+  return layer("Stacked", ninput, noutput, {},
+               {
+                   // layer("Btswitch", nhidden2, nhidden2, {}, {}),
+                   vertical,
+                   // layer("Btswitch", noutput, noutput, {}, {})
+               });
+}
+
+// Two dimensional LSTM
+
+Network make_twod(const Assoc &params) {
+  int ninput = params.get("ninput");
+  int nhidden = params.get("nhidden");
+  int nhidden2 = params.get("nhidden2", nhidden);
+  int nhidden3 = params.get("nhidden3", nhidden2);
+  int noutput = params.get("noutput");
+  string output_type = get(params, "output_type",
+                           noutput == 1 ? "SigmoidLayer" : "SoftmaxLayer");
+  Network horizontal = make_bidi({{"ninput", ninput},
+                                  {"nhidden", nhidden},
+                                  {"noutput", nhidden2},
+                                  {"output_type", "SigmoidLayer"}});
+  Network vertical = make_bidi({{"ninput", nhidden2},
+                                {"nhidden", nhidden3},
+                                {"noutput", noutput},
+                                {"output_type", output_type}});
+  return layer("Stacked", ninput, noutput, {},
+               {horizontal, layer("Btswitch", nhidden2, nhidden2, {}, {}),
+                vertical, layer("Btswitch", noutput, noutput, {}, {})});
+}
+
 void init_clstm_prefab() {
   network_factories["lstm1"] = make_lstm1;
   network_factories["revlstm1"] = make_revlstm1;
   network_factories["bidi"] = make_bidi;
+  network_factories["bidi0"] = make_bidi0;
   network_factories["bidi2"] = make_bidi2;
+  network_factories["twod"] = make_twod;
+  network_factories["perplstm"] = make_perplstm;
 }
 
 static int init_ = (init_clstm_prefab(), 0);
@@ -106,10 +165,10 @@ Network make_net(const string &kind, const Assoc &args) {
   if (network_factories.find(kind) != network_factories.end()) {
     result = network_factories[kind](args);
   } else {
-    result = layer(kind, args.at("ninput"), args.at("noutput"), args, {});
+    result = layer(kind, args.get("ninput"), args.get("noutput"), args, {});
   }
-  if (!result) throwf("%s: no such network or layer", kind.c_str());
-  result->attributes["kind"] = kind;
+  if (!result) throwf("no such network or layer: %s", kind.c_str());
+  result->attr.set("kind", kind);
   return result;
 }
 
@@ -120,8 +179,10 @@ Network make_net_init(const string &kind, const string &params) {
   using std::cerr;
   using std::endl;
   Assoc args(params);
-  for (auto it : args) {
-    cerr << it.first << ": " << it.second << endl;
+  if (getienv("verbose_params", 0)) {
+    for (auto it : args) {
+      cerr << it.first << ": " << it.second << endl;
+    }
   }
   return make_net(kind, args);
 }

diff --git a/clstm_proto.cc b/clstm_proto.cc
@@ -3,45 +3,22 @@
 
 #include "clstm.h"
 #include <assert.h>
-#include <iostream>
-#include <vector>
-#include <string>
-#include <memory>
 #include <math.h>
-#include <iostream>
-#include <fstream>
-#include <Eigen/Dense>
 #include <stdarg.h>
+#include <fstream>
+#include <iostream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <typeinfo>
+#include <vector>
+#include "utils.h"
 #ifdef GOOGLE
-#include "third_party/clstm/clstm.pb.h"
+#include "third_party/clstm/tensor/clstm.pb.h"
 #else
 #include "clstm.pb.h"
 #endif
 
-namespace {
-inline void throwf(const char *format, ...) {
-  static char buf[1024];
-  va_list arglist;
-  va_start(arglist, format);
-  vsprintf(buf, format, arglist);
-  va_end(arglist);
-  THROW(buf);
-}
-
-inline void print() { std::cout << std::endl; }
-
-template <class T>
-inline void print(const T &arg) {
-  std::cout << arg << std::endl;
-}
-
-template <class T, typename... Args>
-inline void print(T arg, Args... args) {
-  std::cout << arg << " ";
-  print(args...);
-}
-}
-
 namespace ocropus {
 using std::cout;
 using std::endl;
@@ -55,51 +32,39 @@ using std::to_string;
 bool proto_verbose =
     getenv("clstm_proto_verbose") && atoi(getenv("clstm_proto_verbose"));
 
-void proto_of_Mat(clstm::Array *array, Mat &a, bool weights = true) {
-  array->add_dim(a.rows());
-  array->add_dim(a.cols());
+void proto_of_params(clstm::Array *array, Params &params, bool weights = true) {
+  Tensor2 temp;
+  temp = params.v;  // copy values in case they are on GPU
+  TensorMap2 a = temp();
+  int n = a.dimension(0), m = a.dimension(1);
+  array->add_dim(n);
+  array->add_dim(m);
   if (!weights) return;
-  for (int i = 0; i < a.rows(); i++)
-    for (int j = 0; j < a.cols(); j++) array->add_value(a(i, j));
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < m; j++) array->add_value(a(i, j));
 }
 
-void proto_of_Vec(clstm::Array *array, Vec &a, bool weights = true) {
-  array->add_dim(a.size());
-  if (!weights) return;
-  for (int i = 0; i < a.size(); i++) array->add_value(a(i));
-}
-
-void Mat_of_proto(Mat &a, const clstm::Array *array) {
+void params_of_proto(Params &params, const clstm::Array *array) {
   if (array->dim_size() != 2)
     throwf("bad format (Mat, %s, %d)", array->name().c_str(),
            array->dim_size());
-  a.resize(array->dim(0), array->dim(1));
-  a.setZero();
+  params.setZero(array->dim(0), array->dim(1));
+  TensorMap2 a = params.v();
   if (array->value_size() > 0) {
     if (array->value_size() != a.size()) THROW("bad size (Mat)");
     int k = 0;
-    for (int i = 0; i < a.rows(); i++)
-      for (int j = 0; j < a.cols(); j++) a(i, j) = array->value(k++);
-  }
-}
-
-void Vec_of_proto(Vec &a, const clstm::Array *array) {
-  if (array->dim_size() != 1) THROW("bad format (Vec)");
-  a.resize(array->dim(0));
-  a.setZero();
-  if (array->value_size() > 0) {
-    if (array->value_size() != a.size()) THROW("bad size (Vec)");
-    int k = 0;
-    for (int i = 0; i < a.size(); i++) a(i) = array->value(k++);
+    for (int i = 0; i < a.dimension(0); i++)
+      for (int j = 0; j < a.dimension(1); j++) a(i, j) = array->value(k++);
   }
 }
 
 void proto_of_net(clstm::NetworkProto *proto, INetwork *net,
                   bool weights = true) {
-  net->preSave();
-  assert(string("") != net->kind());
-  proto->set_kind(net->kind());
-  proto->set_name(net->name);
+  if (net->kind == "") {
+    cerr << typeid(*net).name() << endl;
+    assert(net->kind != "");
+  }
+  proto->set_kind(net->kind);
   proto->set_ninput(net->ninput());
   proto->set_noutput(net->noutput());
   assert(proto->kind() != "");
@@ -108,27 +73,24 @@ void proto_of_net(clstm::NetworkProto *proto, INetwork *net,
   assert(proto->noutput() >= 0);
   assert(proto->noutput() < 1000000);
   for (int i = 0; i < net->icodec.size(); i++)
-    proto->add_icodec(net->icodec[i]);
-  for (int i = 0; i < net->codec.size(); i++) proto->add_codec(net->codec[i]);
-  for (auto kv : net->attributes) {
+    proto->add_icodec(net->icodec.codec[i]);
+  for (int i = 0; i < net->codec.size(); i++)
+    proto->add_codec(net->codec.codec[i]);
+  for (auto kv : net->attr) {
     if (kv.first == "name") continue;
     if (kv.first == "ninput") continue;
     if (kv.first == "noutput") continue;
     clstm::KeyValue *kvp = proto->add_attribute();
     kvp->set_key(kv.first);
     kvp->set_value(kv.second);
   }
-  net->myweights("",
-                 [proto, weights](const string &prefix, VecMat a, VecMat da) {
-                   clstm::Array *array = proto->add_weights();
-                   array->set_name(prefix);
-                   if (a.mat)
-                     proto_of_Mat(array, *a.mat, weights);
-                   else if (a.vec)
-                     proto_of_Vec(array, *a.vec, weights);
-                   else
-                     THROW("oops (save type)");
-                 });
+  for (auto it : net->parameters) {
+    Params *a = it.second;
+    string name = it.first;
+    clstm::Array *array = proto->add_weights();
+    array->set_name(name);
+    proto_of_params(array, *a, weights);
+  }
   for (int i = 0; i < net->sub.size(); i++) {
     clstm::NetworkProto *subproto = proto->add_sub();
     proto_of_net(subproto, net->sub[i].get(), weights);
@@ -143,31 +105,32 @@ Network net_of_proto(const clstm::NetworkProto *proto) {
   assert(proto->noutput() >= 0);
   assert(proto->noutput() < 1000000);
   net = make_layer(proto->kind());
-  net->name = proto->name();
-  net->attributes["ninput"] = to_string(proto->ninput());
-  net->attributes["noutput"] = to_string(proto->noutput());
+  net->attr.set("ninput", proto->ninput());
+  net->attr.set("noutput", proto->noutput());
   for (int i = 0; i < proto->attribute_size(); i++) {
     const clstm::KeyValue *attr = &proto->attribute(i);
-    net->attributes[attr->key()] = attr->value();
+    net->attr.set(attr->key(), std::string(attr->value()));
   }
+  vector<int> icodec;
   for (int i = 0; i < proto->icodec_size(); i++)
-    net->icodec.push_back(proto->icodec(i));
+    icodec.push_back(proto->icodec(i));
+  net->icodec.set(icodec);
+  vector<int> codec;
   for (int i = 0; i < proto->codec_size(); i++)
-    net->codec.push_back(proto->codec(i));
-  map<string, VecMat> weights;
-  net->myweights("", [&weights](const string &prefix, VecMat a, VecMat da) {
-    weights[prefix] = a;
-  });
+    codec.push_back(proto->codec(i));
+  net->codec.set(codec);
+  map<string, Params *> weights;
+  for (auto it : net->parameters) {
+    weights[it.first] = it.second;
+  }
   for (int i = 0; i < proto->weights_size(); i++) {
     string key = proto->weights(i).name();
-    VecMat a = weights[key];
-    if (a.mat)
-      Mat_of_proto(*a.mat, &proto->weights(i));
-    else if (a.vec)
-      Vec_of_proto(*a.vec, &proto->weights(i));
+    Params *a = weights[key];
+    params_of_proto(*a, &proto->weights(i));
   }
   for (int i = 0; i < proto->sub_size(); i++) {
     net->add(net_of_proto(&proto->sub(i)));
+    net->sub[i]->attr.super = &net->attr;
   }
   net->postLoad();
   return net;
@@ -187,29 +150,32 @@ void debug_as_proto(INetwork *net, bool weights) {
   delete proto;
 }
 
-void write_as_proto(ostream &output, INetwork *net) {
+bool write_as_proto(ostream &output, INetwork *net) {
   unique_ptr<clstm::NetworkProto> proto;
   proto.reset(new clstm::NetworkProto());
   proto_of_net(proto.get(), net);
-  if (proto->SerializeToOstream(&output) == false) {
-    THROW("Serializing failed.");
-  }
+  return proto->SerializeToOstream(&output);
 }
 
-void save_as_proto(const string &fname, INetwork *net) {
+bool save_as_proto(const string &fname, INetwork *net) {
   ofstream stream;
   stream.open(fname, ios::binary);
-  write_as_proto(stream, net);
+  return write_as_proto(stream, net);
 }
 
-Network load_as_proto(const string &fname) {
-  ifstream stream;
-  stream.open(fname, ios::binary);
+Network read_as_proto(istream &stream) {
   unique_ptr<clstm::NetworkProto> proto;
   proto.reset(new clstm::NetworkProto());
   if (proto->ParseFromIstream(&stream) == false) {
-    THROW("Invalid message");
+    return Network();
   }
   return net_of_proto(proto.get());
 }
+
+Network load_as_proto(const string &fname) {
+  ifstream stream;
+  stream.open(fname, ios::binary);
+  if (!stream) throwf("cannot open: %s", fname.c_str());
+  return read_as_proto(stream);
+}
 }
diff --git a/clstmfilter.cc b/clstmfilter.cc
@@ -1,24 +1,20 @@
-#include "pstring.h"
 #include "clstm.h"
-#include "clstmhl.h"
 #include <assert.h>
-#include <iostream>
-#include <vector>
-#include <memory>
 #include <math.h>
-#include <Eigen/Dense>
-#include <sstream>
 #include <fstream>
 #include <iostream>
+#include <iostream>
+#include <memory>
 #include <set>
-
-#include "multidim.h"
-#include "pymulti.h"
+#include <sstream>
+#include <vector>
+#include "clstmhl.h"
 #include "extras.h"
+#include "pstring.h"
+#include "utils.h"
 
 using namespace Eigen;
 using namespace ocropus;
-using namespace pymulti;
 using std::vector;
 using std::map;
 using std::make_pair;
@@ -61,13 +57,6 @@ int main1(int argc, char **argv) {
 }
 
 int main(int argc, char **argv) {
-#ifdef NOEXCEPTION
-  return main1(argc, argv);
-#else
-  try {
-    return main1(argc, argv);
-  } catch (const char *message) {
-    cerr << "FATAL: " << message << endl;
-  }
-#endif
+  TRY { return main1(argc, argv); }
+  CATCH(const char *message) { cerr << "FATAL: " << message << endl; }
 }
diff --git a/clstmfiltertrain.cc b/clstmfiltertrain.cc
@@ -1,24 +1,20 @@
-#include "pstring.h"
 #include "clstm.h"
-#include "clstmhl.h"
 #include <assert.h>
-#include <iostream>
-#include <vector>
-#include <memory>
 #include <math.h>
-#include <Eigen/Dense>
-#include <sstream>
 #include <fstream>
 #include <iostream>
+#include <iostream>
+#include <memory>
 #include <set>
-
-#include "multidim.h"
-#include "pymulti.h"
+#include <sstream>
+#include <vector>
+#include "clstmhl.h"
 #include "extras.h"
+#include "pstring.h"
+#include "utils.h"
 
 using namespace Eigen;
 using namespace ocropus;
-using namespace pymulti;
 using std::vector;
 using std::map;
 using std::make_pair;
@@ -41,7 +37,6 @@ void read_samples(vector<Sample> &samples, const string &fname) {
   ifstream stream(fname);
   string line;
   wstring in, out;
-  ;
   samples.clear();
   while (getline(stream, line)) {
     // skip blank lines and lines starting with a comment
@@ -75,51 +70,73 @@ int main1(int argc, char **argv) {
   if (argc > 2) read_samples(test_samples, argv[2]);
   print("got", samples.size(), "inputs,", test_samples.size(), "tests");
 
-  vector<int> icodec, codec;
-  get_codec(icodec, samples, &Sample::in);
-  get_codec(codec, samples, &Sample::out);
+  string load_name = getsenv("load", "");
 
   CLSTMText clstm;
-  clstm.createBidi(icodec, codec, getienv("nhidden", 100));
-  clstm.setLearningRate(getdenv("rate", 1e-4), getdenv("momentum", 0.9));
-  clstm.net->info("");
+
+  int nhidden = -1;
+  double lrate = getdenv("lrate", 1e-4);
+  double momentum = getdenv("momentum", 0.9);
+
+  if (load_name != "") {
+    clstm.load(load_name);
+  } else {
+    vector<int> icodec, codec;
+    get_codec(icodec, samples, &Sample::in);
+    get_codec(codec, samples, &Sample::out);
+    nhidden = getienv("nhidden", 100);
+    clstm.createBidi(icodec, codec, nhidden);
+    clstm.setLearningRate(lrate, momentum);
+  }
+  network_info(clstm.net, "");
 
   int ntrain = getienv("ntrain", 10000000);
   int save_every = getienv("save_every", 10000);
   string save_name = getsenv("save_name", "_filter");
   int report_every = getienv("report_every", 100);
   int test_every = getienv("test_every", 10000);
+  bool use_exact = getienv("use_exact", 0);
 
   // Command to execute after testing the networks performance.
   string after_test = getsenv("after_test", "");
 
   double best_error = 1e38;
   double test_error = 9999.0;
-  for (int trial = 0; trial < ntrain; trial++) {
-    int sample = irandom() % samples.size();
+  int start = clstm.net->attr.get("trial", getienv("start", -1)) + 1;
+  if (start > 0) print("start", start);
+  for (int trial = start; trial < ntrain; trial++) {
+    int sample = lrand48() % samples.size();
     if (trial > 0 && test_samples.size() > 0 && test_every > 0 &&
         trial % test_every == 0) {
       double errors = 0.0;
       double count = 0.0;
+      double exact_matches = 0.0;
       for (int test = 0; test < test_samples.size(); test++) {
         wstring gt = test_samples[test].out;
         wstring pred = clstm.predict(test_samples[test].in);
         count += gt.size();
         errors += levenshtein(pred, gt);
+        if (pred == gt) exact_matches++;
       }
       test_error = errors / count;
-      print("ERROR", trial, test_error, "   ", errors, count);
+      double exact_test_error = 1.0 - exact_matches / test_samples.size();
+      print("ERROR", trial, test_error, "   ", errors, count, "exact_errors",
+            exact_test_error, "lrate", lrate, "momentum", momentum, "nhidden",
+            nhidden);
+      if (use_exact) test_error = exact_test_error;
       if (save_every == 0 && test_error < best_error) {
         best_error = test_error;
         string fname = save_name + ".clstm";
-        print("saving best performing network so far", fname,
-              "error rate: ", best_error);
+        print("saving best performing network so far", fname, "error rate: ",
+              best_error);
+        clstm.net->attr.set("trial", trial);
         clstm.save(fname);
       }
       if (after_test != "") system(after_test.c_str());
     }
     if (trial > 0 && save_every > 0 && trial % save_every == 0) {
       string fname = save_name + "-" + to_string(trial) + ".clstm";
+      clstm.net->attr.set("trial", trial);
       clstm.save(fname);
     }
     wstring pred = clstm.train(samples[sample].in, samples[sample].out);
@@ -136,13 +153,6 @@ int main1(int argc, char **argv) {
 }
 
 int main(int argc, char **argv) {
-#ifdef NOEXCEPTION
-  return main1(argc, argv);
-#else
-  try {
-    return main1(argc, argv);
-  } catch (const char *message) {
-    cerr << "FATAL: " << message << endl;
-  }
-#endif
+  TRY { return main1(argc, argv); }
+  CATCH(const char *message) { cerr << "FATAL: " << message << endl; }
 }
diff --git a/clstmhl.h b/clstmhl.h
@@ -3,12 +3,13 @@
 #ifndef ocropus_clstmhl_
 #define ocropus_clstmhl_
 
-#include "pstring.h"
-#include "clstm.h"
-#include "extras.h"
 #include <memory>
 #include <string>
 #include <vector>
+#include "clstm.h"
+#include "extras.h"
+#include "pstring.h"
+#include "tensor.h"
 
 namespace ocropus {
 
@@ -19,6 +20,7 @@ struct CharPrediction {
   float p;
 };
 
+// Clstm network used for text input and output.
 struct CLSTMText {
   Network net;
   int nclasses = -1;
@@ -27,18 +29,44 @@ struct CLSTMText {
   Sequence targets;
   Sequence aligned;
   void setLearningRate(float lr, float mom) { net->setLearningRate(lr, mom); }
+
+  // Loads the network from the given file. If the file does not exist
+  // or the contents of the file cannot be read, throws an exception.
   void load(const std::string &fname) {
-    net = load_net(fname);
+    if (!maybe_load(fname)) {
+      THROW("Could not load CLSTMText net from file: " + fname);
+    }
+  }
+
+  // Tries to load a network from the given file. If the file does not exist
+  // or the contents of the file cannot be read, returns false.
+  bool maybe_load(const std::string &fname) {
+    net = maybe_load_net(fname);
+
+    if (!net) {
+      cerr << "WARNING: could not load CLSTMText net from " << fname;
+      return false;
+    }
     nclasses = net->codec.size();
     iclasses = net->icodec.size();
-    if (net->attributes["neps"] == "") {
-      cerr << "WARNING: no neps";
-    } else {
-      neps = stoi(net->attributes["neps"]);
+    int neps = net->attr.get("neps", -1);
+    if (neps < 0) cerr << "WARNING: no neps\n";
+    return true;
+  }
+
+  // Saves the network to the given file. If this operation fails, throws an
+  // exception.
+  void save(const std::string &fname) {
+    if (!maybe_save(fname)) {
+      THROW("Could not save CLSTMText net to file: " + fname);
     }
-    net->makeEncoders();
   }
-  void save(const std::string &fname) { save_net(fname, net); }
+
+  // Saves the network to the given file. If this operation fails, return false.
+  bool maybe_save(const std::string &fname) {
+    return maybe_save_net(fname, net);
+  }
+
   void createBidi(const std::vector<int> &icodec, const std::vector<int> codec,
                   int nhidden) {
     // This is just the simplest case of creating a network. For more complex
@@ -48,133 +76,175 @@ struct CLSTMText {
     net = make_net("bidi", {{"ninput", (int)icodec.size()},
                             {"noutput", (int)codec.size()},
                             {"nhidden", nhidden}});
-    net->attributes["neps"] = std::to_string(neps);
-    net->icodec = icodec;
-    net->codec = codec;
-    net->makeEncoders();
+    net->attr.set("neps", neps);
+    net->icodec.set(icodec);
+    net->codec.set(codec);
   }
   void setInputs(const std::wstring &s) {
     Classes cs;
-    net->iencode(cs, s);
+    net->icodec.encode(cs, s);
     Sequence &seq = net->inputs;
     int d = net->ninput();
     seq.clear();
-    seq.resize(cs.size() * (neps+1) + neps);
-    for (int i = 0; i < neps; i++) seq[i].setZero(d, 1);
+    seq.resize(cs.size() * (neps + 1) + neps, d, 1);
+    int index = 0;
+    for (int i = 0; i < neps; i++) seq[index++].clear();
     for (int pos = 0; pos < cs.size(); pos++) {
-      seq[pos].setZero(d, 1);
-      seq[pos](cs[pos], 0) = 1.0;
-      for (int i = 0; i < neps; i++) seq[pos+1+i].setZero(d, 1);
+      TensorMap2 v = *seq[index++].v;
+      v.setZero();
+      v(cs[pos], 0) = 1.0;
+      for (int i = 0; i < neps; i++) seq[index++].v.setZero();
     }
+    assert(index == seq.size());
+    seq.check();
   }
+
+  // Trains the network using the given input and target using backpropagation.
   std::wstring train(const std::wstring &in, const std::wstring &target) {
     setInputs(in);
     net->forward();
     Classes transcript;
-    net->encode(transcript, target);
+    net->codec.encode(transcript, target);
     mktargets(targets, transcript, nclasses);
     ctc_align_targets(aligned, net->outputs, targets);
     for (int t = 0; t < aligned.size(); t++)
-      net->outputs[t].d = aligned[t] - net->outputs[t];
+      net->outputs[t].d() = aligned[t].v() - net->outputs[t].v();
     net->backward();
-    net->update();
+    sgd_update(net);
     Classes output_classes;
     trivial_decode(output_classes, net->outputs);
-    return net->decode(output_classes);
+    return net->codec.decode(output_classes);
   }
   std::wstring predict(const std::wstring &in) {
     setInputs(in);
     net->forward();
     Classes output_classes;
     trivial_decode(output_classes, net->outputs);
-    return net->decode(output_classes);
+    return net->codec.decode(output_classes);
   }
   void train_utf8(const std::string &in, const std::string &target) {
     train(utf8_to_utf32(in), utf8_to_utf32(target));
   }
   std::string aligned_utf8() {
     Classes outputs;
     trivial_decode(outputs, aligned);
-    std::wstring temp = net->decode(outputs);
+    std::wstring temp = net->codec.decode(outputs);
     return utf32_to_utf8(temp);
   }
   std::string predict_utf8(const std::string &in) {
     return utf32_to_utf8(predict(utf8_to_utf32(in)));
   }
-  void get_outputs(mdarray<float> &outputs) {
+  void get_outputs(Tensor2 &outputs) {
     Sequence &o = net->outputs;
     outputs.resize(int(o.size()), int(o[0].rows()));
-    for (int t=0; t < outputs.dim(0); t++)
-      for (int c=0; c < outputs.dim(1); c++)
-        outputs(t,c) = net->outputs[t](c,0);
+    for (int t = 0; t < outputs.dimension(0); t++)
+      for (int c = 0; c < outputs.dimension(1); c++)
+        outputs(t, c) = net->outputs[t].v(c, 0);
   }
-  };
+};
 
 struct CLSTMOCR {
-  unique_ptr<INormalizer> normalizer;
+  shared_ptr<INormalizer> normalizer;
   Network net;
   int target_height = 48;
   int nclasses = -1;
   Sequence aligned, targets;
-  mdarray<float> image;
+  Tensor2 image;
   void setLearningRate(float lr, float mom) { net->setLearningRate(lr, mom); }
-  void load(const std::string &fname) {
-    net = load_net(fname);
+
+  // Tries to load a network from the given file. If the file does not exist
+  // or the contents of the file cannot be read, returns false.
+  bool maybe_load(const std::string &fname) {
+    net = maybe_load_net(fname);
+    if (!net) {
+      cerr << "WARNING: could not load CLSTMOCR net from " << fname;
+      return false;
+    }
     nclasses = net->codec.size();
+    target_height = net->ninput();
     normalizer.reset(make_CenterNormalizer());
     normalizer->target_height = target_height;
+    return true;
+  }
+
+  // Loads the network from the given file. If the file does not exist
+  // or the contents of the file cannot be read, throws an exception.
+  void load(const std::string &fname) {
+    if (!maybe_load(fname)) {
+      THROW("Could not load CLSTMOCR net from file: " + fname);
+    }
   }
-  void save(const std::string &fname) { save_net(fname, net); }
+
+  // Saves the network to the given file. If this operation fails, throws an
+  // exception.
+  void save(const std::string &fname) {
+    if (!maybe_save(fname)) {
+      THROW("Could not save CLSTMOCR net to file: " + fname);
+    }
+  }
+
+  // Saves the network to the given file. If this operation fails, return false.
+  bool maybe_save(const std::string &fname) {
+    return maybe_save_net(fname, net);
+  }
+
   void createBidi(const std::vector<int> codec, int nhidden) {
     nclasses = codec.size();
     net = make_net("bidi", {{"ninput", target_height},
                             {"noutput", nclasses},
                             {"nhidden", nhidden}});
     net->initialize();
-    net->codec = codec;
-    net->makeEncoders();
+    net->codec.set(codec);
     normalizer.reset(make_CenterNormalizer());
     normalizer->target_height = target_height;
   }
-  std::wstring train(mdarray<float> &raw, const std::wstring &target) {
+  std::wstring fwdbwd(TensorMap2 raw, const std::wstring &target) {
     normalizer->measure(raw);
+    image.like(raw);
     normalizer->normalize(image, raw);
-    assign(net->inputs, image);
+    set_inputs(net, image());
     net->forward();
     Classes transcript;
-    net->encode(transcript, target);
+    net->codec.encode(transcript, target);
     mktargets(targets, transcript, nclasses);
     ctc_align_targets(aligned, net->outputs, targets);
     for (int t = 0; t < aligned.size(); t++)
-      net->outputs[t].d = aligned[t] - net->outputs[t];
+      net->outputs[t].d() = aligned[t].v() - net->outputs[t].v();
     net->backward();
-    net->update();
     Classes outputs;
     trivial_decode(outputs, net->outputs);
-    return net->decode(outputs);
+    return net->codec.decode(outputs);
+  }
+  void update() { sgd_update(net); }
+  std::wstring train(TensorMap2 raw, const std::wstring &target) {
+    std::wstring result = fwdbwd(raw, target);
+    update();
+    return result;
   }
   std::string aligned_utf8() {
     Classes outputs;
     trivial_decode(outputs, aligned);
-    std::wstring temp = net->decode(outputs);
+    std::wstring temp = net->codec.decode(outputs);
     return utf32_to_utf8(temp);
   }
-  std::string train_utf8(mdarray<float> &raw, const std::string &target) {
+  std::string train_utf8(TensorMap2 raw, const std::string &target) {
     return utf32_to_utf8(train(raw, utf8_to_utf32(target)));
   }
-  std::wstring predict(mdarray<float> &raw, vector<int> *where = 0) {
+  std::wstring predict(TensorMap2 raw, vector<int> *where = 0) {
     normalizer->measure(raw);
+    image.like(raw);
     normalizer->normalize(image, raw);
-    assign(net->inputs, image);
+    set_inputs(net, image());
     net->forward();
     Classes outputs;
     trivial_decode(outputs, net->outputs, 0, where);
-    return net->decode(outputs);
+    return net->codec.decode(outputs);
   }
-  void predict(vector<CharPrediction> &preds, mdarray<float> &raw) {
+  void predict(vector<CharPrediction> &preds, TensorMap2 raw) {
     normalizer->measure(raw);
+    image.like(raw);
     normalizer->normalize(image, raw);
-    assign(net->inputs, image);
+    set_inputs(net, image());
     net->forward();
     Classes outputs;
     vector<int> where;
@@ -183,23 +253,23 @@ struct CLSTMOCR {
     for (int i = 0; i < outputs.size(); i++) {
       int t = where[i];
       int cls = outputs[i];
-      wchar_t c = net->decode(outputs[i]);
-      float p = net->outputs[t](cls, 0);
+      wchar_t c = net->codec.decode(outputs[i]);
+      float p = net->outputs[t].v(cls, 0);
       CharPrediction pred{i, t, c, p};
       preds.push_back(pred);
     }
   }
-  std::string predict_utf8(mdarray<float> &raw) {
+  std::string predict_utf8(TensorMap2 raw) {
     return utf32_to_utf8(predict(raw));
   }
-  void get_outputs(mdarray<float> &outputs) {
+  void get_outputs(Tensor2 &outputs) {
     Sequence &o = net->outputs;
     outputs.resize(int(o.size()), int(o[0].rows()));
-    for (int t=0; t < outputs.dim(0); t++)
-      for (int c=0; c < outputs.dim(1); c++)
-        outputs(t,c) = net->outputs[t](c,0);
+    for (int t = 0; t < outputs.dimension(0); t++)
+      for (int c = 0; c < outputs.dimension(1); c++)
+        outputs(t, c) = net->outputs[t].v(c, 0);
   }
-  };
+};
 }
 
 #endif
diff --git a/clstmocr.cc b/clstmocr.cc
@@ -1,24 +1,20 @@
-#include "pstring.h"
 #include "clstm.h"
-#include "clstmhl.h"
 #include <assert.h>
-#include <iostream>
-#include <vector>
-#include <memory>
 #include <math.h>
-#include <Eigen/Dense>
-#include <sstream>
 #include <fstream>
 #include <iostream>
+#include <iostream>
+#include <memory>
 #include <set>
-
-#include "multidim.h"
-#include "pymulti.h"
+#include <sstream>
+#include <vector>
+#include "clstmhl.h"
 #include "extras.h"
+#include "pstring.h"
+#include "utils.h"
 
 using namespace Eigen;
 using namespace ocropus;
-using namespace pymulti;
 using std::vector;
 using std::map;
 using std::make_pair;
@@ -43,19 +39,23 @@ inline float scaled_log(float x) {
   return (l + thresh) / thresh;
 }
 
-void write_text(const string fname, const wstring &data) {
-  string utf8 = utf32_to_utf8(data);
-  ofstream stream(fname);
-  stream << utf8 << endl;
-}
-
-void write_text(const string fname, const string &data) {
-  ofstream stream(fname);
-  stream << data << endl;
+int print_usage(char **argv) {
+    cerr << "Usage: [VAR=VAL...] " << argv[0] << " IMAGEFILE\n";
+    cerr << "\n";
+    cerr << "  Arguments:\n";
+    cerr << "    IMAGEFILE      Image file to OCR\n";
+    cerr << "  \n";
+    cerr << "  Variables:\n";
+    cerr << "     load          Model to recognize with. Required\n";
+    cerr << "     conf          Output character-wise predictions. Default: 0\n";
+    cerr << "     output        Output format, either 'text' or 'posteriors'. Default: 'text'\n";
+    cerr << "     save_text     Save text to IMAGEFILE.txt. Default: 1\n";
+    return EXIT_FAILURE;
 }
 
 int main1(int argc, char **argv) {
-  if (argc != 2) THROW("give text file as an argument");
+  if (argc != 2 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))
+      return print_usage(argv);
   const char *fname = argv[1];
 
   string load_name = getsenv("load", "");
@@ -70,40 +70,40 @@ int main1(int argc, char **argv) {
   ifstream stream(fname);
   string line;
   while (getline(stream, line)) {
-    mdarray<float> raw;
+    Tensor2 raw;
     string fname = line;
     string basename = fname.substr(0, fname.find_last_of("."));
-    read_png(raw, fname.c_str(), true);
-    for (int i = 0; i < raw.size(); i++) raw[i] = 1 - raw[i];
+    read_png(raw, fname.c_str());
+    raw() = -raw() + Float(1.0);
     if (!conf) {
-      string out = clstm.predict_utf8(raw);
+      string out = clstm.predict_utf8(raw());
       cout << line << "\t" << out << endl;
       if (save_text) {
-        write_text(basename+".txt", out);
+        write_text(basename + ".txt", out);
       }
     } else {
       cout << "file " << line << endl;
       vector<CharPrediction> preds;
-      clstm.predict(preds, raw);
+      clstm.predict(preds, raw());
       for (int i = 0; i < preds.size(); i++) {
         CharPrediction p = preds[i];
         const char *sep = "\t";
         cout << p.i << sep << p.x << sep << p.c << sep << p.p << endl;
       }
     }
-    if (output == "text" ) {
+    if (output == "text") {
       // nothing else to do
     } else if (output == "logs") {
-      mdarray<float> outputs;
+      Tensor2 outputs;
       clstm.get_outputs(outputs);
-      for (int t=0; t<outputs.dim(0); t++)
-        for (int c=0; c<outputs.dim(1); c++)
-          outputs(t,c) = scaled_log(outputs(t,c));
-      write_png((basename+".lp.png").c_str(), outputs);
+      for (int t = 0; t < outputs.dimension(0); t++)
+        for (int c = 0; c < outputs.dimension(1); c++)
+          outputs(t, c) = scaled_log(outputs(t, c));
+      write_png((basename + ".lp.png").c_str(), outputs());
     } else if (output == "posteriors") {
-      mdarray<float> outputs;
+      Tensor2 outputs;
       clstm.get_outputs(outputs);
-      write_png((basename+".p.png").c_str(), outputs);
+      write_png((basename + ".p.png").c_str(), outputs());
     } else {
       THROW("unknown output format");
     }
@@ -112,13 +112,6 @@ int main1(int argc, char **argv) {
 }
 
 int main(int argc, char **argv) {
-#ifdef NOEXCEPTION
-  return main1(argc, argv);
-#else
-  try {
-    return main1(argc, argv);
-  } catch (const char *message) {
-    cerr << "FATAL: " << message << endl;
-  }
-#endif
+  TRY { return main1(argc, argv); }
+  CATCH(const char *message) { cerr << "FATAL: " << message << endl; }
 }
diff --git a/clstmocrtrain.cc b/clstmocrtrain.cc
@@ -1,25 +1,21 @@
-#include "pstring.h"
 #include "clstm.h"
-#include "clstmhl.h"
 #include <assert.h>
-#include <iostream>
-#include <vector>
-#include <memory>
 #include <math.h>
-#include <Eigen/Dense>
-#include <sstream>
 #include <fstream>
 #include <iostream>
-#include <set>
+#include <iostream>
+#include <memory>
 #include <regex>
-
-#include "multidim.h"
-#include "pymulti.h"
+#include <set>
+#include <sstream>
+#include <vector>
+#include "clstmhl.h"
 #include "extras.h"
+#include "pstring.h"
+#include "utils.h"
 
 using namespace Eigen;
 using namespace ocropus;
-using namespace pymulti;
 using std::vector;
 using std::map;
 using std::make_pair;
@@ -36,186 +32,193 @@ using std::regex_replace;
 #define string std_string
 #define wstring std_wstring
 
-string basename(string s) {
-  int start = 0;
-  for (;;) {
-    auto pos = s.find("/", start);
-    if (pos == string::npos) break;
-    start = pos + 1;
-  }
-  auto pos = s.find(".", start);
-  if (pos == string::npos)
-    return s;
-  else
-    return s.substr(0, pos);
-}
-
-string read_text(string fname, int maxsize = 65536) {
-  char buf[maxsize];
-  buf[maxsize - 1] = 0;
-  ifstream stream(fname);
-  stream.read(buf, maxsize - 1);
-  int n = stream.gcount();
-  while (n > 0 && buf[n - 1] == '\n') n--;
-  return string(buf, n);
-}
-
-wstring read_text32(string fname, int maxsize = 65536) {
-  char buf[maxsize];
-  buf[maxsize - 1] = 0;
-  ifstream stream(fname);
-  stream.read(buf, maxsize - 1);
-  int n = stream.gcount();
-  while (n > 0 && buf[n - 1] == '\n') n--;
-  return utf8_to_utf32(string(buf, n));
-}
-
-void get_codec(vector<int> &codec,
-               const vector<string> &fnames,
-               const wstring extra = L"") {
-  set<int> codes;
-  codes.insert(0);
-  for (auto c : extra) codes.insert(int(c));
-  for (int i = 0; i < fnames.size(); i++) {
-    string fname = fnames[i];
-    string base = basename(fname);
-    wstring text32 = read_text32(base + ".gt.txt");
-    for (auto c : text32) codes.insert(int(c));
-  }
-  codec.clear();
-  for (auto c : codes) codec.push_back(c);
-  for (int i = 1; i < codec.size(); i++) assert(codec[i] > codec[i - 1]);
-}
-
-void show(PyServer &py, Sequence &s, int subplot = 0) {
-  mdarray<float> temp;
-  assign(temp, s);
+#ifndef NODISPLAY
+void show(PyServer &py, Sequence &s, int subplot = 0, int batch = 0) {
+  Tensor<float, 2> temp;
+  temp.resize(s.size(), s.rows());
+  for (int i = 0; i < s.size(); i++)
+    for (int j = 0; j < s.rows(); j++) temp(i, j) = s[i].v(j, batch);
   if (subplot > 0) py.evalf("subplot(%d)", subplot);
   py.imshowT(temp, "cmap=cm.hot");
 }
-
-void read_lines(vector<string> &lines, string fname) {
-  ifstream stream(fname);
-  string line;
-  lines.clear();
-  while (getline(stream, line)) {
-    lines.push_back(line);
-  }
-}
+#endif
 
 wstring separate_chars(const wstring &s, const wstring &charsep) {
   if (charsep == L"") return s;
   wstring result;
-  for (int i=0; i<s.size(); i++) {
+  for (int i = 0; i < s.size(); i++) {
     if (i > 0) result.push_back(charsep[0]);
     result.push_back(s[i]);
   }
   return result;
 }
 
-int main1(int argc, char **argv) {
-  srandomize();
+struct Dataset {
+  vector<string> fnames;
+  wstring charsep = utf8_to_utf32(getsenv("charsep", ""));
+  int size() { return fnames.size(); }
+  Dataset() {}
+  Dataset(string file_list) { readFileList(file_list); }
+  void readFileList(string file_list) { read_lines(fnames, file_list); }
+  void getCodec(Codec &codec) {
+    vector<string> gtnames;
+    for (auto s : fnames) gtnames.push_back(basename(s) + ".gt.txt");
+    codec.build(gtnames, charsep);
+  }
+  void readSample(Tensor2 &raw, wstring &gt, int index) {
+    string fname = fnames[index];
+    string base = basename(fname);
+    gt = separate_chars(read_text32(base + ".gt.txt"), charsep);
+    read_png(raw, fname.c_str());
+    raw() = -raw() + Float(1);
+  }
+};
+
+pair<double, double> test_set_error(CLSTMOCR &clstm, Dataset &testset) {
+  double count = 0.0;
+  double errors = 0.0;
+  for (int test = 0; test < testset.size(); test++) {
+    Tensor2 raw;
+    wstring gt;
+    testset.readSample(raw, gt, test);
+    wstring pred = clstm.predict(raw());
+    count += gt.size();
+    errors += levenshtein(pred, gt);
+  }
+  return make_pair(errors, count);
+}
+
+int print_usage(char **argv) {
+    cerr << "Usage: [VAR=VAL...] " << argv[0] << " TRAININGLIST [TESTLIST]\n";
+    cerr << "\n";
+    cerr << "  Arguments:\n";
+    cerr << "    TRAININGLIST     File with filenames to train with\n";
+    cerr << "    TESTLIST         File with filenames to evaluate training\n";
+    cerr << "  \n";
+    cerr << "  Variables:\n";
+    cerr << "     load            Filename of model file to load. Default: ''\n";
+    cerr << "     save_name       Basename of model file to save. Default: '_ocr'\n";
+    cerr << "     nhidden         Number of hidden Default: 100\n";
+    cerr << "     lrate           Learning rate. Default: 1e-4\n";
+    cerr << "     momentum        Momentum. Default: 0.9\n";
+    cerr << "     target_height   Line height to normalize. Default: 48\n";
+    cerr << "     ntrain          Number of iterations. Default: 10000000\n";
+    cerr << "     start           Initial iteration. Default: -1\n";
+    cerr << "     charsep         Separator between characters in ground truth. Default: ''\n";
+    cerr << "     report_time     Set to 1 to report time. Default: 0\n";
+    cerr << "     test_every      Evaluate model every n-th iteration. Default: 10000\n";
+    cerr << "     report_every    Log current state every n-th iteration. Default: 100\n";
+    cerr << "     save_every      Save model with iteration as suffix every n-th\n";
+    cerr << "                     iteration. Default: 10000\n";
+    cerr << "     display_every   Update display every n-th iteration. Requires compilation\n";
+    cerr << "                     with 'scons display=1'. Default: 0\n";
+    cerr << "     params          Whether to report variable values on read. Default: 1\n";
+    return EXIT_FAILURE;
+}
 
+int main1(int argc, char **argv) {
+  if (argc < 2 || argc > 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))
+      return print_usage(argv);
   int ntrain = getienv("ntrain", 10000000);
-  int save_every = getienv("save_every", 10000);
   string save_name = getsenv("save_name", "_ocr");
-  int report_every = getienv("report_every", 100);
-  int display_every = getienv("display_every", 0);
   int report_time = getienv("report_time", 0);
-  int test_every = getienv("test_every", 10000);
-  wstring charsep = utf8_to_utf32(getsenv("charsep", ""));
-  print("*** charsep", charsep);
 
-  if (argc < 2 || argc > 3) THROW("... training [testing]");
-  vector<string> fnames, test_fnames;
-  read_lines(fnames, argv[1]);
-  if (argc > 2) read_lines(test_fnames, argv[2]);
-  print("got", fnames.size(), "files,", test_fnames.size(), "tests");
+  Dataset trainingset(argv[1]);
+  assert(trainingset.size() > 0);
+  Dataset testset;
+  if (argc > 2) testset.readFileList(argv[2]);
+  print("got", trainingset.size(), "files,", testset.size(), "tests");
 
-  vector<int> codec;
-  get_codec(codec, fnames, charsep);
-  print("got", codec.size(), "classes");
+  string load_name = getsenv("load", "");
 
   CLSTMOCR clstm;
-  clstm.target_height = int(getrenv("target_height", 48));
-  clstm.createBidi(codec, getienv("nhidden", 100));
-  clstm.setLearningRate(getdenv("rate", 1e-4), getdenv("momentum", 0.9));
-  clstm.net->info("");
+
+  if (load_name != "") {
+    clstm.load(load_name);
+  } else {
+    Codec codec;
+    trainingset.getCodec(codec);
+    print("got", codec.size(), "classes");
+
+    clstm.target_height = int(getrenv("target_height", 48));
+    clstm.createBidi(codec.codec, getienv("nhidden", 100));
+    clstm.setLearningRate(getdenv("lrate", 1e-4), getdenv("momentum", 0.9));
+  }
+  network_info(clstm.net, "");
 
   double test_error = 9999.0;
   double best_error = 1e38;
 
+#ifndef NODISPLAY
   PyServer py;
   if (display_every > 0) py.open();
-  double start = now();
-  for (int trial = 0; trial < ntrain; trial++) {
-    if (trial > 0 && test_fnames.size() > 0 && test_every > 0 &&
-        trial % test_every == 0) {
-      double errors = 0.0;
-      double count = 0.0;
-      for (int test = 0; test < test_fnames.size(); test++) {
-        string fname = test_fnames[test];
-        string base = basename(fname);
-        wstring gt = separate_chars(read_text32(base + ".gt.txt"), charsep);
-        mdarray<float> raw;
-        read_png(raw, fname.c_str(), true);
-        for (int i = 0; i < raw.size(); i++) raw[i] = 1 - raw[i];
-        wstring pred = clstm.predict(raw);
-        count += gt.size();
-        errors += levenshtein(pred, gt);
-      }
-      test_error = errors / count;
-      print("ERROR", trial, test_error, "   ", errors, count);
-    }
-    if (save_every == 0 && test_error < best_error) {
-      best_error = test_error;
-      string fname = save_name + ".clstm";
-      print("saving best performing network so far", fname,
-            "error rate: ", best_error);
-      clstm.save(fname);
-    }
-    if (trial > 0 && save_every > 0 && trial % save_every == 0) {
-      string fname = save_name + "-" + to_string(trial) + ".clstm";
-      clstm.save(fname);
+#endif
+  double start_time = now();
+  int start = clstm.net->attr.get("trial", getienv("start", -1)) + 1;
+  if (start > 0) print("start", start);
+
+  Trigger test_trigger(getienv("test_every", 10000), -1, start);
+  test_trigger.skip0();
+  Trigger save_trigger(getienv("save_every", 10000), ntrain, start);
+  save_trigger.enable(save_name != "").skip0();
+  Trigger report_trigger(getienv("report_every", 100), ntrain, start);
+  Trigger display_trigger(getienv("display_every", 0), ntrain, start);
+
+  for (int trial = start; trial < ntrain; trial++) {
+    int sample = lrand48() % trainingset.size();
+    Tensor2 raw;
+    wstring gt;
+    trainingset.readSample(raw, gt, sample);
+    wstring pred = clstm.train(raw(), gt);
+
+    if (report_trigger(trial)) {
+      print(trial);
+      print("TRU", gt);
+      print("ALN", clstm.aligned_utf8());
+      print("OUT", utf32_to_utf8(pred));
+      if (trial > 0 && report_time)
+        print("steptime", (now() - start_time) / report_trigger.since());
+      start_time = now();
     }
-    int sample = irandom() % fnames.size();
-    string fname = fnames[sample];
-    string base = basename(fname);
-    wstring gt = separate_chars(read_text32(base + ".gt.txt"), charsep);
-    mdarray<float> raw;
-    read_png(raw, fname.c_str(), true);
-    for (int i = 0; i < raw.size(); i++) raw[i] = 1 - raw[i];
-    wstring pred = clstm.train(raw, gt);
-    if (trial % display_every == 0) {
+
+#ifndef NODISPLAY
+    if (display_trigger(trial)) {
       py.evalf("clf");
       show(py, clstm.net->inputs, 411);
       show(py, clstm.net->outputs, 412);
       show(py, clstm.targets, 413);
       show(py, clstm.aligned, 414);
     }
-    if (trial % report_every == 0) {
-      mdarray<float> temp;
-      print(trial);
-      print("TRU", gt);
-      print("ALN", clstm.aligned_utf8());
-      print("OUT", utf32_to_utf8(pred));
-      if (trial > 0 && report_time)
-        print("steptime", (now()-start) / report_every);
-      start = now();
+#endif
+
+    if (test_trigger(trial)) {
+      auto tse = test_set_error(clstm, testset);
+      double errors = tse.first;
+      double count = tse.second;
+      test_error = errors / count;
+      print("ERROR", trial, test_error, "   ", errors, count);
+      if (test_error < best_error) {
+        best_error = test_error;
+        string fname = save_name + ".clstm";
+        print("saving best performing network so far", fname, "error rate: ",
+              best_error);
+        clstm.net->attr.set("trial", trial);
+        clstm.save(fname);
+      }
+    }
+
+    if (save_trigger(trial)) {
+      string fname = save_name + "-" + to_string(trial) + ".clstm";
+      print("saving", fname);
+      clstm.net->attr.set("trial", trial);
+      clstm.save(fname);
     }
   }
 
   return 0;
 }
 
 int main(int argc, char **argv) {
-#ifdef NOEXCEPTION
-  return main1(argc, argv);
-#else
-  try {
-    return main1(argc, argv);
-  } catch (const char *message) {
-    cerr << "FATAL: " << message << endl;
-  }
-#endif
+  TRY { main1(argc, argv); }
+  CATCH(const char *message) { cerr << "FATAL: " << message << endl; }
 }
diff --git a/ctc.cc b/ctc.cc
@@ -1,12 +1,12 @@
-#include "clstm.h"
 #include <assert.h>
-#include <iostream>
-#include <vector>
-#include <string>
-#include <memory>
 #include <math.h>
-#include <Eigen/Dense>
 #include <stdarg.h>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "clstm.h"
+#include "clstm_compute.h"
 
 #ifndef MAXEXP
 #define MAXEXP 30
@@ -16,87 +16,80 @@ namespace ocropus {
 using namespace std;
 using Eigen::Ref;
 
-namespace {
-inline Float limexp(Float x) {
-#if 1
-  if (x < -MAXEXP) return exp(-MAXEXP);
-  if (x > MAXEXP) return exp(MAXEXP);
-  return exp(x);
-#else
-  return exp(x);
-#endif
-}
-
-inline Float log_add(Float x, Float y) {
-  if (abs(x - y) > 10) return fmax(x, y);
-  return log(exp(x - y) + 1) + y;
-}
-
-inline Float log_mul(Float x, Float y) { return x + y; }
-}
+inline int rows(const TensorMap2 &m) { return m.dimension(0); }
+inline int cols(const TensorMap2 &m) { return m.dimension(1); }
+inline int rows(const EigenTensor2 &m) { return m.dimension(0); }
+inline int cols(const EigenTensor2 &m) { return m.dimension(1); }
 
-void forward_algorithm(Mat &lr, Mat &lmatch, double skip) {
-  int n = ROWS(lmatch), m = COLS(lmatch);
+static void forward_algorithm(EigenTensor2 &lr, EigenTensor2 &lmatch,
+                              double skip = -5) {
+  int n = rows(lmatch), m = cols(lmatch);
   lr.resize(n, m);
-  Vec v(m), w(m);
+  EigenTensor1 v(m), w(m);
   for (int j = 0; j < m; j++) v(j) = skip * j;
   for (int i = 0; i < n; i++) {
-    w.segment(1, m - 1) = v.segment(0, m - 1);
     w(0) = skip * i;
+    for (int j = 1; j < m; j++) w(j) = v(j - 1);
     for (int j = 0; j < m; j++) {
       Float same = log_mul(v(j), lmatch(i, j));
       Float next = log_mul(w(j), lmatch(i, j));
       v(j) = log_add(same, next);
     }
-    lr.row(i) = v;
+    for (int j = 0; j < m; j++) lr(i, j) = v(j);
   }
 }
 
-void forwardbackward(Mat &both, Mat &lmatch) {
-  Mat lr;
+static void forwardbackward(EigenTensor2 &both, EigenTensor2 &lmatch) {
+  int n = rows(lmatch), m = cols(lmatch);
+  EigenTensor2 lr;
   forward_algorithm(lr, lmatch);
-  Mat rlmatch = lmatch;
-  rlmatch = rlmatch.rowwise().reverse().eval();
-  rlmatch = rlmatch.colwise().reverse().eval();
-  Mat rl;
-  forward_algorithm(rl, rlmatch);
-  rl = rl.colwise().reverse().eval();
-  rl = rl.rowwise().reverse().eval();
+  EigenTensor2 rlmatch(n, m);
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < m; j++) rlmatch(i, j) = lmatch(n - i - 1, m - j - 1);
+  EigenTensor2 rrl;
+  forward_algorithm(rrl, rlmatch);
+  EigenTensor2 rl(n, m);
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < m; j++) rl(i, j) = rrl(n - i - 1, m - j - 1);
   both = lr + rl;
 }
 
-void ctc_align_targets(Mat &posteriors, Mat &outputs, Mat &targets) {
+void ctc_align_targets(EigenTensor2 &posteriors, EigenTensor2 &outputs,
+                       EigenTensor2 &targets) {
   double lo = 1e-5;
-  int n1 = ROWS(outputs);
-  int n2 = ROWS(targets);
-  int nc = COLS(targets);
+  int n1 = rows(outputs);
+  int n2 = rows(targets);
+  int nc = cols(targets);
+  assert(nc == cols(outputs));
 
   // compute log probability of state matches
-  Mat lmatch;
+  EigenTensor2 lmatch;
   lmatch.resize(n1, n2);
   for (int t1 = 0; t1 < n1; t1++) {
-    Vec out = outputs.row(t1);
-    out = out.cwiseMax(lo);
-    out /= out.sum();
+    EigenTensor1 out(nc);
+    for (int i = 0; i < nc; i++) out(i) = fmax(lo, outputs(t1, i));
+    out = out / asum1(out);
     for (int t2 = 0; t2 < n2; t2++) {
-      double value = out.transpose() * targets.row(t2).transpose();
-      lmatch(t1, t2) = log(value);
+      double total = 0.0;
+      for (int k = 0; k < nc; k++) total += out(k) * targets(t2, k);
+      lmatch(t1, t2) = log(total);
     }
   }
   // compute unnormalized forward backward algorithm
-  Mat both;
+  EigenTensor2 both;
   forwardbackward(both, lmatch);
 
   // compute normalized state probabilities
-  Mat epath = (both.array() - both.maxCoeff()).unaryExpr(ptr_fun(limexp));
+  EigenTensor2 epath = (both - amax2(both)).unaryExpr(ptr_fun(limexp));
   for (int j = 0; j < n2; j++) {
-    double l = epath.col(j).sum();
-    epath.col(j) /= l == 0 ? 1e-9 : l;
+    double total = 0.0;
+    for (int i = 0; i < rows(epath); i++) total += epath(i, j);
+    total = fmax(1e-9, total);
+    for (int i = 0; i < rows(epath); i++) epath(i, j) /= total;
   }
-  debugmat = epath;
 
   // compute posterior probabilities for each class and normalize
-  Mat aligned;
+  EigenTensor2 aligned;
   aligned.resize(n1, nc);
   for (int i = 0; i < n1; i++) {
     for (int j = 0; j < nc; j++) {
@@ -109,53 +102,57 @@ void ctc_align_targets(Mat &posteriors, Mat &outputs, Mat &targets) {
     }
   }
   for (int i = 0; i < n1; i++) {
-    aligned.row(i) /= fmax(1e-9, aligned.row(i).sum());
+    double total = 0.0;
+    for (int j = 0; j < nc; j++) total += aligned(i, j);
+    total = fmax(total, 1e-9);
+    for (int j = 0; j < nc; j++) aligned(i, j) /= total;
   }
 
   posteriors = aligned;
 }
 
 void ctc_align_targets(Sequence &posteriors, Sequence &outputs,
                        Sequence &targets) {
-  assert(COLS(outputs[0]) == 1);
-  assert(COLS(targets[0]) == 1);
+  assert(outputs.cols() == 1);
+  assert(targets.cols() == 1);
+  assert(outputs.rows() == targets.rows());
   int n1 = outputs.size();
   int n2 = targets.size();
-  int nc = targets[0].size();
-  Mat moutputs(n1, nc);
-  Mat mtargets(n2, nc);
-  for (int i = 0; i < n1; i++) moutputs.row(i) = outputs[i].col(0);
-  for (int i = 0; i < n2; i++) mtargets.row(i) = targets[i].col(0);
-  Mat aligned;
+  int nc = targets[0].rows();
+  EigenTensor2 moutputs(n1, nc);
+  EigenTensor2 mtargets(n2, nc);
+  for (int i = 0; i < n1; i++)
+    for (int j = 0; j < nc; j++) moutputs(i, j) = outputs[i].v(j, 0);
+  for (int i = 0; i < n2; i++)
+    for (int j = 0; j < nc; j++) mtargets(i, j) = targets[i].v(j, 0);
+  EigenTensor2 aligned;
   ctc_align_targets(aligned, moutputs, mtargets);
-  posteriors.resize(n1);
+  posteriors.resize(n1, nc, 1);
   for (int i = 0; i < n1; i++) {
-    posteriors[i].resize(aligned.row(i).size(), 1);
-    posteriors[i].col(0) = aligned.row(i);
+    for (int j = 0; j < nc; j++) posteriors[i].v(j, 0) = aligned(i, j);
   }
 }
 
 void ctc_align_targets(Sequence &posteriors, Sequence &outputs,
                        Classes &targets) {
-  int nclasses = outputs[0].size();
+  int nclasses = outputs.rows();
   Sequence stargets;
-  stargets.resize(targets.size());
+  stargets.resize(targets.size(), nclasses, 1);
   for (int t = 0; t < stargets.size(); t++) {
-    stargets[t].resize(nclasses, 1);
-    stargets[t].fill(0);
-    stargets[t](targets[t], 0) = 1.0;
+    stargets[t].v().setConstant(0);
+    stargets[t].v(targets[t], 0) = 1.0;
   }
   ctc_align_targets(posteriors, outputs, stargets);
 }
 
 void mktargets(Sequence &seq, Classes &transcript, int ndim) {
-  seq.resize(2 * transcript.size() + 1);
+  seq.resize(2 * transcript.size() + 1, ndim, 1);
   for (int t = 0; t < seq.size(); t++) {
-    seq[t].setZero(ndim, 1);
+    seq[t].v.setZero();
     if (t % 2 == 1)
-      seq[t](transcript[(t - 1) / 2]) = 1;
+      seq[t].v(transcript[(t - 1) / 2], 0) = 1;
     else
-      seq[t](0) = 1;
+      seq[t].v(0, 0) = 1;
   }
 }
 
@@ -169,8 +166,8 @@ void trivial_decode(Classes &cs, Sequence &outputs, int batch,
   int mc = -1;
   int mt = -1;
   while (t < N) {
-    int index;
-    float v = outputs[t].col(batch).maxCoeff(&index);
+    int index = argmax(outputs[t].v().chip(batch, 1));
+    float v = outputs[t].v(index, batch);
     if (index == 0) {
       // NB: there should be a 0 at the end anyway
       if (mc != -1 && mc != 0) {

diff --git a/curun b/curun
@@ -0,0 +1,6 @@
+#!/bin/sh
+for dev in /dev/nv*; do
+  devs="$devs --device=$dev:$dev"
+done
+uid=${uid:-$(id -u)}
+docker run --privileged --rm -u $uid -v $(pwd):/work -w=/work -e PATH=.:/usr/local/cuda/bin:/usr/local/bin:/usr/bin:/bin -i -t tmbdev/ubuntu-cuda "$@"
diff --git a/display_server.py b/display_server.py
diff --git a/docker/16.04-cuda/Dockerfile b/docker/16.04-cuda/Dockerfile
@@ -0,0 +1,48 @@
+FROM ubuntu:16.04
+MAINTAINER Konstantin Baierer <konstantin.baierer@gmail.com>
+ENV DEBIAN_FRONTEND noninteractive
+ENV CUDA_URL http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.44-1_amd64.deb
+
+WORKDIR /tmp
+RUN apt-get -y update \
+    && apt-get -y install wget git scons g++ \
+        libprotobuf-dev libprotobuf9v5 protobuf-compiler libpng12-dev
+# RUN apt-get -y install build-essential gdb strace \
+RUN git clone --depth 1 --single-branch --branch 3.3-rc1 \
+        "https://github.com/RLovelett/eigen" /usr/local/include/eigen3
+RUN wget -nd $CUDA_URL \
+    && dpkg -i cuda-repo-ubuntu*.deb \
+    && apt-get -y update \
+    && apt-get -y install \
+        cuda-8-0 \
+        cuda-command-line-tools-8-0  \
+        cuda-core-8-0  \
+        cuda-cublas-8-0  \
+        cuda-cublas-dev-8-0  \
+        cuda-cudart-8-0  \
+        cuda-cudart-dev-8-0  \
+        cuda-cufft-8-0  \
+        cuda-cufft-dev-8-0  \
+        cuda-curand-8-0  \
+        cuda-curand-dev-8-0  \
+        cuda-cusolver-8-0  \
+        cuda-cusolver-dev-8-0  \
+        cuda-cusparse-8-0  \
+        cuda-cusparse-dev-8-0  \
+        cuda-minimal-build-8-0  \
+        cuda-misc-headers-8-0  \
+        cuda-npp-8-0  \
+        cuda-npp-dev-8-0  \
+        cuda-nvrtc-8-0  \
+        cuda-nvrtc-dev-8-0  \
+        cuda-runtime-8-0  \
+        cuda-toolkit-8-0  \
+        cuda-visual-tools-8-0  \
+        cuda-samples-8-0
+
+RUN git clone --depth 1 "https://github.com/tmbdev/clstm"
+WORKDIR /tmp/clstm
+RUN scons && scons all
+
+VOLUME /work
+WORKDIR /work
diff --git a/docker/16.04/Dockerfile b/docker/16.04/Dockerfile
@@ -0,0 +1,21 @@
+FROM ubuntu:16.04
+MAINTAINER Konstantin Baierer <konstantin.baierer@gmail.com>
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get -y update \
+    && apt-get install -y \
+        git \
+        scons \
+        g++ \
+        libprotobuf-dev \
+        libprotobuf9v5 \
+        protobuf-compiler \
+        libpng12-dev \
+        wget \
+    && git clone --depth 1 --single-branch --branch 3.3-rc1 \
+        "https://github.com/RLovelett/eigen" /usr/local/include/eigen3 \
+    && git clone --depth 1 "https://github.com/tmbdev/clstm"
+
+WORKDIR /clstm
+RUN scons && scons install && apt-get remove -y g++ scons git
+ENV PATH "/clstm:${PATH}"
diff --git a/docker/Makefile b/docker/Makefile
@@ -0,0 +1,8 @@
+TAG=kbai/clstm
+FOLDERS = $(shell find . -mindepth 1 -maxdepth 1 -type d)
+
+.PHONY: $(FOLDERS)
+all: $(FOLDERS)
+
+$(FOLDERS): % : %/Dockerfile
+	cd "$@" && docker build -t "$(TAG):$@" .
diff --git a/enroll.h b/enroll.h
@@ -0,0 +1,34 @@
+#ifndef enroll_h__
+#define enroll_h__
+#define VA_NUM_ARGS(...) \
+  VA_NUM_ARGS_IMPL(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1)
+#define VA_NUM_ARGS_IMPL(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N
+#define ENROLL(...) ENROLL_(VA_NUM_ARGS(__VA_ARGS__), __VA_ARGS__)
+#define ENROLL_(count, ...) ENROLL0(count, __VA_ARGS__)
+#define ENROLL0(count, ...) ENROLL##count(__VA_ARGS__)
+#define ENROLL1(a) enroll(a, #a)
+#define ENROLL2(a, ...) \
+  enroll(a, #a);        \
+  ENROLL1(__VA_ARGS__)
+#define ENROLL3(a, ...) \
+  enroll(a, #a);        \
+  ENROLL2(__VA_ARGS__)
+#define ENROLL4(a, ...) \
+  enroll(a, #a);        \
+  ENROLL3(__VA_ARGS__)
+#define ENROLL5(a, ...) \
+  enroll(a, #a);        \
+  ENROLL4(__VA_ARGS__)
+#define ENROLL6(a, ...) \
+  enroll(a, #a);        \
+  ENROLL5(__VA_ARGS__)
+#define ENROLL7(a, ...) \
+  enroll(a, #a);        \
+  ENROLL6(__VA_ARGS__)
+#define ENROLL8(a, ...) \
+  enroll(a, #a);        \
+  ENROLL7(__VA_ARGS__)
+#define ENROLL9(a, ...) \
+  enroll(a, #a);        \
+  ENROLL8(__VA_ARGS__)
+#endif