diff --git a/examples/example_openpose_plus_hao28.cpp b/examples/example_openpose_plus_hao28.cpp
new file mode 100644
index 0000000..3b9052f
--- /dev/null
+++ b/examples/example_openpose_plus_hao28.cpp
@@ -0,0 +1,57 @@
+// https://github.com/tensorlayer/openpose-plus/blob/master/openpose_plus/models/models_hao28_experimental.py
+
+// #define STDNN_OPS_HAVE_CBLAS
+
+#include <algorithm>
+#include <string>
+
+#include "example_openpose_plus_hao28.hpp"
+
+using std::experimental::range;
+
+int main(int argc, char *argv[])
+{
+    const std::string home(std::getenv("HOME"));
+    const auto prefix = home + "/var/models/openpose";
+    // const auto filename =
+    //     home +
+    //     "/var/data/openpose/examples/media/COCO_val2014_000000000192.jpg";
+
+    openpose_plus_hao28 openpose(prefix);
+
+    // auto paf_runner =
+    //     create_paf_processor(32, 48, openpose.h, openpose.w, 19, 19, 13);
+
+    auto x = ttl::tensor<float, 4>(1, openpose.h, openpose.w, 3);
+
+    // TODO: input images
+    // auto input = ttl::tensor<uint8_t, 4>(x.shape());
+    // cv::Mat resized_image(cv::Size(openpose.w, openpose.h), CV_8UC(3),
+    //                       input.data());
+    // {
+    //     auto img = cv::imread(filename);
+    //     cv::resize(img, resized_image, resized_image.size(), 0, 0);
+    //     std::transform(input.data(), data_end(input), x.data(),
+    //                    [](uint8_t p) { return p / 255.0; });
+    // }
+
+    int repeats = 5;
+    for (auto i : std::experimental::range(repeats)) {
+        printf("inference %d\n", i);
+        auto [l_conf, l_paf] = openpose(ref(x));
+
+        // TODO: run paf process
+        // auto conf = nn::ops::apply<ttl::tensor<float, 4>>(
+        //     nn::ops::to_channels_first(), ref(*l_conf));
+        // auto paf = nn::ops::apply<ttl::tensor<float, 4>>(
+        //     nn::ops::to_channels_first(), ref(*l_paf));
+
+        // auto human = (*paf_runner)(conf.data(), paf.data(), false);
+        // for (auto h : human) {
+        //     h.print();
+        //     draw_human(resized_image, h);
+        // }
+        // cv::imwrite("a.png", resized_image);
+    }
+    return 0;
+}
diff --git a/examples/example_openpose_plus_hao28.hpp b/examples/example_openpose_plus_hao28.hpp
new file mode 100644
index 0000000..a78f1f6
--- /dev/null
+++ b/examples/example_openpose_plus_hao28.hpp
@@ -0,0 +1,178 @@
+#pragma once
+#include <nn/layers>
+#include <nn/models>
+#include <nn/ops>
+
+// https://github.com/tensorlayer/openpose-plus/blob/master/openpose_plus/models/models_hao28_experimental.py
+template <typename R> struct openpose_plus_hao28_impl {
+    using image_order = nn::ops::nhwc;
+    using filter_order = nn::ops::rscd;
+    using relu = nn::ops::pointwise<nn::ops::relu>;
+    using pool = nn::layers::pool<nn::ops::pool_max, image_order>;
+    using concat = nn::ops::concat_channel4d<image_order>;
+
+    mutable nn::models::namescope ns;
+    const std::string data_dir_;
+
+    auto f(const std::string &name) const
+    {
+        const auto full = data_dir_ + "/" + ns(name) + ".idx";
+        return nn::ops::readfile(full);
+    };
+
+    auto conv(int d, int k, int p, const std::string &name = "conv") const
+    {
+        return ns.with(name, [&] {
+            using conv_layer =
+                nn::layers::conv<image_order, filter_order, false>;
+            const auto l = conv_layer(conv_layer::ksize(k, k), d,
+                                      conv_layer::padding(p, p));
+            return with_init(l, f("kernel"));
+        });
+    }
+
+    auto conv_(int d, int k, int p, const std::string &name = "conv") const
+    {
+        return ns.with(name, [&] {
+            using conv_layer =
+                nn::layers::conv<image_order, filter_order, true>;
+            const auto l = conv_layer(conv_layer::ksize(k, k), d,
+                                      conv_layer::padding(p, p));
+            return with_init(l, f("kernel"), f("bias"));
+        });
+    }
+
+    auto bn(const std::string &name = "bn") const
+    {
+        return ns.with(name, [&] {
+            using relu = nn::ops::pointwise<nn::ops::relu>;
+            using bn_layer = nn::layers::batch_norm<image_order, relu>;
+            return with_init(bn_layer(), f("moving_mean"), f("moving_variance"),
+                             f("beta"), f("gamma"));
+        });
+    }
+
+  public:
+    openpose_plus_hao28_impl(const std::string prefix) : data_dir_(prefix) {}
+
+    const int n_joins = 19;
+    const int n_connections = 19;
+
+    auto operator()(const ttl::tensor_ref<R, 4> &x)
+    {
+        return ns.with("model", [&] {
+            const auto fm = cnn(x);
+            auto p1 = stage1(*fm);
+            auto p5 = [&] {
+                return ns.with("stage5", [&] {
+                    return stage2(*fm, *p1.first, *p1.second);
+                });
+            }();
+            auto p6 = [&] {
+                return ns.with("stage6", [&] {
+                    return stage2(*fm, *p5.first, *p5.second);
+                });
+            }();
+            return p6;
+        });
+    }
+
+  private:
+    auto cnn(const ttl::tensor_ref<R, 4> &x)
+    {
+        auto conv_layers =                                //
+            nn::models::make_sequential()                 //
+            << conv(32, 3, 1, "conv1_1") << bn("bn1_1")   //
+            << conv(64, 3, 1, "conv1_2") << bn("bn1_2")   //
+            << pool()                                     //
+            << conv(128, 3, 1, "conv2_1") << bn("bn2_1")  //
+            << conv(128, 3, 1, "conv2_2") << bn("bn2_2")  //
+            << pool()                                     //
+            << conv(200, 3, 1, "conv3_1") << bn("bn3_1")  //
+            << conv(200, 3, 1, "conv3_2") << bn("bn3_2")  //
+            << conv(200, 3, 1, "conv3_3") << bn("bn3_3")  //
+            << pool()                                     //
+            << conv(384, 3, 1, "conv4_1") << bn("bn4_1")  //
+            << conv(384, 3, 1, "conv4_2") << bn("bn4_2")  //
+            << conv(256, 3, 1, "conv4_3") << bn("bn4_3")  //
+            << conv(128, 3, 1, "conv4_4") << bn("bn4_4")  //
+            ;
+        return conv_layers(x);
+    }
+
+    auto stage1(const ttl::tensor_ref<R, 4> &x)
+    {
+        return ns.with("stage1", [&] {
+            auto common = [&] {
+                return                                     //
+                    nn::models::make_sequential()          //
+                    << conv(128, 3, 1, "c1") << bn("bn1")  //
+                    << conv(128, 3, 1, "c2") << bn("bn2")  //
+                    << conv(128, 3, 1, "c3") << bn("bn3")  //
+                    << conv(128, 1, 0, "c4") << bn("bn4")  //
+                    ;
+            };
+            auto left = [&] {
+                return ns.with("branch1", [&] {
+                    return common() << conv_(n_joins, 1, 0, "confs");
+                });
+            }();
+            auto right = [&] {
+                return ns.with("branch2", [&] {
+                    return common() << conv_(2 * n_connections, 1, 0, "pafs");
+                });
+            }();
+            return std::make_pair(left(x), right(x));
+        });
+    }
+
+    auto stage2(const ttl::tensor_ref<R, 4> &x,  //
+                const ttl::tensor_ref<R, 4> &b1,
+                const ttl::tensor_ref<R, 4> &b2)
+    {
+        using T = ttl::tensor<R, 4>;
+        auto net =
+            std::unique_ptr<T>(nn::ops::new_result<T>(concat(), x, b1, b2));
+        auto common = [&] {
+            return                                     //
+                nn::models::make_sequential()          //
+                << conv(128, 3, 1, "c1") << bn("bn1")  //
+                << conv(128, 3, 1, "c2") << bn("bn2")  //
+                << conv(128, 3, 1, "c3") << bn("bn3")  //
+                << conv(128, 3, 1, "c4") << bn("bn4")  //
+                << conv(128, 3, 1, "c5") << bn("bn5")  //
+                << conv(128, 1, 0, "c6") << bn("bn6")  //
+                ;
+        };
+        auto left = [&] {
+            return ns.with("branch1", [&] {
+                return common() << conv_(n_joins, 1, 0, "conf");
+            });
+        }();
+        auto right = [&] {
+            return ns.with("branch2", [&] {
+                return common() << conv_(2 * n_connections, 1, 0, "pafs");
+            });
+        }();
+        return std::make_pair(left(ref(*net)), right(ref(*net)));
+    }
+};
+
+struct openpose_plus_hao28 {
+    const std::string data_dir_;
+
+  public:
+    const size_t h;
+    const size_t w;
+
+    openpose_plus_hao28(const std::string &data_dir, int height = 256,
+                        int width = 384)
+        : data_dir_(data_dir), h(height), w(width)
+    {
+    }
+
+    template <typename R> auto operator()(const ttl::tensor_ref<R, 4> &x)
+    {
+        return openpose_plus_hao28_impl<R>(data_dir_)(x);
+    }
+};
diff --git a/include/nn/bits/ops/concat.hpp b/include/nn/bits/ops/concat.hpp
index ac538c0..f6b185d 100644
--- a/include/nn/bits/ops/concat.hpp
+++ b/include/nn/bits/ops/concat.hpp
@@ -33,7 +33,6 @@ template <> class concat_channel4d_impl<nhwc, 3>
 
         const auto c1 = c_1;
         const auto c2 = c_2;
-        const auto c3 = c_3;
 
         for (auto l : range(n)) {
             for (auto i : range(h)) {