diff --git a/cinn/frontend/net_builder.cc b/cinn/frontend/net_builder.cc
index 31ffea4d64..24c086368f 100644
--- a/cinn/frontend/net_builder.cc
+++ b/cinn/frontend/net_builder.cc
@@ -140,6 +140,7 @@ NETBUILDER_UNARY_OP_DEF(Negative, negative)
 NETBUILDER_UNARY_OP_DEF(Sign, sign)
 NETBUILDER_UNARY_OP_DEF(Abs, abs)
 NETBUILDER_UNARY_OP_DEF(Clz, clz)
+NETBUILDER_UNARY_OP_DEF(Popc, popc)
 
 #undef NETBUILDER_UNARY_OP_DEF
 
diff --git a/cinn/frontend/net_builder.h b/cinn/frontend/net_builder.h
index 1544941580..14bdf2141d 100644
--- a/cinn/frontend/net_builder.h
+++ b/cinn/frontend/net_builder.h
@@ -72,7 +72,8 @@ namespace frontend {
   macro__(Negative) \
   macro__(Sign) \
   macro__(Abs) \
-  macro__(Clz)
+  macro__(Clz) \
+  macro__(Popc)
 
 // ******************************************* //
 // The op has two input and one output, with a attribute [axis]
diff --git a/cinn/hlir/op/contrib/CMakeLists.txt b/cinn/hlir/op/contrib/CMakeLists.txt
index 061ba8f5dc..1c96145600 100644
--- a/cinn/hlir/op/contrib/CMakeLists.txt
+++ b/cinn/hlir/op/contrib/CMakeLists.txt
@@ -17,6 +17,7 @@ gather_srcs(cinnapi_src SRCS
         lookup_table.cc
         one_hot.cc
         clz.cc
+        popc.cc
         )
 
 cc_test(test_cast SRCS cast_test.cc DEPS cinncore)
@@ -34,3 +35,4 @@ cc_test(test_repeat SRCS repeat_test.cc DEPS cinncore)
 cc_test(test_one_hot SRCS one_hot_test.cc DEPS cinncore)
 cc_test(test_lookup_table SRCS lookup_table_test.cc DEPS cinncore)
 cc_test(test_clz SRCS clz_test.cc DEPS cinncore)
+cc_test(test_popc SRCS popc_test.cc DEPS cinncore)
diff --git a/cinn/hlir/op/contrib/popc.cc b/cinn/hlir/op/contrib/popc.cc
new file mode 100644
index 0000000000..94244fbe5a
--- /dev/null
+++ b/cinn/hlir/op/contrib/popc.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/contrib/clip.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "gflags/gflags.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::_CINNValuePack_;
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::OpStrategy;
+using framework::shape_t;
+using framework::StrategyFunction;
+
+ir::Tensor Popc(const ir::Tensor &input, const Target &target, const std::string &output_name) {
+  std::string extern_func = "cinn_";
+  if (target == common::DefaultHostTarget()) {
+    extern_func += "host_";
+  } else if (target == common::DefaultNVGPUTarget()) {
+    extern_func += "nvgpu_";
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  extern_func += "popc";
+
+  if (input->type().is_int(32) || input->type().is_uint(32)) {
+    extern_func += "_int32";
+  } else if (input->type().is_int(64) || input->type().is_uint(64)) {
+    extern_func += "_int64";
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  return Compute(
+      input->shape,
+      [=](const std::vector<Expr> &indices) {
+        Expr e = input(indices);
+        return lang::CallExtern(extern_func, {e});
+      },
+      output_name);
+}
+
+std::shared_ptr<OpStrategy> StrategyForPopc(const framework::NodeAttr &attrs,
+                                            const std::vector<ir::Tensor> &inputs,
+                                            const std::vector<Type> &out_type,
+                                            const std::vector<std::vector<int>> &output_shapes,
+                                            const Target &target) {
+  std::string op_name("popc");
+
+  framework::CINNCompute popc_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of " << op_name << " compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "at least one input tensor for " << op_name << " compute\n";
+
+    std::string tensor_name = UniqName(op_name + "_Out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    Expr A_expr = pack_args[0];
+    CHECK(A_expr.as_tensor());
+    ir::Tensor A = A_expr.as_tensor_ref();
+    auto out     = Popc(A, target, tensor_name);
+    auto stages  = CreateStages({out});
+    *ret         = CINNValuePack{{CINNValue(Expr(out.get())), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(popc_compute, framework::GetInjectiveScheduleFunc(output_shapes, target), "strategy.popc.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForPopc(const std::vector<framework::shape_t> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty() && !inputs_shape[0].empty()) << "The input's shape size is 0! Please check again.";
+  std::vector<framework::shape_t> res{inputs_shape[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForPopc(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(popc_ops) {
+  CINN_REGISTER_OP(popc)
+      .describe("Population count.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForPopc)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForPopc))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForPopc))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/cinn/hlir/op/contrib/popc.h b/cinn/hlir/op/contrib/popc.h
new file mode 100644
index 0000000000..dafcfae688
--- /dev/null
+++ b/cinn/hlir/op/contrib/popc.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+ir::Tensor Popc(const ir::Tensor& input, const Target& target, const std::string& output_name = "T_Popc_out");
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/cinn/hlir/op/contrib/popc_test.cc b/cinn/hlir/op/contrib/popc_test.cc
new file mode 100644
index 0000000000..75c368f1d9
--- /dev/null
+++ b/cinn/hlir/op/contrib/popc_test.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/popc.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/common/context.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+namespace {
+bool IsCompiledWithCUDA() {
+#if !defined(CINN_WITH_CUDA)
+  return false;
+#else
+  return true;
+#endif
+}
+}  // namespace
+
+TEST(GenerateCode_Cpu, Popc) {
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultHostTarget();
+  lang::Placeholder<int> in("in", std::vector<int>{10});
+  ir::Tensor res = Popc(in, target, "test_popc");
+
+  poly::StageMap stages = poly::CreateStages({res});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCpu_Popc", stages, {res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CPU codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("Popc_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+
+  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  VLOG(6) << "Cpu Codegen result:";
+  VLOG(6) << code;
+}
+
+TEST(GenerateCode_Cuda, Popc) {
+  if (!IsCompiledWithCUDA()) {
+    return;
+  }
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultNVGPUTarget();
+
+  lang::Placeholder<int64_t> in("in", std::vector<int>{10});
+  ir::Tensor res = Popc(in, target, "test_popc");
+
+  poly::StageMap stages = poly::CreateStages({res});
+  stages[res]->Bind(0, "blockIdx.x");
+  stages[res]->SetBuffer("global");
+
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCuda_Popc", stages, {res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CUDA codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("Popc_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/cinn/hlir/op/use_ops.h b/cinn/hlir/op/use_ops.h
index 10681d0fa3..925b18ed0c 100644
--- a/cinn/hlir/op/use_ops.h
+++ b/cinn/hlir/op/use_ops.h
@@ -39,3 +39,4 @@ CINN_USE_REGISTER(repeat_ops)
 CINN_USE_REGISTER(one_hot_ops)
 CINN_USE_REGISTER(lookup_table_ops)
 CINN_USE_REGISTER(clz_ops)
+CINN_USE_REGISTER(popc_ops)
diff --git a/cinn/pybind/frontend.cc b/cinn/pybind/frontend.cc
index 533d1b39a6..07817b0889 100644
--- a/cinn/pybind/frontend.cc
+++ b/cinn/pybind/frontend.cc
@@ -621,7 +621,8 @@ void BindFrontend(pybind11::module *m) {
       .def("arange", &NetBuilder::Arange, py::arg("start"), py::arg("end"), py::arg("step"), py::arg("dtype"))
       .def("gather", &NetBuilder::Gather, py::arg("x"), py::arg("index"), py::arg("axis"))
       .def("gather_nd", &NetBuilder::GatherNd, py::arg("x"), py::arg("index"), py::arg("axes"))
-      .def("clz", &NetBuilder::Clz, py::arg("x"));
+      .def("clz", &NetBuilder::Clz, py::arg("x"))
+      .def("popc", &NetBuilder::Popc, py::arg("x"));
 
   auto computation = py::class_<CinnComputation, std::shared_ptr<CinnComputation>>(*m, "Computation");
   py::class_<CinnComputation::CompileOptions>(computation, "CompileOptions")
diff --git a/cinn/runtime/cpu/host_intrinsics.cc b/cinn/runtime/cpu/host_intrinsics.cc
index 92f1954d9f..6ab8207df4 100644
--- a/cinn/runtime/cpu/host_intrinsics.cc
+++ b/cinn/runtime/cpu/host_intrinsics.cc
@@ -128,12 +128,16 @@ inline int FN_INT32(pow)(int x, int y) {
 
 inline int FN_INT32(clz)(int x) { return __builtin_clz(x); }
 
+inline int FN_INT32(popc)(int x) { return __builtin_popcount(x); }
+
 #undef FN_INT32
 
 #define FN_INT64(func) cinn_host_##func##_int64
 
 inline int64_t FN_INT64(clz)(int64_t x) { return __builtin_clzll(x); }
 
+inline int64_t FN_INT64(popc)(int64_t x) { return __builtin_popcountll(x); }
+
 #undef FN_INT64
 }
 
@@ -190,6 +194,10 @@ CINN_REGISTER_HELPER(host_intrinsics) {
 
   REGISTER_EXTERN_FUNC_1_IN_1_OUT(cinn_host_clz_int64, host_target, int64_t, int64_t);
 
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT(cinn_host_popc_int32, host_target, int, int);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT(cinn_host_popc_int64, host_target, int64_t, int64_t);
+
   REGISTER_EXTERN_FUNC_HELPER(cinn_host_find_int, host_target)
       .SetRetType<int>()
       .AddInputType<cinn_buffer_t*>()
diff --git a/cinn/runtime/cpu/host_intrinsics.h b/cinn/runtime/cpu/host_intrinsics.h
index ef3211b9d9..50c37900e4 100644
--- a/cinn/runtime/cpu/host_intrinsics.h
+++ b/cinn/runtime/cpu/host_intrinsics.h
@@ -51,11 +51,15 @@ inline int FN_INT32(pow)(int x, int y);
 
 inline int FN_INT32(clz)(int x);
 
+inline int FN_INT32(popc)(int x);
+
 #undef FN_INT32
 
 #define FN_INT64(func) cinn_host_##func##_int64
 
 inline int64_t FN_INT64(clz)(int64_t x);
 
+inline int64_t FN_INT64(popc)(int64_t x);
+
 #undef FN_INT64
 }
diff --git a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
index 866f63d903..9c25a44ca3 100644
--- a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
+++ b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
@@ -68,6 +68,7 @@ __device__ inline int FN_INT32(bitwise_or)(int a, int b) { return a | b; }
 __device__ inline int FN_INT32(bitwise_xor)(int a, int b) { return a ^ b; }
 __device__ inline int FN_INT32(bitwise_not)(int a) { return ~a; }
 __device__ inline int FN_INT32(clz)(int a) { return __clz(a); }
+__device__ inline int FN_INT32(popc)(int a) { return __popc(a); }
 
 
 // *************************************************************** //
@@ -75,6 +76,7 @@ __device__ inline int FN_INT32(clz)(int a) { return __clz(a); }
 #define FN_INT64(func) cinn_nvgpu_##func##_int64
 
 __device__ inline long long int FN_INT64(clz)(long long int a) { return __clzll(a); }
+__device__ inline long long int FN_INT64(popc)(long long int a) { return __popcll(a); }
 
 
 // *************************************************************** //
diff --git a/cinn/runtime/cuda/cuda_intrinsics.cc b/cinn/runtime/cuda/cuda_intrinsics.cc
index 34f1c2b08c..0acf54bde2 100644
--- a/cinn/runtime/cuda/cuda_intrinsics.cc
+++ b/cinn/runtime/cuda/cuda_intrinsics.cc
@@ -91,6 +91,7 @@ CINN_REGISTER_HELPER(cuda_intrinsics) {
 
   REGISTER_EXTERN_FUNC_1_IN_1_INT32(bitwise_not)
   REGISTER_EXTERN_FUNC_1_IN_1_INT32(clz)
+  REGISTER_EXTERN_FUNC_1_IN_1_INT32(popc)
 
 #undef REGISTER_EXTERN_FUNC_1_IN_1_INT32
 
@@ -98,6 +99,7 @@ CINN_REGISTER_HELPER(cuda_intrinsics) {
   REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_int64, target, int64_t, int64_t);
 
   REGISTER_EXTERN_FUNC_1_IN_1_INT64(clz)
+  REGISTER_EXTERN_FUNC_1_IN_1_INT64(popc)
 
 #undef REGISTER_EXTERN_FUNC_1_IN_1_INT64
 
diff --git a/python/tests/ops/test_popc_op.py b/python/tests/ops/test_popc_op.py
new file mode 100644
index 0000000000..80a793fdbb
--- /dev/null
+++ b/python/tests/ops/test_popc_op.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestPopcOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = {
+            # "x": self.random([32, 64], 'int32', low = -2147483648, high=2147483647)
+            "x":
+            np.array([
+                -1591895863, -1770335025, -1290313501, 478042597, 189030958,
+                -935228100, 718518127, -2066013593, -1028229638, -1930307001,
+                -858478166, -282304333
+            ]).astype(np.int32)
+        }
+        self.outputs = {
+            "y":
+            np.array([14, 19, 16, 18, 12, 13, 20, 15, 19, 17, 16,
+                      17]).astype(np.int32)
+        }
+
+    def build_paddle_program(self, target):
+        y = paddle.to_tensor(self.outputs["y"], stop_gradient=False)
+        self.paddle_outputs = [y]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("popc")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.popc(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestPopcCase1(TestPopcOp):
+    def init_case(self):
+        self.inputs = {
+            # "x": self.random([48, 36], 'int32', low = -2147483648, high=2147483647)
+            "x":
+            np.array([[
+                -780762106, 2088944770, 1793870564, 995233974, -1566864405,
+                -1550063384
+            ],
+                      [
+                          58189437, -585656506, 1058816786, -1676158651,
+                          -175192886, 2129254990
+                      ]]).astype(np.int32)
+        }
+        self.outputs = {
+            "y":
+            np.array([[13, 12, 16, 14, 18, 17], [19, 18, 14, 16, 17,
+                                                 20]]).astype(np.int32)
+        }
+
+
+class TestPopcCase2(TestPopcOp):
+    def init_case(self):
+        self.inputs = {
+            # "x": self.random([4, 3, 5, 8], 'int64', low = -9223372036854775808, high=9223372036854775807)
+            "x":
+            np.array([
+                -2603587548323400654, 5370659515557365091,
+                -2051413160116828951, 9015154622229049624,
+                -8328245342679021727, -8113334794330105534,
+                7187230222985732039, 1835610600500058242
+            ]).astype(np.int64)
+        }
+        self.outputs = {
+            "y": np.array([34, 32, 34, 32, 29, 31, 34, 32]).astype(np.int64)
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()