From 5e01b2af128ef7aa8bbcbda8a500b446d84bae9f Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Mon, 24 Aug 2015 20:23:46 -0600
Subject: [PATCH 1/4] more checks on shape

---
 include/mxnet/base.h           | 56 ++++++++++++++++++++++++++++++++++
 include/mxnet/operator.h       | 12 ++++++--
 src/operator/convolution-inl.h | 36 +++++++++++++++-------
 src/operator/pooling-inl.h     | 19 ++++++++----
 4 files changed, 104 insertions(+), 19 deletions(-)
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 0d3f81ea7605..a7a3a8063a92 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -5,8 +5,10 @@
  */
 #ifndef MXNET_BASE_H_
 #define MXNET_BASE_H_
+
 #include <dmlc/base.h>
 #include <dmlc/type_traits.h>
+#include <dmlc/parameter.h>
 #include <mshadow/tensor.h>
 #include <string>
 
@@ -47,4 +49,58 @@ typedef mshadow::TShape TShape;
 /*! \brief storage container type */
 typedef mshadow::TBlob TBlob;
 }  // namespace mxnet
+
+
+//! \cond Doxygen_Suppress
+namespace dmlc {
+// Add a few patches to support TShape in dmlc/parameter.
+DMLC_DECLARE_TYPE_NAME(mxnet::TShape, "Shape(tuple)");
+
+namespace parameter {
+template<>
+class FieldEntry<mxnet::TShape>
+    : public FieldEntryBase<FieldEntry<mxnet::TShape>, mxnet::TShape> {
+ public:
+  FieldEntry() : enforce_nonzero_(false), expect_ndim_(0) {}
+  // parent class
+  typedef FieldEntryBase<FieldEntry<mxnet::TShape>, mxnet::TShape> Parent;
+
+  virtual void Check(void *head) const {
+    Parent::Check(head);
+    mxnet::TShape &v = this->Get(head);
+    if (expect_ndim_ != 0 && v.ndim() != expect_ndim_) {
+      std::ostringstream os;
+        os << "value " << v << "for Parameter " << this->key_
+           << " has wrong dimensions, expected dimension=" << expect_ndim_;
+        throw dmlc::ParamError(os.str());
+    }
+    if (enforce_nonzero_) {
+      for (mxnet::index_t i = 0; i < v.ndim(); ++i) {
+        if (v[i] == 0U) {
+          std::ostringstream os;
+          os << "value " << v << "for Parameter " << this->key_
+             << " is invalid, the input shape must be nonzero in all dimensions";
+          throw dmlc::ParamError(os.str());
+        }
+      }
+    }
+  }
+  inline FieldEntry<mxnet::TShape> &enforce_nonzero() {
+    this->enforce_nonzero_ = true;
+    return this->self();
+  }
+  inline FieldEntry<mxnet::TShape> &set_expect_ndim(mshadow::index_t ndim) {
+    expect_ndim_ = ndim;
+    return this->self();
+  }
+
+ private:
+  // whether all the entries need to be nonzero
+  bool enforce_nonzero_;
+  // expected number of dimension, default = 0 means no restriction.
+  mxnet::index_t expect_ndim_;
+};
+}  // namespace parameter
+}  // namespace dmlc
+//! \endcond
 #endif  // MXNET_BASE_H_
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index c6dd60431c47..57c8c6c85098 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -89,21 +89,29 @@ class Operator {
   /*!
    * \brief Perform a Backward Operation, write gradient to the in_grad.
    *
+   * \note
    * Convention:
    *   out_grad.size() == OperatorProperty.NumVisibleReturns()
    *   out_data.size() == OperatorProperty.NumReturns()
    * out_data can contain additional invisible returns that remembers the
    * state carried from the Forward pass. For example mask in the dropout.
-   *
    * The gradients are passed from visible returns in this function.
    *
+   * \par
+   * Not all the TBlobs in the arguments will be available
+   * if you override the DeclareBackwardDependency of corresponding OperatorProperty class.
+   * Only the dependencies you declared will be available at corresponding position,
+   * the rest of the parameters are simply dummy where you will get a nullptr.
+   * You will be safe if you use the default DeclareBackwardDependency.
+   * But only declare what you need will give engine more chance for optimization.
+   *
    * \param ctx runtime context available to this call
    * \param out_grad the gradient value we get from of the Operator.
    * \param in_data the array of input data.
    * \param out_data the array of output data.
    * \param req request types of the saving operation, can be all types.
    * \param in_grad the array of gradient we need to write to.
-   * \sa OpReqType, OpContext, OperatorProperty
+   * \sa OperatorProperty, OpReqType, OpContext
    */
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index e96d81023ef5..ec8ad72e2b94 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -33,21 +33,35 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   uint32_t nstep;
   bool no_bias;
   DMLC_DECLARE_PARAMETER(ConvolutionParam) {
-    int shape[] = {1, 1};
-    DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (y, x)");
-    DMLC_DECLARE_FIELD(stride).describe("convolution stride: (y, x)")
-      .set_default(TShape(shape, shape + 2));
-    shape[0] = shape[1] = 0;
-    DMLC_DECLARE_FIELD(pad).describe("pad for convolution: (y, x)")
-      .set_default(TShape(shape, shape + 2));
-    DMLC_DECLARE_FIELD(nb_filter).describe("convolution filter(channel) number")
-      .set_range(1, 100000);
+    DMLC_DECLARE_FIELD(kernel)
+      .set_expect_ndim(2).enforce_nonzero()
+      .describe("convolution kernel size: (y, x)");
+
+    int stride_shape[] = {1, 1};
+    DMLC_DECLARE_FIELD(stride)
+      .set_expect_ndim(2).enforce_nonzero()
+      .set_default(TShape(stride_shape, stride_shape + 2))
+      .describe("convolution stride: (y, x)");
+
+    int pad_shape[] = {1, 1};
+    DMLC_DECLARE_FIELD(pad)
+      .set_expect_ndim(2)
+      .set_default(TShape(pad_shape, pad_shape + 2))
+      .describe("pad for convolution: (y, x)");
+
+    DMLC_DECLARE_FIELD(nb_filter)
+      .set_lower_bound(1)
+      .describe("convolution filter(channel) number");
+
     DMLC_DECLARE_FIELD(nb_group).set_default(1)
       .describe("number of groups partition");
+
     DMLC_DECLARE_FIELD(nstep)
-      .describe("process n images once").set_default(2).set_range(1, 10000);
+      .set_default(2).set_range(1, 10000)
+      .describe("process n images once");
+
     DMLC_DECLARE_FIELD(no_bias).set_default(false)
-        .describe("Whether to disable bias parameter.");
+      .describe("Whether to disable bias parameter.");
   }
 };
 
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index 359d070cdf11..93f18322fa7a 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -31,18 +31,25 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   int pool_type;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
     // TODO(bing) change to only set lower bound
-    int shape[] = {0, 0};
-    DMLC_DECLARE_FIELD(kernel).describe("pooling kernel size: (y, x)");
+    DMLC_DECLARE_FIELD(kernel)
+      .set_expect_ndim(2).enforce_nonzero()
+      .describe("pooling kernel size: (y, x)");
+
     DMLC_DECLARE_FIELD(pool_type).set_default(kMaxPooling)
       .add_enum("max", kMaxPooling)
       .add_enum("avg", kAvgPooling)
       .add_enum("sum", kSumPooling)
       .describe("Pooling type to be applied.");
-    DMLC_DECLARE_FIELD(pad).set_default(TShape(shape, shape + 2))
-      .describe("pad for pooling: (y, x)");
-    shape[0] = shape[1] = 1;
-    DMLC_DECLARE_FIELD(stride).set_default(TShape(shape, shape + 2))
+
+    int stride_shape[] = {1, 1};
+    DMLC_DECLARE_FIELD(stride).set_default(TShape(stride_shape, stride_shape + 2))
+      .set_expect_ndim(2).enforce_nonzero()
       .describe("stride: for pooling (y, x)");
+
+    int pad_shape[] = {0, 0};
+    DMLC_DECLARE_FIELD(pad).set_default(TShape(pad_shape, pad_shape + 2))
+      .set_expect_ndim(2)
+      .describe("pad for pooling: (y, x)");
   }
 };
 

From a1c02d90dae934e9b93987473f06f6fe69d940f6 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 25 Aug 2015 14:55:37 -0600
Subject: [PATCH 2/4] chg

---
 src/operator/convolution-inl.h | 52 +++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index e96d81023ef5..fc1817d751a4 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -35,19 +35,19 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   DMLC_DECLARE_PARAMETER(ConvolutionParam) {
     int shape[] = {1, 1};
     DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (y, x)");
-    DMLC_DECLARE_FIELD(stride).describe("convolution stride: (y, x)")
-      .set_default(TShape(shape, shape + 2));
+    DMLC_DECLARE_FIELD(stride).set_default(TShape(shape, shape + 2))
+      .describe("convolution stride: (y, x)");
     shape[0] = shape[1] = 0;
-    DMLC_DECLARE_FIELD(pad).describe("pad for convolution: (y, x)")
-      .set_default(TShape(shape, shape + 2));
-    DMLC_DECLARE_FIELD(nb_filter).describe("convolution filter(channel) number")
-      .set_range(1, 100000);
+    DMLC_DECLARE_FIELD(pad).set_default(TShape(shape, shape + 2))
+      .describe("pad for convolution: (y, x)");
+    DMLC_DECLARE_FIELD(nb_filter).set_range(1, 100000)
+      .describe("convolution filter(channel) number");
     DMLC_DECLARE_FIELD(nb_group).set_default(1)
       .describe("number of groups partition");
-    DMLC_DECLARE_FIELD(nstep)
-      .describe("process n images once").set_default(2).set_range(1, 10000);
+    DMLC_DECLARE_FIELD(nstep).set_default(2).set_range(1, 10000)
+      .describe("process n images once");
     DMLC_DECLARE_FIELD(no_bias).set_default(false)
-        .describe("Whether to disable bias parameter.");
+      .describe("Whether to disable bias parameter.");
   }
 };
 
@@ -139,22 +139,22 @@ class ConvolutionOp : public Operator {
     const index_t nbatch = data.size(0);
     for (index_t i = 0; i < nbatch; i += param_.nstep) {
       const index_t step = std::min(param_.nstep, nbatch - i);
-      temp_col_.Resize(mshadow::Shape2(shape_colunit_[0], \
+      temp_col_.Resize(mshadow::Shape2(shape_colunit_[0],
                                        shape_colunit_[1] * step));
-      temp_dst_.Resize(mshadow::Shape3(shape_dstunit_[0], \
+      temp_dst_.Resize(mshadow::Shape3(shape_dstunit_[0],
                                        shape_dstunit_[1], shape_dstunit_[2] * step));
       temp_dst_ = reshape(swapaxis<1, 0>(grad.Slice(i, i + step)), temp_dst_.shape_);
       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
         // TODO(bing): dual stride
-        temp_col_ = unpack_patch2col(data.Slice(i, i + step), \
-                                     param_.kernel[0], \
-                                     param_.kernel[1], \
+        temp_col_ = unpack_patch2col(data.Slice(i, i + step),
+                                     param_.kernel[0],
+                                     param_.kernel[1],
                                      param_.stride[0]);
       } else {
         // TODO(bing): dual stride
-        temp_col_ = unpack_patch2col(pad(data.Slice(i, i + step), param_.pad[0], param_.pad[1]), \
-                                     param_.kernel[0], \
-                                     param_.kernel[1], \
+        temp_col_ = unpack_patch2col(pad(data.Slice(i, i + step), param_.pad[0], param_.pad[1]),
+                                     param_.kernel[0],
+                                     param_.kernel[1],
                                      param_.stride[0]);
       }
       const index_t gstride = temp_col_.size(0) / param_.nb_group;
@@ -168,20 +168,20 @@ class ConvolutionOp : public Operator {
           tmpc = dot(wmat[gid].T(), temp_dst_[gid]);
         }
         if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-          gdata.Slice(i, i + step) = pack_col2patch(temp_col_, \
-                                                    data.Slice(i, i + step).shape_, \
-                                                    param_.kernel[0], \
-                                                    param_.kernel[1], \
+          gdata.Slice(i, i + step) = pack_col2patch(temp_col_,
+                                                    data.Slice(i, i + step).shape_,
+                                                    param_.kernel[0],
+                                                    param_.kernel[1],
                                                     param_.stride[0]);
         } else {
           mshadow::Shape<4> pshape = data.Slice(i, i + step).shape_;
           pshape[2] += 2 * param_.pad[0];
           pshape[3] += 2 * param_.pad[1];
-          gdata.Slice(i, i + step) = crop(pack_col2patch(temp_col_, \
-                                                         pshape, \
-                                                         param_.kernel[0], \
-                                                         param_.kernel[1], \
-                                                         param_.stride[0]), \
+          gdata.Slice(i, i + step) = crop(pack_col2patch(temp_col_,
+                                                         pshape,
+                                                         param_.kernel[0],
+                                                         param_.kernel[1],
+                                                         param_.stride[0]),
                                           gdata[i][0].shape_);
         }
       }

From 46844a1a399a87c1c6931deeda98195df52efb05 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 25 Aug 2015 16:09:07 -0600
Subject: [PATCH 3/4] change conv wmat shape

---
 src/operator/convolution-inl.h | 27 +++++++++++++++++----------
 src/operator/convolution.cc    |  4 ++--
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index fc1817d751a4..ccc0fdec0d9a 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -28,8 +28,8 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   TShape kernel;
   TShape stride;
   TShape pad;
-  int nb_filter;
-  int nb_group;
+  uint32_t nb_filter;
+  uint32_t nb_group;
   uint32_t nstep;
   bool no_bias;
   DMLC_DECLARE_PARAMETER(ConvolutionParam) {
@@ -71,7 +71,11 @@ class ConvolutionOp : public Operator {
     // TODO(bing): check the BLAS Handle, be careful
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 3> wmat = in_data[kWeight].get<xpu, 3, real_t>(s);
+    uint32_t ws[] = {param_.nb_group,
+                     param_.nb_filter / param_.nb_group,
+                     data.shape_[1] / param_.nb_group * param_.kernel[0] * param_.kernel[1]};
+    TShape wmat_shape(ws, ws + 3);
+    Tensor<xpu, 3> wmat = in_data[kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
     Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
     this->InitTemp(data.shape_, out.shape_);
     const index_t nbatch = data.size(0);
@@ -128,13 +132,18 @@ class ConvolutionOp : public Operator {
     size_t expected = param_.no_bias == 0 ? 3 : 2;
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     CHECK_EQ(req.size(), expected);
+    CHECK_EQ(in_data[kWeight].CheckContiguous(), true);
     // get data
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 3> wmat = in_data[kWeight].get<xpu, 3, real_t>(s);
+    uint32_t ws[] = {param_.nb_group,
+                     param_.nb_filter / param_.nb_group,
+                     data.shape_[1] / param_.nb_group * param_.kernel[0] * param_.kernel[1]};
+    TShape wmat_shape(ws, ws + 3);
+    Tensor<xpu, 3> wmat = in_data[kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
     Tensor<xpu, 4> grad = out_grad[kOut].get<xpu, 4, real_t>(s);
     Tensor<xpu, 4> gdata = in_grad[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 3> gwmat = in_grad[kWeight].get<xpu, 3, real_t>(s);
+    Tensor<xpu, 3> gwmat = in_grad[kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
     this->InitTemp(data.shape_, grad.shape_);
     const index_t nbatch = data.size(0);
     for (index_t i = 0; i < nbatch; i += param_.nstep) {
@@ -251,11 +260,9 @@ class ConvolutionProp : public OperatorProperty {
     if (dshape.ndim() ==  0) return false;
     CHECK_EQ(dshape.ndim(), 4) \
       << "Input data should be 4D in batch-nb_filter-y-x";
-    SHAPE_ASSIGN_CHECK(*in_shape, \
-                       kWeight, \
-                       Shape3(param_.nb_group, \
-                              param_.nb_filter / param_.nb_group, \
-                              dshape[1] / param_.nb_group * param_.kernel[0] * param_.kernel[1]));
+    SHAPE_ASSIGN_CHECK(*in_shape,
+                       kWeight,
+                       Shape4(param_.nb_filter, dshape[1], param_.kernel[0], param_.kernel[1]));
     if (!param_.no_bias) {
       SHAPE_ASSIGN_CHECK(*in_shape, kBias, Shape1(param_.nb_filter));
     }
diff --git a/src/operator/convolution.cc b/src/operator/convolution.cc
index e78533d0a985..5589d22bd004 100644
--- a/src/operator/convolution.cc
+++ b/src/operator/convolution.cc
@@ -21,11 +21,11 @@ Operator* ConvolutionProp::CreateOperator(Context ctx) const {
 DMLC_REGISTER_PARAMETER(ConvolutionParam);
 
 MXNET_REGISTER_OP_PROPERTY(Convolution, ConvolutionProp)
-.describe("Apply convolution to input then add a bias.")
 .add_argument("data", "Symbol", "Input data to the ConvolutionOp.")
 .add_argument("weight", "Symbol", "Weight matrix.")
 .add_argument("bias", "Symbol", "Bias parameter.")
-.add_arguments(ConvolutionParam::__FIELDS__());
+.add_arguments(ConvolutionParam::__FIELDS__())
+.describe("Apply convolution to input then add a bias.");
 
 }  // namespace op
 }  // namespace mxnet

From 39f2dfc77ce9b6d174056ed4e54ce15630e2d829 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Tue, 25 Aug 2015 19:37:27 -0600
Subject: [PATCH 4/4] reshape and flatten

---
 Makefile                                |   8 +-
 src/operator/convolution-inl.h          |   4 +-
 src/operator/flatten-inl.h              | 101 --------------
 src/operator/reshape-inl.h              | 175 ++++++++++++++++++++++++
 src/operator/{flatten.cc => reshape.cc} |  15 +-
 src/operator/{flatten.cu => reshape.cu} |   4 +-
 6 files changed, 193 insertions(+), 114 deletions(-)
 delete mode 100644 src/operator/flatten-inl.h
 create mode 100644 src/operator/reshape-inl.h
 rename src/operator/{flatten.cc => reshape.cc} (51%)
 rename src/operator/{flatten.cu => reshape.cu} (78%)

diff --git a/Makefile b/Makefile
index 2f2b14bee0a7..8ebcfa896d62 100644
--- a/Makefile
+++ b/Makefile
@@ -64,14 +64,14 @@ endif
 #BIN = test/test_threaded_engine test/api_registry_test
 OBJ = narray_function_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = flatten_cpu.o engine.o narray.o c_api.o operator.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o convolution_cpu.o
+OBJCXX11 = reshape_cpu.o engine.o narray.o c_api.o operator.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o convolution_cpu.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
 LIB_DEP = $(DMLC_CORE)/libdmlc.a
 
 ifeq ($(USE_CUDA), 1)
-	CUOBJ += flatten_gpu.o narray_function_gpu.o fully_connected_gpu.o activation_gpu.o elementwise_sum_gpu.o pooling_gpu.o softmax_gpu.o convolution_gpu.o
+	CUOBJ += reshape_gpu.o narray_function_gpu.o fully_connected_gpu.o activation_gpu.o elementwise_sum_gpu.o pooling_gpu.o softmax_gpu.o convolution_gpu.o
 endif
 
 .PHONY: clean all test lint doc
@@ -103,8 +103,8 @@ softmax_cpu.o: src/operator/softmax.cc
 softmax_gpu.o: src/operator/softmax.cu
 convolution_cpu.o: src/operator/convolution.cc
 convolution_gpu.o: src/operator/convolution.cu
-flatten_cpu.o: src/operator/flatten.cc
-flatten_gpu.o: src/operator/flatten.cu
+reshape_cpu.o: src/operator/reshape.cc
+reshape_gpu.o: src/operator/reshape.cu
 
 lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ)
 lib/libmxnet.so: $(OBJ) $(OBJCXX11) $(CUOBJ)
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index ccc0fdec0d9a..c328f5247f12 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -21,8 +21,8 @@
 namespace mxnet {
 namespace op {
 
-enum FullyConnectedOpInputs {kData, kWeight, kBias};
-enum FullyConnectedOpOutputs {kOut};
+enum ConvolutionOpInputs {kData, kWeight, kBias};
+enum ConvolutionOpOutputs {kOut};
 
 struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   TShape kernel;
diff --git a/src/operator/flatten-inl.h b/src/operator/flatten-inl.h
deleted file mode 100644
index da4110296909..000000000000
--- a/src/operator/flatten-inl.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file flatten-inl.h
- * \brief
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_FLATTEN_INL_H_
-#define MXNET_OPERATOR_FLATTEN_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "./operator_common.h"
-
-namespace mxnet {
-namespace op {
-
-enum FlattenOpInputs {kData};
-enum FlattenOpOutputs {kOut};
-
-template<typename xpu>
-class FlattenOp : public Operator {
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
-    Assign(out, req[kOut], reshape(data, out.shape_));
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad_out = out_grad[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> grad_in = in_grad[kOut].get<xpu, 4, real_t>(s);
-    Assign(grad_in, req[kData], reshape(grad_out, grad_in.shape_));
-  }
-};  // class FlattenOp
-
-template<typename xpu>
-Operator* CreateOp();
-
-#if DMLC_USE_CXX11
-class FlattenProp : public OperatorProperty {
- public:
-  FlattenProp() {}
-
-  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {}
-
-  virtual std::string TypeString() const {
-    return "Flatten";
-  }
-
-  virtual bool InferShape(std::vector<TShape> *in_shape,
-                          std::vector<TShape> *out_shape) const {
-    CHECK_EQ(in_shape->size(), 1) << "Input: [data]";
-    const TShape &dshape = in_shape->at(kData);
-    if (dshape.ndim() == 0) return false;
-    out_shape->clear();
-    out_shape->push_back(mshadow::Shape4(dshape[0], 1, 1, dshape[1] * dshape[2] * dshape[3]));
-    return true;
-  }
-
-  virtual OperatorProperty* Copy() const {
-    auto ptr = new FlattenProp();
-    return ptr;
-  }
-
-  virtual std::vector<int> DeclareBackwardDependency(
-      const std::vector<int> &out_grad,
-      const std::vector<int> &in_data,
-      const std::vector<int> &out_data) const {
-    return {out_grad[kOut]};
-  }
-
-  Operator* CreateOperator(Context ctx) const;
-};  // class FlattenProp
-#endif  // DMLC_USE_CXX11
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_FLATTEN_INL_H_
diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
new file mode 100644
index 000000000000..68918c460678
--- /dev/null
+++ b/src/operator/reshape-inl.h
@@ -0,0 +1,175 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file reshape-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_RESHAPE_INL_H_
+#define MXNET_OPERATOR_RESHAPE_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+enum ReshapeOpInputs {kData};
+enum ReshapeOpOutputs {kOut};
+
+struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
+  TShape target_shape;
+  DMLC_DECLARE_PARAMETER(ReshapeParam) {
+    DMLC_DECLARE_FIELD(target_shape).describe("Target new shape");
+  }
+};
+
+template<typename xpu>
+class ReshapeOp : public Operator {
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    if (req[kOut] == kNullOp) return;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
+    CHECK_EQ(data.CheckContiguous(), true);
+    CHECK_EQ(out.CheckContiguous(), true);
+    if (data.dptr_ == out.dptr_) return;
+    CHECK_EQ(data.shape_.Size(), out.shape_.Size());
+    Assign(out, req[kOut], reshape(data, out.shape_));
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(req.size(), 1);
+    if (req[kData] == kNullOp) return;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_grad.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> grad_out = out_grad[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> grad_in = in_grad[kOut].get<xpu, 4, real_t>(s);
+    CHECK_EQ(grad_out.CheckContiguous(), true);
+    CHECK_EQ(grad_in.CheckContiguous(), true);
+    if (grad_out.dptr_ == grad_in.dptr_) return;
+    CHECK_EQ(grad_out.shape_.Size(), grad_in.shape_.Size());
+    Assign(grad_in, req[kData], reshape(grad_out, grad_in.shape_));
+  }
+};  // class ReshapeOp
+
+template<typename xpu>
+Operator* CreateOp();
+
+#if DMLC_USE_CXX11
+class ReshapeProp : public OperatorProperty {
+ public:
+  ReshapeProp() {}
+
+  explicit ReshapeProp(ReshapeParam param) : param_(param) {}
+
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.Init(kwargs);
+  }
+
+  virtual std::string TypeString() const {
+    return "Reshape";
+  }
+
+  virtual bool InferShape(std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape) const {
+    CHECK_EQ(in_shape->size(), 1) << "Input: [data]";
+    const TShape &dshape = in_shape->at(kData);
+    if (dshape.ndim() == 0) return false;
+    CHECK(param_.target_shape.Size() == dshape.Size())
+      << "Target shape size is different to source. "
+      << "Target: " << param_.target_shape.Size()
+      << "\nSource: " << dshape.Size();
+    out_shape->clear();
+    out_shape->push_back(param_.target_shape);
+    return true;
+  }
+
+  virtual OperatorProperty* Copy() const {
+    auto ptr = new ReshapeProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  virtual std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    return {out_grad[kOut]};
+  }
+
+  virtual std::vector<std::pair<int, void*> > ForwardInplaceOption(
+      const std::vector<int> &in_data,
+      const std::vector<void*> &out_data) const {
+    return {{in_data[kData], out_data[kOut]}};
+  }
+
+  virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data,
+      const std::vector<void*> &in_grad) const {
+    return {{out_grad[kOut], in_grad[kData]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+
+ private:
+  ReshapeParam param_;
+};  // class ReshapeProp
+
+class FlattenProp : public ReshapeProp {
+ public:
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {}
+
+  virtual std::string TypeString() const {
+    return "Flatten";
+  }
+
+  virtual bool InferShape(std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape) const {
+    CHECK_EQ(in_shape->size(), 1) << "Input: [data]";
+    const TShape &dshape = in_shape->at(kData);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    uint32_t target_dim = 1;
+    for (uint32_t i = 1; i < dshape.ndim(); ++i) {
+      target_dim *= dshape[i];
+    }
+    out_shape->push_back(mshadow::Shape4(dshape[0], 1, 1, target_dim));
+    return true;
+  }
+
+  virtual OperatorProperty* Copy() const {
+    auto ptr = new FlattenProp();
+    return ptr;
+  }
+};  // class FlattenProp
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_RESHAPE_INL_H_
diff --git a/src/operator/flatten.cc b/src/operator/reshape.cc
similarity index 51%
rename from src/operator/flatten.cc
rename to src/operator/reshape.cc
index db156def8ca2..4f68b44e98e6 100644
--- a/src/operator/flatten.cc
+++ b/src/operator/reshape.cc
@@ -5,23 +5,28 @@
  * \author Bing Xu
 */
 
-#include "./flatten-inl.h"
+#include "./reshape-inl.h"
 
 
 namespace mxnet {
 namespace op {
 template<>
 Operator *CreateOp<cpu>() {
-  return new FlattenOp<cpu>();
+  return new ReshapeOp<cpu>();
 }
 
-Operator* FlattenProp::CreateOperator(Context ctx) const {
+Operator* ReshapeProp::CreateOperator(Context ctx) const {
   DO_BIND_DISPATCH(CreateOp);
 }
 
-MXNET_REGISTER_OP_PROPERTY(Flatten, FlattenProp)
+DMLC_REGISTER_PARAMETER(ReshapeParam);
+
+MXNET_REGISTER_OP_PROPERTY(Reshape, ReshapeProp)
 .add_argument("data", "Symbol", "Input data to  flatten.")
-.describe("Flatten 4D input to form batch-1-1-feature format");
+.describe("Reshape input to target shape");
 
+MXNET_REGISTER_OP_PROPERTY(Flatten, FlattenProp)
+.add_argument("data", "Symbol", "Input data to  flatten.")
+.describe("Flatten input");
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/flatten.cu b/src/operator/reshape.cu
similarity index 78%
rename from src/operator/flatten.cu
rename to src/operator/reshape.cu
index 5bf9d47c5691..34aa5e1754dd 100644
--- a/src/operator/flatten.cu
+++ b/src/operator/reshape.cu
@@ -5,14 +5,14 @@
  * \author Bing Xu
 */
 
-#include "./flatten-inl.h"
+#include "./reshape-inl.h"
 
 
 namespace mxnet {
 namespace op {
 template<>
   Operator *CreateOp<gpu>() {
-  return new FlattenOp<gpu>();
+  return new ReshapeOp<gpu>();
 }
 
 }  // namespace op