From 451641c77f14c2231e913618c917e6f86be33520 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Mon, 14 Sep 2015 03:13:03 +0800
Subject: [PATCH 01/13] almost add prefetcher

---
 include/mxnet/io.h       |   2 +-
 src/io/iter_batch.h      |   2 +-
 src/io/iter_prefetcher.h | 272 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 274 insertions(+), 2 deletions(-)
 create mode 100644 src/io/iter_prefetcher.h
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 7bb86f4eece3..4e9a9e7e60b1 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -68,7 +68,7 @@ struct DataBatch {
   mshadow::index_t num_batch_padd;
  public:
   /*! \brief content of dense data, if this DataBatch is dense */
-  std::vector<TBlob> data;
+  std::vector<NArray> data;
   /*! \brief extra data to be fed to the network */
   std::string extra_data;
  public:
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
index b45dfd3328e1..d2102439bb26 100644
--- a/src/io/iter_batch.h
+++ b/src/io/iter_batch.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2015 by Contributors
- * \file iter_batch_proc-inl.hpp
+ * \file iter_batch.h
  * \brief definition of preprocessing iterators that takes an iterator and do some preprocessing
  * \author Tianqi Chen, Tianjun Xiao
  */
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
new file mode 100644
index 000000000000..da3f49a2c28c
--- /dev/null
+++ b/src/io/iter_prefetcher.h
@@ -0,0 +1,272 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file iter_prefetcher.h
+ * \brief define a prefetcher using threaditer to keep k batch fetched 
+ */
+#ifndef MXNET_IO_ITER_PREFETCHER_H_
+#define MXNET_IO_ITER_PREFETCHER_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <dmlc/logging.h>
+#include <mshadow/tensor.h>
+#include <utility>
+#include <string>
+#include <vector>
+
+namespace mxnet {
+namespace io {
+// Batch parameters
+struct BatchParam : public dmlc::Parameter<BatchParam> {
+  /*! \brief label width */
+  index_t batch_size;
+  /*! \brief input shape */
+  TShape input_shape;
+  /*! \brief label width */
+  index_t label_width;
+  /*! \brief use round roubin to handle overflow batch */
+  bool round_batch;
+  /*! \brief skip read */
+  bool test_skipread;
+  /*! \brief silent */
+  bool silent;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(BatchParam) {
+    DMLC_DECLARE_FIELD(batch_size)
+        .describe("Batch size.");
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Input shape of the neural net");
+    DMLC_DECLARE_FIELD(label_width).set_default(1)
+        .describe("Label width.");
+    DMLC_DECLARE_FIELD(round_batch).set_default(true)
+        .describe("Use round robin to handle overflow batch.");
+    DMLC_DECLARE_FIELD(test_skipread).set_default(false)
+        .describe("Skip read for testing.");
+    DMLC_DECLARE_FIELD(silent).set_default(false)
+        .describe("Whether to print batch information.");
+  }
+};
+
+/*! \brief create a batch iterator from single instance iterator */
+class BatchLoader: public IIterator<DataBatch> {
+ public:
+  explicit BatchLoader(IIterator<DataInst> *base): base_(base), num_overflow_(0) {}
+  virtual ~BatchLoader(void) {
+    delete base_;
+    FreeSpaceDense();
+  }
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    // init batch param, it could have similar param with
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    // init base iterator
+    base_->Init(kwargs);
+    data_shape_[1] = param_.input_shape[0];
+    data_shape_[2] = param_.input_shape[1];
+    data_shape_[3] = param_.input_shape[2];
+    data_shape_[0] = param_.batch_size;
+    label_shape_[1] = param_.label_width;
+    label_shape_[0] = param_.batch_size;
+  }
+  virtual void BeforeFirst(void) {
+    if (param_.round_batch == 0 || num_overflow_ == 0) {
+      // otherise, we already called before first
+      base_->BeforeFirst();
+    } else {
+      num_overflow_ = 0;
+    }
+    head_ = 1;
+  }
+  virtual bool Next(void) {
+    out_.num_batch_padd = 0;
+
+    // skip read if in head version
+    if (param_.test_skipread != 0 && head_ == 0)
+        return true;
+    else
+        this->head_ = 0;
+
+    // if overflow from previous round, directly return false, until before first is called
+    if (num_overflow_ != 0) return false;
+    index_t top = 0;
+
+    while (base_->Next()) {
+      const DataInst& d = base_->Value();
+      mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
+      out_.inst_index[top] = d.index;
+      mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
+
+      if (++ top >= param_.batch_size) {
+        out_.data[0] = TBlob(data);
+        out_.data[1] = TBlob(label);
+        return true;
+      }
+    }
+    if (top != 0) {
+      if (param_.round_batch != 0) {
+        num_overflow_ = 0;
+        base_->BeforeFirst();
+        for (; top < param_.batch_size; ++top, ++num_overflow_) {
+          CHECK(base_->Next()) << "number of input must be bigger than batch size";
+          const DataInst& d = base_->Value();
+          mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
+          out_.inst_index[top] = d.index;
+          mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
+        }
+        out_.num_batch_padd = num_overflow_;
+      } else {
+        out_.num_batch_padd = param_.batch_size - top;
+      }
+      out_.data[0] = TBlob(data);
+      out_.data[1] = TBlob(label);
+      return true;
+    }
+    return false;
+  }
+  virtual const DataBatch &Value(void) const {
+    CHECK(head_ == 0) << "must call Next to get value";
+    return out_;
+  }
+
+ private:
+  /*! \brief batch parameters */
+  BatchParam param_;
+  /*! \brief base iterator */
+  IIterator<DataInst> *base_;
+  /*! \brief output data */
+  DataBatch out_;
+  /*! \brief on first */
+  int head_;
+  /*! \brief number of overflow instances that readed in round_batch mode */
+  int num_overflow_;
+  /*! \brief label information of the data*/
+  mshadow::Tensor<mshadow::cpu, 2> label;
+  /*! \brief content of dense data, if this DataBatch is dense */
+  mshadow::Tensor<mshadow::cpu, 4> data;
+  /*! \brief data shape */
+  mshadow::Shape<4> data_shape_;
+  /*! \brief data shape */
+  mshadow::Shape<2> label_shape_;
+  // Functions that allocate and free tensor space
+  inline void AllocSpaceDense(bool pad = false) {
+    data = mshadow::NewTensor<mshadow::cpu>(data_shape_, 0.0f, pad);
+    mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size, param_.label_width);
+    label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
+    out_.inst_index = new unsigned[param_.batch_size];
+    out_.batch_size = param_.batch_size;
+    out_.data.resize(2);
+  }
+  /*! \brief auxiliary function to free space, if needed, dense only */
+  inline void FreeSpaceDense(void) {
+    if (label.dptr_ != NULL) {
+      delete [] out_.inst_index;
+      mshadow::FreeSpace(&label);
+      mshadow::FreeSpace(&data);
+      label.dptr_ = NULL;
+    }
+  }
+};  // class BatchAdaptIter
+
+    
+    
+// Define prefetcher parameters
+struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
+  /*! \brief number of prefetched batches */
+  int capacity;
+  /*! \brief input shape */
+  TShape input_shape;
+  /*! \brief label width */
+  index_t label_width;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(PrefetcherParam) {
+    DMLC_DECLARE_FIELD(capacity).set_default(1)
+        .describe("Number of prefetched batches");
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Input shape of the neural net");
+    DMLC_DECLARE_FIELD(label_width).set_default(1)
+        .describe("Label width.");
+  }
+};
+  
+// iterator on image recordio
+class PrefetcherIter : public IIterator<DataInst> {
+ public:
+  PrefetcherIter(IIterator<DataInst>* base) : loader_(base){
+  }
+  virtual ~PrefetcherIter(void) {
+    iter_.Destroy();
+  }
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    // init image rec param
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    // use the kwarg to init parser
+    parser_.Init(kwargs);
+    // init thread iter
+    iter_.set_max_capacity(param_.capacity);
+    iter_.Init([this](DataBatch **dptr) {
+        if (*dptr == NULL) {
+          *dptr = new DataBatch();
+          // init NArrays
+          // TODO: currectly use defalt context
+          Context ctx; 
+          *dptr->data.push_back(NArray(TShape(param_.input_shape), ctx, true));
+          *dptr->data.push_back(NArray(TShape(param_.label_shape), ctx, true));
+        }
+        return loader_.LoadNext(*dptr);
+      },
+      [this]() { loader_.BeforeFirst(); });
+  }
+  virtual void BeforeFirst(void) {
+    iter_.BeforeFirst();
+  }
+  virtual bool Next(void) {
+     if (ready_narrays_.size() == param_.capacity) {
+         std::vector<NArray*> old_narrays = ready_narrays_.front();
+         for (size_t i = 0; i < old_narrays.size(); i++) {
+             old_narrays[i]->WaitToWrite();
+         }
+         ready_narrays_.pop();
+         DataIter* old_batch = ready_batches_.front();
+         ready_batches_.pop();
+         iter_->Recycle(&old_batch);
+     }
+     DataBatch* next_batch = NULL;
+     if (!iter_.Next(&next_batch)) return false;
+     out_.data.clear();
+     for (size_t i = 0; i < next_batch->data.size(); i++) {
+         out_.data.push_back(Copy(next_batch->data[i], next_batch->data[i].ctx()));
+     }
+     // push the narrays and batch into the queue
+     ready_batches_.push_back(next_batch);
+     std::vector<NArray*> next_batch_narrays;
+     for (size_t i = 0; i < out_.data.size(); i++) {
+         next_batch_narrays.push_back(&out.data[i]);
+     }
+  }
+  virtual const DataInst &Value(void) const {
+    return out_;
+  }
+
+ private:
+  /*! \brief prefetcher parameters */
+  PrefetcherParam param_;
+  /*! \brief output data */
+  DataBatch out_;
+  /*! \brief queue to hold the NArrays for check whether writable */
+  std::queue<std::vector<NArray*> > ready_narrays_;
+  /*! \brief queue to hold the NArrays for check whether writable */
+  std::queue<DataBatch*> ready_batches_;
+  // internal parser
+  BatchLoader loader_;
+  // backend thread
+  dmlc::ThreadedIter<DataBatch> iter_;
+};
+}  // namespace io
+}  // namespace mxnet

From 26df426fe9d286be1c27bde72aa7fd248483965c Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Mon, 14 Sep 2015 16:29:44 +0800
Subject: [PATCH 02/13] add prefetcher, need merge to compile

---
 include/mxnet/io.h            |   1 +
 src/c_api.cc                  |   4 +-
 src/io/image_augmenter.h      |   1 -
 src/io/io.cc                  |   2 +-
 src/io/iter_batch.h           | 172 ----------------------------------
 src/io/iter_image_recordio.cc |   6 +-
 src/io/iter_mnist.cc          |   8 +-
 src/io/iter_prefetcher.h      | 125 ++++++++++++------------
 8 files changed, 72 insertions(+), 247 deletions(-)
 delete mode 100644 src/io/iter_batch.h

diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 4e9a9e7e60b1..537cee89cc97 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -11,6 +11,7 @@
 #include <string>
 #include <utility>
 #include "./base.h"
+#include "./narray.h"
 
 namespace mxnet {
 /*!
diff --git a/src/c_api.cc b/src/c_api.cc
index 48d83cffc688..8c748a1c4b5a 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -812,13 +812,13 @@ int MXDataIterNext(DataIterHandle handle, int *out) {
 int MXDataIterGetLabel(DataIterHandle handle, NArrayHandle *out) {
   API_BEGIN();
   DataBatch db = static_cast<IIterator<DataBatch>* >(handle)->Value();
-  *out = new NArray(db.data[1], 0);
+  *out = &db.data[1];
   API_END();
 }
 
 int MXDataIterGetData(DataIterHandle handle, NArrayHandle *out) {
   API_BEGIN();
   DataBatch db = static_cast<IIterator<DataBatch>* >(handle)->Value();
-  *out = new NArray(db.data[0], 0);
+  *out = &db.data[0];
   API_END();
 }
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index a4b77f5a41df..5dc00b585d9d 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -2,7 +2,6 @@
  *  Copyright (c) 2015 by Contributors
  * \file image_augmenter_opencv.hpp
  * \brief threaded version of page iterator
- * \author Naiyan Wang, Tianqi Chen, Tianjun Xiao
  */
 #ifndef MXNET_IO_IMAGE_AUGMENTER_H_
 #define MXNET_IO_IMAGE_AUGMENTER_H_
diff --git a/src/io/io.cc b/src/io/io.cc
index 8bfb5dbdd570..8c2b221af525 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -5,7 +5,7 @@
 #include <mxnet/io.h>
 #include <dmlc/registry.h>
 #include "./image_augmenter.h"
-#include "./iter_batch.h"
+#include "./iter_prefetcher.h"
 
 // Registers
 namespace dmlc {
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
deleted file mode 100644
index d2102439bb26..000000000000
--- a/src/io/iter_batch.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/*!
- *  Copyright (c) 2015 by Contributors
- * \file iter_batch.h
- * \brief definition of preprocessing iterators that takes an iterator and do some preprocessing
- * \author Tianqi Chen, Tianjun Xiao
- */
-#ifndef MXNET_IO_ITER_BATCH_H_
-#define MXNET_IO_ITER_BATCH_H_
-
-#include <mxnet/io.h>
-#include <mxnet/base.h>
-#include <dmlc/logging.h>
-#include <mshadow/tensor.h>
-#include <utility>
-#include <string>
-#include <vector>
-
-namespace mxnet {
-namespace io {
-// Batch parameters
-struct BatchParam : public dmlc::Parameter<BatchParam> {
-  /*! \brief label width */
-  index_t batch_size;
-  /*! \brief input shape */
-  TShape input_shape;
-  /*! \brief label width */
-  index_t label_width;
-  /*! \brief use round roubin to handle overflow batch */
-  bool round_batch;
-  /*! \brief skip read */
-  bool test_skipread;
-  /*! \brief silent */
-  bool silent;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(BatchParam) {
-    DMLC_DECLARE_FIELD(batch_size)
-        .describe("Batch size.");
-    index_t input_shape_default[] = {3, 224, 224};
-    DMLC_DECLARE_FIELD(input_shape)
-        .set_default(TShape(input_shape_default, input_shape_default + 3))
-        .set_expect_ndim(3).enforce_nonzero()
-        .describe("Input shape of the neural net");
-    DMLC_DECLARE_FIELD(label_width).set_default(1)
-        .describe("Label width.");
-    DMLC_DECLARE_FIELD(round_batch).set_default(true)
-        .describe("Use round robin to handle overflow batch.");
-    DMLC_DECLARE_FIELD(test_skipread).set_default(false)
-        .describe("Skip read for testing.");
-    DMLC_DECLARE_FIELD(silent).set_default(false)
-        .describe("Whether to print batch information.");
-  }
-};
-
-/*! \brief create a batch iterator from single instance iterator */
-class BatchAdaptIter: public IIterator<DataBatch> {
- public:
-  explicit BatchAdaptIter(IIterator<DataInst> *base): base_(base), num_overflow_(0) {}
-  virtual ~BatchAdaptIter(void) {
-    delete base_;
-    FreeSpaceDense();
-  }
-  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
-    std::vector<std::pair<std::string, std::string> > kwargs_left;
-    // init batch param, it could have similar param with
-    kwargs_left = param_.InitAllowUnknown(kwargs);
-    // init base iterator
-    base_->Init(kwargs);
-    data_shape_[1] = param_.input_shape[0];
-    data_shape_[2] = param_.input_shape[1];
-    data_shape_[3] = param_.input_shape[2];
-    data_shape_[0] = param_.batch_size;
-    AllocSpaceDense(false);
-  }
-  virtual void BeforeFirst(void) {
-    if (param_.round_batch == 0 || num_overflow_ == 0) {
-      // otherise, we already called before first
-      base_->BeforeFirst();
-    } else {
-      num_overflow_ = 0;
-    }
-    head_ = 1;
-  }
-  virtual bool Next(void) {
-    out_.num_batch_padd = 0;
-
-    // skip read if in head version
-    if (param_.test_skipread != 0 && head_ == 0)
-        return true;
-    else
-        this->head_ = 0;
-
-    // if overflow from previous round, directly return false, until before first is called
-    if (num_overflow_ != 0) return false;
-    index_t top = 0;
-
-    while (base_->Next()) {
-      const DataInst& d = base_->Value();
-      mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
-      out_.inst_index[top] = d.index;
-      mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
-
-      if (++ top >= param_.batch_size) {
-        out_.data[0] = TBlob(data);
-        out_.data[1] = TBlob(label);
-        return true;
-      }
-    }
-    if (top != 0) {
-      if (param_.round_batch != 0) {
-        num_overflow_ = 0;
-        base_->BeforeFirst();
-        for (; top < param_.batch_size; ++top, ++num_overflow_) {
-          CHECK(base_->Next()) << "number of input must be bigger than batch size";
-          const DataInst& d = base_->Value();
-          mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
-          out_.inst_index[top] = d.index;
-          mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
-        }
-        out_.num_batch_padd = num_overflow_;
-      } else {
-        out_.num_batch_padd = param_.batch_size - top;
-      }
-      out_.data[0] = TBlob(data);
-      out_.data[1] = TBlob(label);
-      return true;
-    }
-    return false;
-  }
-  virtual const DataBatch &Value(void) const {
-    CHECK(head_ == 0) << "must call Next to get value";
-    return out_;
-  }
-
- private:
-  /*! \brief batch parameters */
-  BatchParam param_;
-  /*! \brief base iterator */
-  IIterator<DataInst> *base_;
-  /*! \brief output data */
-  DataBatch out_;
-  /*! \brief on first */
-  int head_;
-  /*! \brief number of overflow instances that readed in round_batch mode */
-  int num_overflow_;
-  /*! \brief label information of the data*/
-  mshadow::Tensor<mshadow::cpu, 2> label;
-  /*! \brief content of dense data, if this DataBatch is dense */
-  mshadow::Tensor<mshadow::cpu, 4> data;
-  /*! \brief data shape */
-  mshadow::Shape<4> data_shape_;
-  // Functions that allocate and free tensor space
-  inline void AllocSpaceDense(bool pad = false) {
-    data = mshadow::NewTensor<mshadow::cpu>(data_shape_, 0.0f, pad);
-    mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size, param_.label_width);
-    label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
-    out_.inst_index = new unsigned[param_.batch_size];
-    out_.batch_size = param_.batch_size;
-    out_.data.resize(2);
-  }
-  /*! \brief auxiliary function to free space, if needed, dense only */
-  inline void FreeSpaceDense(void) {
-    if (label.dptr_ != NULL) {
-      delete [] out_.inst_index;
-      mshadow::FreeSpace(&label);
-      mshadow::FreeSpace(&data);
-      label.dptr_ = NULL;
-    }
-  }
-};  // class BatchAdaptIter
-}  // namespace io
-}  // namespace mxnet
-#endif  // MXNET_IO_ITER_BATCH_H_
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 701c28deb4c9..bbcaf4fac19f 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -4,6 +4,8 @@
  * \brief recordio data
 iterator
  */
+#include <mxnet/io.h>
+#include <mxnet/narray.h>
 #include <dmlc/base.h>
 #include <dmlc/io.h>
 #include <dmlc/omp.h>
@@ -412,11 +414,11 @@ class ImageRecordIter : public IIterator<DataInst> {
 };
 DMLC_REGISTER_PARAMETER(ImageRecParserParam);
 DMLC_REGISTER_PARAMETER(ImageRecordParam);
-MXNET_REGISTER_IO_CHAINED_ITER(ImageRecordIter, ImageRecordIter, BatchAdaptIter)
+MXNET_REGISTER_IO_CHAINED_ITER(ImageRecordIter, ImageRecordIter, PrefetcherIter)
     .describe("Create iterator for dataset packed in recordio.")
     .add_arguments(ImageRecordParam::__FIELDS__())
     .add_arguments(ImageRecParserParam::__FIELDS__())
-    .add_arguments(BatchParam::__FIELDS__())
+    .add_arguments(PrefetcherParam::__FIELDS__())
     .add_arguments(ImageAugmentParam::__FIELDS__());
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index 77ac3a479f75..b60ec6b524ac 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -2,10 +2,10 @@
  * Copyright (c) 2015 by Contributors
  * \file iter_mnist.cc
  * \brief register mnist iterator
- * \author Tianjun Xiao
 */
 #include <mxnet/io.h>
 #include <mxnet/base.h>
+#include <mxnet/narray.h>
 #include <dmlc/io.h>
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
@@ -100,10 +100,10 @@ class MNISTIter: public IIterator<DataBatch> {
       batch_data_.dptr_ = img_[loc_].dptr_;
       batch_label_.dptr_ = &labels_[loc_];
       if (param_.flat)
-          out_.data[0] = TBlob(batch_data_.FlatTo2D());
+          out_.data[0] = NArray(TBlob(batch_data_.FlatTo2D()), 0);
       else
-          out_.data[0] = TBlob(batch_data_);
-      out_.data[1] = TBlob(batch_label_);
+          out_.data[0] = NArray(TBlob(batch_data_), 0);
+      out_.data[1] = NArray(TBlob(batch_label_), 0);
       out_.inst_index = &inst_[loc_];
       loc_ += param_.batch_size;
       return true;
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index da3f49a2c28c..16070a703bd0 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -8,11 +8,14 @@
 
 #include <mxnet/io.h>
 #include <mxnet/base.h>
+#include <mxnet/narray.h>
 #include <dmlc/logging.h>
+#include <dmlc/threadediter.h>
 #include <mshadow/tensor.h>
 #include <utility>
 #include <string>
 #include <vector>
+#include <queue>
 
 namespace mxnet {
 namespace io {
@@ -51,27 +54,30 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
 };
 
 /*! \brief create a batch iterator from single instance iterator */
-class BatchLoader: public IIterator<DataBatch> {
+class BatchLoader {
  public:
   explicit BatchLoader(IIterator<DataInst> *base): base_(base), num_overflow_(0) {}
   virtual ~BatchLoader(void) {
     delete base_;
-    FreeSpaceDense();
   }
-  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+  inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init batch param, it could have similar param with
     kwargs_left = param_.InitAllowUnknown(kwargs);
     // init base iterator
     base_->Init(kwargs);
-    data_shape_[1] = param_.input_shape[0];
-    data_shape_[2] = param_.input_shape[1];
-    data_shape_[3] = param_.input_shape[2];
-    data_shape_[0] = param_.batch_size;
-    label_shape_[1] = param_.label_width;
-    label_shape_[0] = param_.batch_size;
+    std::vector<size_t> data_shape_vec;
+    data_shape_vec.push_back(param_.batch_size);
+    data_shape_vec.push_back(param_.input_shape[0]);
+    data_shape_vec.push_back(param_.input_shape[1]);
+    data_shape_vec.push_back(param_.input_shape[2]);
+    data_shape_ = TShape(data_shape_vec.begin(), data_shape_vec.end());
+    std::vector<size_t> label_shape_vec;
+    label_shape_vec.push_back(param_.batch_size);
+    label_shape_vec.push_back(param_.label_width);
+    label_shape_ = TShape(label_shape_vec.begin(), label_shape_vec.end());
   }
-  virtual void BeforeFirst(void) {
+  inline void BeforeFirst(void) {
     if (param_.round_batch == 0 || num_overflow_ == 0) {
       // otherise, we already called before first
       base_->BeforeFirst();
@@ -80,8 +86,8 @@ class BatchLoader: public IIterator<DataBatch> {
     }
     head_ = 1;
   }
-  virtual bool Next(void) {
-    out_.num_batch_padd = 0;
+  inline bool LoadNext(DataBatch* out) {
+    out->num_batch_padd = 0;
 
     // skip read if in head version
     if (param_.test_skipread != 0 && head_ == 0)
@@ -95,13 +101,13 @@ class BatchLoader: public IIterator<DataBatch> {
 
     while (base_->Next()) {
       const DataInst& d = base_->Value();
-      mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
-      out_.inst_index[top] = d.index;
-      mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
+      out->inst_index[top] = d.index;
+      mshadow::Copy(out->data[1].data().get<mshadow::cpu, 2, float>()[top],
+              d.data[1].get<mshadow::cpu, 1, float>());
+      mshadow::Copy(out->data[0].data().get<mshadow::cpu, 4, float>()[top],
+              d.data[0].get<mshadow::cpu, 3, float>());
 
       if (++ top >= param_.batch_size) {
-        out_.data[0] = TBlob(data);
-        out_.data[1] = TBlob(label);
         return true;
       }
     }
@@ -112,70 +118,42 @@ class BatchLoader: public IIterator<DataBatch> {
         for (; top < param_.batch_size; ++top, ++num_overflow_) {
           CHECK(base_->Next()) << "number of input must be bigger than batch size";
           const DataInst& d = base_->Value();
-          mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
-          out_.inst_index[top] = d.index;
-          mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
+          out->inst_index[top] = d.index;
+          mshadow::Copy(out->data[1].data().get<mshadow::cpu, 2, float>()[top],
+                  d.data[1].get<mshadow::cpu, 1, float>());
+          mshadow::Copy(out->data[0].data().get<mshadow::cpu, 4, float>()[top],
+                  d.data[0].get<mshadow::cpu, 3, float>());
         }
-        out_.num_batch_padd = num_overflow_;
+        out->num_batch_padd = num_overflow_;
       } else {
-        out_.num_batch_padd = param_.batch_size - top;
+        out->num_batch_padd = param_.batch_size - top;
       }
-      out_.data[0] = TBlob(data);
-      out_.data[1] = TBlob(label);
       return true;
     }
     return false;
   }
-  virtual const DataBatch &Value(void) const {
-    CHECK(head_ == 0) << "must call Next to get value";
-    return out_;
-  }
 
  private:
   /*! \brief batch parameters */
   BatchParam param_;
   /*! \brief base iterator */
   IIterator<DataInst> *base_;
-  /*! \brief output data */
-  DataBatch out_;
   /*! \brief on first */
   int head_;
   /*! \brief number of overflow instances that readed in round_batch mode */
   int num_overflow_;
-  /*! \brief label information of the data*/
-  mshadow::Tensor<mshadow::cpu, 2> label;
-  /*! \brief content of dense data, if this DataBatch is dense */
-  mshadow::Tensor<mshadow::cpu, 4> data;
-  /*! \brief data shape */
-  mshadow::Shape<4> data_shape_;
   /*! \brief data shape */
-  mshadow::Shape<2> label_shape_;
-  // Functions that allocate and free tensor space
-  inline void AllocSpaceDense(bool pad = false) {
-    data = mshadow::NewTensor<mshadow::cpu>(data_shape_, 0.0f, pad);
-    mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size, param_.label_width);
-    label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
-    out_.inst_index = new unsigned[param_.batch_size];
-    out_.batch_size = param_.batch_size;
-    out_.data.resize(2);
-  }
-  /*! \brief auxiliary function to free space, if needed, dense only */
-  inline void FreeSpaceDense(void) {
-    if (label.dptr_ != NULL) {
-      delete [] out_.inst_index;
-      mshadow::FreeSpace(&label);
-      mshadow::FreeSpace(&data);
-      label.dptr_ = NULL;
-    }
-  }
-};  // class BatchAdaptIter
-
-    
+  TShape data_shape_;
+  /*! \brief label shape */
+  TShape label_shape_;
+};  // class BatchLoader
     
 // Define prefetcher parameters
 struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
   /*! \brief number of prefetched batches */
-  int capacity;
+  size_t capacity;
+    /*! \brief label width */
+  index_t batch_size;
   /*! \brief input shape */
   TShape input_shape;
   /*! \brief label width */
@@ -206,18 +184,27 @@ class PrefetcherIter : public IIterator<DataInst> {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init image rec param
     kwargs_left = param_.InitAllowUnknown(kwargs);
-    // use the kwarg to init parser
-    parser_.Init(kwargs);
+    // use the kwarg to init batch loader
+    loader_.Init(kwargs);
+    std::vector<size_t> data_shape_vec;
+    data_shape_vec.push_back(param_.batch_size);
+    data_shape_vec.push_back(param_.input_shape[0]);
+    data_shape_vec.push_back(param_.input_shape[1]);
+    data_shape_vec.push_back(param_.input_shape[2]);
+    data_shape_ = TShape(data_shape_vec.begin(),data_shape_vec.end());
+    std::vector<size_t> label_shape_vec;
+    label_shape_vec.push_back(param_.batch_size);
+    label_shape_vec.push_back(param_.label_width);
+    label_shape_ = TShape(label_shape_vec.begin(), label_shape_vec.end());
     // init thread iter
     iter_.set_max_capacity(param_.capacity);
     iter_.Init([this](DataBatch **dptr) {
         if (*dptr == NULL) {
           *dptr = new DataBatch();
           // init NArrays
-          // TODO: currectly use defalt context
           Context ctx; 
-          *dptr->data.push_back(NArray(TShape(param_.input_shape), ctx, true));
-          *dptr->data.push_back(NArray(TShape(param_.label_shape), ctx, true));
+          (*dptr)->data.push_back(NArray(data_shape_, ctx, true));
+          (*dptr)->data.push_back(NArray(label_shape_, ctx, true));
         }
         return loader_.LoadNext(*dptr);
       },
@@ -240,6 +227,7 @@ class PrefetcherIter : public IIterator<DataInst> {
      DataBatch* next_batch = NULL;
      if (!iter_.Next(&next_batch)) return false;
      out_.data.clear();
+     // copy the batch
      for (size_t i = 0; i < next_batch->data.size(); i++) {
          out_.data.push_back(Copy(next_batch->data[i], next_batch->data[i].ctx()));
      }
@@ -249,6 +237,8 @@ class PrefetcherIter : public IIterator<DataInst> {
      for (size_t i = 0; i < out_.data.size(); i++) {
          next_batch_narrays.push_back(&out.data[i]);
      }
+     ready_narrays_.push_back(next_batch_narrays);
+     return true;
   }
   virtual const DataInst &Value(void) const {
     return out_;
@@ -263,10 +253,15 @@ class PrefetcherIter : public IIterator<DataInst> {
   std::queue<std::vector<NArray*> > ready_narrays_;
   /*! \brief queue to hold the NArrays for check whether writable */
   std::queue<DataBatch*> ready_batches_;
-  // internal parser
+  // internal batch loader
   BatchLoader loader_;
   // backend thread
   dmlc::ThreadedIter<DataBatch> iter_;
+  /*! \brief data shape */
+  TShape data_shape_;
+  /*! \brief label shape */
+  TShape label_shape_;
 };
 }  // namespace io
 }  // namespace mxnet
+#endif  // MXNET_IO_ITER_PREFETCHER_H_

From 297396cb30f1219a88cff139a5f8e688e6c268d2 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Tue, 15 Sep 2015 18:02:37 +0800
Subject: [PATCH 03/13] modify bug in getnarray

---
 example/mnist/mlp_gpu.py | 14 --------------
 python/mxnet/io.py       |  1 -
 python/mxnet/ndarray.py  |  1 -
 src/c_api.cc             |  6 ++----
 4 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/example/mnist/mlp_gpu.py b/example/mnist/mlp_gpu.py
index 3174e563de00..a801476decfe 100644
--- a/example/mnist/mlp_gpu.py
+++ b/example/mnist/mlp_gpu.py
@@ -90,21 +90,7 @@ def test_mlp():
         train_nbatch = 0
         val_nbatch = 0
         for data, label in train_dataiter:
-            print tmp_label.shape
-            print label.asnumpy().shape
-            print 'xx'
-            tt = label.asnumpy()
-            print 'tt'
-            ss = label.asnumpy()
-            print label.handle
-            print label.asnumpy()[0:5]
-            print 'ccc'
-            label = label.asnumpy()
-            print 'aaaa'
-            exit(1)
             label = label.asnumpy().reshape(tmp_label.shape)
-
-            
             tmp_label[:] = label
             inputs["data"][:] = data
             inputs["sm_label"][:] = tmp_label
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index eefc1da04250..aff8e1c8cb00 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -81,7 +81,6 @@ def getlabel(self):
         """
         hdl = NDArrayHandle()
         check_call(_LIB.MXDataIterGetLabel(self.handle, ctypes.byref(hdl)))
-        print 'get', hdl
         return NDArray(hdl)
 
 def _make_io_iterator(handle):
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index fc997a987349..5b9323639298 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -63,7 +63,6 @@ def __init__(self, handle):
         self.handle = handle
 
     def __del__(self):
-        print 'del', self.handle
         check_call(_LIB.MXNDArrayFree(self.handle))
 
     def __add__(self, other):
diff --git a/src/c_api.cc b/src/c_api.cc
index dc950cb144c2..251fd1b3172a 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -331,8 +331,6 @@ int MXNDArrayWaitAll() {
 
 int MXNDArrayFree(NDArrayHandle handle) {
   API_BEGIN();
-  std::cout << handle << std::endl;
-  std::cout << static_cast<NDArray*>(handle)->shape().Size() << std::endl;
   delete static_cast<NDArray*>(handle);
   API_END();
 }
@@ -826,14 +824,14 @@ int MXDataIterNext(DataIterHandle handle, int *out) {
 int MXDataIterGetLabel(DataIterHandle handle, NDArrayHandle *out) {
   API_BEGIN();
   DataBatch db = static_cast<IIterator<DataBatch>* >(handle)->Value();
-  *out = &db.data[1];
+  *out = new NDArray(db.data[1].data(), 0);
   API_END();
 }
 
 int MXDataIterGetData(DataIterHandle handle, NDArrayHandle *out) {
   API_BEGIN();
   DataBatch db = static_cast<IIterator<DataBatch>* >(handle)->Value();
-  *out = &db.data[0];
+  *out = new NDArray(db.data[0].data(), 0);
   API_END();
 }
 

From e4e8f9dc305cc18b9307cb3b06a03322c836d761 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Wed, 16 Sep 2015 16:37:02 +0800
Subject: [PATCH 04/13] prefetcher can work but has uncertainty

---
 example/cifar10/cifar10.py       |  8 +++++--
 include/mxnet/io.h               |  7 +++++++
 python/mxnet/io.py               |  4 ++--
 python/mxnet/ndarray.py          |  7 ++++++-
 src/c_api.cc                     |  8 +++++--
 src/io/iter_image_recordio.cc    |  7 +++----
 src/io/iter_mnist.cc             |  9 +++-----
 src/io/iter_prefetcher.h         | 31 +++++++++++----------------
 tests/python/unittest/test_io.py | 36 +++++++++++++++-----------------
 9 files changed, 62 insertions(+), 55 deletions(-)

diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index ce8aa2c8823e..02cd5a4b07f0 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -200,17 +200,21 @@ def Update(grad, weight, mom):
         mean_img="data/cifar/cifar_mean.bin",
         rand_crop=True,
         rand_mirror=True,
+        shuffle=False,
         input_shape=(3,28,28),
         batch_size=batch_size,
-        nthread=1)
+        nthread=1,
+        capacity=4)
 test_dataiter = mx.io.ImageRecordIter(
         path_imgrec="data/cifar/test.rec",
         mean_img="data/cifar/cifar_mean.bin",
         rand_crop=False,
         rand_mirror=False,
+        shuffle=False,
         input_shape=(3,28,28),
         batch_size=batch_size,
-        nthread=1)
+        nthread=1,
+        capacity=4)
 
 
 def progress(count, total, epoch, toc):
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 056bcc3f4f74..6b049eef4222 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -11,6 +11,7 @@
 #include <vector>
 #include <string>
 #include <utility>
+#include <queue>
 #include "./base.h"
 #include "./ndarray.h"
 
@@ -79,6 +80,12 @@ struct DataBatch {
     inst_index = NULL;
     batch_size = 0; num_batch_padd = 0;
   }
+  /*! \brief destructor */
+  ~DataBatch() {
+    if(inst_index != NULL) {
+        delete inst_index;
+    }
+  }
   /*! \brief giving name to the data */
   void Naming(std::vector<std::string> names);
 };  // struct DataBatch
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index aff8e1c8cb00..62e92bd020d5 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -73,7 +73,7 @@ def getdata(self):
         """
         hdl = NDArrayHandle()
         check_call(_LIB.MXDataIterGetData(self.handle, ctypes.byref(hdl)))
-        return NDArray(hdl)
+        return NDArray(hdl, False)
 
     def getlabel(self):
         """get label from batch
@@ -81,7 +81,7 @@ def getlabel(self):
         """
         hdl = NDArrayHandle()
         check_call(_LIB.MXDataIterGetLabel(self.handle, ctypes.byref(hdl)))
-        return NDArray(hdl)
+        return NDArray(hdl, False)
 
 def _make_io_iterator(handle):
     """Create an io iterator by handle."""
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index 5b9323639298..83cb76edb560 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -51,7 +51,7 @@ class NDArray(object):
     NDArray is basic ndarray/Tensor like data structure in mxnet.
     """
     # pylint: disable= no-member
-    def __init__(self, handle):
+    def __init__(self, handle, writable=True):
         """initialize a new NDArray
 
         Parameters
@@ -61,6 +61,7 @@ def __init__(self, handle):
         """
         assert isinstance(handle, NDArrayHandle)
         self.handle = handle
+        self.writable = writable
 
     def __del__(self):
         check_call(_LIB.MXNDArrayFree(self.handle))
@@ -547,6 +548,8 @@ def binary_ndarray_function(lhs, rhs, out=None):
         if out:
             if isinstance(out, NDArray) == False:
                 raise TypeError('out must be NDArray')
+            if out.writable == True:
+                raise TypeError('out must be writable')
         else:
             if not accept_empty_mutate:
                 raise TypeError('argument out is required to call %s' % func_name)
@@ -562,6 +565,8 @@ def unary_ndarray_function(src, out=None):
         if out:
             if isinstance(out, NDArray) == False:
                 raise TypeError('out must be NDArray')
+            if out.writable == True:
+                raise TypeError('out must be writable')
         else:
             if not accept_empty_mutate:
                 raise TypeError('argument out is required to call %s' % func_name)
diff --git a/src/c_api.cc b/src/c_api.cc
index 251fd1b3172a..7251652d5d2f 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -824,14 +824,18 @@ int MXDataIterNext(DataIterHandle handle, int *out) {
 int MXDataIterGetLabel(DataIterHandle handle, NDArrayHandle *out) {
   API_BEGIN();
   DataBatch db = static_cast<IIterator<DataBatch>* >(handle)->Value();
-  *out = new NDArray(db.data[1].data(), 0);
+  NDArray* pndarray = new NDArray();
+  *pndarray = db.data[1];
+  *out = pndarray;
   API_END();
 }
 
 int MXDataIterGetData(DataIterHandle handle, NDArrayHandle *out) {
   API_BEGIN();
   DataBatch db = static_cast<IIterator<DataBatch>* >(handle)->Value();
-  *out = new NDArray(db.data[0].data(), 0);
+  NDArray* pndarray = new NDArray();
+  *pndarray = db.data[0];
+  *out = pndarray;
   API_END();
 }
 
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index b73d81aa6eb5..35c37e1fb887 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -300,7 +300,8 @@ class ImageRecordIter : public IIterator<DataInst> {
     // use the kwarg to init parser
     parser_.Init(kwargs);
     // init thread iter
-    iter_.set_max_capacity(4);
+    // TODO: Originally 4
+    iter_.set_max_capacity(1);
     iter_.Init([this](std::vector<InstVector> **dptr) {
         if (*dptr == NULL) {
           *dptr = new std::vector<InstVector>();
@@ -342,7 +343,7 @@ class ImageRecordIter : public IIterator<DataInst> {
           }
         }
         // shuffle instance order if needed
-        if (shuffle_ != 0) {
+        if (param_.shuffle != 0) {
             std::shuffle(inst_order_.begin(), inst_order_.end(), \
                     common::RANDOM_ENGINE(kRandMagic + param_.seed));
         }
@@ -394,8 +395,6 @@ class ImageRecordIter : public IIterator<DataInst> {
   static const int kRandMagic = 111;
   // output instance
   DataInst out_;
-  // whether shuffle data
-  int shuffle_;
   // data ptr
   size_t inst_ptr_;
   // internal instance order
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index 5222748bb296..36d252e3d5f5 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -101,14 +101,11 @@ class MNISTIter: public IIterator<DataBatch> {
       batch_data_.dptr_ = img_[loc_].dptr_;
       batch_label_.dptr_ = &labels_[loc_];
       if (param_.flat) {
-          NDArray data_ndarray = NDArray(TBlob(batch_data_.FlatTo2D()), 0);
-          out_.data[0] = data_ndarray.Copy(data_ndarray.ctx());
+          out_.data[0] = NDArray(TBlob(batch_data_.FlatTo2D()), 0);
       } else {
-          NDArray data_ndarray = NDArray(TBlob(batch_data_), 0);
-          out_.data[0] = data_ndarray.Copy(data_ndarray.ctx());
+          out_.data[0] = NDArray(TBlob(batch_data_), 0);
       }
-      NDArray label_ndarray = NDArray(TBlob(batch_label_), 0);
-      out_.data[1] = label_ndarray.Copy(label_ndarray.ctx());
+      out_.data[1] = NDArray(TBlob(batch_label_), 0);
       out_.inst_index = &inst_[loc_];
       loc_ += param_.batch_size;
       return true;
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index a5378f28090f..d3089de19d7b 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -106,9 +106,8 @@ class BatchLoader {
               d.data[1].get<mshadow::cpu, 1, float>());
       mshadow::Copy(out->data[0].data().get<mshadow::cpu, 4, float>()[top],
               d.data[0].get<mshadow::cpu, 3, float>());
-
       if (++ top >= param_.batch_size) {
-        return true;
+          return true;
       }
     }
     if (top != 0) {
@@ -152,7 +151,7 @@ class BatchLoader {
 struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
   /*! \brief number of prefetched batches */
   size_t capacity;
-    /*! \brief label width */
+  /*! \brief label width */
   index_t batch_size;
   /*! \brief input shape */
   TShape input_shape;
@@ -160,6 +159,8 @@ struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
   index_t label_width;
   // declare parameters
   DMLC_DECLARE_PARAMETER(PrefetcherParam) {
+    DMLC_DECLARE_FIELD(batch_size)
+        .describe("Batch size.");
     DMLC_DECLARE_FIELD(capacity).set_default(1)
         .describe("Number of prefetched batches");
     index_t input_shape_default[] = {3, 224, 224};
@@ -202,9 +203,10 @@ class PrefetcherIter : public IIterator<DataBatch> {
         if (*dptr == NULL) {
           *dptr = new DataBatch();
           // init NDArrays
-          Context ctx; 
-          (*dptr)->data.push_back(NDArray(data_shape_, ctx, true));
-          (*dptr)->data.push_back(NDArray(label_shape_, ctx, true));
+          (*dptr)->inst_index = new unsigned[param_.batch_size]; 
+          Context ctx;
+          (*dptr)->data.push_back(NDArray(data_shape_, ctx, false));
+          (*dptr)->data.push_back(NDArray(label_shape_, ctx, false));
         }
         return loader_.LoadNext(*dptr);
       },
@@ -214,12 +216,10 @@ class PrefetcherIter : public IIterator<DataBatch> {
     iter_.BeforeFirst();
   }
   virtual bool Next(void) {
-     if (ready_narrays_.size() == param_.capacity) {
-         std::vector<NDArray*> old_narrays = ready_narrays_.front();
-         for (size_t i = 0; i < old_narrays.size(); i++) {
-             old_narrays[i]->WaitToWrite();
+     if (ready_batches_.size() == param_.capacity) {
+         for (size_t i = 0; i < out_.data.size(); i++) {
+             out_.data[i].WaitToWrite();
          }
-         ready_narrays_.pop();
          DataBatch* old_batch = ready_batches_.front();
          ready_batches_.pop();
          iter_.Recycle(&old_batch);
@@ -229,15 +229,10 @@ class PrefetcherIter : public IIterator<DataBatch> {
      out_.data.clear();
      // copy the batch
      for (size_t i = 0; i < next_batch->data.size(); i++) {
-         out_.data.push_back(next_batch->data[i].Copy(next_batch->data[i].ctx()));
+         out_.data.push_back(next_batch->data[i]);
      }
      // push the narrays and batch into the queue
      ready_batches_.push(next_batch);
-     std::vector<NDArray*> next_batch_narrays;
-     for (size_t i = 0; i < out_.data.size(); i++) {
-         next_batch_narrays.push_back(&out_.data[i]);
-     }
-     ready_narrays_.push(next_batch_narrays);
      return true;
   }
   virtual const DataBatch &Value(void) const {
@@ -249,8 +244,6 @@ class PrefetcherIter : public IIterator<DataBatch> {
   /*! \brief output data */
   DataBatch out_;
   /*! \brief queue to hold the NDArrays for check whether writable */
-  std::queue<std::vector<NDArray*> > ready_narrays_;
-  /*! \brief queue to hold the NDArrays for check whether writable */
   std::queue<DataBatch*> ready_batches_;
   // internal batch loader
   BatchLoader loader_;
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index e606f9254b5a..253c14374b28 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -3,8 +3,10 @@
 import numpy as np
 import os, gzip
 import pickle as pickle
+import time
+import sys
 from common import get_data
-#from PIL import Image
+from PIL import Image
 
 
 def test_MNISTIter():
@@ -69,38 +71,34 @@ def test_ImageRecIter():
         nplabel = label.numpy
         for i in range(nplabel.shape[0]):
             labelcount[int(nplabel[i])] += 1
+'''
 
 def test_Cifar10Rec():
     dataiter = mx.io.ImageRecordIter(
-            path_imgrec="data/cifar/test.rec",
-            mean_img="data/cifar/cifar10_mean.bin",
-            rand_crop=True,
-            rand_mirror=True,
+            path_imgrec="data/cifar/train.rec",
+            mean_img="data/cifar/cifar10_mean_1.bin",
+            rand_crop=False,
+            rand_mirror=False,
+            shuffle=False,
             input_shape=(3,28,28),
             batch_size=100,
-            nthread=1)
+            nthread=1,
+            capacity=1)
     labelcount = [0 for i in range(10)] 
     batchcount = 0
+    
     for data, label in dataiter:
-        npdata = data.numpy
-        print npdata[0,:,:,:]
-        imgdata = np.zeros([28, 28, 3], dtype=np.uint8)
-        imgdata[:,:,0] = npdata[0,2,:,:]
-        imgdata[:,:,1] = npdata[0,1,:,:]
-        imgdata[:,:,2] = npdata[0,0,:,:]
-        img = Image.fromarray(imgdata)
-        imgpath = "data/cifar/test.jpg"
-        img.save(imgpath, format='JPEG')
-        exit(0)
+        npdata = data.asnumpy().flatten().sum()
         print "Batch: ", batchcount
         sys.stdout.flush()
         batchcount += 1
-        nplabel = label.numpy
+        nplabel = label.asnumpy()
         for i in range(nplabel.shape[0]):
             labelcount[int(nplabel[i])] += 1
     for i in range(10):
         assert(labelcount[i] == 1000)
-'''
 
 if __name__ == "__main__":
-    test_MNISTIter()
+    CheckEqual()
+    #test_Cifar10Rec()
+

From c91e0db44eaf1845f6329a29aef9c6f5da6e66d4 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Thu, 17 Sep 2015 09:45:38 +0800
Subject: [PATCH 05/13] make

---
 example/cifar10/cifar10.py       |  4 +--
 python/mxnet/ndarray.py          |  4 +--
 src/c_api.cc                     |  4 +--
 src/io/image_augmenter.h         | 30 ++++------------
 src/io/inst_vector.h             |  4 +++
 src/io/iter_image_recordio.cc    | 61 ++++++++++++++++++++++++--------
 src/io/iter_mnist.cc             | 16 +++++----
 src/io/iter_prefetcher.h         | 10 +++---
 tests/python/unittest/test_io.py | 51 +++++---------------------
 9 files changed, 87 insertions(+), 97 deletions(-)

diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index 02cd5a4b07f0..2319c1347ec9 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -203,7 +203,7 @@ def Update(grad, weight, mom):
         shuffle=False,
         input_shape=(3,28,28),
         batch_size=batch_size,
-        nthread=1,
+        nthread=4,
         capacity=4)
 test_dataiter = mx.io.ImageRecordIter(
         path_imgrec="data/cifar/test.rec",
@@ -213,7 +213,7 @@ def Update(grad, weight, mom):
         shuffle=False,
         input_shape=(3,28,28),
         batch_size=batch_size,
-        nthread=1,
+        nthread=4,
         capacity=4)
 
 
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index 83cb76edb560..0e3fa328593c 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -548,7 +548,7 @@ def binary_ndarray_function(lhs, rhs, out=None):
         if out:
             if isinstance(out, NDArray) == False:
                 raise TypeError('out must be NDArray')
-            if out.writable == True:
+            if out.writable == False:
                 raise TypeError('out must be writable')
         else:
             if not accept_empty_mutate:
@@ -565,7 +565,7 @@ def unary_ndarray_function(src, out=None):
         if out:
             if isinstance(out, NDArray) == False:
                 raise TypeError('out must be NDArray')
-            if out.writable == True:
+            if out.writable == False:
                 raise TypeError('out must be writable')
         else:
             if not accept_empty_mutate:
diff --git a/src/c_api.cc b/src/c_api.cc
index 7251652d5d2f..6f1d5130982f 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -823,7 +823,7 @@ int MXDataIterNext(DataIterHandle handle, int *out) {
 
 int MXDataIterGetLabel(DataIterHandle handle, NDArrayHandle *out) {
   API_BEGIN();
-  DataBatch db = static_cast<IIterator<DataBatch>* >(handle)->Value();
+  const DataBatch& db = static_cast<IIterator<DataBatch>* >(handle)->Value();
   NDArray* pndarray = new NDArray();
   *pndarray = db.data[1];
   *out = pndarray;
@@ -832,7 +832,7 @@ int MXDataIterGetLabel(DataIterHandle handle, NDArrayHandle *out) {
 
 int MXDataIterGetData(DataIterHandle handle, NDArrayHandle *out) {
   API_BEGIN();
-  DataBatch db = static_cast<IIterator<DataBatch>* >(handle)->Value();
+  const DataBatch& db = static_cast<IIterator<DataBatch>* >(handle)->Value();
   NDArray* pndarray = new NDArray();
   *pndarray = db.data[0];
   *out = pndarray;
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index 5dc00b585d9d..31560ca23167 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -121,6 +121,8 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
         .describe("Maximum ratio of contrast variation");
     DMLC_DECLARE_FIELD(max_random_illumination).set_default(0.0f)
         .describe("Maximum value of illumination variation");
+    DMLC_DECLARE_FIELD(silent).set_default(true)
+        .describe("Whether to print augmentor info");
   }
 };
 
@@ -173,6 +175,7 @@ class ImageAugmenter {
    */
   virtual cv::Mat OpencvProcess(const cv::Mat &src,
                           common::RANDOM_ENGINE *prnd) {
+    if (!NeedOpencvProcess()) return src;
     // shear
     float s = NextDouble(prnd) * param_.max_shear_ratio * 2 - param_.max_shear_ratio;
     // rotate
@@ -276,7 +279,8 @@ class ImageAugmenter {
     return tmpres_;
   }
 
-  void TensorProcess(mshadow::TensorContainer<cpu, 3> *p_data,
+  void TensorProcess(mshadow::Tensor<cpu, 3> *p_data,
+                     mshadow::TensorContainer<cpu, 3> *dst_data,
                        common::RANDOM_ENGINE *prnd) {
     // Check Newly Created mean image
     if (meanfile_ready_ == false && param_.mean_img.length() != 0) {
@@ -290,7 +294,7 @@ class ImageAugmenter {
         meanfile_ready_ = true;
       }
     }
-    img_.Resize(mshadow::Shape3((*p_data).shape_[0], param_.input_shape[1], param_.input_shape[2]));
+    img_.Resize(mshadow::Shape3((*p_data).shape_[0], param_.input_shape[1], param_.input_shape[2])); 
     if (param_.input_shape[1] == 1) {
       img_ = (*p_data) * param_.scale;
     } else {
@@ -354,27 +358,7 @@ class ImageAugmenter {
         }
       }
     }
-    (*p_data) = img_;
-  }
-
-  virtual void Process(unsigned char *dptr, size_t sz,
-                       mshadow::TensorContainer<cpu, 3> *p_data,
-                       common::RANDOM_ENGINE *prnd) {
-    cv::Mat buf(1, sz, CV_8U, dptr);
-    cv::Mat res = cv::imdecode(buf, 1);
-    if (NeedOpencvProcess())
-        res = this->OpencvProcess(res, prnd);
-    p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
-    for (index_t i = 0; i < p_data->size(1); ++i) {
-      for (index_t j = 0; j < p_data->size(2); ++j) {
-        cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
-        (*p_data)[0][i][j] = bgr[2];
-        (*p_data)[1][i][j] = bgr[1];
-        (*p_data)[2][i][j] = bgr[0];
-      }
-    }
-    res.release();
-    this->TensorProcess(p_data, prnd);
+    (*dst_data) = img_;
   }
 
  private:
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index ed560fc2b5da..994911bdcdab 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -72,6 +72,10 @@ class InstVector {
   inline size_t Size(void) const {
     return index_.size();
   }
+  // get index
+  inline unsigned Index(unsigned i) const {
+    return index_[i];
+  }
   // instance
   inline DataInst operator[](size_t i) const {
     DataInst inst;
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 35c37e1fb887..f8b033ed6142 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -194,7 +194,6 @@ inline void ImageRecordIOParser::Init(
     augmenters_[i]->Init(kwargs);
     prnds_.push_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic));
   }
-
   // handling for hadoop
   const char *ps_rank = getenv("PS_RANK");
   if (ps_rank != NULL) {
@@ -227,7 +226,9 @@ ParseNext(std::vector<InstVector> *out_vec) {
   CHECK(source_ != NULL);
   dmlc::InputSplit::Blob chunk;
   if (!source_->NextChunk(&chunk)) return false;
-  out_vec->resize(param_.nthread);
+  // save opencv out
+  std::vector<InstVector> * opencv_out_vec = new std::vector<InstVector>();
+  opencv_out_vec->resize(param_.nthread);
   #pragma omp parallel num_threads(param_.nthread)
   {
     CHECK(omp_get_num_threads() == param_.nthread);
@@ -236,26 +237,59 @@ ParseNext(std::vector<InstVector> *out_vec) {
     ImageRecordIO rec;
     dmlc::InputSplit::Blob blob;
     // image data
-    InstVector &out = (*out_vec)[tid];
-    out.Clear();
+    InstVector &opencv_out = (*opencv_out_vec)[tid];
+    opencv_out.Clear();
     while (reader.NextRecord(&blob)) {
+      // Opencv decode and augments
+      cv::Mat res;
       rec.Load(blob.dptr, blob.size);
-      out.Push(static_cast<unsigned>(rec.image_index()),
+      cv::Mat buf(1, rec.content_size, CV_8U, rec.content);
+      res = cv::imdecode(buf, 1);
+      res = augmenters_[tid]->OpencvProcess(res, prnds_[tid]);
+      opencv_out.Push(static_cast<unsigned>(rec.image_index()),
+               mshadow::Shape3(3, res.rows, res.cols),
+               mshadow::Shape1(param_.label_width));
+      DataInst opencv_inst = opencv_out.Back();
+      mshadow::Tensor<mshadow::cpu, 3> opencv_data = opencv_inst.data[0].get<mshadow::cpu, 3, float>();
+      mshadow::Tensor<mshadow::cpu, 1> opencv_label = opencv_inst.data[1].get<mshadow::cpu, 1, float>();
+      for (int i = 0; i < res.rows; ++i) {
+        for (int j = 0; j < res.cols; ++j) {
+          cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
+          opencv_data[0][i][j] = bgr[2];
+          opencv_data[1][i][j] = bgr[1];
+          opencv_data[2][i][j] = bgr[0];
+        }
+      }
+      if (label_map_ != NULL) {
+        mshadow::Copy(opencv_label, label_map_->Find(rec.image_index()));
+      } else {
+        opencv_label[0] = rec.header.label;
+      }
+      res.release();
+    }
+  }
+  // Tensor Op is not thread safe, so call outside of omp
+  out_vec->resize(param_.nthread);
+  for (size_t i = 0; i < opencv_out_vec->size(); i++) {
+    InstVector &out = (*out_vec)[i];
+    InstVector &opencv_out = (*opencv_out_vec)[i];
+    out.Clear();
+    for (size_t j = 0; j < opencv_out.Size(); j++) {
+      out.Push(opencv_out.Index(j),
                mshadow::Shape3(param_.input_shape[0], param_.input_shape[1], param_.input_shape[2]),
                mshadow::Shape1(param_.label_width));
       DataInst inst = out.Back();
-      // turn datainst into tensor
+      DataInst opencv_inst = opencv_out[j]; 
+      mshadow::Tensor<mshadow::cpu, 3> opencv_data = opencv_inst.data[0].get<mshadow::cpu, 3, float>();
+      mshadow::Tensor<mshadow::cpu, 1> opencv_label = opencv_inst.data[1].get<mshadow::cpu, 1, float>();
       mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>();
       mshadow::Tensor<mshadow::cpu, 1> label = inst.data[1].get<mshadow::cpu, 1, float>();
-      augmenters_[tid]->Process(rec.content, rec.content_size, &img_, prnds_[tid]);
+      augmenters_[i]->TensorProcess(&opencv_data, &img_, prnds_[i]);
       mshadow::Copy(data, img_);
-      if (label_map_ != NULL) {
-        mshadow::Copy(label, label_map_->Find(rec.image_index()));
-      } else {
-        label[0] = rec.header.label;
-      }
+      mshadow::Copy(label, opencv_label);
     }
   }
+  delete opencv_out_vec; 
   return true;
 }
 
@@ -300,8 +334,7 @@ class ImageRecordIter : public IIterator<DataInst> {
     // use the kwarg to init parser
     parser_.Init(kwargs);
     // init thread iter
-    // TODO: Originally 4
-    iter_.set_max_capacity(1);
+    iter_.set_max_capacity(4);
     iter_.Init([this](std::vector<InstVector> **dptr) {
         if (*dptr == NULL) {
           *dptr = new std::vector<InstVector>();
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index 36d252e3d5f5..60002f5470ed 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -74,8 +74,7 @@ class MNISTIter: public IIterator<DataBatch> {
     } else {
       batch_data_.shape_ = mshadow::Shape4(param_.batch_size, 1, img_.size(1), img_.size(2));
     }
-    out_.inst_index = NULL;
-    out_.data.resize(2);
+    out_.data.clear();
     batch_label_.shape_ = mshadow::Shape2(param_.batch_size, 1);
     batch_label_.stride_ = 1;
     batch_data_.stride_ = batch_data_.size(3);
@@ -98,15 +97,20 @@ class MNISTIter: public IIterator<DataBatch> {
   }
   virtual bool Next(void) {
     if (loc_ + param_.batch_size <= img_.size(0)) {
+      if (out_.data.size() == 2) {
+        for (size_t i = 0; i < out_.data.size(); i++) {
+             out_.data[i].WaitToWrite();
+         }
+      }
       batch_data_.dptr_ = img_[loc_].dptr_;
       batch_label_.dptr_ = &labels_[loc_];
+      out_.data.clear();
       if (param_.flat) {
-          out_.data[0] = NDArray(TBlob(batch_data_.FlatTo2D()), 0);
+          out_.data.push_back(NDArray(TBlob(batch_data_.FlatTo2D()), 0));
       } else {
-          out_.data[0] = NDArray(TBlob(batch_data_), 0);
+          out_.data.push_back(NDArray(TBlob(batch_data_), 0));
       }
-      out_.data[1] = NDArray(TBlob(batch_label_), 0);
-      out_.inst_index = &inst_[loc_];
+      out_.data.push_back(NDArray(TBlob(batch_label_), 0));
       loc_ += param_.batch_size;
       return true;
     } else {
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index d3089de19d7b..a976db1386d6 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -216,13 +216,13 @@ class PrefetcherIter : public IIterator<DataBatch> {
     iter_.BeforeFirst();
   }
   virtual bool Next(void) {
-     if (ready_batches_.size() == param_.capacity) {
-         for (size_t i = 0; i < out_.data.size(); i++) {
-             out_.data[i].WaitToWrite();
-         }
+     if (ready_batches_.size() != 0) {
          DataBatch* old_batch = ready_batches_.front();
-         ready_batches_.pop();
+         for (size_t i = 0; i < old_batch->data.size(); i++) {
+             old_batch->data[i].WaitToWrite();
+         }
          iter_.Recycle(&old_batch);
+         ready_batches_.pop();
      }
      DataBatch* next_batch = NULL;
      if (!iter_.Next(&next_batch)) return false;
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index 253c14374b28..2beff9f366ea 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -18,10 +18,6 @@ def test_MNISTIter():
             image="data/train-images-idx3-ubyte",
             label="data/train-labels-idx1-ubyte",
             batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10)
-    val_dataiter = mx.io.MNISTIter(
-            image="data/t10k-images-idx3-ubyte",
-            label="data/t10k-labels-idx1-ubyte",
-            batch_size=batch_size, shuffle=0, flat=1, silent=0)
     # test_loop
     nbatch = 60000 / batch_size
     batch_count = 0
@@ -41,49 +37,18 @@ def test_MNISTIter():
     label_1 = train_dataiter.getlabel().asnumpy().flatten()
     assert(sum(label_0 - label_1) == 0)
 
-'''
-def test_ImageRecIter():
-    dataiter = mx.io.ImageRecordIter(
-            path_imgrec="data/val_cxxnet.rec",
-            mean_img="data/smallset/image_net_mean.bin",
-            rand_crop=True,
-            mirror=True,
-            input_shape=(3,227,227),
-            batch_size=100,
-            nthread=1,
-            seed=10)
-    labelcount = [0 for i in range(1000)] 
-    batchcount = 0
-    for data, label in dataiter:
-        npdata = data.numpy
-        print npdata[0,:,:,:]
-        imgdata = np.zeros([227, 227, 3], dtype=np.uint8)
-        imgdata[:,:,0] = npdata[10,2,:,:]
-        imgdata[:,:,1] = npdata[10,1,:,:]
-        imgdata[:,:,2] = npdata[10,0,:,:]
-        img = Image.fromarray(imgdata)
-        imgpath = "data/smallset/test_3.jpg"
-        img.save(imgpath, format='JPEG')
-        exit(0)
-        print batchcount
-        sys.stdout.flush()
-        batchcount += 1
-        nplabel = label.numpy
-        for i in range(nplabel.shape[0]):
-            labelcount[int(nplabel[i])] += 1
-'''
-
 def test_Cifar10Rec():
+    get_data.GetCifar10()
     dataiter = mx.io.ImageRecordIter(
             path_imgrec="data/cifar/train.rec",
-            mean_img="data/cifar/cifar10_mean_1.bin",
+            mean_img="data/cifar/cifar10_mean.bin",
             rand_crop=False,
-            rand_mirror=False,
+            and_mirror=False,
             shuffle=False,
             input_shape=(3,28,28),
             batch_size=100,
-            nthread=1,
-            capacity=1)
+            nthread=4,
+            capacity=6)
     labelcount = [0 for i in range(10)] 
     batchcount = 0
     
@@ -96,9 +61,9 @@ def test_Cifar10Rec():
         for i in range(nplabel.shape[0]):
             labelcount[int(nplabel[i])] += 1
     for i in range(10):
-        assert(labelcount[i] == 1000)
+        assert(labelcount[i] == 5000)
 
 if __name__ == "__main__":
-    CheckEqual()
-    #test_Cifar10Rec()
+    #test_MNISTIter()
+    test_Cifar10Rec()
 

From 1a820351d9ae6b7199e36d2450cf19332f4aaa0f Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Thu, 17 Sep 2015 12:36:19 +0800
Subject: [PATCH 06/13] prefetcher take iter

---
 example/cifar10/cifar10.py       |  4 +-
 example/mnist/mlp_gpu.py         |  2 +
 include/mxnet/io.h               | 22 +++++++++-
 src/io/iter_image_recordio.cc    |  8 ++--
 src/io/iter_mnist.cc             |  8 ++--
 src/io/iter_prefetcher.h         | 70 +++++++++++++++++++-------------
 tests/python/unittest/test_io.py | 11 ++---
 7 files changed, 82 insertions(+), 43 deletions(-)

diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index 5fb5d93413be..95d9e69dbb15 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -205,7 +205,7 @@ def Update(grad, weight, mom):
         input_shape=(3,28,28),
         batch_size=batch_size,
         nthread=4,
-        capacity=4)
+        prefetch_capacity=4)
 test_dataiter = mx.io.ImageRecordIter(
         path_imgrec="data/cifar/test.rec",
         mean_img="data/cifar/cifar_mean.bin",
@@ -215,7 +215,7 @@ def Update(grad, weight, mom):
         input_shape=(3,28,28),
         batch_size=batch_size,
         nthread=4,
-        capacity=4)
+        prefetch_capacity=4)
 
 
 def progress(count, total, epoch, toc):
diff --git a/example/mnist/mlp_gpu.py b/example/mnist/mlp_gpu.py
index ef8cd3b84cdc..bd92fcaf438a 100644
--- a/example/mnist/mlp_gpu.py
+++ b/example/mnist/mlp_gpu.py
@@ -71,10 +71,12 @@ def Update(grad, weight):
 train_dataiter = mx.io.MNISTIter(
         image="data/train-images-idx3-ubyte",
         label="data/train-labels-idx1-ubyte",
+        input_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10)
 val_dataiter = mx.io.MNISTIter(
         image="data/t10k-images-idx3-ubyte",
         label="data/t10k-labels-idx1-ubyte",
+        input_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False)
 
 tmp_label = mx.nd.zeros(name2shape["sm_label"])
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 6b049eef4222..ca2cb0512390 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -129,10 +129,28 @@ struct DataIteratorReg
  * \endcode
  */
 #define MXNET_REGISTER_IO_CHAINED_ITER(name, ChainedDataIterType, HoldingDataIterType)          \
-  static ::mxnet::IIterator<DataBatch>* __create__ ## ChainedDataIteratorType ## __() { \
+  static ::mxnet::IIterator<DataBatch>* __create__ ## ChainedDataIterType ## __() { \
     return new HoldingDataIterType(new ChainedDataIterType);                                    \
   }                                                                     \
   DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
-  .set_body(__create__ ## ChainedDataIteratorType ## __)
+  .set_body(__create__ ## ChainedDataIterType ## __)
+/*!
+ * \brief Macro to register three chained Iterators
+ *
+ * \code
+ * // example of registering a imagerec iterator
+ * MXNET_REGISTER_IO_CHAINED_ITERATOR(ImageRecordIter, 
+ * ImageRecordIter, ImageRecBatchLoader, Prefetcher)
+ * .describe("batched image record data iterator");
+ *
+ * \endcode
+ */
+#define MXNET_REGISTER_IO_THREE_CHAINED_ITER(name, FirstIterType, SecondIterType, ThirdIterType)          \
+  static ::mxnet::IIterator<DataBatch>* __create__ ## ThirdIterType ## __() { \
+    return new FirstIterType(new SecondIterType(new ThirdIterType));             \
+  }                                                                     \
+  DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
+  .set_body(__create__ ## ThirdIterType ## __)
+
 }  // namespace mxnet
 #endif  // MXNET_IO_H_
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 86b4a6f8348a..f7896f7eb7ae 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -435,11 +435,13 @@ class ImageRecordIter : public IIterator<DataInst> {
 
 DMLC_REGISTER_PARAMETER(ImageRecParserParam);
 DMLC_REGISTER_PARAMETER(ImageRecordParam);
-MXNET_REGISTER_IO_CHAINED_ITER(ImageRecordIter, ImageRecordIter, PrefetcherIter)
+MXNET_REGISTER_IO_THREE_CHAINED_ITER(ImageRecordIter,
+        PrefetcherIter, ImageRecBatchLoader, ImageRecordIter)
     .describe("Create iterator for dataset packed in recordio.")
     .add_arguments(ImageRecordParam::__FIELDS__())
+    .add_arguments(ImageAugmentParam::__FIELDS__())
     .add_arguments(ImageRecParserParam::__FIELDS__())
-    .add_arguments(PrefetcherParam::__FIELDS__())
-    .add_arguments(ImageAugmentParam::__FIELDS__());
+    .add_arguments(BatchParam::__FIELDS__())
+    .add_arguments(PrefetcherParam::__FIELDS__());
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index 60002f5470ed..91374b6b5e44 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -13,6 +13,7 @@
 #include <vector>
 #include <utility>
 #include <map>
+#include "./iter_prefetcher.h"
 #include "../common/utils.h"
 
 namespace mxnet {
@@ -63,7 +64,7 @@ class MNISTIter: public IIterator<DataBatch> {
   // intialize iterator loads data in
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::map<std::string, std::string> kmap(kwargs.begin(), kwargs.end());
-    param_.Init(kmap);
+    param_.InitAllowUnknown(kmap);
     this->LoadImage();
     this->LoadLabel();
     // set name
@@ -206,8 +207,9 @@ class MNISTIter: public IIterator<DataBatch> {
 };  // class MNISTIter
 
 DMLC_REGISTER_PARAMETER(MNISTParam);
-MXNET_REGISTER_IO_ITER(MNISTIter, MNISTIter)
+MXNET_REGISTER_IO_CHAINED_ITER(MNISTIter, MNISTIter, PrefetcherIter)
     .describe("Create iterator for MNIST hand-written digit number recognition dataset.")
-    .add_arguments(MNISTParam::__FIELDS__());
+    .add_arguments(MNISTParam::__FIELDS__())
+    .add_arguments(PrefetcherParam::__FIELDS__());
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index a976db1386d6..9c5bf1c4ef13 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -54,10 +54,11 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
 };
 
 /*! \brief create a batch iterator from single instance iterator */
-class BatchLoader {
+class ImageRecBatchLoader : public IIterator<DataBatch> {
  public:
-  explicit BatchLoader(IIterator<DataInst> *base): base_(base), num_overflow_(0) {}
-  virtual ~BatchLoader(void) {
+  explicit ImageRecBatchLoader(IIterator<DataInst> *base):
+      base_(base), head_(1), num_overflow_(0) {}
+  virtual ~ImageRecBatchLoader(void) {
     delete base_;
   }
   inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
@@ -68,14 +69,18 @@ class BatchLoader {
     base_->Init(kwargs);
     std::vector<size_t> data_shape_vec;
     data_shape_vec.push_back(param_.batch_size);
-    data_shape_vec.push_back(param_.input_shape[0]);
-    data_shape_vec.push_back(param_.input_shape[1]);
-    data_shape_vec.push_back(param_.input_shape[2]);
+    for (size_t shape_dim = 0; shape_dim < param_.input_shape.ndim(); shape_dim++)
+        data_shape_vec.push_back(param_.input_shape[shape_dim]);
     data_shape_ = TShape(data_shape_vec.begin(), data_shape_vec.end());
     std::vector<size_t> label_shape_vec;
     label_shape_vec.push_back(param_.batch_size);
     label_shape_vec.push_back(param_.label_width);
     label_shape_ = TShape(label_shape_vec.begin(), label_shape_vec.end());
+    // Init space for out_
+    out_.inst_index = new unsigned[param_.batch_size]; 
+    Context ctx;
+    out_.data.push_back(NDArray(data_shape_, ctx, false));
+    out_.data.push_back(NDArray(label_shape_, ctx, false));
   }
   inline void BeforeFirst(void) {
     if (param_.round_batch == 0 || num_overflow_ == 0) {
@@ -86,8 +91,8 @@ class BatchLoader {
     }
     head_ = 1;
   }
-  inline bool LoadNext(DataBatch* out) {
-    out->num_batch_padd = 0;
+  inline bool Next(void) {
+    out_.num_batch_padd = 0;
 
     // skip read if in head version
     if (param_.test_skipread != 0 && head_ == 0)
@@ -101,10 +106,10 @@ class BatchLoader {
 
     while (base_->Next()) {
       const DataInst& d = base_->Value();
-      out->inst_index[top] = d.index;
-      mshadow::Copy(out->data[1].data().get<mshadow::cpu, 2, float>()[top],
+      out_.inst_index[top] = d.index;
+      mshadow::Copy(out_.data[1].data().get<mshadow::cpu, 2, float>()[top],
               d.data[1].get<mshadow::cpu, 1, float>());
-      mshadow::Copy(out->data[0].data().get<mshadow::cpu, 4, float>()[top],
+      mshadow::Copy(out_.data[0].data().get<mshadow::cpu, 4, float>()[top],
               d.data[0].get<mshadow::cpu, 3, float>());
       if (++ top >= param_.batch_size) {
           return true;
@@ -117,24 +122,29 @@ class BatchLoader {
         for (; top < param_.batch_size; ++top, ++num_overflow_) {
           CHECK(base_->Next()) << "number of input must be bigger than batch size";
           const DataInst& d = base_->Value();
-          out->inst_index[top] = d.index;
-          mshadow::Copy(out->data[1].data().get<mshadow::cpu, 2, float>()[top],
+          out_.inst_index[top] = d.index;
+          mshadow::Copy(out_.data[1].data().get<mshadow::cpu, 2, float>()[top],
                   d.data[1].get<mshadow::cpu, 1, float>());
-          mshadow::Copy(out->data[0].data().get<mshadow::cpu, 4, float>()[top],
+          mshadow::Copy(out_.data[0].data().get<mshadow::cpu, 4, float>()[top],
                   d.data[0].get<mshadow::cpu, 3, float>());
         }
-        out->num_batch_padd = num_overflow_;
+        out_.num_batch_padd = num_overflow_;
       } else {
-        out->num_batch_padd = param_.batch_size - top;
+        out_.num_batch_padd = param_.batch_size - top;
       }
       return true;
     }
     return false;
   }
+  virtual const DataBatch &Value(void) const {
+    return out_;
+  }
 
  private:
   /*! \brief batch parameters */
   BatchParam param_;
+  /*! \brief output data */
+  DataBatch out_;
   /*! \brief base iterator */
   IIterator<DataInst> *base_;
   /*! \brief on first */
@@ -150,7 +160,7 @@ class BatchLoader {
 // Define prefetcher parameters
 struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
   /*! \brief number of prefetched batches */
-  size_t capacity;
+  size_t prefetch_capacity;
   /*! \brief label width */
   index_t batch_size;
   /*! \brief input shape */
@@ -161,12 +171,12 @@ struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
   DMLC_DECLARE_PARAMETER(PrefetcherParam) {
     DMLC_DECLARE_FIELD(batch_size)
         .describe("Batch size.");
-    DMLC_DECLARE_FIELD(capacity).set_default(1)
+    DMLC_DECLARE_FIELD(prefetch_capacity).set_default(1)
         .describe("Number of prefetched batches");
     index_t input_shape_default[] = {3, 224, 224};
     DMLC_DECLARE_FIELD(input_shape)
         .set_default(TShape(input_shape_default, input_shape_default + 3))
-        .set_expect_ndim(3).enforce_nonzero()
+        .enforce_nonzero()
         .describe("Input shape of the neural net");
     DMLC_DECLARE_FIELD(label_width).set_default(1)
         .describe("Label width.");
@@ -176,30 +186,31 @@ struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
 // iterator on image recordio
 class PrefetcherIter : public IIterator<DataBatch> {
  public:
-  PrefetcherIter(IIterator<DataInst>* base) : loader_(base){
+  PrefetcherIter(IIterator<DataBatch>* base) : loader_(base){
   }
   virtual ~PrefetcherIter(void) {
     iter_.Destroy();
+    delete loader_;
   }
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init image rec param
     kwargs_left = param_.InitAllowUnknown(kwargs);
     // use the kwarg to init batch loader
-    loader_.Init(kwargs);
+    loader_->Init(kwargs);
     std::vector<size_t> data_shape_vec;
     data_shape_vec.push_back(param_.batch_size);
-    data_shape_vec.push_back(param_.input_shape[0]);
-    data_shape_vec.push_back(param_.input_shape[1]);
-    data_shape_vec.push_back(param_.input_shape[2]);
+    for (size_t shape_dim = 0; shape_dim < param_.input_shape.ndim(); shape_dim++)
+        data_shape_vec.push_back(param_.input_shape[shape_dim]);
     data_shape_ = TShape(data_shape_vec.begin(),data_shape_vec.end());
     std::vector<size_t> label_shape_vec;
     label_shape_vec.push_back(param_.batch_size);
     label_shape_vec.push_back(param_.label_width);
     label_shape_ = TShape(label_shape_vec.begin(), label_shape_vec.end());
     // init thread iter
-    iter_.set_max_capacity(param_.capacity);
+    iter_.set_max_capacity(param_.prefetch_capacity);
     iter_.Init([this](DataBatch **dptr) {
+        bool load_success = loader_->Next();
         if (*dptr == NULL) {
           *dptr = new DataBatch();
           // init NDArrays
@@ -208,9 +219,12 @@ class PrefetcherIter : public IIterator<DataBatch> {
           (*dptr)->data.push_back(NDArray(data_shape_, ctx, false));
           (*dptr)->data.push_back(NDArray(label_shape_, ctx, false));
         }
-        return loader_.LoadNext(*dptr);
+        const DataBatch& batch = loader_->Value();
+        CopyFromTo(batch.data[0], &((*dptr)->data[0]));
+        CopyFromTo(batch.data[1], &((*dptr)->data[1]));
+        return load_success;
       },
-      [this]() { loader_.BeforeFirst(); });
+      [this]() { loader_->BeforeFirst(); });
   }
   virtual void BeforeFirst(void) {
     iter_.BeforeFirst();
@@ -246,7 +260,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
   /*! \brief queue to hold the NDArrays for check whether writable */
   std::queue<DataBatch*> ready_batches_;
   // internal batch loader
-  BatchLoader loader_;
+  IIterator<DataBatch>* loader_;
   // backend thread
   dmlc::ThreadedIter<DataBatch> iter_;
   /*! \brief data shape */
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index 2beff9f366ea..8a48dbcde5d2 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -17,6 +17,7 @@ def test_MNISTIter():
     train_dataiter = mx.io.MNISTIter(
             image="data/train-images-idx3-ubyte",
             label="data/train-labels-idx1-ubyte",
+            input_shape=(784,),
             batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10)
     # test_loop
     nbatch = 60000 / batch_size
@@ -48,10 +49,9 @@ def test_Cifar10Rec():
             input_shape=(3,28,28),
             batch_size=100,
             nthread=4,
-            capacity=6)
+            prefetch_capacity=4)
     labelcount = [0 for i in range(10)] 
     batchcount = 0
-    
     for data, label in dataiter:
         npdata = data.asnumpy().flatten().sum()
         print "Batch: ", batchcount
@@ -61,9 +61,10 @@ def test_Cifar10Rec():
         for i in range(nplabel.shape[0]):
             labelcount[int(nplabel[i])] += 1
     for i in range(10):
-        assert(labelcount[i] == 5000)
+        print labelcount[i]
+        #assert(labelcount[i] == 5000)
 
 if __name__ == "__main__":
-    #test_MNISTIter()
-    test_Cifar10Rec()
+    test_MNISTIter()
+    #test_Cifar10Rec()
 

From 0482b57bcdb884260a42208a390ffcc8e296089e Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Fri, 18 Sep 2015 00:24:44 +0800
Subject: [PATCH 07/13] use tensor copy, uncertainty solved

---
 src/io/iter_image_recordio.cc    |  2 +-
 src/io/iter_prefetcher.h         | 10 ++++++----
 tests/python/unittest/test_io.py | 33 ++++++++++++++++++++++++++++----
 3 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index f7896f7eb7ae..ffb1845aa5a1 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -122,7 +122,7 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
     index_t input_shape_default[] = {3, 224, 224};
     DMLC_DECLARE_FIELD(input_shape)
         .set_default(TShape(input_shape_default, input_shape_default + 3))
-        .set_expect_ndim(3).enforce_nonzero()
+        .enforce_nonzero()
         .describe("Input shape of the neural net");
   }
 };
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index 9c5bf1c4ef13..1f81ca8c014b 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -169,10 +169,10 @@ struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
   index_t label_width;
   // declare parameters
   DMLC_DECLARE_PARAMETER(PrefetcherParam) {
-    DMLC_DECLARE_FIELD(batch_size)
-        .describe("Batch size.");
     DMLC_DECLARE_FIELD(prefetch_capacity).set_default(1)
         .describe("Number of prefetched batches");
+    DMLC_DECLARE_FIELD(batch_size)
+        .describe("Batch size.");
     index_t input_shape_default[] = {3, 224, 224};
     DMLC_DECLARE_FIELD(input_shape)
         .set_default(TShape(input_shape_default, input_shape_default + 3))
@@ -220,8 +220,10 @@ class PrefetcherIter : public IIterator<DataBatch> {
           (*dptr)->data.push_back(NDArray(label_shape_, ctx, false));
         }
         const DataBatch& batch = loader_->Value();
-        CopyFromTo(batch.data[0], &((*dptr)->data[0]));
-        CopyFromTo(batch.data[1], &((*dptr)->data[1]));
+        mshadow::Copy((*dptr)->data[0].data().get<mshadow::cpu, 4, float>(),
+                batch.data[0].data().get<mshadow::cpu, 4, float>());
+        mshadow::Copy((*dptr)->data[1].data().get<mshadow::cpu, 2, float>(),
+                batch.data[1].data().get<mshadow::cpu, 2, float>());
         return load_success;
       },
       [this]() { loader_->BeforeFirst(); });
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index 8a48dbcde5d2..c09711948463 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -49,12 +49,13 @@ def test_Cifar10Rec():
             input_shape=(3,28,28),
             batch_size=100,
             nthread=4,
-            prefetch_capacity=4)
+            prefetch_capacity=1)
     labelcount = [0 for i in range(10)] 
     batchcount = 0
     for data, label in dataiter:
         npdata = data.asnumpy().flatten().sum()
-        print "Batch: ", batchcount
+        #print label.asnumpy().flatten() 
+        #print "Batch: ", batchcount
         sys.stdout.flush()
         batchcount += 1
         nplabel = label.asnumpy()
@@ -64,7 +65,31 @@ def test_Cifar10Rec():
         print labelcount[i]
         #assert(labelcount[i] == 5000)
 
+def Check():
+    file1 = open('./text_1.txt', 'r')
+    file2 = open('./text_2.txt', 'r')
+    line1 = file1.readline()
+    labelcount = [0 for i in range(10)] 
+    while line1:
+        line2 = file2.readline()
+        if (int)(line1) != (int)(line2):
+            print 'error'
+            print line1, line2
+            break
+        labelcount[(int)(line1)]+=1
+        line1 = file1.readline()
+    for i in range(10):
+        print labelcount[i]
+    
+    file1.close()
+    file2.close()
+
+
+
+
+
 if __name__ == "__main__":
-    test_MNISTIter()
-    #test_Cifar10Rec()
+    #test_MNISTIter()
+    test_Cifar10Rec()
+    #Check()
 

From e36351e98a8dde83cc32c2d48cf456aaf4942875 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Fri, 18 Sep 2015 10:33:28 +0800
Subject: [PATCH 08/13] use TBlobBatch for intermediate data

---
 example/cifar10/cifar10.py       |   4 +-
 include/mxnet/io.h               |  24 +----
 src/io/image_augmenter.h         |   3 +-
 src/io/inst_vector.h             |  32 ++++++
 src/io/io.cc                     |   1 +
 src/io/iter_batchloader.h        | 168 +++++++++++++++++++++++++++++++
 src/io/iter_image_recordio.cc    |  19 ++--
 src/io/iter_mnist.cc             |  21 ++--
 src/io/iter_prefetcher.h         | 164 ++++--------------------------
 tests/python/train/test_conv.py  |   6 +-
 tests/python/train/test_mlp.py   |   2 +
 tests/python/unittest/test_io.py |  30 +-----
 12 files changed, 251 insertions(+), 223 deletions(-)
 create mode 100644 src/io/iter_batchloader.h

diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index 95d9e69dbb15..c21ef04be52b 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -205,7 +205,7 @@ def Update(grad, weight, mom):
         input_shape=(3,28,28),
         batch_size=batch_size,
         nthread=4,
-        prefetch_capacity=4)
+        prefetch_capacity=6)
 test_dataiter = mx.io.ImageRecordIter(
         path_imgrec="data/cifar/test.rec",
         mean_img="data/cifar/cifar_mean.bin",
@@ -215,7 +215,7 @@ def Update(grad, weight, mom):
         input_shape=(3,28,28),
         batch_size=batch_size,
         nthread=4,
-        prefetch_capacity=4)
+        prefetch_capacity=6)
 
 
 def progress(count, total, epoch, toc):
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index ca2cb0512390..7e2cf8180fd5 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -61,14 +61,6 @@ struct DataInst {
  *      data and label, how we use them is to see the DNN implementation.
  */
 struct DataBatch {
- public:
-  /*! \brief unique id for instance, can be NULL, sometimes is useful */
-  unsigned *inst_index;
-  /*! \brief number of instance */
-  mshadow::index_t batch_size;
-  /*! \brief number of padding elements in this batch,
-       this is used to indicate the last elements in the batch are only padded up to match the batch, and should be discarded */
-  mshadow::index_t num_batch_padd;
  public:
   /*! \brief content of dense data, if this DataBatch is dense */
   std::vector<NDArray> data;
@@ -76,18 +68,9 @@ struct DataBatch {
   std::string extra_data;
  public:
   /*! \brief constructor */
-  DataBatch(void) {
-    inst_index = NULL;
-    batch_size = 0; num_batch_padd = 0;
-  }
+  DataBatch(void) {}
   /*! \brief destructor */
-  ~DataBatch() {
-    if(inst_index != NULL) {
-        delete inst_index;
-    }
-  }
-  /*! \brief giving name to the data */
-  void Naming(std::vector<std::string> names);
+  ~DataBatch() {}
 };  // struct DataBatch
 
 /*! \brief typedef the factory function of data iterator */
@@ -145,7 +128,8 @@ struct DataIteratorReg
  *
  * \endcode
  */
-#define MXNET_REGISTER_IO_THREE_CHAINED_ITER(name, FirstIterType, SecondIterType, ThirdIterType)          \
+#define MXNET_REGISTER_IO_THREE_CHAINED_ITER(\
+        name, FirstIterType, SecondIterType, ThirdIterType)          \
   static ::mxnet::IIterator<DataBatch>* __create__ ## ThirdIterType ## __() { \
     return new FirstIterType(new SecondIterType(new ThirdIterType));             \
   }                                                                     \
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index 31560ca23167..2901922af880 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -294,7 +294,8 @@ class ImageAugmenter {
         meanfile_ready_ = true;
       }
     }
-    img_.Resize(mshadow::Shape3((*p_data).shape_[0], param_.input_shape[1], param_.input_shape[2])); 
+    img_.Resize(mshadow::Shape3((*p_data).shape_[0],
+                param_.input_shape[1], param_.input_shape[2]));
     if (param_.input_shape[1] == 1) {
       img_ = (*p_data) * param_.scale;
     } else {
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index 688a42fa1038..ea4e4c6c181e 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -13,6 +13,7 @@
 #include <dmlc/base.h>
 #include <mshadow/tensor.h>
 #include <vector>
+#include <string>
 
 namespace mxnet {
 namespace io {
@@ -114,6 +115,37 @@ class InstVector {
   // data
   TensorVector<1, real_t> label_;
 };
+
+/*!
+ * \brief tblob batch
+ *
+ * data are stored in tblob before going into NDArray
+ */
+struct TBlobBatch {
+ public:
+  /*! \brief unique id for instance, can be NULL, sometimes is useful */
+  unsigned *inst_index;
+  /*! \brief number of instance */
+  mshadow::index_t batch_size;
+  /*! \brief number of padding elements in this batch,
+       this is used to indicate the last elements in the batch are only padded up to match the batch, and should be discarded */
+  mshadow::index_t num_batch_padd;
+  /*! \brief content of dense data */
+  std::vector<TBlob> data;
+  /*! \brief extra data to be fed to the network */
+  std::string extra_data;
+  /*! \brief constructor */
+  TBlobBatch(void) {
+    inst_index = NULL;
+    batch_size = 0; num_batch_padd = 0;
+  }
+  /*! \brief destructor */
+  ~TBlobBatch() {
+    if (inst_index != NULL) {
+        delete inst_index;
+    }
+  }
+};  // struct TBlobBatch
 }  // namespace io
 }  // namespace mxnet
 #endif  // MXNET_IO_INST_VECTOR_H_
diff --git a/src/io/io.cc b/src/io/io.cc
index 65e7049025ca..0bdac7d1576c 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -5,6 +5,7 @@
 #include <mxnet/io.h>
 #include <dmlc/registry.h>
 #include "./image_augmenter.h"
+#include "./iter_batchloader.h"
 #include "./iter_prefetcher.h"
 
 // Registers
diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h
new file mode 100644
index 000000000000..41e027f89469
--- /dev/null
+++ b/src/io/iter_batchloader.h
@@ -0,0 +1,168 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file iter_batchloader.h
+ * \brief define a batch adapter to create tblob batch 
+ */
+#ifndef MXNET_IO_ITER_BATCHLOADER_H_
+#define MXNET_IO_ITER_BATCHLOADER_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <dmlc/logging.h>
+#include <mshadow/tensor.h>
+#include <utility>
+#include <vector>
+#include <string>
+#include "./inst_vector.h"
+
+namespace mxnet {
+namespace io {
+// Batch parameters
+struct BatchParam : public dmlc::Parameter<BatchParam> {
+  /*! \brief label width */
+  index_t batch_size;
+  /*! \brief input shape */
+  TShape input_shape;
+  /*! \brief label width */
+  index_t label_width;
+  /*! \brief use round roubin to handle overflow batch */
+  bool round_batch;
+  /*! \brief skip read */
+  bool test_skipread;
+  /*! \brief silent */
+  bool silent;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(BatchParam) {
+    DMLC_DECLARE_FIELD(batch_size)
+        .describe("Batch size.");
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Input shape of the neural net");
+    DMLC_DECLARE_FIELD(label_width).set_default(1)
+        .describe("Label width.");
+    DMLC_DECLARE_FIELD(round_batch).set_default(true)
+        .describe("Use round robin to handle overflow batch.");
+    DMLC_DECLARE_FIELD(test_skipread).set_default(false)
+        .describe("Skip read for testing.");
+    DMLC_DECLARE_FIELD(silent).set_default(false)
+        .describe("Whether to print batch information.");
+  }
+};
+
+/*! \brief create a batch iterator from single instance iterator */
+class BatchLoader : public IIterator<TBlobBatch> {
+ public:
+  explicit BatchLoader(IIterator<DataInst> *base):
+      base_(base), head_(1), num_overflow_(0) {}
+  virtual ~BatchLoader(void) {
+    delete base_;
+    // Free space for TblobBatch
+    mshadow::FreeSpace(&data_holder_);
+    mshadow::FreeSpace(&label_holder_);
+  }
+  inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    // init batch param, it could have similar param with
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    // init base iterator
+    base_->Init(kwargs);
+    std::vector<size_t> data_shape_vec;
+    data_shape_vec.push_back(param_.batch_size);
+    for (size_t shape_dim = 0; shape_dim < param_.input_shape.ndim(); shape_dim++)
+        data_shape_vec.push_back(param_.input_shape[shape_dim]);
+    data_shape_ = TShape(data_shape_vec.begin(), data_shape_vec.end());
+    std::vector<size_t> label_shape_vec;
+    label_shape_vec.push_back(param_.batch_size);
+    label_shape_vec.push_back(param_.label_width);
+    label_shape_ = TShape(label_shape_vec.begin(), label_shape_vec.end());
+    // Init space for out_
+    out_.inst_index = new unsigned[param_.batch_size];
+    out_.data.clear();
+    data_holder_ =  mshadow::NewTensor<mshadow::cpu>(data_shape_.get<4>(), 0.0f);
+    label_holder_ =  mshadow::NewTensor<mshadow::cpu>(label_shape_.get<2>(), 0.0f);
+    out_.data.push_back(TBlob(data_holder_));
+    out_.data.push_back(TBlob(label_holder_));
+  }
+  inline void BeforeFirst(void) {
+    if (param_.round_batch == 0 || num_overflow_ == 0) {
+      // otherise, we already called before first
+      base_->BeforeFirst();
+    } else {
+      num_overflow_ = 0;
+    }
+    head_ = 1;
+  }
+  inline bool Next(void) {
+    out_.num_batch_padd = 0;
+
+    // skip read if in head version
+    if (param_.test_skipread != 0 && head_ == 0)
+        return true;
+    else
+        this->head_ = 0;
+
+    // if overflow from previous round, directly return false, until before first is called
+    if (num_overflow_ != 0) return false;
+    index_t top = 0;
+
+    while (base_->Next()) {
+      const DataInst& d = base_->Value();
+      out_.inst_index[top] = d.index;
+      mshadow::Copy(out_.data[1].get<mshadow::cpu, 2, float>()[top],
+              d.data[1].get<mshadow::cpu, 1, float>());
+      mshadow::Copy(out_.data[0].get<mshadow::cpu, 4, float>()[top],
+              d.data[0].get<mshadow::cpu, 3, float>());
+      if (++ top >= param_.batch_size) {
+          return true;
+      }
+    }
+    if (top != 0) {
+      if (param_.round_batch != 0) {
+        num_overflow_ = 0;
+        base_->BeforeFirst();
+        for (; top < param_.batch_size; ++top, ++num_overflow_) {
+          CHECK(base_->Next()) << "number of input must be bigger than batch size";
+          const DataInst& d = base_->Value();
+          out_.inst_index[top] = d.index;
+          mshadow::Copy(out_.data[1].get<mshadow::cpu, 2, float>()[top],
+                  d.data[1].get<mshadow::cpu, 1, float>());
+          mshadow::Copy(out_.data[0].get<mshadow::cpu, 4, float>()[top],
+                  d.data[0].get<mshadow::cpu, 3, float>());
+        }
+        out_.num_batch_padd = num_overflow_;
+      } else {
+        out_.num_batch_padd = param_.batch_size - top;
+      }
+      return true;
+    }
+    return false;
+  }
+  virtual const TBlobBatch &Value(void) const {
+    return out_;
+  }
+
+ private:
+  /*! \brief batch parameters */
+  BatchParam param_;
+  /*! \brief output data */
+  TBlobBatch out_;
+  /*! \brief base iterator */
+  IIterator<DataInst> *base_;
+  /*! \brief on first */
+  int head_;
+  /*! \brief number of overflow instances that readed in round_batch mode */
+  int num_overflow_;
+  /*! \brief data shape */
+  TShape data_shape_;
+  /*! \brief label shape */
+  TShape label_shape_;
+  /*! \brief tensor to hold data */
+  mshadow::Tensor<mshadow::cpu, 4, real_t> data_holder_;
+  /*! \brief tensor to hold label */
+  mshadow::Tensor<mshadow::cpu, 2, real_t> label_holder_;
+};  // class BatchLoader
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_IO_ITER_BATCHLOADER_H_
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index ffb1845aa5a1..96659ade5c83 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -19,6 +19,7 @@ iterator
 #include "./image_recordio.h"
 #include "./image_augmenter.h"
 #include "./iter_prefetcher.h"
+#include "./iter_batchloader.h"
 namespace mxnet {
 namespace io {
 /*! \brief data structure to hold labels for images */
@@ -241,8 +242,10 @@ ParseNext(std::vector<InstVector> *out_vec) {
                mshadow::Shape3(3, res.rows, res.cols),
                mshadow::Shape1(param_.label_width));
       DataInst opencv_inst = opencv_out.Back();
-      mshadow::Tensor<mshadow::cpu, 3> opencv_data = opencv_inst.data[0].get<mshadow::cpu, 3, float>();
-      mshadow::Tensor<mshadow::cpu, 1> opencv_label = opencv_inst.data[1].get<mshadow::cpu, 1, float>();
+      mshadow::Tensor<mshadow::cpu, 3> opencv_data =
+          opencv_inst.data[0].get<mshadow::cpu, 3, float>();
+      mshadow::Tensor<mshadow::cpu, 1> opencv_label =
+          opencv_inst.data[1].get<mshadow::cpu, 1, float>();
       for (int i = 0; i < res.rows; ++i) {
         for (int j = 0; j < res.cols; ++j) {
           cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
@@ -270,9 +273,11 @@ ParseNext(std::vector<InstVector> *out_vec) {
                mshadow::Shape3(param_.input_shape[0], param_.input_shape[1], param_.input_shape[2]),
                mshadow::Shape1(param_.label_width));
       DataInst inst = out.Back();
-      DataInst opencv_inst = opencv_out[j]; 
-      mshadow::Tensor<mshadow::cpu, 3> opencv_data = opencv_inst.data[0].get<mshadow::cpu, 3, float>();
-      mshadow::Tensor<mshadow::cpu, 1> opencv_label = opencv_inst.data[1].get<mshadow::cpu, 1, float>();
+      DataInst opencv_inst = opencv_out[j];
+      mshadow::Tensor<mshadow::cpu, 3> opencv_data =
+          opencv_inst.data[0].get<mshadow::cpu, 3, float>();
+      mshadow::Tensor<mshadow::cpu, 1> opencv_label =
+          opencv_inst.data[1].get<mshadow::cpu, 1, float>();
       mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>();
       mshadow::Tensor<mshadow::cpu, 1> label = inst.data[1].get<mshadow::cpu, 1, float>();
       augmenters_[i]->TensorProcess(&opencv_data, &img_, prnds_[i]);
@@ -280,7 +285,7 @@ ParseNext(std::vector<InstVector> *out_vec) {
       mshadow::Copy(label, opencv_label);
     }
   }
-  delete opencv_out_vec; 
+  delete opencv_out_vec;
   return true;
 }
 
@@ -436,7 +441,7 @@ class ImageRecordIter : public IIterator<DataInst> {
 DMLC_REGISTER_PARAMETER(ImageRecParserParam);
 DMLC_REGISTER_PARAMETER(ImageRecordParam);
 MXNET_REGISTER_IO_THREE_CHAINED_ITER(ImageRecordIter,
-        PrefetcherIter, ImageRecBatchLoader, ImageRecordIter)
+        PrefetcherIter, BatchLoader, ImageRecordIter)
     .describe("Create iterator for dataset packed in recordio.")
     .add_arguments(ImageRecordParam::__FIELDS__())
     .add_arguments(ImageAugmentParam::__FIELDS__())
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index 91374b6b5e44..454a40e8aa35 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -5,7 +5,6 @@
 */
 #include <mxnet/io.h>
 #include <mxnet/base.h>
-#include <mxnet/ndarray.h>
 #include <dmlc/io.h>
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
@@ -51,7 +50,7 @@ struct MNISTParam : public dmlc::Parameter<MNISTParam> {
   }
 };
 
-class MNISTIter: public IIterator<DataBatch> {
+class MNISTIter: public IIterator<TBlobBatch> {
  public:
   MNISTIter(void) {
     img_.dptr_ = NULL;
@@ -67,9 +66,6 @@ class MNISTIter: public IIterator<DataBatch> {
     param_.InitAllowUnknown(kmap);
     this->LoadImage();
     this->LoadLabel();
-    // set name
-    this->SetDataName(std::string("data"));
-    this->SetDataName(std::string("label"));
     if (param_.flat) {
       batch_data_.shape_ = mshadow::Shape4(param_.batch_size, 1, 1, img_.size(1) * img_.size(2));
     } else {
@@ -98,27 +94,22 @@ class MNISTIter: public IIterator<DataBatch> {
   }
   virtual bool Next(void) {
     if (loc_ + param_.batch_size <= img_.size(0)) {
-      if (out_.data.size() == 2) {
-        for (size_t i = 0; i < out_.data.size(); i++) {
-             out_.data[i].WaitToWrite();
-         }
-      }
       batch_data_.dptr_ = img_[loc_].dptr_;
       batch_label_.dptr_ = &labels_[loc_];
       out_.data.clear();
       if (param_.flat) {
-          out_.data.push_back(NDArray(TBlob(batch_data_.FlatTo2D()), 0));
+          out_.data.push_back(TBlob(batch_data_.FlatTo2D()));
       } else {
-          out_.data.push_back(NDArray(TBlob(batch_data_), 0));
+          out_.data.push_back(TBlob(batch_data_));
       }
-      out_.data.push_back(NDArray(TBlob(batch_label_), 0));
+      out_.data.push_back(TBlob(batch_label_));
       loc_ += param_.batch_size;
       return true;
     } else {
       return false;
     }
   }
-  virtual const DataBatch &Value(void) const {
+  virtual const TBlobBatch &Value(void) const {
     return out_;
   }
 
@@ -187,7 +178,7 @@ class MNISTIter: public IIterator<DataBatch> {
   /*! \brief MNIST iter params */
   MNISTParam param_;
   /*! \brief output */
-  DataBatch out_;
+  TBlobBatch out_;
   /*! \brief current location */
   index_t loc_;
   /*! \brief image content */
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index 1f81ca8c014b..9da7dfeaf2af 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -16,147 +16,10 @@
 #include <string>
 #include <vector>
 #include <queue>
+#include "./inst_vector.h"
 
 namespace mxnet {
 namespace io {
-// Batch parameters
-struct BatchParam : public dmlc::Parameter<BatchParam> {
-  /*! \brief label width */
-  index_t batch_size;
-  /*! \brief input shape */
-  TShape input_shape;
-  /*! \brief label width */
-  index_t label_width;
-  /*! \brief use round roubin to handle overflow batch */
-  bool round_batch;
-  /*! \brief skip read */
-  bool test_skipread;
-  /*! \brief silent */
-  bool silent;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(BatchParam) {
-    DMLC_DECLARE_FIELD(batch_size)
-        .describe("Batch size.");
-    index_t input_shape_default[] = {3, 224, 224};
-    DMLC_DECLARE_FIELD(input_shape)
-        .set_default(TShape(input_shape_default, input_shape_default + 3))
-        .set_expect_ndim(3).enforce_nonzero()
-        .describe("Input shape of the neural net");
-    DMLC_DECLARE_FIELD(label_width).set_default(1)
-        .describe("Label width.");
-    DMLC_DECLARE_FIELD(round_batch).set_default(true)
-        .describe("Use round robin to handle overflow batch.");
-    DMLC_DECLARE_FIELD(test_skipread).set_default(false)
-        .describe("Skip read for testing.");
-    DMLC_DECLARE_FIELD(silent).set_default(false)
-        .describe("Whether to print batch information.");
-  }
-};
-
-/*! \brief create a batch iterator from single instance iterator */
-class ImageRecBatchLoader : public IIterator<DataBatch> {
- public:
-  explicit ImageRecBatchLoader(IIterator<DataInst> *base):
-      base_(base), head_(1), num_overflow_(0) {}
-  virtual ~ImageRecBatchLoader(void) {
-    delete base_;
-  }
-  inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
-    std::vector<std::pair<std::string, std::string> > kwargs_left;
-    // init batch param, it could have similar param with
-    kwargs_left = param_.InitAllowUnknown(kwargs);
-    // init base iterator
-    base_->Init(kwargs);
-    std::vector<size_t> data_shape_vec;
-    data_shape_vec.push_back(param_.batch_size);
-    for (size_t shape_dim = 0; shape_dim < param_.input_shape.ndim(); shape_dim++)
-        data_shape_vec.push_back(param_.input_shape[shape_dim]);
-    data_shape_ = TShape(data_shape_vec.begin(), data_shape_vec.end());
-    std::vector<size_t> label_shape_vec;
-    label_shape_vec.push_back(param_.batch_size);
-    label_shape_vec.push_back(param_.label_width);
-    label_shape_ = TShape(label_shape_vec.begin(), label_shape_vec.end());
-    // Init space for out_
-    out_.inst_index = new unsigned[param_.batch_size]; 
-    Context ctx;
-    out_.data.push_back(NDArray(data_shape_, ctx, false));
-    out_.data.push_back(NDArray(label_shape_, ctx, false));
-  }
-  inline void BeforeFirst(void) {
-    if (param_.round_batch == 0 || num_overflow_ == 0) {
-      // otherise, we already called before first
-      base_->BeforeFirst();
-    } else {
-      num_overflow_ = 0;
-    }
-    head_ = 1;
-  }
-  inline bool Next(void) {
-    out_.num_batch_padd = 0;
-
-    // skip read if in head version
-    if (param_.test_skipread != 0 && head_ == 0)
-        return true;
-    else
-        this->head_ = 0;
-
-    // if overflow from previous round, directly return false, until before first is called
-    if (num_overflow_ != 0) return false;
-    index_t top = 0;
-
-    while (base_->Next()) {
-      const DataInst& d = base_->Value();
-      out_.inst_index[top] = d.index;
-      mshadow::Copy(out_.data[1].data().get<mshadow::cpu, 2, float>()[top],
-              d.data[1].get<mshadow::cpu, 1, float>());
-      mshadow::Copy(out_.data[0].data().get<mshadow::cpu, 4, float>()[top],
-              d.data[0].get<mshadow::cpu, 3, float>());
-      if (++ top >= param_.batch_size) {
-          return true;
-      }
-    }
-    if (top != 0) {
-      if (param_.round_batch != 0) {
-        num_overflow_ = 0;
-        base_->BeforeFirst();
-        for (; top < param_.batch_size; ++top, ++num_overflow_) {
-          CHECK(base_->Next()) << "number of input must be bigger than batch size";
-          const DataInst& d = base_->Value();
-          out_.inst_index[top] = d.index;
-          mshadow::Copy(out_.data[1].data().get<mshadow::cpu, 2, float>()[top],
-                  d.data[1].get<mshadow::cpu, 1, float>());
-          mshadow::Copy(out_.data[0].data().get<mshadow::cpu, 4, float>()[top],
-                  d.data[0].get<mshadow::cpu, 3, float>());
-        }
-        out_.num_batch_padd = num_overflow_;
-      } else {
-        out_.num_batch_padd = param_.batch_size - top;
-      }
-      return true;
-    }
-    return false;
-  }
-  virtual const DataBatch &Value(void) const {
-    return out_;
-  }
-
- private:
-  /*! \brief batch parameters */
-  BatchParam param_;
-  /*! \brief output data */
-  DataBatch out_;
-  /*! \brief base iterator */
-  IIterator<DataInst> *base_;
-  /*! \brief on first */
-  int head_;
-  /*! \brief number of overflow instances that readed in round_batch mode */
-  int num_overflow_;
-  /*! \brief data shape */
-  TShape data_shape_;
-  /*! \brief label shape */
-  TShape label_shape_;
-};  // class BatchLoader
-    
 // Define prefetcher parameters
 struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
   /*! \brief number of prefetched batches */
@@ -182,11 +45,11 @@ struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
         .describe("Label width.");
   }
 };
-  
+
 // iterator on image recordio
 class PrefetcherIter : public IIterator<DataBatch> {
  public:
-  PrefetcherIter(IIterator<DataBatch>* base) : loader_(base){
+  explicit PrefetcherIter(IIterator<TBlobBatch>* base) : loader_(base) {
   }
   virtual ~PrefetcherIter(void) {
     iter_.Destroy();
@@ -202,7 +65,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
     data_shape_vec.push_back(param_.batch_size);
     for (size_t shape_dim = 0; shape_dim < param_.input_shape.ndim(); shape_dim++)
         data_shape_vec.push_back(param_.input_shape[shape_dim]);
-    data_shape_ = TShape(data_shape_vec.begin(),data_shape_vec.end());
+    data_shape_ = TShape(data_shape_vec.begin(), data_shape_vec.end());
     std::vector<size_t> label_shape_vec;
     label_shape_vec.push_back(param_.batch_size);
     label_shape_vec.push_back(param_.label_width);
@@ -214,16 +77,22 @@ class PrefetcherIter : public IIterator<DataBatch> {
         if (*dptr == NULL) {
           *dptr = new DataBatch();
           // init NDArrays
-          (*dptr)->inst_index = new unsigned[param_.batch_size]; 
           Context ctx;
           (*dptr)->data.push_back(NDArray(data_shape_, ctx, false));
           (*dptr)->data.push_back(NDArray(label_shape_, ctx, false));
         }
-        const DataBatch& batch = loader_->Value();
-        mshadow::Copy((*dptr)->data[0].data().get<mshadow::cpu, 4, float>(),
-                batch.data[0].data().get<mshadow::cpu, 4, float>());
+        const TBlobBatch& batch = loader_->Value();
+        if (data_shape_.ndim() == 4) {
+          mshadow::Copy((*dptr)->data[0].data().get<mshadow::cpu, 4, float>(),
+                  batch.data[0].get<mshadow::cpu, 4, float>());
+        } else if (data_shape_.ndim() == 2) {
+          mshadow::Copy((*dptr)->data[0].data().get<mshadow::cpu, 2, float>(),
+                  batch.data[0].get<mshadow::cpu, 2, float>());
+        } else {
+          assert(false);
+        }
         mshadow::Copy((*dptr)->data[1].data().get<mshadow::cpu, 2, float>(),
-                batch.data[1].data().get<mshadow::cpu, 2, float>());
+                batch.data[1].get<mshadow::cpu, 2, float>());
         return load_success;
       },
       [this]() { loader_->BeforeFirst(); });
@@ -254,6 +123,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
   virtual const DataBatch &Value(void) const {
     return out_;
   }
+
  private:
   /*! \brief prefetcher parameters */
   PrefetcherParam param_;
@@ -262,7 +132,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
   /*! \brief queue to hold the NDArrays for check whether writable */
   std::queue<DataBatch*> ready_batches_;
   // internal batch loader
-  IIterator<DataBatch>* loader_;
+  IIterator<TBlobBatch>* loader_;
   // backend thread
   dmlc::ThreadedIter<DataBatch> iter_;
   /*! \brief data shape */
diff --git a/tests/python/train/test_conv.py b/tests/python/train/test_conv.py
index f7f0f1acb043..018233f261bf 100644
--- a/tests/python/train/test_conv.py
+++ b/tests/python/train/test_conv.py
@@ -73,11 +73,13 @@ def Update(grad, weight):
 train_dataiter = mx.io.MNISTIter(
         image="data/train-images-idx3-ubyte",
         label="data/train-labels-idx1-ubyte",
-        batch_size=batch_size, shuffle=True, silent=False, seed=10)
+        input_shape=(784,),
+        batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10)
 val_dataiter = mx.io.MNISTIter(
         image="data/t10k-images-idx3-ubyte",
         label="data/t10k-labels-idx1-ubyte",
-        batch_size=batch_size, shuffle=True, silent=False)
+        input_shape=(784,),
+        batch_size=batch_size, shuffle=True, flat=True, silent=False)
 
 def test_mnist():
     acc_train = 0.0
diff --git a/tests/python/train/test_mlp.py b/tests/python/train/test_mlp.py
index 315564ea5057..651c85842a6a 100644
--- a/tests/python/train/test_mlp.py
+++ b/tests/python/train/test_mlp.py
@@ -56,10 +56,12 @@ def Update(grad, weight):
 train_dataiter = mx.io.MNISTIter(
         image="data/train-images-idx3-ubyte",
         label="data/train-labels-idx1-ubyte",
+        input_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10)
 val_dataiter = mx.io.MNISTIter(
         image="data/t10k-images-idx3-ubyte",
         label="data/t10k-labels-idx1-ubyte",
+        input_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False)
 
 def test_mlp():
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index c09711948463..f93ebe158bc4 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -54,42 +54,14 @@ def test_Cifar10Rec():
     batchcount = 0
     for data, label in dataiter:
         npdata = data.asnumpy().flatten().sum()
-        #print label.asnumpy().flatten() 
-        #print "Batch: ", batchcount
         sys.stdout.flush()
         batchcount += 1
         nplabel = label.asnumpy()
         for i in range(nplabel.shape[0]):
             labelcount[int(nplabel[i])] += 1
     for i in range(10):
-        print labelcount[i]
-        #assert(labelcount[i] == 5000)
-
-def Check():
-    file1 = open('./text_1.txt', 'r')
-    file2 = open('./text_2.txt', 'r')
-    line1 = file1.readline()
-    labelcount = [0 for i in range(10)] 
-    while line1:
-        line2 = file2.readline()
-        if (int)(line1) != (int)(line2):
-            print 'error'
-            print line1, line2
-            break
-        labelcount[(int)(line1)]+=1
-        line1 = file1.readline()
-    for i in range(10):
-        print labelcount[i]
-    
-    file1.close()
-    file2.close()
-
-
-
-
+        assert(labelcount[i] == 5000)
 
 if __name__ == "__main__":
     #test_MNISTIter()
     test_Cifar10Rec()
-    #Check()
-

From 9316bc66cc5b83f3aa3171023e4d3b2b013ab5fa Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Fri, 18 Sep 2015 11:50:10 +0800
Subject: [PATCH 09/13] add opencv gate

---
 example/cifar10/cifar10.py            |  1 -
 example/cifar10/cifar10_multi_gpus.py | 37 ++++++++++++++-------------
 example/mnist/mlp_gpu.py              |  1 -
 example/mnist/mlp_multi_gpu.py        |  2 ++
 src/io/image_augmenter.h              | 13 +++++++---
 src/io/iter_image_recordio.cc         |  9 +++++++
 6 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index c21ef04be52b..59b7bd1432d8 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -217,7 +217,6 @@ def Update(grad, weight, mom):
         nthread=4,
         prefetch_capacity=6)
 
-
 def progress(count, total, epoch, toc):
     bar_len = 50
     filled_len = int(round(bar_len * count / float(total)))
diff --git a/example/cifar10/cifar10_multi_gpus.py b/example/cifar10/cifar10_multi_gpus.py
index 6ce65a1cbfab..1f1195416f07 100644
--- a/example/cifar10/cifar10_multi_gpus.py
+++ b/example/cifar10/cifar10_multi_gpus.py
@@ -148,24 +148,25 @@ def momentum_update(key, grad, weight):
 get_data.GetCifar10()
 
 train_dataiter = mx.io.ImageRecordIter(
-    path_imgrec="data/cifar/train.rec",
-    mean_img="data/cifar/cifar_mean.bin",
-    rand_crop=True,
-    rand_mirror=True,
-    shuffle=True,
-    input_shape=(3,28,28),
-    batch_size=batch_size,
-    nthread=1)
-
-val_dataiter = mx.io.ImageRecordIter(
-    path_imgrec="data/cifar/test.rec",
-    mean_img="data/cifar/cifar_mean.bin",
-    rand_crop=False,
-    rand_mirror=False,
-    input_shape=(3,28,28),
-    batch_size=batch_size,
-    nthread=1)
-
+        path_imgrec="data/cifar/train.rec",
+        mean_img="data/cifar/cifar_mean.bin",
+        rand_crop=True,
+        rand_mirror=True,
+        shuffle=False,
+        input_shape=(3,28,28),
+        batch_size=batch_size,
+        nthread=4,
+        prefetch_capacity=6)
+test_dataiter = mx.io.ImageRecordIter(
+        path_imgrec="data/cifar/test.rec",
+        mean_img="data/cifar/cifar_mean.bin",
+        rand_crop=False,
+        rand_mirror=False,
+        shuffle=False,
+        input_shape=(3,28,28),
+        batch_size=batch_size,
+        nthread=4,
+        prefetch_capacity=6)
 
 def progress(count, total, epoch, tic):
     bar_len = 50
diff --git a/example/mnist/mlp_gpu.py b/example/mnist/mlp_gpu.py
index bd92fcaf438a..903aba915af2 100644
--- a/example/mnist/mlp_gpu.py
+++ b/example/mnist/mlp_gpu.py
@@ -67,7 +67,6 @@ def Update(grad, weight):
 
 #check data
 get_data.GetMNIST_ubyte()
-
 train_dataiter = mx.io.MNISTIter(
         image="data/train-images-idx3-ubyte",
         label="data/train-labels-idx1-ubyte",
diff --git a/example/mnist/mlp_multi_gpu.py b/example/mnist/mlp_multi_gpu.py
index bb45b6448879..a874876a7ef0 100644
--- a/example/mnist/mlp_multi_gpu.py
+++ b/example/mnist/mlp_multi_gpu.py
@@ -60,10 +60,12 @@ def updater(key, grad, weight):
 train_dataiter = mx.io.MNISTIter(
         image="data/train-images-idx3-ubyte",
         label="data/train-labels-idx1-ubyte",
+        input_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10)
 val_dataiter = mx.io.MNISTIter(
         image="data/t10k-images-idx3-ubyte",
         label="data/t10k-labels-idx1-ubyte",
+        input_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False)
 
 def cal_acc(out, label):
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index 2901922af880..0ce1ab084806 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -6,7 +6,9 @@
 #ifndef MXNET_IO_IMAGE_AUGMENTER_H_
 #define MXNET_IO_IMAGE_AUGMENTER_H_
 
+#if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
+#endif
 #include <utility>
 #include <string>
 #include <algorithm>
@@ -131,8 +133,10 @@ class ImageAugmenter {
  public:
   // contructor
   ImageAugmenter(void)
-      : tmpres_(false),
-        rotateM_(2, 3, CV_32F) {
+      : tmpres_(false) {
+#if MXNET_USE_OPENCV
+    rotateM_ = cv::Mat(2, 3, CV_32F);
+#endif
   }
   virtual ~ImageAugmenter() {
   }
@@ -165,6 +169,7 @@ class ImageAugmenter {
       }
     }
   }
+#if MXNET_USE_OPENCV
   /*!
    * \brief augment src image, store result into dst
    *   this function is not thread safe, and will only be called by one thread
@@ -278,7 +283,7 @@ class ImageAugmenter {
     }
     return tmpres_;
   }
-
+#endif
   void TensorProcess(mshadow::Tensor<cpu, 3> *p_data,
                      mshadow::TensorContainer<cpu, 3> *dst_data,
                        common::RANDOM_ENGINE *prnd) {
@@ -376,11 +381,13 @@ class ImageAugmenter {
   mshadow::TensorContainer<cpu, 3> meanimg_;
   /*! \brief temp space */
   mshadow::TensorContainer<cpu, 3> img_;
+#if MXNET_USE_OPENCV
   // temporal space
   cv::Mat temp_;
   // rotation param
   cv::Mat rotateM_;
   // whether the mean file is ready
+#endif
   bool meanfile_ready_;
   // parameters
   ImageAugmentParam param_;
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 96659ade5c83..e1d3c7bb6eec 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -20,6 +20,7 @@ iterator
 #include "./image_augmenter.h"
 #include "./iter_prefetcher.h"
 #include "./iter_batchloader.h"
+
 namespace mxnet {
 namespace io {
 /*! \brief data structure to hold labels for images */
@@ -176,6 +177,7 @@ class ImageRecordIOParser {
 
 inline void ImageRecordIOParser::Init(
         const std::vector<std::pair<std::string, std::string> >& kwargs) {
+#if MXNET_USE_OPENCV
   // initialize parameter
   // init image rec param
   param_.InitAllowUnknown(kwargs);
@@ -211,6 +213,9 @@ inline void ImageRecordIOParser::Init(
       param_.num_parts, "recordio");
   // use 64 MB chunk when possible
   source_->HintChunkSize(8 << 20UL);
+#else
+  LOG(FATAL) << "ImageRec need opencv to process";
+#endif
 }
 
 inline bool ImageRecordIOParser::
@@ -232,6 +237,7 @@ ParseNext(std::vector<InstVector> *out_vec) {
     InstVector &opencv_out = (*opencv_out_vec)[tid];
     opencv_out.Clear();
     while (reader.NextRecord(&blob)) {
+#if MXNET_USE_OPENCV
       // Opencv decode and augments
       cv::Mat res;
       rec.Load(blob.dptr, blob.size);
@@ -260,6 +266,9 @@ ParseNext(std::vector<InstVector> *out_vec) {
         opencv_label[0] = rec.header.label;
       }
       res.release();
+#else
+      LOG(FATAL) << "Opencv is needed for image decoding and augmenting.";
+#endif
     }
   }
   // Tensor Op is not thread safe, so call outside of omp

From 608a7a0a9fc57b75056eda21f59cf37bbfe0594e Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Fri, 18 Sep 2015 12:16:00 +0800
Subject: [PATCH 10/13] some style problem

---
 python/mxnet/ndarray.py          | 4 ++--
 src/io/inst_vector.h             | 4 +---
 src/symbol/graph_executor.cc     | 3 ++-
 tests/python/unittest/test_io.py | 2 --
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index cccf28453acd..1bb9f465ac59 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -556,7 +556,7 @@ def binary_ndarray_function(lhs, rhs, out=None):
         if out:
             if isinstance(out, NDArray) == False:
                 raise TypeError('out must be NDArray')
-            if out.writable == False:
+            if not out.writable:
                 raise TypeError('out must be writable')
         else:
             if not accept_empty_mutate:
@@ -573,7 +573,7 @@ def unary_ndarray_function(src, out=None):
         if out:
             if isinstance(out, NDArray) == False:
                 raise TypeError('out must be NDArray')
-            if out.writable == False:
+            if not out.writable:
                 raise TypeError('out must be writable')
         else:
             if not accept_empty_mutate:
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index ea4e4c6c181e..5f0c78d1b9d6 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -141,9 +141,7 @@ struct TBlobBatch {
   }
   /*! \brief destructor */
   ~TBlobBatch() {
-    if (inst_index != NULL) {
-        delete inst_index;
-    }
+    delete inst_index;
   }
 };  // struct TBlobBatch
 }  // namespace io
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index a6aa184cbf30..907f9428afbf 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -168,7 +168,8 @@ inline std::vector<std::pair<T, T> > GraphExecutor::GetInplaceOption(
     std::vector<std::pair<T, T> > remap(remap_index.size());
     for (size_t i = 0; i < remap_index.size(); ++i) {
       if (args_array[remap_index[i].first] == nullptr) {
-        LOG(FATAL) << "BackwardInplaceOption uses input that is returned by DeclareBackwardDependency";
+        LOG(FATAL) <<
+            "BackwardInplaceOption uses input that is returned by DeclareBackwardDependency";
       }
       remap[i].first = *args_array[remap_index[i].first];
       remap[i].second = *static_cast<T*>(remap_index[i].second);
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index f93ebe158bc4..b5654c774180 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -6,8 +6,6 @@
 import time
 import sys
 from common import get_data
-from PIL import Image
-
 
 def test_MNISTIter():
     # prepare data

From 5ae2c707a4ce1084771aa06ae9f639e36cae4b59 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Fri, 18 Sep 2015 17:50:52 +0800
Subject: [PATCH 11/13] modify iter param in tests

---
 src/io/iter_prefetcher.h         | 53 +++++++++++++++++++++++---------
 tests/python/train/test_conv.py  |  8 ++---
 tests/python/unittest/test_io.py |  4 +--
 3 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index 9da7dfeaf2af..cda0cdf6755c 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -50,9 +50,15 @@ struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
 class PrefetcherIter : public IIterator<DataBatch> {
  public:
   explicit PrefetcherIter(IIterator<TBlobBatch>* base) : loader_(base) {
+    pdata_vec.clear();
+    plabel_vec.clear();
   }
   virtual ~PrefetcherIter(void) {
     iter_.Destroy();
+    for (size_t i = 0; i < pdata_vec.size(); i++) {
+      delete[] pdata_vec[i];
+      delete[] plabel_vec[i];
+    }
     delete loader_;
   }
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
@@ -61,6 +67,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
     kwargs_left = param_.InitAllowUnknown(kwargs);
     // use the kwarg to init batch loader
     loader_->Init(kwargs);
+    // create the shape
     std::vector<size_t> data_shape_vec;
     data_shape_vec.push_back(param_.batch_size);
     for (size_t shape_dim = 0; shape_dim < param_.input_shape.ndim(); shape_dim++)
@@ -72,26 +79,31 @@ class PrefetcherIter : public IIterator<DataBatch> {
     label_shape_ = TShape(label_shape_vec.begin(), label_shape_vec.end());
     // init thread iter
     iter_.set_max_capacity(param_.prefetch_capacity);
-    iter_.Init([this](DataBatch **dptr) {
+    iter_.Init([this](TBlobBatch **dptr) {
         bool load_success = loader_->Next();
+        if (load_success == false)
+          return false;
         if (*dptr == NULL) {
-          *dptr = new DataBatch();
-          // init NDArrays
-          Context ctx;
-          (*dptr)->data.push_back(NDArray(data_shape_, ctx, false));
-          (*dptr)->data.push_back(NDArray(label_shape_, ctx, false));
+          *dptr = new TBlobBatch();
+          // create the spaces and record the pointers
+          real_t* pdata = new real_t[data_shape_.Size()];
+          pdata_vec.push_back(pdata);
+          real_t* plabel = new real_t[label_shape_.Size()];
+          plabel_vec.push_back(plabel);
+          (*dptr)->data.push_back(TBlob(pdata, data_shape_, mshadow::cpu::kDevMask));
+          (*dptr)->data.push_back(TBlob(plabel, label_shape_, mshadow::cpu::kDevMask));
         }
         const TBlobBatch& batch = loader_->Value();
         if (data_shape_.ndim() == 4) {
-          mshadow::Copy((*dptr)->data[0].data().get<mshadow::cpu, 4, float>(),
+          mshadow::Copy((*dptr)->data[0].get<mshadow::cpu, 4, float>(),
                   batch.data[0].get<mshadow::cpu, 4, float>());
         } else if (data_shape_.ndim() == 2) {
-          mshadow::Copy((*dptr)->data[0].data().get<mshadow::cpu, 2, float>(),
+          mshadow::Copy((*dptr)->data[0].get<mshadow::cpu, 2, float>(),
                   batch.data[0].get<mshadow::cpu, 2, float>());
         } else {
           assert(false);
         }
-        mshadow::Copy((*dptr)->data[1].data().get<mshadow::cpu, 2, float>(),
+        mshadow::Copy((*dptr)->data[1].get<mshadow::cpu, 2, float>(),
                 batch.data[1].get<mshadow::cpu, 2, float>());
         return load_success;
       },
@@ -102,19 +114,22 @@ class PrefetcherIter : public IIterator<DataBatch> {
   }
   virtual bool Next(void) {
      if (ready_batches_.size() != 0) {
-         DataBatch* old_batch = ready_batches_.front();
+         TBlobBatch* old_batch = ready_batches_.front();
          for (size_t i = 0; i < old_batch->data.size(); i++) {
-             old_batch->data[i].WaitToWrite();
+             NDArray old_ndarray = ready_ndarrays_.front();
+             old_ndarray.WaitToWrite();
+             ready_ndarrays_.pop();
          }
          iter_.Recycle(&old_batch);
          ready_batches_.pop();
      }
-     DataBatch* next_batch = NULL;
+     TBlobBatch* next_batch = NULL;
      if (!iter_.Next(&next_batch)) return false;
      out_.data.clear();
      // copy the batch
      for (size_t i = 0; i < next_batch->data.size(); i++) {
-         out_.data.push_back(next_batch->data[i]);
+         out_.data.push_back(NDArray(next_batch->data[i], mshadow::cpu::kDevMask));
+         ready_ndarrays_.push(out_.data[i]);
      }
      // push the narrays and batch into the queue
      ready_batches_.push(next_batch);
@@ -129,16 +144,24 @@ class PrefetcherIter : public IIterator<DataBatch> {
   PrefetcherParam param_;
   /*! \brief output data */
   DataBatch out_;
+  /*! \brief batch holder */
+  TBlobBatch out_holder_;
   /*! \brief queue to hold the NDArrays for check whether writable */
-  std::queue<DataBatch*> ready_batches_;
+  std::queue<TBlobBatch*> ready_batches_;
+  /*! \breif ndarrays to wait to write */
+  std::queue<NDArray> ready_ndarrays_;
   // internal batch loader
   IIterator<TBlobBatch>* loader_;
   // backend thread
-  dmlc::ThreadedIter<DataBatch> iter_;
+  dmlc::ThreadedIter<TBlobBatch> iter_;
   /*! \brief data shape */
   TShape data_shape_;
   /*! \brief label shape */
   TShape label_shape_;
+  /*! \brief log the pointers of the space created for data*/
+  std::vector<real_t*> pdata_vec;
+  /*! \brief log the pointers of the space created for label*/
+  std::vector<real_t*> plabel_vec;
 };
 }  // namespace io
 }  // namespace mxnet
diff --git a/tests/python/train/test_conv.py b/tests/python/train/test_conv.py
index 018233f261bf..a27222c1227a 100644
--- a/tests/python/train/test_conv.py
+++ b/tests/python/train/test_conv.py
@@ -73,13 +73,13 @@ def Update(grad, weight):
 train_dataiter = mx.io.MNISTIter(
         image="data/train-images-idx3-ubyte",
         label="data/train-labels-idx1-ubyte",
-        input_shape=(784,),
-        batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10)
+        input_shape=(1, 28, 28),
+        batch_size=batch_size, shuffle=True, flat=False, silent=False, seed=10)
 val_dataiter = mx.io.MNISTIter(
         image="data/t10k-images-idx3-ubyte",
         label="data/t10k-labels-idx1-ubyte",
-        input_shape=(784,),
-        batch_size=batch_size, shuffle=True, flat=True, silent=False)
+        input_shape=(1, 28, 28),
+        batch_size=batch_size, shuffle=True, flat=False, silent=False)
 
 def test_mnist():
     acc_train = 0.0
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index b5654c774180..23d2afc18c03 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -61,5 +61,5 @@ def test_Cifar10Rec():
         assert(labelcount[i] == 5000)
 
 if __name__ == "__main__":
-    #test_MNISTIter()
-    test_Cifar10Rec()
+    test_MNISTIter()
+    #test_Cifar10Rec()

From 6c4a04bf032d6012a70beaf0ff3509ebce02886f Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 17 Sep 2015 20:58:16 -0700
Subject: [PATCH 12/13] minor fix

---
 Makefile | 2 +-
 mshadow  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 5b325f635171..604cb3894b84 100644
--- a/Makefile
+++ b/Makefile
@@ -32,7 +32,7 @@ ifeq ($(DEBUG),0)
 else
 	CFLAGS += -g -O0
 endif
-CFLAGS += -I./mshadow/ -I./dmlc-core/include -fPIC -Iinclude $(MSHADOW_CFLAGS)
+CFLAGS += -I./mshadow/ -I./dmlc-core/include -fPIC -Iinclude $(MSHADOW_CFLAGS) $(DMLC_CFLAGS)
 LDFLAGS = -pthread $(MSHADOW_LDFLAGS) $(DMLC_LDFLAGS)
 NVCCFLAGS = --use_fast_math -g -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 ROOTDIR = $(CURDIR)
diff --git a/mshadow b/mshadow
index a38bb6929f63..2a7cdc9f5081 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit a38bb6929f63671e4ae0919dbbfc8642f43197f2
+Subproject commit 2a7cdc9f50817e739801c2a31161e0fedd9b13eb

From d5d6177b961b75c54f67d8b8747ebab9438092c3 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 17 Sep 2015 21:56:16 -0700
Subject: [PATCH 13/13] Update graph_executor.cc

---
 src/symbol/graph_executor.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index 907f9428afbf..3ac19fa4469e 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -168,8 +168,7 @@ inline std::vector<std::pair<T, T> > GraphExecutor::GetInplaceOption(
     std::vector<std::pair<T, T> > remap(remap_index.size());
     for (size_t i = 0; i < remap_index.size(); ++i) {
       if (args_array[remap_index[i].first] == nullptr) {
-        LOG(FATAL) <<
-            "BackwardInplaceOption uses input that is returned by DeclareBackwardDependency";
+        LOG(FATAL) << "BackwardInplaceOption not consistent with DeclareBackwardDependency";
       }
       remap[i].first = *args_array[remap_index[i].first];
       remap[i].second = *static_cast<T*>(remap_index[i].second);