diff --git a/Makefile b/Makefile
index 08f81a220bcd..604cb3894b84 100644
--- a/Makefile
+++ b/Makefile
@@ -12,12 +12,7 @@ ifndef DMLC_CORE
 	DMLC_CORE = dmlc-core
 endif
 
-
-ifneq ($(USE_OPENMP_ITER), 1)
-	export NO_OPENMP = 1
-endif
-
-ifneq ($(USE_OPENMP_ITER), 1)
+ifneq ($(USE_OPENMP), 1)
 	export NO_OPENMP = 1
 endif
 
diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index 81ba9e5591f7..59b7bd1432d8 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -201,18 +201,21 @@ def Update(grad, weight, mom):
         mean_img="data/cifar/cifar_mean.bin",
         rand_crop=True,
         rand_mirror=True,
+        shuffle=False,
         input_shape=(3,28,28),
         batch_size=batch_size,
-        nthread=1)
+        nthread=4,
+        prefetch_capacity=6)
 test_dataiter = mx.io.ImageRecordIter(
         path_imgrec="data/cifar/test.rec",
         mean_img="data/cifar/cifar_mean.bin",
         rand_crop=False,
         rand_mirror=False,
+        shuffle=False,
         input_shape=(3,28,28),
         batch_size=batch_size,
-        nthread=1)
-
+        nthread=4,
+        prefetch_capacity=6)
 
 def progress(count, total, epoch, toc):
     bar_len = 50
diff --git a/example/cifar10/cifar10_multi_gpus.py b/example/cifar10/cifar10_multi_gpus.py
index 6ce65a1cbfab..1f1195416f07 100644
--- a/example/cifar10/cifar10_multi_gpus.py
+++ b/example/cifar10/cifar10_multi_gpus.py
@@ -148,24 +148,25 @@ def momentum_update(key, grad, weight):
 get_data.GetCifar10()
 
 train_dataiter = mx.io.ImageRecordIter(
-    path_imgrec="data/cifar/train.rec",
-    mean_img="data/cifar/cifar_mean.bin",
-    rand_crop=True,
-    rand_mirror=True,
-    shuffle=True,
-    input_shape=(3,28,28),
-    batch_size=batch_size,
-    nthread=1)
-
-val_dataiter = mx.io.ImageRecordIter(
-    path_imgrec="data/cifar/test.rec",
-    mean_img="data/cifar/cifar_mean.bin",
-    rand_crop=False,
-    rand_mirror=False,
-    input_shape=(3,28,28),
-    batch_size=batch_size,
-    nthread=1)
-
+        path_imgrec="data/cifar/train.rec",
+        mean_img="data/cifar/cifar_mean.bin",
+        rand_crop=True,
+        rand_mirror=True,
+        shuffle=False,
+        input_shape=(3,28,28),
+        batch_size=batch_size,
+        nthread=4,
+        prefetch_capacity=6)
+test_dataiter = mx.io.ImageRecordIter(
+        path_imgrec="data/cifar/test.rec",
+        mean_img="data/cifar/cifar_mean.bin",
+        rand_crop=False,
+        rand_mirror=False,
+        shuffle=False,
+        input_shape=(3,28,28),
+        batch_size=batch_size,
+        nthread=4,
+        prefetch_capacity=6)
 
 def progress(count, total, epoch, tic):
     bar_len = 50
diff --git a/example/mnist/mlp_gpu.py b/example/mnist/mlp_gpu.py
index ef8cd3b84cdc..903aba915af2 100644
--- a/example/mnist/mlp_gpu.py
+++ b/example/mnist/mlp_gpu.py
@@ -67,14 +67,15 @@ def Update(grad, weight):
 
 #check data
 get_data.GetMNIST_ubyte()
-
 train_dataiter = mx.io.MNISTIter(
         image="data/train-images-idx3-ubyte",
         label="data/train-labels-idx1-ubyte",
+        input_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10)
 val_dataiter = mx.io.MNISTIter(
         image="data/t10k-images-idx3-ubyte",
         label="data/t10k-labels-idx1-ubyte",
+        input_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False)
 
 tmp_label = mx.nd.zeros(name2shape["sm_label"])
diff --git a/example/mnist/mlp_multi_gpu.py b/example/mnist/mlp_multi_gpu.py
index bb45b6448879..a874876a7ef0 100644
--- a/example/mnist/mlp_multi_gpu.py
+++ b/example/mnist/mlp_multi_gpu.py
@@ -60,10 +60,12 @@ def updater(key, grad, weight):
 train_dataiter = mx.io.MNISTIter(
         image="data/train-images-idx3-ubyte",
         label="data/train-labels-idx1-ubyte",
+        input_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10)
 val_dataiter = mx.io.MNISTIter(
         image="data/t10k-images-idx3-ubyte",
         label="data/t10k-labels-idx1-ubyte",
+        input_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False)
 
 def cal_acc(out, label):
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 43dd5fad92d1..7e2cf8180fd5 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -11,7 +11,9 @@
 #include <vector>
 #include <string>
 #include <utility>
+#include <queue>
 #include "./base.h"
+#include "./ndarray.h"
 
 namespace mxnet {
 /*!
@@ -59,27 +61,16 @@ struct DataInst {
  *      data and label, how we use them is to see the DNN implementation.
  */
 struct DataBatch {
- public:
-  /*! \brief unique id for instance, can be NULL, sometimes is useful */
-  unsigned *inst_index;
-  /*! \brief number of instance */
-  mshadow::index_t batch_size;
-  /*! \brief number of padding elements in this batch,
-       this is used to indicate the last elements in the batch are only padded up to match the batch, and should be discarded */
-  mshadow::index_t num_batch_padd;
  public:
   /*! \brief content of dense data, if this DataBatch is dense */
-  std::vector<TBlob> data;
+  std::vector<NDArray> data;
   /*! \brief extra data to be fed to the network */
   std::string extra_data;
  public:
   /*! \brief constructor */
-  DataBatch(void) {
-    inst_index = NULL;
-    batch_size = 0; num_batch_padd = 0;
-  }
-  /*! \brief giving name to the data */
-  void Naming(std::vector<std::string> names);
+  DataBatch(void) {}
+  /*! \brief destructor */
+  ~DataBatch() {}
 };  // struct DataBatch
 
 /*! \brief typedef the factory function of data iterator */
@@ -121,10 +112,29 @@ struct DataIteratorReg
  * \endcode
  */
 #define MXNET_REGISTER_IO_CHAINED_ITER(name, ChainedDataIterType, HoldingDataIterType)          \
-  static ::mxnet::IIterator<DataBatch>* __create__ ## ChainedDataIteratorType ## __() { \
+  static ::mxnet::IIterator<DataBatch>* __create__ ## ChainedDataIterType ## __() { \
     return new HoldingDataIterType(new ChainedDataIterType);                                    \
   }                                                                     \
   DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
-  .set_body(__create__ ## ChainedDataIteratorType ## __)
+  .set_body(__create__ ## ChainedDataIterType ## __)
+/*!
+ * \brief Macro to register three chained Iterators
+ *
+ * \code
+ * // example of registering a imagerec iterator
+ * MXNET_REGISTER_IO_CHAINED_ITERATOR(ImageRecordIter, 
+ * ImageRecordIter, ImageRecBatchLoader, Prefetcher)
+ * .describe("batched image record data iterator");
+ *
+ * \endcode
+ */
+#define MXNET_REGISTER_IO_THREE_CHAINED_ITER(\
+        name, FirstIterType, SecondIterType, ThirdIterType)          \
+  static ::mxnet::IIterator<DataBatch>* __create__ ## ThirdIterType ## __() { \
+    return new FirstIterType(new SecondIterType(new ThirdIterType));             \
+  }                                                                     \
+  DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
+  .set_body(__create__ ## ThirdIterType ## __)
+
 }  // namespace mxnet
 #endif  // MXNET_IO_H_
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index aff8e1c8cb00..62e92bd020d5 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -73,7 +73,7 @@ def getdata(self):
         """
         hdl = NDArrayHandle()
         check_call(_LIB.MXDataIterGetData(self.handle, ctypes.byref(hdl)))
-        return NDArray(hdl)
+        return NDArray(hdl, False)
 
     def getlabel(self):
         """get label from batch
@@ -81,7 +81,7 @@ def getlabel(self):
         """
         hdl = NDArrayHandle()
         check_call(_LIB.MXDataIterGetLabel(self.handle, ctypes.byref(hdl)))
-        return NDArray(hdl)
+        return NDArray(hdl, False)
 
 def _make_io_iterator(handle):
     """Create an io iterator by handle."""
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index e5d1b5a903a2..1bb9f465ac59 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -51,7 +51,7 @@ class NDArray(object):
     NDArray is basic ndarray/Tensor like data structure in mxnet.
     """
     # pylint: disable= no-member
-    def __init__(self, handle):
+    def __init__(self, handle, writable=True):
         """initialize a new NDArray
 
         Parameters
@@ -61,6 +61,7 @@ def __init__(self, handle):
         """
         assert isinstance(handle, NDArrayHandle)
         self.handle = handle
+        self.writable = writable
 
     def __del__(self):
         check_call(_LIB.MXNDArrayFree(self.handle))
@@ -555,6 +556,8 @@ def binary_ndarray_function(lhs, rhs, out=None):
         if out:
             if isinstance(out, NDArray) == False:
                 raise TypeError('out must be NDArray')
+            if not out.writable:
+                raise TypeError('out must be writable')
         else:
             if not accept_empty_mutate:
                 raise TypeError('argument out is required to call %s' % func_name)
@@ -570,6 +573,8 @@ def unary_ndarray_function(src, out=None):
         if out:
             if isinstance(out, NDArray) == False:
                 raise TypeError('out must be NDArray')
+            if not out.writable:
+                raise TypeError('out must be writable')
         else:
             if not accept_empty_mutate:
                 raise TypeError('argument out is required to call %s' % func_name)
diff --git a/src/c_api.cc b/src/c_api.cc
index f1622c905d5b..86f7704b5871 100644
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -872,15 +872,19 @@ int MXDataIterNext(DataIterHandle handle, int *out) {
 
 int MXDataIterGetLabel(DataIterHandle handle, NDArrayHandle *out) {
   API_BEGIN();
-  DataBatch db = static_cast<IIterator<DataBatch>* >(handle)->Value();
-  *out = new NDArray(db.data[1], 0);
+  const DataBatch& db = static_cast<IIterator<DataBatch>* >(handle)->Value();
+  NDArray* pndarray = new NDArray();
+  *pndarray = db.data[1];
+  *out = pndarray;
   API_END();
 }
 
 int MXDataIterGetData(DataIterHandle handle, NDArrayHandle *out) {
   API_BEGIN();
-  DataBatch db = static_cast<IIterator<DataBatch>* >(handle)->Value();
-  *out = new NDArray(db.data[0], 0);
+  const DataBatch& db = static_cast<IIterator<DataBatch>* >(handle)->Value();
+  NDArray* pndarray = new NDArray();
+  *pndarray = db.data[0];
+  *out = pndarray;
   API_END();
 }
 
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index a4b77f5a41df..0ce1ab084806 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -2,12 +2,13 @@
  *  Copyright (c) 2015 by Contributors
  * \file image_augmenter_opencv.hpp
  * \brief threaded version of page iterator
- * \author Naiyan Wang, Tianqi Chen, Tianjun Xiao
  */
 #ifndef MXNET_IO_IMAGE_AUGMENTER_H_
 #define MXNET_IO_IMAGE_AUGMENTER_H_
 
+#if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
+#endif
 #include <utility>
 #include <string>
 #include <algorithm>
@@ -122,6 +123,8 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
         .describe("Maximum ratio of contrast variation");
     DMLC_DECLARE_FIELD(max_random_illumination).set_default(0.0f)
         .describe("Maximum value of illumination variation");
+    DMLC_DECLARE_FIELD(silent).set_default(true)
+        .describe("Whether to print augmentor info");
   }
 };
 
@@ -130,8 +133,10 @@ class ImageAugmenter {
  public:
   // contructor
   ImageAugmenter(void)
-      : tmpres_(false),
-        rotateM_(2, 3, CV_32F) {
+      : tmpres_(false) {
+#if MXNET_USE_OPENCV
+    rotateM_ = cv::Mat(2, 3, CV_32F);
+#endif
   }
   virtual ~ImageAugmenter() {
   }
@@ -164,6 +169,7 @@ class ImageAugmenter {
       }
     }
   }
+#if MXNET_USE_OPENCV
   /*!
    * \brief augment src image, store result into dst
    *   this function is not thread safe, and will only be called by one thread
@@ -174,6 +180,7 @@ class ImageAugmenter {
    */
   virtual cv::Mat OpencvProcess(const cv::Mat &src,
                           common::RANDOM_ENGINE *prnd) {
+    if (!NeedOpencvProcess()) return src;
     // shear
     float s = NextDouble(prnd) * param_.max_shear_ratio * 2 - param_.max_shear_ratio;
     // rotate
@@ -276,8 +283,9 @@ class ImageAugmenter {
     }
     return tmpres_;
   }
-
-  void TensorProcess(mshadow::TensorContainer<cpu, 3> *p_data,
+#endif
+  void TensorProcess(mshadow::Tensor<cpu, 3> *p_data,
+                     mshadow::TensorContainer<cpu, 3> *dst_data,
                        common::RANDOM_ENGINE *prnd) {
     // Check Newly Created mean image
     if (meanfile_ready_ == false && param_.mean_img.length() != 0) {
@@ -291,7 +299,8 @@ class ImageAugmenter {
         meanfile_ready_ = true;
       }
     }
-    img_.Resize(mshadow::Shape3((*p_data).shape_[0], param_.input_shape[1], param_.input_shape[2]));
+    img_.Resize(mshadow::Shape3((*p_data).shape_[0],
+                param_.input_shape[1], param_.input_shape[2]));
     if (param_.input_shape[1] == 1) {
       img_ = (*p_data) * param_.scale;
     } else {
@@ -355,27 +364,7 @@ class ImageAugmenter {
         }
       }
     }
-    (*p_data) = img_;
-  }
-
-  virtual void Process(unsigned char *dptr, size_t sz,
-                       mshadow::TensorContainer<cpu, 3> *p_data,
-                       common::RANDOM_ENGINE *prnd) {
-    cv::Mat buf(1, sz, CV_8U, dptr);
-    cv::Mat res = cv::imdecode(buf, 1);
-    if (NeedOpencvProcess())
-        res = this->OpencvProcess(res, prnd);
-    p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
-    for (index_t i = 0; i < p_data->size(1); ++i) {
-      for (index_t j = 0; j < p_data->size(2); ++j) {
-        cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
-        (*p_data)[0][i][j] = bgr[2];
-        (*p_data)[1][i][j] = bgr[1];
-        (*p_data)[2][i][j] = bgr[0];
-      }
-    }
-    res.release();
-    this->TensorProcess(p_data, prnd);
+    (*dst_data) = img_;
   }
 
  private:
@@ -392,11 +381,13 @@ class ImageAugmenter {
   mshadow::TensorContainer<cpu, 3> meanimg_;
   /*! \brief temp space */
   mshadow::TensorContainer<cpu, 3> img_;
+#if MXNET_USE_OPENCV
   // temporal space
   cv::Mat temp_;
   // rotation param
   cv::Mat rotateM_;
   // whether the mean file is ready
+#endif
   bool meanfile_ready_;
   // parameters
   ImageAugmentParam param_;
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index f2f86751e698..5f0c78d1b9d6 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -13,6 +13,7 @@
 #include <dmlc/base.h>
 #include <mshadow/tensor.h>
 #include <vector>
+#include <string>
 
 namespace mxnet {
 namespace io {
@@ -72,6 +73,11 @@ class InstVector {
   inline size_t Size(void) const {
     return index_.size();
   }
+  // get index
+  inline unsigned Index(unsigned i) const {
+    return index_[i];
+  }
+  // instance
   /* \brief get the i-th (label, example) pair */
   inline DataInst operator[](size_t i) const {
     DataInst inst;
@@ -109,6 +115,35 @@ class InstVector {
   // data
   TensorVector<1, real_t> label_;
 };
+
+/*!
+ * \brief tblob batch
+ *
+ * data are stored in tblob before going into NDArray
+ */
+struct TBlobBatch {
+ public:
+  /*! \brief unique id for instance, can be NULL, sometimes is useful */
+  unsigned *inst_index;
+  /*! \brief number of instance */
+  mshadow::index_t batch_size;
+  /*! \brief number of padding elements in this batch,
+       this is used to indicate the last elements in the batch are only padded up to match the batch, and should be discarded */
+  mshadow::index_t num_batch_padd;
+  /*! \brief content of dense data */
+  std::vector<TBlob> data;
+  /*! \brief extra data to be fed to the network */
+  std::string extra_data;
+  /*! \brief constructor */
+  TBlobBatch(void) {
+    inst_index = NULL;
+    batch_size = 0; num_batch_padd = 0;
+  }
+  /*! \brief destructor */
+  ~TBlobBatch() {
+    delete inst_index;
+  }
+};  // struct TBlobBatch
 }  // namespace io
 }  // namespace mxnet
 #endif  // MXNET_IO_INST_VECTOR_H_
diff --git a/src/io/io.cc b/src/io/io.cc
index 8bfb5dbdd570..0bdac7d1576c 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -5,7 +5,8 @@
 #include <mxnet/io.h>
 #include <dmlc/registry.h>
 #include "./image_augmenter.h"
-#include "./iter_batch.h"
+#include "./iter_batchloader.h"
+#include "./iter_prefetcher.h"
 
 // Registers
 namespace dmlc {
@@ -16,6 +17,7 @@ namespace mxnet {
 namespace io {
 // Register parameters in header files
 DMLC_REGISTER_PARAMETER(BatchParam);
+DMLC_REGISTER_PARAMETER(PrefetcherParam);
 DMLC_REGISTER_PARAMETER(ImageAugmentParam);
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/io/iter_batch.h b/src/io/iter_batchloader.h
similarity index 58%
rename from src/io/iter_batch.h
rename to src/io/iter_batchloader.h
index b45dfd3328e1..41e027f89469 100644
--- a/src/io/iter_batch.h
+++ b/src/io/iter_batchloader.h
@@ -1,19 +1,19 @@
 /*!
  *  Copyright (c) 2015 by Contributors
- * \file iter_batch_proc-inl.hpp
- * \brief definition of preprocessing iterators that takes an iterator and do some preprocessing
- * \author Tianqi Chen, Tianjun Xiao
+ * \file iter_batchloader.h
+ * \brief define a batch adapter to create tblob batch 
  */
-#ifndef MXNET_IO_ITER_BATCH_H_
-#define MXNET_IO_ITER_BATCH_H_
+#ifndef MXNET_IO_ITER_BATCHLOADER_H_
+#define MXNET_IO_ITER_BATCHLOADER_H_
 
 #include <mxnet/io.h>
 #include <mxnet/base.h>
 #include <dmlc/logging.h>
 #include <mshadow/tensor.h>
 #include <utility>
-#include <string>
 #include <vector>
+#include <string>
+#include "./inst_vector.h"
 
 namespace mxnet {
 namespace io {
@@ -52,26 +52,40 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
 };
 
 /*! \brief create a batch iterator from single instance iterator */
-class BatchAdaptIter: public IIterator<DataBatch> {
+class BatchLoader : public IIterator<TBlobBatch> {
  public:
-  explicit BatchAdaptIter(IIterator<DataInst> *base): base_(base), num_overflow_(0) {}
-  virtual ~BatchAdaptIter(void) {
+  explicit BatchLoader(IIterator<DataInst> *base):
+      base_(base), head_(1), num_overflow_(0) {}
+  virtual ~BatchLoader(void) {
     delete base_;
-    FreeSpaceDense();
+    // Free space for TblobBatch
+    mshadow::FreeSpace(&data_holder_);
+    mshadow::FreeSpace(&label_holder_);
   }
-  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+  inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init batch param, it could have similar param with
     kwargs_left = param_.InitAllowUnknown(kwargs);
     // init base iterator
     base_->Init(kwargs);
-    data_shape_[1] = param_.input_shape[0];
-    data_shape_[2] = param_.input_shape[1];
-    data_shape_[3] = param_.input_shape[2];
-    data_shape_[0] = param_.batch_size;
-    AllocSpaceDense(false);
+    std::vector<size_t> data_shape_vec;
+    data_shape_vec.push_back(param_.batch_size);
+    for (size_t shape_dim = 0; shape_dim < param_.input_shape.ndim(); shape_dim++)
+        data_shape_vec.push_back(param_.input_shape[shape_dim]);
+    data_shape_ = TShape(data_shape_vec.begin(), data_shape_vec.end());
+    std::vector<size_t> label_shape_vec;
+    label_shape_vec.push_back(param_.batch_size);
+    label_shape_vec.push_back(param_.label_width);
+    label_shape_ = TShape(label_shape_vec.begin(), label_shape_vec.end());
+    // Init space for out_
+    out_.inst_index = new unsigned[param_.batch_size];
+    out_.data.clear();
+    data_holder_ =  mshadow::NewTensor<mshadow::cpu>(data_shape_.get<4>(), 0.0f);
+    label_holder_ =  mshadow::NewTensor<mshadow::cpu>(label_shape_.get<2>(), 0.0f);
+    out_.data.push_back(TBlob(data_holder_));
+    out_.data.push_back(TBlob(label_holder_));
   }
-  virtual void BeforeFirst(void) {
+  inline void BeforeFirst(void) {
     if (param_.round_batch == 0 || num_overflow_ == 0) {
       // otherise, we already called before first
       base_->BeforeFirst();
@@ -80,7 +94,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     }
     head_ = 1;
   }
-  virtual bool Next(void) {
+  inline bool Next(void) {
     out_.num_batch_padd = 0;
 
     // skip read if in head version
@@ -95,14 +109,13 @@ class BatchAdaptIter: public IIterator<DataBatch> {
 
     while (base_->Next()) {
       const DataInst& d = base_->Value();
-      mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
       out_.inst_index[top] = d.index;
-      mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
-
+      mshadow::Copy(out_.data[1].get<mshadow::cpu, 2, float>()[top],
+              d.data[1].get<mshadow::cpu, 1, float>());
+      mshadow::Copy(out_.data[0].get<mshadow::cpu, 4, float>()[top],
+              d.data[0].get<mshadow::cpu, 3, float>());
       if (++ top >= param_.batch_size) {
-        out_.data[0] = TBlob(data);
-        out_.data[1] = TBlob(label);
-        return true;
+          return true;
       }
     }
     if (top != 0) {
@@ -112,61 +125,44 @@ class BatchAdaptIter: public IIterator<DataBatch> {
         for (; top < param_.batch_size; ++top, ++num_overflow_) {
           CHECK(base_->Next()) << "number of input must be bigger than batch size";
           const DataInst& d = base_->Value();
-          mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
           out_.inst_index[top] = d.index;
-          mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
+          mshadow::Copy(out_.data[1].get<mshadow::cpu, 2, float>()[top],
+                  d.data[1].get<mshadow::cpu, 1, float>());
+          mshadow::Copy(out_.data[0].get<mshadow::cpu, 4, float>()[top],
+                  d.data[0].get<mshadow::cpu, 3, float>());
         }
         out_.num_batch_padd = num_overflow_;
       } else {
         out_.num_batch_padd = param_.batch_size - top;
       }
-      out_.data[0] = TBlob(data);
-      out_.data[1] = TBlob(label);
       return true;
     }
     return false;
   }
-  virtual const DataBatch &Value(void) const {
-    CHECK(head_ == 0) << "must call Next to get value";
+  virtual const TBlobBatch &Value(void) const {
     return out_;
   }
 
  private:
   /*! \brief batch parameters */
   BatchParam param_;
+  /*! \brief output data */
+  TBlobBatch out_;
   /*! \brief base iterator */
   IIterator<DataInst> *base_;
-  /*! \brief output data */
-  DataBatch out_;
   /*! \brief on first */
   int head_;
   /*! \brief number of overflow instances that readed in round_batch mode */
   int num_overflow_;
-  /*! \brief label information of the data*/
-  mshadow::Tensor<mshadow::cpu, 2> label;
-  /*! \brief content of dense data, if this DataBatch is dense */
-  mshadow::Tensor<mshadow::cpu, 4> data;
   /*! \brief data shape */
-  mshadow::Shape<4> data_shape_;
-  // Functions that allocate and free tensor space
-  inline void AllocSpaceDense(bool pad = false) {
-    data = mshadow::NewTensor<mshadow::cpu>(data_shape_, 0.0f, pad);
-    mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size, param_.label_width);
-    label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
-    out_.inst_index = new unsigned[param_.batch_size];
-    out_.batch_size = param_.batch_size;
-    out_.data.resize(2);
-  }
-  /*! \brief auxiliary function to free space, if needed, dense only */
-  inline void FreeSpaceDense(void) {
-    if (label.dptr_ != NULL) {
-      delete [] out_.inst_index;
-      mshadow::FreeSpace(&label);
-      mshadow::FreeSpace(&data);
-      label.dptr_ = NULL;
-    }
-  }
-};  // class BatchAdaptIter
+  TShape data_shape_;
+  /*! \brief label shape */
+  TShape label_shape_;
+  /*! \brief tensor to hold data */
+  mshadow::Tensor<mshadow::cpu, 4, real_t> data_holder_;
+  /*! \brief tensor to hold label */
+  mshadow::Tensor<mshadow::cpu, 2, real_t> label_holder_;
+};  // class BatchLoader
 }  // namespace io
 }  // namespace mxnet
-#endif  // MXNET_IO_ITER_BATCH_H_
+#endif  // MXNET_IO_ITER_BATCHLOADER_H_
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 6f77bb6aac57..e1d3c7bb6eec 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -4,6 +4,7 @@
  * \brief recordio data
 iterator
  */
+#include <mxnet/io.h>
 #include <dmlc/base.h>
 #include <dmlc/io.h>
 #include <dmlc/omp.h>
@@ -17,7 +18,9 @@ iterator
 #include "./inst_vector.h"
 #include "./image_recordio.h"
 #include "./image_augmenter.h"
-#include "./iter_batch.h"
+#include "./iter_prefetcher.h"
+#include "./iter_batchloader.h"
+
 namespace mxnet {
 namespace io {
 /*! \brief data structure to hold labels for images */
@@ -121,7 +124,7 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
     index_t input_shape_default[] = {3, 224, 224};
     DMLC_DECLARE_FIELD(input_shape)
         .set_default(TShape(input_shape_default, input_shape_default + 3))
-        .set_expect_ndim(3).enforce_nonzero()
+        .enforce_nonzero()
         .describe("Input shape of the neural net");
   }
 };
@@ -174,6 +177,7 @@ class ImageRecordIOParser {
 
 inline void ImageRecordIOParser::Init(
         const std::vector<std::pair<std::string, std::string> >& kwargs) {
+#if MXNET_USE_OPENCV
   // initialize parameter
   // init image rec param
   param_.InitAllowUnknown(kwargs);
@@ -195,7 +199,6 @@ inline void ImageRecordIOParser::Init(
     augmenters_[i]->Init(kwargs);
     prnds_.push_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic));
   }
-
   if (param_.path_imglist.length() != 0) {
     label_map_ = new ImageLabelMap(param_.path_imglist.c_str(),
                                    param_.label_width, param_.silent != 0);
@@ -210,6 +213,9 @@ inline void ImageRecordIOParser::Init(
       param_.num_parts, "recordio");
   // use 64 MB chunk when possible
   source_->HintChunkSize(8 << 20UL);
+#else
+  LOG(FATAL) << "ImageRec need opencv to process";
+#endif
 }
 
 inline bool ImageRecordIOParser::
@@ -217,7 +223,9 @@ ParseNext(std::vector<InstVector> *out_vec) {
   CHECK(source_ != NULL);
   dmlc::InputSplit::Blob chunk;
   if (!source_->NextChunk(&chunk)) return false;
-  out_vec->resize(param_.nthread);
+  // save opencv out
+  std::vector<InstVector> * opencv_out_vec = new std::vector<InstVector>();
+  opencv_out_vec->resize(param_.nthread);
   #pragma omp parallel num_threads(param_.nthread)
   {
     CHECK(omp_get_num_threads() == param_.nthread);
@@ -226,26 +234,67 @@ ParseNext(std::vector<InstVector> *out_vec) {
     ImageRecordIO rec;
     dmlc::InputSplit::Blob blob;
     // image data
-    InstVector &out = (*out_vec)[tid];
-    out.Clear();
+    InstVector &opencv_out = (*opencv_out_vec)[tid];
+    opencv_out.Clear();
     while (reader.NextRecord(&blob)) {
+#if MXNET_USE_OPENCV
+      // Opencv decode and augments
+      cv::Mat res;
       rec.Load(blob.dptr, blob.size);
-      out.Push(static_cast<unsigned>(rec.image_index()),
+      cv::Mat buf(1, rec.content_size, CV_8U, rec.content);
+      res = cv::imdecode(buf, 1);
+      res = augmenters_[tid]->OpencvProcess(res, prnds_[tid]);
+      opencv_out.Push(static_cast<unsigned>(rec.image_index()),
+               mshadow::Shape3(3, res.rows, res.cols),
+               mshadow::Shape1(param_.label_width));
+      DataInst opencv_inst = opencv_out.Back();
+      mshadow::Tensor<mshadow::cpu, 3> opencv_data =
+          opencv_inst.data[0].get<mshadow::cpu, 3, float>();
+      mshadow::Tensor<mshadow::cpu, 1> opencv_label =
+          opencv_inst.data[1].get<mshadow::cpu, 1, float>();
+      for (int i = 0; i < res.rows; ++i) {
+        for (int j = 0; j < res.cols; ++j) {
+          cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
+          opencv_data[0][i][j] = bgr[2];
+          opencv_data[1][i][j] = bgr[1];
+          opencv_data[2][i][j] = bgr[0];
+        }
+      }
+      if (label_map_ != NULL) {
+        mshadow::Copy(opencv_label, label_map_->Find(rec.image_index()));
+      } else {
+        opencv_label[0] = rec.header.label;
+      }
+      res.release();
+#else
+      LOG(FATAL) << "Opencv is needed for image decoding and augmenting.";
+#endif
+    }
+  }
+  // Tensor Op is not thread safe, so call outside of omp
+  out_vec->resize(param_.nthread);
+  for (size_t i = 0; i < opencv_out_vec->size(); i++) {
+    InstVector &out = (*out_vec)[i];
+    InstVector &opencv_out = (*opencv_out_vec)[i];
+    out.Clear();
+    for (size_t j = 0; j < opencv_out.Size(); j++) {
+      out.Push(opencv_out.Index(j),
                mshadow::Shape3(param_.input_shape[0], param_.input_shape[1], param_.input_shape[2]),
                mshadow::Shape1(param_.label_width));
       DataInst inst = out.Back();
-      // turn datainst into tensor
+      DataInst opencv_inst = opencv_out[j];
+      mshadow::Tensor<mshadow::cpu, 3> opencv_data =
+          opencv_inst.data[0].get<mshadow::cpu, 3, float>();
+      mshadow::Tensor<mshadow::cpu, 1> opencv_label =
+          opencv_inst.data[1].get<mshadow::cpu, 1, float>();
       mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>();
       mshadow::Tensor<mshadow::cpu, 1> label = inst.data[1].get<mshadow::cpu, 1, float>();
-      augmenters_[tid]->Process(rec.content, rec.content_size, &img_, prnds_[tid]);
+      augmenters_[i]->TensorProcess(&opencv_data, &img_, prnds_[i]);
       mshadow::Copy(data, img_);
-      if (label_map_ != NULL) {
-        mshadow::Copy(label, label_map_->Find(rec.image_index()));
-      } else {
-        label[0] = rec.header.label;
-      }
+      mshadow::Copy(label, opencv_label);
     }
   }
+  delete opencv_out_vec;
   return true;
 }
 
@@ -306,7 +355,6 @@ class ImageRecordIter : public IIterator<DataInst> {
       }
     }
     inst_ptr_ = 0;
-    shuffle_ = param_.shuffle;
   }
   virtual void BeforeFirst(void) {
     iter_.BeforeFirst();
@@ -331,7 +379,7 @@ class ImageRecordIter : public IIterator<DataInst> {
           }
         }
         // shuffle instance order if needed
-        if (shuffle_ != 0) {
+        if (param_.shuffle != 0) {
             std::shuffle(inst_order_.begin(), inst_order_.end(), \
                     common::RANDOM_ENGINE(kRandMagic + param_.seed));
         }
@@ -383,8 +431,6 @@ class ImageRecordIter : public IIterator<DataInst> {
   static const int kRandMagic = 111;
   // output instance
   DataInst out_;
-  // whether shuffle data
-  int shuffle_;
   // data ptr
   size_t inst_ptr_;
   // internal instance order
@@ -403,11 +449,13 @@ class ImageRecordIter : public IIterator<DataInst> {
 
 DMLC_REGISTER_PARAMETER(ImageRecParserParam);
 DMLC_REGISTER_PARAMETER(ImageRecordParam);
-MXNET_REGISTER_IO_CHAINED_ITER(ImageRecordIter, ImageRecordIter, BatchAdaptIter)
+MXNET_REGISTER_IO_THREE_CHAINED_ITER(ImageRecordIter,
+        PrefetcherIter, BatchLoader, ImageRecordIter)
     .describe("Create iterator for dataset packed in recordio.")
     .add_arguments(ImageRecordParam::__FIELDS__())
+    .add_arguments(ImageAugmentParam::__FIELDS__())
     .add_arguments(ImageRecParserParam::__FIELDS__())
     .add_arguments(BatchParam::__FIELDS__())
-    .add_arguments(ImageAugmentParam::__FIELDS__());
+    .add_arguments(PrefetcherParam::__FIELDS__());
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index 77ac3a479f75..454a40e8aa35 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -2,7 +2,6 @@
  * Copyright (c) 2015 by Contributors
  * \file iter_mnist.cc
  * \brief register mnist iterator
- * \author Tianjun Xiao
 */
 #include <mxnet/io.h>
 #include <mxnet/base.h>
@@ -13,6 +12,7 @@
 #include <vector>
 #include <utility>
 #include <map>
+#include "./iter_prefetcher.h"
 #include "../common/utils.h"
 
 namespace mxnet {
@@ -50,7 +50,7 @@ struct MNISTParam : public dmlc::Parameter<MNISTParam> {
   }
 };
 
-class MNISTIter: public IIterator<DataBatch> {
+class MNISTIter: public IIterator<TBlobBatch> {
  public:
   MNISTIter(void) {
     img_.dptr_ = NULL;
@@ -63,18 +63,15 @@ class MNISTIter: public IIterator<DataBatch> {
   // intialize iterator loads data in
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::map<std::string, std::string> kmap(kwargs.begin(), kwargs.end());
-    param_.Init(kmap);
+    param_.InitAllowUnknown(kmap);
     this->LoadImage();
     this->LoadLabel();
-    // set name
-    this->SetDataName(std::string("data"));
-    this->SetDataName(std::string("label"));
     if (param_.flat) {
       batch_data_.shape_ = mshadow::Shape4(param_.batch_size, 1, 1, img_.size(1) * img_.size(2));
     } else {
       batch_data_.shape_ = mshadow::Shape4(param_.batch_size, 1, img_.size(1), img_.size(2));
     }
-    out_.inst_index = NULL;
+    out_.data.clear();
     batch_label_.shape_ = mshadow::Shape2(param_.batch_size, 1);
     batch_label_.stride_ = 1;
     batch_data_.stride_ = batch_data_.size(3);
@@ -99,19 +96,20 @@ class MNISTIter: public IIterator<DataBatch> {
     if (loc_ + param_.batch_size <= img_.size(0)) {
       batch_data_.dptr_ = img_[loc_].dptr_;
       batch_label_.dptr_ = &labels_[loc_];
-      if (param_.flat)
-          out_.data[0] = TBlob(batch_data_.FlatTo2D());
-      else
-          out_.data[0] = TBlob(batch_data_);
-      out_.data[1] = TBlob(batch_label_);
-      out_.inst_index = &inst_[loc_];
+      out_.data.clear();
+      if (param_.flat) {
+          out_.data.push_back(TBlob(batch_data_.FlatTo2D()));
+      } else {
+          out_.data.push_back(TBlob(batch_data_));
+      }
+      out_.data.push_back(TBlob(batch_label_));
       loc_ += param_.batch_size;
       return true;
     } else {
       return false;
     }
   }
-  virtual const DataBatch &Value(void) const {
+  virtual const TBlobBatch &Value(void) const {
     return out_;
   }
 
@@ -180,7 +178,7 @@ class MNISTIter: public IIterator<DataBatch> {
   /*! \brief MNIST iter params */
   MNISTParam param_;
   /*! \brief output */
-  DataBatch out_;
+  TBlobBatch out_;
   /*! \brief current location */
   index_t loc_;
   /*! \brief image content */
@@ -200,8 +198,9 @@ class MNISTIter: public IIterator<DataBatch> {
 };  // class MNISTIter
 
 DMLC_REGISTER_PARAMETER(MNISTParam);
-MXNET_REGISTER_IO_ITER(MNISTIter, MNISTIter)
+MXNET_REGISTER_IO_CHAINED_ITER(MNISTIter, MNISTIter, PrefetcherIter)
     .describe("Create iterator for MNIST hand-written digit number recognition dataset.")
-    .add_arguments(MNISTParam::__FIELDS__());
+    .add_arguments(MNISTParam::__FIELDS__())
+    .add_arguments(PrefetcherParam::__FIELDS__());
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
new file mode 100644
index 000000000000..cda0cdf6755c
--- /dev/null
+++ b/src/io/iter_prefetcher.h
@@ -0,0 +1,168 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file iter_prefetcher.h
+ * \brief define a prefetcher using threaditer to keep k batch fetched 
+ */
+#ifndef MXNET_IO_ITER_PREFETCHER_H_
+#define MXNET_IO_ITER_PREFETCHER_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <dmlc/logging.h>
+#include <dmlc/threadediter.h>
+#include <mshadow/tensor.h>
+#include <utility>
+#include <string>
+#include <vector>
+#include <queue>
+#include "./inst_vector.h"
+
+namespace mxnet {
+namespace io {
+// Define prefetcher parameters
+struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
+  /*! \brief number of prefetched batches */
+  size_t prefetch_capacity;
+  /*! \brief label width */
+  index_t batch_size;
+  /*! \brief input shape */
+  TShape input_shape;
+  /*! \brief label width */
+  index_t label_width;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(PrefetcherParam) {
+    DMLC_DECLARE_FIELD(prefetch_capacity).set_default(1)
+        .describe("Number of prefetched batches");
+    DMLC_DECLARE_FIELD(batch_size)
+        .describe("Batch size.");
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
+        .enforce_nonzero()
+        .describe("Input shape of the neural net");
+    DMLC_DECLARE_FIELD(label_width).set_default(1)
+        .describe("Label width.");
+  }
+};
+
+// iterator on image recordio
+class PrefetcherIter : public IIterator<DataBatch> {
+ public:
+  explicit PrefetcherIter(IIterator<TBlobBatch>* base) : loader_(base) {
+    pdata_vec.clear();
+    plabel_vec.clear();
+  }
+  virtual ~PrefetcherIter(void) {
+    iter_.Destroy();
+    for (size_t i = 0; i < pdata_vec.size(); i++) {
+      delete[] pdata_vec[i];
+      delete[] plabel_vec[i];
+    }
+    delete loader_;
+  }
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    // init image rec param
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    // use the kwarg to init batch loader
+    loader_->Init(kwargs);
+    // create the shape
+    std::vector<size_t> data_shape_vec;
+    data_shape_vec.push_back(param_.batch_size);
+    for (size_t shape_dim = 0; shape_dim < param_.input_shape.ndim(); shape_dim++)
+        data_shape_vec.push_back(param_.input_shape[shape_dim]);
+    data_shape_ = TShape(data_shape_vec.begin(), data_shape_vec.end());
+    std::vector<size_t> label_shape_vec;
+    label_shape_vec.push_back(param_.batch_size);
+    label_shape_vec.push_back(param_.label_width);
+    label_shape_ = TShape(label_shape_vec.begin(), label_shape_vec.end());
+    // init thread iter
+    iter_.set_max_capacity(param_.prefetch_capacity);
+    iter_.Init([this](TBlobBatch **dptr) {
+        bool load_success = loader_->Next();
+        if (load_success == false)
+          return false;
+        if (*dptr == NULL) {
+          *dptr = new TBlobBatch();
+          // create the spaces and record the pointers
+          real_t* pdata = new real_t[data_shape_.Size()];
+          pdata_vec.push_back(pdata);
+          real_t* plabel = new real_t[label_shape_.Size()];
+          plabel_vec.push_back(plabel);
+          (*dptr)->data.push_back(TBlob(pdata, data_shape_, mshadow::cpu::kDevMask));
+          (*dptr)->data.push_back(TBlob(plabel, label_shape_, mshadow::cpu::kDevMask));
+        }
+        const TBlobBatch& batch = loader_->Value();
+        if (data_shape_.ndim() == 4) {
+          mshadow::Copy((*dptr)->data[0].get<mshadow::cpu, 4, float>(),
+                  batch.data[0].get<mshadow::cpu, 4, float>());
+        } else if (data_shape_.ndim() == 2) {
+          mshadow::Copy((*dptr)->data[0].get<mshadow::cpu, 2, float>(),
+                  batch.data[0].get<mshadow::cpu, 2, float>());
+        } else {
+          assert(false);
+        }
+        mshadow::Copy((*dptr)->data[1].get<mshadow::cpu, 2, float>(),
+                batch.data[1].get<mshadow::cpu, 2, float>());
+        return load_success;
+      },
+      [this]() { loader_->BeforeFirst(); });
+  }
+  virtual void BeforeFirst(void) {
+    iter_.BeforeFirst();
+  }
+  virtual bool Next(void) {
+     if (ready_batches_.size() != 0) {
+         TBlobBatch* old_batch = ready_batches_.front();
+         for (size_t i = 0; i < old_batch->data.size(); i++) {
+             NDArray old_ndarray = ready_ndarrays_.front();
+             old_ndarray.WaitToWrite();
+             ready_ndarrays_.pop();
+         }
+         iter_.Recycle(&old_batch);
+         ready_batches_.pop();
+     }
+     TBlobBatch* next_batch = NULL;
+     if (!iter_.Next(&next_batch)) return false;
+     out_.data.clear();
+     // copy the batch
+     for (size_t i = 0; i < next_batch->data.size(); i++) {
+         out_.data.push_back(NDArray(next_batch->data[i], mshadow::cpu::kDevMask));
+         ready_ndarrays_.push(out_.data[i]);
+     }
+     // push the narrays and batch into the queue
+     ready_batches_.push(next_batch);
+     return true;
+  }
+  virtual const DataBatch &Value(void) const {
+    return out_;
+  }
+
+ private:
+  /*! \brief prefetcher parameters */
+  PrefetcherParam param_;
+  /*! \brief output data */
+  DataBatch out_;
+  /*! \brief batch holder */
+  TBlobBatch out_holder_;
+  /*! \brief queue to hold the NDArrays for check whether writable */
+  std::queue<TBlobBatch*> ready_batches_;
+  /*! \breif ndarrays to wait to write */
+  std::queue<NDArray> ready_ndarrays_;
+  // internal batch loader
+  IIterator<TBlobBatch>* loader_;
+  // backend thread
+  dmlc::ThreadedIter<TBlobBatch> iter_;
+  /*! \brief data shape */
+  TShape data_shape_;
+  /*! \brief label shape */
+  TShape label_shape_;
+  /*! \brief log the pointers of the space created for data*/
+  std::vector<real_t*> pdata_vec;
+  /*! \brief log the pointers of the space created for label*/
+  std::vector<real_t*> plabel_vec;
+};
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_IO_ITER_PREFETCHER_H_
diff --git a/tests/python/train/test_conv.py b/tests/python/train/test_conv.py
index f7f0f1acb043..a27222c1227a 100644
--- a/tests/python/train/test_conv.py
+++ b/tests/python/train/test_conv.py
@@ -73,11 +73,13 @@ def Update(grad, weight):
 train_dataiter = mx.io.MNISTIter(
         image="data/train-images-idx3-ubyte",
         label="data/train-labels-idx1-ubyte",
-        batch_size=batch_size, shuffle=True, silent=False, seed=10)
+        input_shape=(1, 28, 28),
+        batch_size=batch_size, shuffle=True, flat=False, silent=False, seed=10)
 val_dataiter = mx.io.MNISTIter(
         image="data/t10k-images-idx3-ubyte",
         label="data/t10k-labels-idx1-ubyte",
-        batch_size=batch_size, shuffle=True, silent=False)
+        input_shape=(1, 28, 28),
+        batch_size=batch_size, shuffle=True, flat=False, silent=False)
 
 def test_mnist():
     acc_train = 0.0
diff --git a/tests/python/train/test_mlp.py b/tests/python/train/test_mlp.py
index 315564ea5057..651c85842a6a 100644
--- a/tests/python/train/test_mlp.py
+++ b/tests/python/train/test_mlp.py
@@ -56,10 +56,12 @@ def Update(grad, weight):
 train_dataiter = mx.io.MNISTIter(
         image="data/train-images-idx3-ubyte",
         label="data/train-labels-idx1-ubyte",
+        input_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10)
 val_dataiter = mx.io.MNISTIter(
         image="data/t10k-images-idx3-ubyte",
         label="data/t10k-labels-idx1-ubyte",
+        input_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False)
 
 def test_mlp():
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index e606f9254b5a..23d2afc18c03 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -3,9 +3,9 @@
 import numpy as np
 import os, gzip
 import pickle as pickle
+import time
+import sys
 from common import get_data
-#from PIL import Image
-
 
 def test_MNISTIter():
     # prepare data
@@ -15,11 +15,8 @@ def test_MNISTIter():
     train_dataiter = mx.io.MNISTIter(
             image="data/train-images-idx3-ubyte",
             label="data/train-labels-idx1-ubyte",
+            input_shape=(784,),
             batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10)
-    val_dataiter = mx.io.MNISTIter(
-            image="data/t10k-images-idx3-ubyte",
-            label="data/t10k-labels-idx1-ubyte",
-            batch_size=batch_size, shuffle=0, flat=1, silent=0)
     # test_loop
     nbatch = 60000 / batch_size
     batch_count = 0
@@ -39,68 +36,30 @@ def test_MNISTIter():
     label_1 = train_dataiter.getlabel().asnumpy().flatten()
     assert(sum(label_0 - label_1) == 0)
 
-'''
-def test_ImageRecIter():
-    dataiter = mx.io.ImageRecordIter(
-            path_imgrec="data/val_cxxnet.rec",
-            mean_img="data/smallset/image_net_mean.bin",
-            rand_crop=True,
-            mirror=True,
-            input_shape=(3,227,227),
-            batch_size=100,
-            nthread=1,
-            seed=10)
-    labelcount = [0 for i in range(1000)] 
-    batchcount = 0
-    for data, label in dataiter:
-        npdata = data.numpy
-        print npdata[0,:,:,:]
-        imgdata = np.zeros([227, 227, 3], dtype=np.uint8)
-        imgdata[:,:,0] = npdata[10,2,:,:]
-        imgdata[:,:,1] = npdata[10,1,:,:]
-        imgdata[:,:,2] = npdata[10,0,:,:]
-        img = Image.fromarray(imgdata)
-        imgpath = "data/smallset/test_3.jpg"
-        img.save(imgpath, format='JPEG')
-        exit(0)
-        print batchcount
-        sys.stdout.flush()
-        batchcount += 1
-        nplabel = label.numpy
-        for i in range(nplabel.shape[0]):
-            labelcount[int(nplabel[i])] += 1
-
 def test_Cifar10Rec():
+    get_data.GetCifar10()
     dataiter = mx.io.ImageRecordIter(
-            path_imgrec="data/cifar/test.rec",
+            path_imgrec="data/cifar/train.rec",
             mean_img="data/cifar/cifar10_mean.bin",
-            rand_crop=True,
-            rand_mirror=True,
+            rand_crop=False,
+            and_mirror=False,
+            shuffle=False,
             input_shape=(3,28,28),
             batch_size=100,
-            nthread=1)
+            nthread=4,
+            prefetch_capacity=1)
     labelcount = [0 for i in range(10)] 
     batchcount = 0
     for data, label in dataiter:
-        npdata = data.numpy
-        print npdata[0,:,:,:]
-        imgdata = np.zeros([28, 28, 3], dtype=np.uint8)
-        imgdata[:,:,0] = npdata[0,2,:,:]
-        imgdata[:,:,1] = npdata[0,1,:,:]
-        imgdata[:,:,2] = npdata[0,0,:,:]
-        img = Image.fromarray(imgdata)
-        imgpath = "data/cifar/test.jpg"
-        img.save(imgpath, format='JPEG')
-        exit(0)
-        print "Batch: ", batchcount
+        npdata = data.asnumpy().flatten().sum()
         sys.stdout.flush()
         batchcount += 1
-        nplabel = label.numpy
+        nplabel = label.asnumpy()
         for i in range(nplabel.shape[0]):
             labelcount[int(nplabel[i])] += 1
     for i in range(10):
-        assert(labelcount[i] == 1000)
-'''
+        assert(labelcount[i] == 5000)
 
 if __name__ == "__main__":
     test_MNISTIter()
+    #test_Cifar10Rec()