bug fix

jeffdonahue · cbfinn · commit daf32da5f055 · 2015-09-11T16:51:43.000-07:00
diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu
@@ -6,21 +6,41 @@
 
 namespace caffe {
 
+template <typename Dtype>
+__global__ void Concat(const int nthreads, const Dtype* in_data,
+    const bool forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, Dtype* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int total_concat_size = concat_size * bottom_concat_axis;
+    const int concat_num = index / total_concat_size;
+    const int concat_index = index % total_concat_size;
+    const int top_index = concat_index +
+        (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+    if (forward) {
+      out_data[top_index] = in_data[index];
+    } else {
+      out_data[index] = in_data[top_index];
+    }
+  }
+}
+
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   Dtype* top_data = top[0]->mutable_gpu_data();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
+  const bool kForward = true;
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
     const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    for (int n = 0; n < num_concats_; ++n) {
-      caffe_copy(bottom_concat_axis * concat_input_size_,
-          bottom_data + n * bottom_concat_axis * concat_input_size_,
-          top_data + (n * top_concat_axis + offset_concat_axis)
-              * concat_input_size_);
-    }
+    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
+    const int nthreads = bottom_concat_size * num_concats_;
+    Concat<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
+        nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
+        top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
     offset_concat_axis += bottom_concat_axis;
   }
 }
@@ -31,6 +51,7 @@ void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   const Dtype* top_diff = top[0]->gpu_diff();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
+  const bool kForward = false;
   for (int i = 0; i < bottom.size(); ++i) {
     const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
     if (propagate_down[i]) {
diff --git a/src/caffe/layers/slice_layer.cu b/src/caffe/layers/slice_layer.cu
@@ -6,22 +6,41 @@
 
 namespace caffe {
 
+template <typename Dtype>
+__global__ void Slice(const int nthreads, const Dtype* in_data,
+    const bool forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, Dtype* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int total_slice_size = slice_size * top_slice_axis;
+    const int slice_num = index / total_slice_size;
+    const int slice_index = index % total_slice_size;
+    const int bottom_index = slice_index +
+        (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
+    if (forward) {
+      out_data[index] = in_data[bottom_index];
+    } else {
+      out_data[bottom_index] = in_data[index];
+    }
+  }
+}
+
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   int offset_slice_axis = 0;
   const Dtype* bottom_data = bottom[0]->gpu_data();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  const bool kForward = true;
   for (int i = 0; i < top.size(); ++i) {
     Dtype* top_data = top[i]->mutable_gpu_data();
     const int top_slice_axis = top[i]->shape(slice_axis_);
-    for (int n = 0; n < num_slices_; ++n) {
-      const int top_offset = n * top_slice_axis * slice_size_;
-      const int bottom_offset =
-          (n * bottom_slice_axis + offset_slice_axis) * slice_size_;
-      caffe_copy(top_slice_axis * slice_size_,
-          bottom_data + bottom_offset, top_data + top_offset);
-    }
+    const int top_slice_size = top_slice_axis * slice_size_;
+    const int nthreads = top_slice_size * num_slices_;
+    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
+        nthreads, bottom_data, kForward, num_slices_, slice_size_,
+        bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data);
     offset_slice_axis += top_slice_axis;
   }
 }
@@ -33,16 +52,16 @@ void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   int offset_slice_axis = 0;
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  const bool kForward = false;
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->gpu_diff();
     const int top_slice_axis = top[i]->shape(slice_axis_);
-    for (int n = 0; n < num_slices_; ++n) {
-      const int top_offset = n * top_slice_axis * slice_size_;
-      const int bottom_offset =
-          (n * bottom_slice_axis + offset_slice_axis) * slice_size_;
-      caffe_copy(top_slice_axis * slice_size_,
-          top_diff + top_offset, bottom_diff + bottom_offset);
-    }
+    const int top_slice_size = top_slice_axis * slice_size_;
+    const int nthreads = top_slice_size * num_slices_;
+    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
+        nthreads, top_diff, kForward, num_slices_, slice_size_,
+        bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff);
     offset_slice_axis += top_slice_axis;
   }
 }