BVLC · jeffdonahue · Aug 26, 2014 · Aug 25, 2014 · Aug 25, 2014
diff --git a/examples/cifar10/cifar10_full_solver.prototxt b/examples/cifar10/cifar10_full_solver.prototxt
@@ -23,9 +23,4 @@ max_iter: 60000
 snapshot: 10000
 snapshot_prefix: "cifar10_full"
 # solver mode: CPU or GPU
-# Note: there seems to be a bug with CPU computation in the pooling layers,
-# and changing to solver_mode: CPU may result in NaNs on this example.
-# If you want to train a variant of this architecture on the
-# CPU, try changing the pooling regions from WITHIN_CHANNEL to ACROSS_CHANNELS
-# in both cifar_full_train.prototxt and cifar_full_test.prototxt.
 solver_mode: GPU
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
@@ -115,6 +115,8 @@ class EltwiseLayer : public Layer<Dtype> {
 
   EltwiseParameter_EltwiseOp op_;
   vector<Dtype> coeffs_;
+
+  bool stable_prod_grad_;
 };
 
 /* FlattenLayer

diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
@@ -35,6 +35,7 @@ void EltwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       coeffs_[i] = this->layer_param().eltwise_param().coeff(i);
     }
   }
+  stable_prod_grad_ = this->layer_param_.eltwise_param().stable_prod_grad();
 }
 
 template <typename Dtype>
@@ -73,7 +74,21 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       Dtype* bottom_diff = (*bottom)[i]->mutable_cpu_diff();
       switch (op_) {
       case EltwiseParameter_EltwiseOp_PROD:
-        caffe_div(count, top_data, bottom_data, bottom_diff);
+        if (stable_prod_grad_) {
+          bool initialized = false;
+          for (int j = 0; j < bottom->size(); ++j) {
+            if (i == j) { continue; }
+            if (!initialized) {
+              caffe_copy(count, (*bottom)[j]->cpu_data(), bottom_diff);
+              initialized = true;
+            } else {
+              caffe_mul(count, (*bottom)[j]->cpu_data(), bottom_diff,
+                        bottom_diff);
+            }
+          }
+        } else {
+          caffe_div(count, top_data, bottom_data, bottom_diff);
+        }
         caffe_mul(count, bottom_diff, top_diff, bottom_diff);
         break;
       case EltwiseParameter_EltwiseOp_SUM:

diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu
@@ -43,7 +43,21 @@ void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       Dtype* bottom_diff = (*bottom)[i]->mutable_gpu_diff();
       switch (op_) {
       case EltwiseParameter_EltwiseOp_PROD:
-        caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
+        if (stable_prod_grad_) {
+          bool initialized = false;
+          for (int j = 0; j < bottom->size(); ++j) {
+            if (i == j) { continue; }
+            if (!initialized) {
+              caffe_copy(count, (*bottom)[j]->gpu_data(), bottom_diff);
+              initialized = true;
+            } else {
+              caffe_gpu_mul(count, (*bottom)[j]->gpu_data(), bottom_diff,
+                            bottom_diff);
+            }
+          }
+        } else {
+          caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
+        }
         caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
         break;
       case EltwiseParameter_EltwiseOp_SUM:

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
@@ -420,6 +420,10 @@ message EltwiseParameter {
   }
   optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
   repeated float coeff = 2; // blob-wise coefficient for SUM operation
+
+  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
+  // of computing the gradient for the PROD operation. (No effect for SUM op.)
+  optional bool stable_prod_grad = 3 [default = true];
 }
 
 // Message that stores parameters used by ThresholdLayer

diff --git a/src/caffe/test/test_eltwise_layer.cpp b/src/caffe/test/test_eltwise_layer.cpp
@@ -124,11 +124,24 @@ TYPED_TEST(EltwiseLayerTest, TestSumCoeff) {
   }
 }
 
-TYPED_TEST(EltwiseLayerTest, TestProdGradient) {
+TYPED_TEST(EltwiseLayerTest, TestStableProdGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
   eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD);
+  eltwise_param->set_stable_prod_grad(true);
+  EltwiseLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientEltwise(&layer, &(this->blob_bottom_vec_),
+      &(this->blob_top_vec_));
+}
+
+TYPED_TEST(EltwiseLayerTest, TestUnstableProdGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
+  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD);
+  eltwise_param->set_stable_prod_grad(false);
   EltwiseLayer<Dtype> layer(layer_param);
   GradientChecker<Dtype> checker(1e-2, 1e-3);
   checker.CheckGradientEltwise(&layer, &(this->blob_bottom_vec_),