BVLC · jeffdonahue · Jan 26, 2014 · Jan 25, 2014 · Jan 25, 2014 · Jan 25, 2014
diff --git a/python/caffe/detection/__init__.py b/python/caffe/detection/__init__.py
diff --git a/python/caffe/imagenet/power_wrapper.py → python/caffe/detection/detector.py b/python/caffe/imagenet/power_wrapper.py → python/caffe/detection/detector.py
@@ -1,6 +1,6 @@
 """
-Classify a number of images at once, optionally using the selective
-search window proposal method.
+Do windowed detection by classifying a number of images/crops at once,
+optionally using the selective search window proposal method.
 
 This implementation follows
   Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik.
@@ -12,7 +12,9 @@
   https://github.com/sergeyk/selective_search_ijcv_with_python
 
 TODO:
-- [ ] batch up image filenames as well: don't want to load all of them into memory
+- batch up image filenames as well: don't want to load all of them into memory
+- refactor into class (without globals)
+- update demo notebook with new options
 """
 import numpy as np
 import os
@@ -25,19 +27,18 @@
 import selective_search_ijcv_with_python as selective_search
 import caffe
 
-IMAGE_DIM = 256
-CROPPED_DIM = 227
-IMAGE_CENTER = int((IMAGE_DIM - CROPPED_DIM) / 2)
+NET = None
 
-CROP_MODES = ['center_only', 'corners', 'selective_search']
+IMAGE_DIM = None
+CROPPED_DIM = None
+IMAGE_CENTER = None
+
+IMAGE_MEAN = None
+CROPPED_IMAGE_MEAN = None
 
-# Load the imagenet mean file
-IMAGENET_MEAN = np.load(
-    os.path.join(os.path.dirname(__file__), 'ilsvrc_2012_mean.npy'))
-CROPPED_IMAGENET_MEAN = IMAGENET_MEAN[
-  IMAGE_CENTER:IMAGE_CENTER + CROPPED_DIM,
-  IMAGE_CENTER:IMAGE_CENTER + CROPPED_DIM, :]
+NUM_OUTPUT = None
 
+CROP_MODES = ['center_only', 'corners', 'selective_search']
 
 def load_image(filename):
   """
@@ -70,20 +71,16 @@ def format_image(image, window=None, cropped_size=False):
   """
   # Crop a subimage if window is provided.
   if window is not None:
-    image = image[
-      window[0]:window[2],
-      window[1]:window[3]
-    ]
+    image = image[window[0]:window[2], window[1]:window[3]]
 
-
-  # Resize to ImageNet size, convert to BGR, subtract mean.
+  # Resize to input size, convert to BGR, subtract mean.
   image = image[:, :, ::-1]
   if cropped_size:
     image = skimage.transform.resize(image, (CROPPED_DIM, CROPPED_DIM)) * 255
-    image -= CROPPED_IMAGENET_MEAN
+    image -= CROPPED_IMAGE_MEAN
   else:
     image = skimage.transform.resize(image, (IMAGE_DIM, IMAGE_DIM)) * 255
-    image -= IMAGENET_MEAN
+    image -= IMAGE_MEAN
 
   image = image.swapaxes(1, 2).swapaxes(0, 1)
   return image
@@ -239,19 +236,14 @@ def assemble_batches(image_fnames, crop_mode='center_only', batch_size=10):
   return df_batches
 
 
-def compute_feats(images_df, layer='imagenet'):
-  if layer == 'imagenet':
-    num_output = 1000
-  else:
-    raise ValueError("Unknown layer requested: {}".format(layer))
-
+def compute_feats(images_df):
   num = images_df.shape[0]
   input_blobs = [np.ascontiguousarray(
     np.concatenate(images_df['image'].values), dtype='float32')]
-  output_blobs = [np.empty((num, num_output, 1, 1), dtype=np.float32)]
+  output_blobs = [np.empty((num, NUM_OUTPUT, 1, 1), dtype=np.float32)]
   print(input_blobs[0].shape, output_blobs[0].shape)
 
-  caffenet.Forward(input_blobs, output_blobs)
+  NET.Forward(input_blobs, output_blobs)
   feats = [output_blobs[0][i].flatten() for i in range(len(output_blobs[0]))]
 
   # Add the features and delete the images.
@@ -260,55 +252,84 @@ def compute_feats(images_df, layer='imagenet'):
   return images_df
 
 
+def config(model_def, pretrained_model, gpu, image_dim, image_mean_file):
+  global IMAGE_DIM, CROPPED_DIM, IMAGE_CENTER, IMAGE_MEAN, CROPPED_IMAGE_MEAN
+  global NET, NUM_OUTPUT
+
+  # Initialize network by loading model definition and weights.
+  t = time.time()
+  print("Loading Caffe model.")
+  NET = caffe.CaffeNet(model_def, pretrained_model)
+  NET.set_phase_test()
+  if gpu:
+    NET.set_mode_gpu()
+  print("Caffe model loaded in {:.3f} s".format(time.time() - t))
+
+  # Configure for input/output data
+  IMAGE_DIM = image_dim
+  CROPPED_DIM = NET.blobs()[0].width
+  IMAGE_CENTER = int((IMAGE_DIM - CROPPED_DIM) / 2)
+
+    # Load the data set mean file
+  IMAGE_MEAN = np.load(image_mean_file)
+
+
+  CROPPED_IMAGE_MEAN = IMAGE_MEAN[IMAGE_CENTER:IMAGE_CENTER + CROPPED_DIM,
+                                  IMAGE_CENTER:IMAGE_CENTER + CROPPED_DIM,
+                                  :]
+  NUM_OUTPUT = NET.blobs()[-1].channels # number of output classes
+
+
 if __name__ == "__main__":
-  ## Parse cmdline options
+  # Parse cmdline options
   gflags.DEFINE_string(
-    "model_def", "", "The model definition file.")
+    "model_def", "", "Model definition file.")
   gflags.DEFINE_string(
-    "pretrained_model", "", "The pretrained model.")
+    "pretrained_model", "", "Pretrained model weights file.")
   gflags.DEFINE_boolean(
-    "gpu", False, "use gpu for computation")
+    "gpu", False, "Switch for gpu computation.")
   gflags.DEFINE_string(
     "crop_mode", "center_only", "Crop mode, from {}".format(CROP_MODES))
   gflags.DEFINE_string(
-    "images_file", "", "File that contains image filenames.")
+    "images_file", "", "Image filenames file.")
   gflags.DEFINE_string(
     "batch_size", 10, "Number of image crops to let through in one go")
   gflags.DEFINE_string(
-    "output", "", "The output DataFrame HDF5 filename.")
+    "output_file", "", "Output DataFrame HDF5 filename.")
+  gflags.DEFINE_string(
+    "images_dim", 256, "Canonical dimension of (square) images.")
   gflags.DEFINE_string(
-    "layer", "imagenet", "Layer to output.")
+    "images_mean_file",
+    os.path.join(os.path.dirname(__file__), '../imagenet/ilsvrc_2012_mean.npy'),
+    "Data set image mean (numpy array).")
   FLAGS = gflags.FLAGS
   FLAGS(sys.argv)
 
-  ## Load list of image filenames and assemble into batches.
+
+  # Configure network, input, output
+  config(FLAGS.model_def, FLAGS.pretrained_model, FLAGS.gpu, FLAGS.images_dim,
+         FLAGS.images_mean_file)
+
+  # Load list of image filenames and assemble into batches.
   t = time.time()
   print('Assembling batches...')
   with open(FLAGS.images_file) as f:
     image_fnames = [_.strip() for _ in f.readlines()]
-  image_batches = assemble_batches(
-    image_fnames, FLAGS.crop_mode, FLAGS.batch_size)
-  print('{} batches assembled in {:.3f} s'.format(
-    len(image_batches), time.time() - t))
-
-  # Initialize network by loading model definition and weights.
-  t = time.time()
-  print("Loading Caffe model.")
-  caffenet = caffe.CaffeNet(FLAGS.model_def, FLAGS.pretrained_model)
-  caffenet.set_phase_test()
-  if FLAGS.gpu:
-    caffenet.set_mode_gpu()
-  print("Caffe model loaded in {:.3f} s".format(time.time() - t))
+  image_batches = assemble_batches(image_fnames, FLAGS.crop_mode,
+                                   FLAGS.batch_size)
+  print('{} batches assembled in {:.3f} s'.format(len(image_batches),
+                                                  time.time() - t))
 
   # Process the batches.
   t = time.time()
-  print 'Processing {} files in {} batches'.format(
-    len(image_fnames), len(image_batches))
+  print 'Processing {} files in {} batches'.format(len(image_fnames),
+                                                   len(image_batches))
   dfs_with_feats = []
   for i in range(len(image_batches)):
     if i % 10 == 0:
-      print('Batch {}/{}, elapsed time: {:.3f} s'.format(
-        i, len(image_batches), time.time() - t))
+      print('Batch {}/{}, elapsed time: {:.3f} s'.format(i,
+                                                         len(image_batches),
+                                                         time.time() - t))
     dfs_with_feats.append(compute_feats(image_batches[i]))
 
   # Concatenate, droppping the padding rows.
@@ -317,8 +338,8 @@ def compute_feats(images_df, layer='imagenet'):
 
   # Write our the results.
   t = time.time()
-  df.to_hdf(FLAGS.output, 'df', mode='w')
+  df.to_hdf(FLAGS.output_file, 'df', mode='w')
   print("Done. Saving to {} took {:.3f} s.".format(
-    FLAGS.output, time.time() - t))
+    FLAGS.output_file, time.time() - t))
 
   sys.exit()
diff --git a/...affe/imagenet/selective_search_demo.ipynb → ...ffe/detection/selective_search_demo.ipynb b/...affe/imagenet/selective_search_demo.ipynb → ...ffe/detection/selective_search_demo.ipynb
@@ -19,14 +19,14 @@
       "\n",
       "    wget http://farm1.static.flickr.com/220/512450093_7717fb8ce8.jpg\n",
       "    echo `pwd`/\"512450093_7717fb8ce8.jpg\" > image_cat.txt\n",
-      "    python power_wrapper.py --images_file=image_cat.txt --crop_mode=selective_search --model_def=<path to imagenet_deploy.prototxt> --pretrained_model=<path to alexnet_train_iter_470000> --output=selective_cat.h5\n",
+      "    python detector.py --images_file=image_cat.txt --crop_mode=selective_search --model_def=<path to imagenet_deploy.prototxt> --pretrained_model=<path to alexnet_train_iter_470000> --output=selective_cat.h5\n",
       "    \n",
       "    \n",
       "Running this outputs an HDF5 file with the filenames, selected windows, and their ImageNet scores.\n",
       "Of course, we only ran on one image, so the filenames will all be the same.\n",
       "\n",
-      "In general, `power_wrapper` is most efficient when running on a lot of images: it first extracts window proposals for all of them, then batches the windows for efficient GPU processing, and then outputs the results.\n",
-      "Simply list an image per line in the `images_file`, and `power_wrapper` will process all of them."
+      "In general, `detector` is most efficient when running on a lot of images: it first extracts window proposals for all of them, then batches the windows for efficient GPU processing, and then outputs the results.\n",
+      "Simply list an image per line in the `images_file`, and `detector` will process all of them."
      ]
     },
     {

diff --git a/python/caffe/imagenet/wrapper.py b/python/caffe/imagenet/wrapper.py
@@ -2,15 +2,12 @@
 from disk, using the imagenet classifier.
 """
 
-from google.protobuf import text_format
-import gzip
 import numpy as np
 import os
 from skimage import io
 from skimage import transform
 
 import caffe
-from caffe.proto import caffe_pb2
 
 IMAGE_DIM = 256
 CROPPED_DIM = 227
@@ -20,8 +17,9 @@
     os.path.join(os.path.dirname(__file__), 'ilsvrc_2012_mean.npy'))
 
 
-def oversample(image, center_only = False):
-  """Oversamples an image. Currently the indices are hard coded to the
+def oversample(image, center_only=False):
+  """
+  Oversamples an image. Currently the indices are hard coded to the
   4 corners and the center of the image, as well as their flipped ones,
   a total of 10 images.
 
@@ -31,7 +29,7 @@ def oversample(image, center_only = False):
   Output:
       images: the output of size (10 x 3 x 227 x 227)
   """
-  image = image.swapaxes(1,2).swapaxes(0,1)
+  image = image.swapaxes(1, 2).swapaxes(0, 1)
   indices = [0, IMAGE_DIM - CROPPED_DIM]
   center = int(indices[1] / 2)
   if center_only:
@@ -46,19 +44,19 @@ def oversample(image, center_only = False):
       for j in indices:
         images[curr] = image[:, i:i + CROPPED_DIM, j:j + CROPPED_DIM]
         curr += 1
-    images[4] = image[
-        :, center:center + CROPPED_DIM,center:center + CROPPED_DIM]
+    images[4] = image[:, center:center + CROPPED_DIM,
+                      center:center + CROPPED_DIM]
     # flipped version
     images[5:] = images[:5, :, :, ::-1]
     return images
 
 
-def prepare_image(filename, center_only = False):
+def prepare_image(filename, center_only=False):
   img = io.imread(filename)
   if img.ndim == 2:
     img = np.tile(img[:, :, np.newaxis], (1, 1, 3))
   elif img.shape[2] == 4:
-    img = img[:,:,:3]
+    img = img[:, :, :3]
   # Resize and convert to BGR
   img_reshape = (transform.resize(img, (IMAGE_DIM,IMAGE_DIM)) * 255)[:, :, ::-1]
   # subtract main
@@ -67,11 +65,12 @@ def prepare_image(filename, center_only = False):
 
 
 class ImageNetClassifier(object):
-  """The ImageNetClassifier is a wrapper class to perform easier deployment
+  """
+  The ImageNetClassifier is a wrapper class to perform easier deployment
   of models trained on imagenet.
   """
-  def __init__(self, model_def_file, pretrained_model, center_only = False,
-      num_output=1000):
+  def __init__(self, model_def_file, pretrained_model, center_only=False,
+               num_output=1000):
     if center_only:
       num = 1
     else:
@@ -87,7 +86,9 @@ def predict(self, filename):
 
 
 def main(argv):
-  """The main function will carry out classification."""
+  """
+  The main function will carry out classification.
+  """
   import gflags
   import glob
   import time
@@ -99,15 +100,18 @@ def main(argv):
   gflags.DEFINE_boolean("gpu", True, "use gpu for computation")
   FLAGS = gflags.FLAGS
   FLAGS(argv)
+
   net = ImageNetClassifier(FLAGS.model_def, FLAGS.pretrained_model)
+
   if FLAGS.gpu:
     print 'Use gpu.'
     net.caffenet.set_mode_gpu()
+
   files = glob.glob(os.path.join(FLAGS.root, "*." + FLAGS.ext))
   files.sort()
   print 'A total of %d files' % len(files)
   output = np.empty((len(files), net._output_blobs[0].shape[1]),
-      dtype=np.float32)
+                    dtype=np.float32)
   start = time.time()
   for i, f in enumerate(files):
     output[i] = net.predict(f)