# Copyright 2017 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Image pre-processing utilities. """ import tensorflow as tf IMAGE_DEPTH = 3 # color images import tensorflow as tf # _R_MEAN = 123.68 # _G_MEAN = 116.78 # _B_MEAN = 103.94 # _CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN] _CHANNEL_MEANS = [0.0, 0.0, 0.0] # The lower bound for the smallest side of the image for aspect-preserving # resizing. For example, if an image is 500 x 1000, it will be resized to # _RESIZE_MIN x (_RESIZE_MIN * 2). _RESIZE_MIN = 128 def _decode_crop_and_flip(image_buffer, bbox, num_channels): """Crops the given image to a random part of the image, and randomly flips. We use the fused decode_and_crop op, which performs better than the two ops used separately in series, but note that this requires that the image be passed in as an un-decoded string Tensor. Args: image_buffer: scalar string Tensor representing the raw JPEG image buffer. bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. num_channels: Integer depth of the image buffer for decoding. Returns: 3-D tensor with cropped image. """ # A large fraction of image datasets contain a human-annotated bounding box # delineating the region of the image containing the object of interest. We # choose to create a new bounding box for the object which is a randomly # distorted version of the human-annotated bounding box that obeys an # allowed range of aspect ratios, sizes and overlap with the human-annotated # bounding box. If no box is supplied, then we assume the bounding box is # the entire image. sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( tf.image.extract_jpeg_shape(image_buffer), bounding_boxes=bbox, min_object_covered=0.1, aspect_ratio_range=[0.75, 1.33], area_range=[0.05, 1.0], max_attempts=100, use_image_if_no_bounding_boxes=True) bbox_begin, bbox_size, _ = sample_distorted_bounding_box # Reassemble the bounding box in the format the crop op requires. offset_y, offset_x, _ = tf.unstack(bbox_begin) target_height, target_width, _ = tf.unstack(bbox_size) crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) # Use the fused decode and crop op here, which is faster than each in series. cropped = tf.image.decode_and_crop_jpeg( image_buffer, crop_window, channels=num_channels) # Flip to add a little more random distortion in. cropped = tf.image.random_flip_left_right(cropped) return cropped def _central_crop(image, crop_height, crop_width): """Performs central crops of the given image list. Args: image: a 3-D image tensor crop_height: the height of the image following the crop. crop_width: the width of the image following the crop. Returns: 3-D tensor with cropped image. """ shape = tf.shape(input=image) height, width = shape[0], shape[1] amount_to_be_cropped_h = (height - crop_height) crop_top = amount_to_be_cropped_h // 2 amount_to_be_cropped_w = (width - crop_width) crop_left = amount_to_be_cropped_w // 2 return tf.slice( image, [crop_top, crop_left, 0], [crop_height, crop_width, -1]) def _mean_image_subtraction(image, means, num_channels): """Subtracts the given means from each image channel. For example: means = [123.68, 116.779, 103.939] image = _mean_image_subtraction(image, means) Note that the rank of `image` must be known. Args: image: a tensor of size [height, width, C]. means: a C-vector of values to subtract from each channel. num_channels: number of color channels in the image that will be distorted. Returns: the centered image. Raises: ValueError: If the rank of `image` is unknown, if `image` has a rank other than three or if the number of channels in `image` doesn't match the number of values in `means`. """ if image.get_shape().ndims != 3: raise ValueError('Input must be of size [height, width, C>0]') if len(means) != num_channels: raise ValueError('len(means) must match the number of channels') # We have a 1-D tensor of means; convert to 3-D. means = tf.expand_dims(tf.expand_dims(means, 0), 0) return image - means def _smallest_size_at_least(height, width, resize_min): """Computes new shape with the smallest side equal to `smallest_side`. Computes new shape with the smallest side equal to `smallest_side` while preserving the original aspect ratio. Args: height: an int32 scalar tensor indicating the current height. width: an int32 scalar tensor indicating the current width. resize_min: A python integer or scalar `Tensor` indicating the size of the smallest side after resize. Returns: new_height: an int32 scalar tensor indicating the new height. new_width: an int32 scalar tensor indicating the new width. """ resize_min = tf.cast(resize_min, tf.float32) # Convert to floats to make subsequent calculations go smoothly. height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32) smaller_dim = tf.minimum(height, width) scale_ratio = resize_min / smaller_dim # Convert back to ints to make heights and widths that TF ops will accept. new_height = tf.cast(tf.ceil(height * scale_ratio), tf.int32) new_width = tf.cast(tf.ceil(width * scale_ratio), tf.int32) return new_height, new_width def _aspect_preserving_resize(image, resize_min): """Resize images preserving the original aspect ratio. Args: image: A 3-D image `Tensor`. resize_min: A python integer or scalar `Tensor` indicating the size of the smallest side after resize. Returns: resized_image: A 3-D tensor containing the resized image. """ shape = tf.shape(input=image) height, width = shape[0], shape[1] new_height, new_width = _smallest_size_at_least(height, width, resize_min) return _resize_image(image, new_height, new_width) def _resize_image(image, height, width): """Simple wrapper around tf.resize_images. This is primarily to make sure we use the same `ResizeMethod` and other details each time. Args: image: A 3-D image `Tensor`. height: The target height for the resized image. width: The target width for the resized image. Returns: resized_image: A 3-D tensor containing the resized image. The first two dimensions have the shape [height, width]. """ return tf.image.resize_images( image, [height, width], method=tf.image.ResizeMethod.BILINEAR, align_corners=False) def preprocess_image(image_buffer, bbox, output_height, output_width, num_channels, is_training=False): """Preprocesses the given image. Preprocessing includes decoding, cropping, and resizing for both training and eval images. Training preprocessing, however, introduces some random distortion of the image to improve accuracy. Args: image_buffer: scalar string Tensor representing the raw JPEG image buffer. bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. num_channels: Integer depth of the image buffer for decoding. is_training: `True` if we're preprocessing the image for training and `False` otherwise. Returns: A preprocessed image. """ if is_training: # For training, we want to randomize some of the distortions. image = _decode_crop_and_flip(image_buffer, bbox, num_channels) image = _resize_image(image, output_height, output_width) else: # For validation, we want to decode, resize, then just crop the middle. image = tf.image.decode_jpeg(image_buffer, channels=num_channels) image = _aspect_preserving_resize(image, _RESIZE_MIN) print(image) image = _central_crop(image, output_height, output_width) image.set_shape([output_height, output_width, num_channels]) return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels) def parse_example_proto(example_serialized): """Parses an Example proto containing a training example of an image. The output of the build_image_data.py image preprocessing script is a dataset containing serialized Example protocol buffers. Each Example proto contains the following fields: image/height: 462 image/width: 581 image/colorspace: 'RGB' image/channels: 3 image/class/label: 615 image/class/synset: 'n03623198' image/class/text: 'knee pad' image/object/bbox/xmin: 0.1 image/object/bbox/xmax: 0.9 image/object/bbox/ymin: 0.2 image/object/bbox/ymax: 0.6 image/object/bbox/label: 615 image/format: 'JPEG' image/filename: 'ILSVRC2012_val_00041207.JPEG' image/encoded: Args: example_serialized: scalar Tensor tf.string containing a serialized Example protocol buffer. Returns: image_buffer: Tensor tf.string containing the contents of a JPEG file. label: Tensor tf.int32 containing the label. bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. text: Tensor tf.string containing the human-readable label. """ # Dense features in Example proto. feature_map = { 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, default_value=''), 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, default_value=-1), 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, default_value=''), } sparse_float32 = tf.VarLenFeature(dtype=tf.float32) # Sparse features in Example proto. feature_map.update( {k: sparse_float32 for k in ['image/object/bbox/xmin', 'image/object/bbox/ymin', 'image/object/bbox/xmax', 'image/object/bbox/ymax']}) features = tf.parse_single_example(example_serialized, feature_map) label = tf.cast(features['image/class/label'], dtype=tf.int32) xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) # Note that we impose an ordering of (y, x) just to make life difficult. bbox = tf.concat([ymin, xmin, ymax, xmax], 0) # Force the variable number of bounding boxes into the shape # [1, num_boxes, coords]. bbox = tf.expand_dims(bbox, 0) bbox = tf.transpose(bbox, [0, 2, 1]) return features['image/encoded'], label, bbox, features['image/class/text'] class ImagenetPreprocessor: def __init__(self, image_size, dtype, train): self.image_size = image_size self.dtype = dtype self.train = train def preprocess(self, image_buffer, bbox): # pylint: disable=g-import-not-at-top image = preprocess_image(image_buffer, bbox, self.image_size, self.image_size, IMAGE_DEPTH, is_training=self.train) return tf.cast(image, self.dtype) def parse_and_preprocess(self, value): image_buffer, label_index, bbox, _ = parse_example_proto(value) image = self.preprocess(image_buffer, bbox) image = tf.reshape(image, [self.image_size, self.image_size, IMAGE_DEPTH]) return label_index, image