chores: refactor for the new ai research, add linter, gh action, etc (#27)

2025-11-23 08:00:26 -05:00 · 2025-08-13 21:49:46 +08:00 · 2025-08-13 21:49:46 +08:00 · d5467e559f
commit d5467e559f
parent fb4ab80dc3
40 changed files with 5177 additions and 2476 deletions
--- a/EBMs/imagenet_preprocessing.py
+++ b/EBMs/imagenet_preprocessing.py
@ -13,14 +13,11 @@
 # limitations under the License.
 # ==============================================================================

-"""Image pre-processing utilities.
-"""
+"""Image pre-processing utilities."""
 import tensorflow as tf

+IMAGE_DEPTH = 3  # color images

-IMAGE_DEPTH = 3 # color images
-
-import tensorflow as tf

 # _R_MEAN = 123.68
 # _G_MEAN = 116.78
@ -35,303 +32,318 @@ _RESIZE_MIN = 128


 def _decode_crop_and_flip(image_buffer, bbox, num_channels):
-  """Crops the given image to a random part of the image, and randomly flips.
+    """Crops the given image to a random part of the image, and randomly flips.

-  We use the fused decode_and_crop op, which performs better than the two ops
-  used separately in series, but note that this requires that the image be
-  passed in as an un-decoded string Tensor.
+    We use the fused decode_and_crop op, which performs better than the two ops
+    used separately in series, but note that this requires that the image be
+    passed in as an un-decoded string Tensor.

-  Args:
-    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as
-      [ymin, xmin, ymax, xmax].
-    num_channels: Integer depth of the image buffer for decoding.
+    Args:
+      image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+      bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+        where each coordinate is [0, 1) and the coordinates are arranged as
+        [ymin, xmin, ymax, xmax].
+      num_channels: Integer depth of the image buffer for decoding.

-  Returns:
-    3-D tensor with cropped image.
+    Returns:
+      3-D tensor with cropped image.

-  """
-  # A large fraction of image datasets contain a human-annotated bounding box
-  # delineating the region of the image containing the object of interest.  We
-  # choose to create a new bounding box for the object which is a randomly
-  # distorted version of the human-annotated bounding box that obeys an
-  # allowed range of aspect ratios, sizes and overlap with the human-annotated
-  # bounding box. If no box is supplied, then we assume the bounding box is
-  # the entire image.
-  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
-      tf.image.extract_jpeg_shape(image_buffer),
-      bounding_boxes=bbox,
-      min_object_covered=0.1,
-      aspect_ratio_range=[0.75, 1.33],
-      area_range=[0.05, 1.0],
-      max_attempts=100,
-      use_image_if_no_bounding_boxes=True)
-  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+    """
+    # A large fraction of image datasets contain a human-annotated bounding box
+    # delineating the region of the image containing the object of interest.  We
+    # choose to create a new bounding box for the object which is a randomly
+    # distorted version of the human-annotated bounding box that obeys an
+    # allowed range of aspect ratios, sizes and overlap with the human-annotated
+    # bounding box. If no box is supplied, then we assume the bounding box is
+    # the entire image.
+    sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+        tf.image.extract_jpeg_shape(image_buffer),
+        bounding_boxes=bbox,
+        min_object_covered=0.1,
+        aspect_ratio_range=[0.75, 1.33],
+        area_range=[0.05, 1.0],
+        max_attempts=100,
+        use_image_if_no_bounding_boxes=True,
+    )
+    bbox_begin, bbox_size, _ = sample_distorted_bounding_box

-  # Reassemble the bounding box in the format the crop op requires.
-  offset_y, offset_x, _ = tf.unstack(bbox_begin)
-  target_height, target_width, _ = tf.unstack(bbox_size)
-  crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+    # Reassemble the bounding box in the format the crop op requires.
+    offset_y, offset_x, _ = tf.unstack(bbox_begin)
+    target_height, target_width, _ = tf.unstack(bbox_size)
+    crop_window = tf.stack([offset_y, offset_x, target_height, target_width])

-  # Use the fused decode and crop op here, which is faster than each in series.
-  cropped = tf.image.decode_and_crop_jpeg(
-      image_buffer, crop_window, channels=num_channels)
+    # Use the fused decode and crop op here, which is faster than each in
+    # series.
+    cropped = tf.image.decode_and_crop_jpeg(
+        image_buffer, crop_window, channels=num_channels
+    )

-  # Flip to add a little more random distortion in.
-  cropped = tf.image.random_flip_left_right(cropped)
-  return cropped
+    # Flip to add a little more random distortion in.
+    cropped = tf.image.random_flip_left_right(cropped)
+    return cropped


 def _central_crop(image, crop_height, crop_width):
-  """Performs central crops of the given image list.
+    """Performs central crops of the given image list.

-  Args:
-    image: a 3-D image tensor
-    crop_height: the height of the image following the crop.
-    crop_width: the width of the image following the crop.
+    Args:
+      image: a 3-D image tensor
+      crop_height: the height of the image following the crop.
+      crop_width: the width of the image following the crop.

-  Returns:
-    3-D tensor with cropped image.
-  """
-  shape = tf.shape(input=image)
-  height, width = shape[0], shape[1]
+    Returns:
+      3-D tensor with cropped image.
+    """
+    shape = tf.shape(input=image)
+    height, width = shape[0], shape[1]

-  amount_to_be_cropped_h = (height - crop_height)
-  crop_top = amount_to_be_cropped_h // 2
-  amount_to_be_cropped_w = (width - crop_width)
-  crop_left = amount_to_be_cropped_w // 2
-  return tf.slice(
-      image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
+    amount_to_be_cropped_h = height - crop_height
+    crop_top = amount_to_be_cropped_h // 2
+    amount_to_be_cropped_w = width - crop_width
+    crop_left = amount_to_be_cropped_w // 2
+    return tf.slice(image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])


 def _mean_image_subtraction(image, means, num_channels):
-  """Subtracts the given means from each image channel.
+    """Subtracts the given means from each image channel.

-  For example:
-    means = [123.68, 116.779, 103.939]
-    image = _mean_image_subtraction(image, means)
+    For example:
+      means = [123.68, 116.779, 103.939]
+      image = _mean_image_subtraction(image, means)

-  Note that the rank of `image` must be known.
+    Note that the rank of `image` must be known.

-  Args:
-    image: a tensor of size [height, width, C].
-    means: a C-vector of values to subtract from each channel.
-    num_channels: number of color channels in the image that will be distorted.
+    Args:
+      image: a tensor of size [height, width, C].
+      means: a C-vector of values to subtract from each channel.
+      num_channels: number of color channels in the image that will be distorted.

-  Returns:
-    the centered image.
+    Returns:
+      the centered image.

-  Raises:
-    ValueError: If the rank of `image` is unknown, if `image` has a rank other
-      than three or if the number of channels in `image` doesn't match the
-      number of values in `means`.
-  """
-  if image.get_shape().ndims != 3:
-    raise ValueError('Input must be of size [height, width, C>0]')
+    Raises:
+      ValueError: If the rank of `image` is unknown, if `image` has a rank other
+        than three or if the number of channels in `image` doesn't match the
+        number of values in `means`.
+    """
+    if image.get_shape().ndims != 3:
+        raise ValueError("Input must be of size [height, width, C>0]")

-  if len(means) != num_channels:
-    raise ValueError('len(means) must match the number of channels')
+    if len(means) != num_channels:
+        raise ValueError("len(means) must match the number of channels")

-  # We have a 1-D tensor of means; convert to 3-D.
-  means = tf.expand_dims(tf.expand_dims(means, 0), 0)
+    # We have a 1-D tensor of means; convert to 3-D.
+    means = tf.expand_dims(tf.expand_dims(means, 0), 0)

-  return image - means
+    return image - means


 def _smallest_size_at_least(height, width, resize_min):
-  """Computes new shape with the smallest side equal to `smallest_side`.
+    """Computes new shape with the smallest side equal to `smallest_side`.

-  Computes new shape with the smallest side equal to `smallest_side` while
-  preserving the original aspect ratio.
+    Computes new shape with the smallest side equal to `smallest_side` while
+    preserving the original aspect ratio.

-  Args:
-    height: an int32 scalar tensor indicating the current height.
-    width: an int32 scalar tensor indicating the current width.
-    resize_min: A python integer or scalar `Tensor` indicating the size of
-      the smallest side after resize.
+    Args:
+      height: an int32 scalar tensor indicating the current height.
+      width: an int32 scalar tensor indicating the current width.
+      resize_min: A python integer or scalar `Tensor` indicating the size of
+        the smallest side after resize.

-  Returns:
-    new_height: an int32 scalar tensor indicating the new height.
-    new_width: an int32 scalar tensor indicating the new width.
-  """
-  resize_min = tf.cast(resize_min, tf.float32)
+    Returns:
+      new_height: an int32 scalar tensor indicating the new height.
+      new_width: an int32 scalar tensor indicating the new width.
+    """
+    resize_min = tf.cast(resize_min, tf.float32)

-  # Convert to floats to make subsequent calculations go smoothly.
-  height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
+    # Convert to floats to make subsequent calculations go smoothly.
+    height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)

-  smaller_dim = tf.minimum(height, width)
-  scale_ratio = resize_min / smaller_dim
+    smaller_dim = tf.minimum(height, width)
+    scale_ratio = resize_min / smaller_dim

-  # Convert back to ints to make heights and widths that TF ops will accept.
-  new_height = tf.cast(tf.ceil(height * scale_ratio), tf.int32)
-  new_width = tf.cast(tf.ceil(width * scale_ratio), tf.int32)
+    # Convert back to ints to make heights and widths that TF ops will accept.
+    new_height = tf.cast(tf.ceil(height * scale_ratio), tf.int32)
+    new_width = tf.cast(tf.ceil(width * scale_ratio), tf.int32)

-  return new_height, new_width
+    return new_height, new_width


 def _aspect_preserving_resize(image, resize_min):
-  """Resize images preserving the original aspect ratio.
+    """Resize images preserving the original aspect ratio.

-  Args:
-    image: A 3-D image `Tensor`.
-    resize_min: A python integer or scalar `Tensor` indicating the size of
-      the smallest side after resize.
+    Args:
+      image: A 3-D image `Tensor`.
+      resize_min: A python integer or scalar `Tensor` indicating the size of
+        the smallest side after resize.

-  Returns:
-    resized_image: A 3-D tensor containing the resized image.
-  """
-  shape = tf.shape(input=image)
-  height, width = shape[0], shape[1]
+    Returns:
+      resized_image: A 3-D tensor containing the resized image.
+    """
+    shape = tf.shape(input=image)
+    height, width = shape[0], shape[1]

-  new_height, new_width = _smallest_size_at_least(height, width, resize_min)
+    new_height, new_width = _smallest_size_at_least(height, width, resize_min)

-  return _resize_image(image, new_height, new_width)
+    return _resize_image(image, new_height, new_width)


 def _resize_image(image, height, width):
-  """Simple wrapper around tf.resize_images.
+    """Simple wrapper around tf.resize_images.

-  This is primarily to make sure we use the same `ResizeMethod` and other
-  details each time.
+    This is primarily to make sure we use the same `ResizeMethod` and other
+    details each time.

-  Args:
-    image: A 3-D image `Tensor`.
-    height: The target height for the resized image.
-    width: The target width for the resized image.
+    Args:
+      image: A 3-D image `Tensor`.
+      height: The target height for the resized image.
+      width: The target width for the resized image.

-  Returns:
-    resized_image: A 3-D tensor containing the resized image. The first two
-      dimensions have the shape [height, width].
-  """
-  return tf.image.resize_images(
-      image, [height, width], method=tf.image.ResizeMethod.BILINEAR,
-      align_corners=False)
+    Returns:
+      resized_image: A 3-D tensor containing the resized image. The first two
+        dimensions have the shape [height, width].
+    """
+    return tf.image.resize_images(
+        image,
+        [height, width],
+        method=tf.image.ResizeMethod.BILINEAR,
+        align_corners=False,
+    )


-def preprocess_image(image_buffer, bbox, output_height, output_width,
-                     num_channels, is_training=False):
-  """Preprocesses the given image.
+def preprocess_image(
+    image_buffer, bbox, output_height, output_width, num_channels, is_training=False
+):
+    """Preprocesses the given image.

-  Preprocessing includes decoding, cropping, and resizing for both training
-  and eval images. Training preprocessing, however, introduces some random
-  distortion of the image to improve accuracy.
+    Preprocessing includes decoding, cropping, and resizing for both training
+    and eval images. Training preprocessing, however, introduces some random
+    distortion of the image to improve accuracy.

-  Args:
-    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as
-      [ymin, xmin, ymax, xmax].
-    output_height: The height of the image after preprocessing.
-    output_width: The width of the image after preprocessing.
-    num_channels: Integer depth of the image buffer for decoding.
-    is_training: `True` if we're preprocessing the image for training and
-      `False` otherwise.
+    Args:
+      image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+      bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+        where each coordinate is [0, 1) and the coordinates are arranged as
+        [ymin, xmin, ymax, xmax].
+      output_height: The height of the image after preprocessing.
+      output_width: The width of the image after preprocessing.
+      num_channels: Integer depth of the image buffer for decoding.
+      is_training: `True` if we're preprocessing the image for training and
+        `False` otherwise.

-  Returns:
-    A preprocessed image.
-  """
-  if is_training:
-    # For training, we want to randomize some of the distortions.
-    image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
-    image = _resize_image(image, output_height, output_width)
-  else:
-    # For validation, we want to decode, resize, then just crop the middle.
-    image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
-    image = _aspect_preserving_resize(image, _RESIZE_MIN)
-    print(image)
-    image = _central_crop(image, output_height, output_width)
+    Returns:
+      A preprocessed image.
+    """
+    if is_training:
+        # For training, we want to randomize some of the distortions.
+        image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
+        image = _resize_image(image, output_height, output_width)
+    else:
+        # For validation, we want to decode, resize, then just crop the middle.
+        image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
+        image = _aspect_preserving_resize(image, _RESIZE_MIN)
+        print(image)
+        image = _central_crop(image, output_height, output_width)

-  image.set_shape([output_height, output_width, num_channels])
+    image.set_shape([output_height, output_width, num_channels])

-  return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)
+    return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)


 def parse_example_proto(example_serialized):
-  """Parses an Example proto containing a training example of an image.
+    """Parses an Example proto containing a training example of an image.

-  The output of the build_image_data.py image preprocessing script is a dataset
-  containing serialized Example protocol buffers. Each Example proto contains
-  the following fields:
+    The output of the build_image_data.py image preprocessing script is a dataset
+    containing serialized Example protocol buffers. Each Example proto contains
+    the following fields:

-    image/height: 462
-    image/width: 581
-    image/colorspace: 'RGB'
-    image/channels: 3
-    image/class/label: 615
-    image/class/synset: 'n03623198'
-    image/class/text: 'knee pad'
-    image/object/bbox/xmin: 0.1
-    image/object/bbox/xmax: 0.9
-    image/object/bbox/ymin: 0.2
-    image/object/bbox/ymax: 0.6
-    image/object/bbox/label: 615
-    image/format: 'JPEG'
-    image/filename: 'ILSVRC2012_val_00041207.JPEG'
-    image/encoded: <JPEG encoded string>
+      image/height: 462
+      image/width: 581
+      image/colorspace: 'RGB'
+      image/channels: 3
+      image/class/label: 615
+      image/class/synset: 'n03623198'
+      image/class/text: 'knee pad'
+      image/object/bbox/xmin: 0.1
+      image/object/bbox/xmax: 0.9
+      image/object/bbox/ymin: 0.2
+      image/object/bbox/ymax: 0.6
+      image/object/bbox/label: 615
+      image/format: 'JPEG'
+      image/filename: 'ILSVRC2012_val_00041207.JPEG'
+      image/encoded: <JPEG encoded string>

-  Args:
-    example_serialized: scalar Tensor tf.string containing a serialized
-      Example protocol buffer.
+    Args:
+      example_serialized: scalar Tensor tf.string containing a serialized
+        Example protocol buffer.

-  Returns:
-    image_buffer: Tensor tf.string containing the contents of a JPEG file.
-    label: Tensor tf.int32 containing the label.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as
-      [ymin, xmin, ymax, xmax].
-    text: Tensor tf.string containing the human-readable label.
-  """
-  # Dense features in Example proto.
-  feature_map = {
-      'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
-                                          default_value=''),
-      'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64,
-                                              default_value=-1),
-      'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
-                                             default_value=''),
-  }
-  sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
-  # Sparse features in Example proto.
-  feature_map.update(
-      {k: sparse_float32 for k in ['image/object/bbox/xmin',
-                                   'image/object/bbox/ymin',
-                                   'image/object/bbox/xmax',
-                                   'image/object/bbox/ymax']})
+    Returns:
+      image_buffer: Tensor tf.string containing the contents of a JPEG file.
+      label: Tensor tf.int32 containing the label.
+      bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+        where each coordinate is [0, 1) and the coordinates are arranged as
+        [ymin, xmin, ymax, xmax].
+      text: Tensor tf.string containing the human-readable label.
+    """
+    # Dense features in Example proto.
+    feature_map = {
+        "image/encoded": tf.FixedLenFeature([], dtype=tf.string, default_value=""),
+        "image/class/label": tf.FixedLenFeature([1], dtype=tf.int64, default_value=-1),
+        "image/class/text": tf.FixedLenFeature([], dtype=tf.string, default_value=""),
+    }
+    sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
+    # Sparse features in Example proto.
+    feature_map.update(
+        {
+            k: sparse_float32
+            for k in [
+                "image/object/bbox/xmin",
+                "image/object/bbox/ymin",
+                "image/object/bbox/xmax",
+                "image/object/bbox/ymax",
+            ]
+        }
+    )

-  features = tf.parse_single_example(example_serialized, feature_map)
-  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+    features = tf.parse_single_example(example_serialized, feature_map)
+    label = tf.cast(features["image/class/label"], dtype=tf.int32)

-  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
-  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
-  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
-  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
+    xmin = tf.expand_dims(features["image/object/bbox/xmin"].values, 0)
+    ymin = tf.expand_dims(features["image/object/bbox/ymin"].values, 0)
+    xmax = tf.expand_dims(features["image/object/bbox/xmax"].values, 0)
+    ymax = tf.expand_dims(features["image/object/bbox/ymax"].values, 0)

-  # Note that we impose an ordering of (y, x) just to make life difficult.
-  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
+    # Note that we impose an ordering of (y, x) just to make life difficult.
+    bbox = tf.concat([ymin, xmin, ymax, xmax], 0)

-  # Force the variable number of bounding boxes into the shape
-  # [1, num_boxes, coords].
-  bbox = tf.expand_dims(bbox, 0)
-  bbox = tf.transpose(bbox, [0, 2, 1])
+    # Force the variable number of bounding boxes into the shape
+    # [1, num_boxes, coords].
+    bbox = tf.expand_dims(bbox, 0)
+    bbox = tf.transpose(bbox, [0, 2, 1])

-  return features['image/encoded'], label, bbox, features['image/class/text']
+    return features["image/encoded"], label, bbox, features["image/class/text"]


 class ImagenetPreprocessor:
-  def __init__(self, image_size, dtype, train):
-    self.image_size = image_size
-    self.dtype = dtype
-    self.train = train
+    def __init__(self, image_size, dtype, train):
+        self.image_size = image_size
+        self.dtype = dtype
+        self.train = train

-  def preprocess(self, image_buffer, bbox):
-    # pylint: disable=g-import-not-at-top
-    image = preprocess_image(image_buffer, bbox, self.image_size, self.image_size, IMAGE_DEPTH, is_training=self.train)
-    return tf.cast(image, self.dtype)
-
-  def parse_and_preprocess(self, value):
-    image_buffer, label_index, bbox, _ = parse_example_proto(value)
-    image = self.preprocess(image_buffer, bbox)
-    image = tf.reshape(image, [self.image_size, self.image_size, IMAGE_DEPTH])
-    return label_index, image
+    def preprocess(self, image_buffer, bbox):
+        # pylint: disable=g-import-not-at-top
+        image = preprocess_image(
+            image_buffer,
+            bbox,
+            self.image_size,
+            self.image_size,
+            IMAGE_DEPTH,
+            is_training=self.train,
+        )
+        return tf.cast(image, self.dtype)

+    def parse_and_preprocess(self, value):
+        image_buffer, label_index, bbox, _ = parse_example_proto(value)
+        image = self.preprocess(image_buffer, bbox)
+        image = tf.reshape(image, [self.image_size, self.image_size, IMAGE_DEPTH])
+        return label_index, image