core

Some custom Torchvision tranforms.
img_path = './images/call-hand-gesture.png'

# Open the associated image file as a RGB image
sample_img = Image.open(img_path).convert('RGB')

# Print the dimensions of the image
print(f"Image Dims: {sample_img.size}")

# Show the image
sample_img
Image Dims: (384, 512)


source

ResizeMax

 ResizeMax (max_sz:int=256)

A PyTorch Transform class that resizes an image such that the maximum dimension is equal to a specified size while maintaining the aspect ratio.

Type Default Details
max_sz int 256 The maximum size for any dimension (height or width) of the image.
target_sz = 384
print(f"Source image: {sample_img.size}")

# Create a `ResizeMax` object
resize_max = ResizeMax(max_sz=target_sz)

# Convert the cropped image to a tensor
img_tensor = transforms.PILToTensor()(sample_img)[None]
print(f"Image tensor: {img_tensor.shape}")

# Resize the tensor
resized_tensor = resize_max(img_tensor)
print(f"Padded tensor: {resized_tensor.shape}")

# Display the updated image
tensor_to_pil(resized_tensor)
Source image: (384, 512)
Image tensor: torch.Size([1, 3, 512, 384])
Padded tensor: torch.Size([1, 3, 384, 288])


source

PadSquare

 PadSquare (padding_mode:str='constant', fill:tuple=(123, 117, 104),
            shift:bool=True)

PadSquare is a PyTorch Transform class used to pad images to make them square. Depending on the configuration, padding can be applied equally on both sides, or can be randomly split between the two sides.

Type Default Details
padding_mode str constant The method to use for padding. Default is ‘constant’.
fill tuple (123, 117, 104) The RGB values to use for padding if padding_mode is ‘constant’.
shift bool True If True, padding is randomly split between the two sides. If False, padding is equally applied.
print(f"Resized tensor: {resized_tensor.shape}")

# Create a `PadSquare` object
pad_square = PadSquare(shift=True)

# Pad the tensor
padded_tensor = pad_square(resized_tensor)
print(f"Padded tensor: {padded_tensor.shape}")

# Display the updated image
stack_imgs([tensor_to_pil(pad_square(resized_tensor)) for i in range(3)])
Resized tensor: torch.Size([3, 384, 288])
Padded tensor: torch.Size([3, 384, 384])


source

CustomTrivialAugmentWide

 CustomTrivialAugmentWide (num_magnitude_bins:int=31, interpolation:torchv
                           ision.transforms.functional.InterpolationMode=<
                           InterpolationMode.NEAREST: 'nearest'>,
                           fill:Optional[List[float]]=None, op_meta:Option
                           al[Dict[str,Tuple[torch.Tensor,bool]]]=None)

This class extends the TrivialAugmentWide class provided by PyTorch’s transforms module. TrivialAugmentWide is an augmentation policy randomly applies a single augmentation to each image.

num_bins = 31

custom_augmentation_space = {
    # Identity operation doesn't change the image
    "Identity": (torch.tensor(0.0), False),
            
    # Distort the image along the x or y axis, respectively.
    "ShearX": (torch.linspace(0.0, 0.25, num_bins), True),
    "ShearY": (torch.linspace(0.0, 0.25, num_bins), True),

    # Move the image along the x or y axis, respectively.
    "TranslateX": (torch.linspace(0.0, 32.0, num_bins), True),
    "TranslateY": (torch.linspace(0.0, 32.0, num_bins), True),

    # Rotate operation: rotates the image.
    "Rotate": (torch.linspace(0.0, 45.0, num_bins), True),

    # Adjust brightness, color, contrast,and sharpness respectively.
    "Brightness": (torch.linspace(0.0, 0.75, num_bins), True),
    "Color": (torch.linspace(0.0, 0.99, num_bins), True),
    "Contrast": (torch.linspace(0.0, 0.99, num_bins), True),
    "Sharpness": (torch.linspace(0.0, 0.99, num_bins), True),

    # Reduce the number of bits used to express the color in each channel of the image.
    "Posterize": (8 - (torch.arange(num_bins) / ((num_bins - 1) / 6)).round().int(), False),

    # Invert all pixel values above a threshold.
    "Solarize": (torch.linspace(255.0, 0.0, num_bins), False),

    # Maximize the image contrast by setting the darkest color to black and the lightest to white.
    "AutoContrast": (torch.tensor(0.0), False),

    # Equalize the image histogram to improve its contrast.
    "Equalize": (torch.tensor(0.0), False),
}
# Create a `CustomTrivialAugmentWide` object
trivial_aug = CustomTrivialAugmentWide(op_meta=custom_augmentation_space)

# Pad the tensor
aug_tensor = trivial_aug(resized_tensor)
print(f"Augmented tensor: {aug_tensor.shape}")

# Display the updated image
stack_imgs([tensor_to_pil(trivial_aug(resized_tensor)) for i in range(3)])
Augmented tensor: torch.Size([3, 384, 288])


source

CustomRandomIoUCrop

 CustomRandomIoUCrop (min_scale:float=0.3, max_scale:float=1.0,
                      min_aspect_ratio:float=0.5,
                      max_aspect_ratio:float=2.0,
                      sampler_options:Optional[List[float]]=[0.0, 0.1,
                      0.3, 0.5, 0.7, 0.9, 1.0], trials:int=40,
                      jitter_factor:float=0.0)

A customized Random IoU crop transformation that inherits from torchvision’s RandomIoUCrop transform.

Type Default Details
min_scale float 0.3 Minimum factors to scale the input size.
max_scale float 1.0 Maximum factors to scale the input size.
min_aspect_ratio float 0.5 Minimum aspect ratio for the cropped image or video.
max_aspect_ratio float 2.0 Maximum aspect ratio for the cropped image or video.
sampler_options Optional [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0] List of minimal IoU (Jaccard) overlap between all the boxes a cropped image or video.
trials int 40 Number of trials to find a crop for a given value of minimal IoU (Jaccard) overlap.
jitter_factor float 0.0 Value to jitter the center coordinates for the crop area.
class_names = ['call']
# Prepare bounding box targets
targets = {
    'boxes': BoundingBoxes([[ 91.8727, 146.4079, 188.0844, 252.7894]], 
                         format=BoundingBoxFormat.XYXY, 
                         canvas_size=sample_img.size[::-1]),
    'labels': torch.Tensor([class_names.index(label) for label in ['call']])
}
# Annotate the sample image with labels and bounding boxes
annotated_tensor = draw_bounding_boxes(
    image=transforms.PILToTensor()(sample_img), 
    boxes=targets['boxes'], 
    labels=[class_names[int(label.item())] for label in targets['labels']],
)

display(tensor_to_pil(annotated_tensor))

# Create a RandomIoUCrop object
iou_crop = CustomRandomIoUCrop(min_scale=0.3, 
                                  max_scale=1.0, 
                                  min_aspect_ratio=0.5, 
                                  max_aspect_ratio=2.0, 
                                  sampler_options=[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
                                  trials=400, 
                                  jitter_factor=0.25)

# Crop the image
cropped_img, cropped_targets = iou_crop(sample_img, targets)
sanitized_img, sanitized_targets = transforms.SanitizeBoundingBoxes()(cropped_img, cropped_targets)

# Annotate the augmented image with updated labels and bounding boxes
annotated_tensor = draw_bounding_boxes(
    image=transforms.PILToTensor()(sanitized_img), 
    boxes=cropped_targets['boxes'], 
    labels=[class_names[int(label.item())] for label in sanitized_targets['labels']],
)

tensor_to_pil(annotated_tensor)


source

RandomPatchCopy

 RandomPatchCopy (pct:float=0.2, min_num:int=0, max_num:int=4,
                  iou_thresh:float=0.25)

A torchvision V2 transform that copies data from a randomly selected rectangular patch to another randomly selected rectangular region of an image tensor multiple times.

Type Default Details
pct float 0.2 The percentage of the tensor’s size to be used as the side length of the square regions.
min_num int 0 The minimum number of times to apply the rand_square_copy function.
max_num int 4 The maximum number of times to apply the rand_square_copy function.
iou_thresh float 0.25 The IoU threshold for bounding box suppression.
# Create a RandomPatchCopy object
rand_patch_copy_tfm = RandomPatchCopy(pct=0.3, min_num=1, max_num=4)

# Feed sample image and targets through the image transform
augmented_img, augmented_targets = rand_patch_copy_tfm(sample_img, targets)
# Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
sanitized_img, sanitized_targets = transforms.SanitizeBoundingBoxes()(augmented_img, augmented_targets)

# Annotate the augmented image with updated labels and bounding boxes
annotated_tensor = draw_bounding_boxes(
    image=transforms.PILToTensor()(sanitized_img), 
    boxes=sanitized_targets['boxes'], 
    labels=[class_names[int(label.item())] for label in sanitized_targets['labels']], 
)

# Display the augmented image
tensor_to_pil(annotated_tensor)


source

RandomPixelCopy

 RandomPixelCopy (min_pct=0.0025, max_pct:float=0.1)

A torchvision V2 transform that copies data from a randomly selected set of pixels to another randomly selected set of pixels of a image tensor.

Type Default Details
min_pct float 0.0025 The minimum percentage of the tensor’s pixels to be copied.
max_pct float 0.1 The maximum percentage of the tensor’s pixels to be copied.
# Create a RandomPixelCopy object
rand_pixel_copy_tfm = RandomPixelCopy(max_pct=0.05)

# Feed sample image and targets through the image transform
augmented_img, augmented_targets = rand_pixel_copy_tfm(sample_img, targets)

# Annotate the augmented image with updated labels and bounding boxes
annotated_tensor = draw_bounding_boxes(
    image=transforms.PILToTensor()(augmented_img), 
    boxes=augmented_targets['boxes'], 
    labels=[class_names[int(label.item())] for label in augmented_targets['labels']], 
)

# Display the augmented image
transforms.ToPILImage()(annotated_tensor)


source

CustomRandomAugment

 CustomRandomAugment (num_bins:int=31, shear:int=18, translate:float=0.25,
                      degrees:float=70.0, brightness:float=0.75,
                      hue:float=0.4, saturation:float=0.99,
                      contrast:float=0.75, sharpness:tuple=(0.0, 1.99),
                      posterize:tuple=(2.0, 8.0), solarize:tuple=(1.0,
                      255.0), auto_contrast:bool=True, equalize:bool=True)

A PyTorch Module for applying a random augmentation from a predefined set of augmentations to an image and its associated targets (e.g., bounding boxes, masks). This class is designed to work with torchvision transforms and supports augmentation of both images and their corresponding target data.

Type Default Details
num_bins int 31 The number of discrete levels for certain augmentations.
shear int 18 Maximum shear angle for the RandomAffine transform.
translate float 0.25 Maximum translation as a fraction of image dimensions for RandomAffine.
degrees float 70.0 Range of degrees for random rotations.
brightness float 0.75 Brightness adjustment factor.
hue float 0.4 Hue adjustment range.
saturation float 0.99 Saturation adjustment factor.
contrast float 0.75 Contrast adjustment factor.
sharpness tuple (0.0, 1.99) Range for sharpness factor adjustments.
posterize tuple (2.0, 8.0) Range for bits in posterization.
solarize tuple (1.0, 255.0) Threshold range for solarization.
auto_contrast bool True
equalize bool True
# Create a RandomIoUCrop object
random_aug_tfm = CustomRandomAugment()

# Augment the image
augmented_img, augmented_targets = random_aug_tfm(sample_img, targets)
sanitized_img, sanitized_targets = transforms.SanitizeBoundingBoxes()(augmented_img, augmented_targets)

# Annotate the augmented image with updated labels and bounding boxes
annotated_tensor = draw_bounding_boxes(
    image=transforms.PILToTensor()(sanitized_img), 
    boxes=sanitized_targets['boxes'], 
    labels=[class_names[int(label.item())] for label in sanitized_targets['labels']],
)

tensor_to_pil(annotated_tensor)