Source code for tf_crnn.data_handler

#!/usr/bin/env python
__author__ = 'solivr'
__license__ = "GPL"

import tensorflow as tf
from tensorflow_addons.image.transform_ops import rotate, transform
from .config import Params, CONST
from typing import Tuple, Union, List
import collections

[docs]@tf.function def random_rotation(img: tf.Tensor, max_rotation: float=0.1, crop: bool=True, minimum_width: int=0) -> tf.Tensor: # adapted from SeguinBe """ Rotates an image with a random angle. See for formulae :param img: Tensor :param max_rotation: maximum angle to rotate (radians) :param crop: boolean to crop or not the image after rotation :param minimum_width: minimum width of image after data augmentation :return: """ with tf.name_scope('RandomRotation'): rotation = tf.random.uniform([], -max_rotation, max_rotation, name='pick_random_angle') # rotated_image = tf.contrib.image.rotate(img, rotation, interpolation='BILINEAR') rotated_image = rotate(tf.expand_dims(img, axis=0), rotation, interpolation='BILINEAR') rotated_image = tf.squeeze(rotated_image, axis=0) if crop: rotation = tf.abs(rotation) original_shape = tf.shape(rotated_image)[:2] h, w = original_shape[0], original_shape[1] old_l, old_s = tf.cond(h > w, lambda: [h, w], lambda: [w, h]) old_l, old_s = tf.cast(old_l, tf.float32), tf.cast(old_s, tf.float32) new_l = (old_l * tf.cos(rotation) - old_s * tf.sin(rotation)) / tf.cos(2*rotation) new_s = (old_s - tf.sin(rotation) * new_l) / tf.cos(rotation) new_h, new_w = tf.cond(h > w, lambda: [new_l, new_s], lambda: [new_s, new_l]) new_h, new_w = tf.cast(new_h, tf.int32), tf.cast(new_w, tf.int32) bb_begin = tf.cast(tf.math.ceil((h-new_h)/2), tf.int32), tf.cast(tf.math.ceil((w-new_w)/2), tf.int32) # Test sliced rotated_image_crop = tf.cond( tf.logical_and(bb_begin[0] < h - bb_begin[0], bb_begin[1] < w - bb_begin[1]), true_fn=lambda: rotated_image[bb_begin[0]:h - bb_begin[0], bb_begin[1]:w - bb_begin[1], :], false_fn=lambda: img, name='check_slices_indices' ) # rotated_image_crop = rotated_image[bb_begin[0]:h - bb_begin[0], bb_begin[1]:w - bb_begin[1], :] # If crop removes the entire image, keep the original image rotated_image = tf.cond(tf.less_equal(tf.shape(rotated_image_crop)[1], minimum_width), true_fn=lambda: img, false_fn=lambda: rotated_image_crop, name='check_size_crop') return rotated_image
# def random_padding(image: tf.Tensor, max_pad_w: int=5, max_pad_h: int=10) -> tf.Tensor: # """ # Given an image will pad its border adding a random number of rows and columns # # :param image: image to pad # :param max_pad_w: maximum padding in width # :param max_pad_h: maximum padding in height # :return: a padded image # """ # # TODO specify image shape in doc # # w_pad = list(np.random.randint(0, max_pad_w, size=[2])) # h_pad = list(np.random.randint(0, max_pad_h, size=[2])) # paddings = [h_pad, w_pad, [0, 0]] # # return tf.pad(image, paddings, mode='REFLECT', name='random_padding')
[docs]@tf.function def augment_data(image: tf.Tensor, max_rotation: float=0.1, minimum_width: int=0) -> tf.Tensor: """ Data augmentation on an image (padding, brightness, contrast, rotation) :param image: Tensor :param max_rotation: float, maximum permitted rotation (in radians) :param minimum_width: minimum width of image after data augmentation :return: Tensor """ with tf.name_scope('DataAugmentation'): # Random padding # image = random_padding(image) # TODO : add random scaling image = tf.image.random_brightness(image, max_delta=0.1) image = tf.image.random_contrast(image, 0.5, 1.5) image = random_rotation(image, max_rotation, crop=True, minimum_width=minimum_width) if image.shape[-1] >= 3: image = tf.image.random_hue(image, 0.2) image = tf.image.random_saturation(image, 0.5, 1.5) return image
[docs]@tf.function def get_resized_width(image: tf.Tensor, target_height: int, increment: int): """ Resizes the image according to `target_height`. :param image: image to resize :param target_height: height of the resized image :param increment: reduction factor due to pooling between input width and output width, this makes sure that the final width will be a multiple of increment :return: resized image """ image_shape = tf.shape(image) image_ratio = tf.divide(image_shape[1], image_shape[0], name='ratio') new_width = tf.cast(tf.round((image_ratio * target_height) / increment) * increment, tf.int32) f1 = lambda: (new_width, image_ratio) f2 = lambda: (target_height, tf.constant(1.0, dtype=tf.float64)) if tf.math.less_equal(new_width, 0): return f2() else: return f1()
[docs]@tf.function def padding_inputs_width(image: tf.Tensor, target_shape: Tuple[int, int], increment: int) -> Tuple[tf.Tensor, tf.Tensor]: """ Given an input image, will pad it to return a target_shape size padded image. There are 3 cases: - image width > target width : simple resizing to shrink the image - image width >= 0.5*target width : pad the image - image width < 0.5*target width : replicates the image segment and appends it :param image: Tensor of shape [H,W,C] :param target_shape: final shape after padding [H, W] :param increment: reduction factor due to pooling between input width and output width, this makes sure that the final width will be a multiple of increment :return: (image padded, output width) """ target_ratio = target_shape[1]/target_shape[0] target_w = target_shape[1] # Compute ratio to keep the same ratio in new image and get the size of padding # necessary to have the final desired shape new_h = target_shape[0] new_w, ratio = get_resized_width(image, new_h, increment) # Definitions for cases def pad_fn(): with tf.name_scope('mirror_padding'): pad = tf.subtract(target_w, new_w) img_resized = tf.image.resize(image, [new_h, new_w]) # Padding to have the desired width paddings = [[0, 0], [0, pad], [0, 0]] pad_image = tf.pad(img_resized, paddings, mode='SYMMETRIC', name=None) # Set manually the shape pad_image.set_shape([target_shape[0], target_shape[1], img_resized.get_shape()[2]]) return pad_image, (new_h, new_w) def replicate_fn(): with tf.name_scope('replication_padding'): img_resized = tf.image.resize(image, [new_h, new_w]) # If one symmetry is not enough to have a full width # Count number of replications needed n_replication = tf.cast(tf.math.ceil(target_shape[1]/new_w), tf.int32) img_replicated = tf.tile(img_resized, tf.stack([1, n_replication, 1])) pad_image = tf.image.crop_to_bounding_box(image=img_replicated, offset_height=0, offset_width=0, target_height=target_shape[0], target_width=target_shape[1]) # Set manually the shape pad_image.set_shape([target_shape[0], target_shape[1], img_resized.get_shape()[2]]) return pad_image, (new_h, new_w) def simple_resize(): with tf.name_scope('simple_resize'): img_resized = tf.image.resize(image, target_shape) img_resized.set_shape([target_shape[0], target_shape[1], img_resized.get_shape()[2]]) return img_resized, tuple(target_shape) # case 1 : new_w >= target_w if tf.logical_and(tf.greater_equal(ratio, target_ratio), tf.greater_equal(new_w, target_w)): pad_image, (new_h, new_w) = simple_resize() # case 2 : new_w >= target_w/2 & new_w < target_w & ratio < target_ratio elif tf.logical_and(tf.less(ratio, target_ratio), tf.logical_and(tf.greater_equal(new_w, tf.cast(tf.divide(target_w, 2), tf.int32)), tf.less(new_w, target_w))): pad_image, (new_h, new_w) = pad_fn() # case 3 : new_w < target_w/2 & new_w < target_w & ratio < target_ratio elif tf.logical_and(tf.less(ratio, target_ratio), tf.logical_and(tf.less(new_w, target_w), tf.less(new_w, tf.cast(tf.divide(target_w, 2), tf.int32)))): pad_image, (new_h, new_w) = replicate_fn() else: pad_image, (new_h, new_w) = simple_resize() return pad_image, new_w
# def apply_slant(image: np.ndarray, alpha: np.ndarray) -> (np.ndarray, np.ndarray): # alpha = alpha[0] # # def _find_background_color(image: np.ndarray) -> int: # """ # Given a grayscale image, finds the background color value # :param image: grayscale image # :return: background color value (int) # """ # # Otsu's thresholding after Gaussian filtering # blur = cv2.GaussianBlur(image[:, :, 0].astype(np.uint8), (5, 5), 0) # thresh_value, thresholded_image = cv2.threshold(blur.astype(np.uint8), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # # # Find which is the background (0 or 255). Supposing that the background color occurrence is higher # # than the writing color # counts, bin_edges = np.histogram(thresholded_image, bins=2) # background_color = int(np.median(image[thresholded_image == 255 * np.argmax(counts)])) # # return background_color # # shape_image = image.shape # shift = max(-alpha * shape_image[0], 0) # output_size = (int(shape_image[1] + np.ceil(abs(alpha * shape_image[0]))), int(shape_image[0])) # # warpM = np.array([[1, alpha, shift], [0, 1, 0]]) # # # Find color of background in order to replicate it in the borders # border_value = _find_background_color(image) # # image_warp = cv2.warpAffine(image, np.array(warpM), output_size, borderValue=border_value) # # return image_warp, np.array(output_size)
[docs]def dataset_generator(csv_filename: Union[List[str], str], params: Params, use_labels: bool=True, batch_size: int=64, data_augmentation: bool=False, num_epochs: int=None, shuffle: bool=True): """ Generates the dataset for the experiment. :param csv_filename: Path to csv file containing the data :param params: parameters df the experiment (``Params``) :param use_labels: boolean to indicate dataset generation during training / evaluation (true) or prediction (false) :param batch_size: size of the generated batches :param data_augmentation: whether to use data augmentation strategies or not :param num_epochs: number of epochs to repeat the dataset generation :param shuffle: whether to suffle the data :return: ```` """ do_padding = True if use_labels: column_defaults = [['None'], ['None'], tf.int32] column_names = ['paths', 'label_codes', 'label_seq_length'] label_name = 'label_codes' else: column_defaults = [['None']] column_names = ['paths'] label_name = None num_parallel_reads = 1 # ----- from data.experimental.make_csv_dataset def filename_to_dataset(filename): dataset =, record_defaults=column_defaults, field_delim=params.csv_delimiter, header=False) return dataset def map_fn(*columns): """Organizes columns into a features dictionary. Args: *columns: list of `Tensor`s corresponding to one csv record. Returns: An OrderedDict of feature names to values for that particular record. If label_name is provided, extracts the label feature to be returned as the second element of the tuple. """ features = collections.OrderedDict(zip(column_names, columns)) if label_name is not None: label = features.pop(label_name) return features, label return features dataset = # Read files sequentially (if num_parallel_reads=1) or in parallel # dataset = dataset.apply(, # cycle_length=num_parallel_reads)) dataset = dataset.interleave(filename_to_dataset, cycle_length=num_parallel_reads, dataset = # ----- def _load_image(features: dict, labels=None): path = features['paths'] image_content = image =, channels=params.input_channels, try_recover_truncated=True, name='image_decoding_op') if use_labels: return {'input_images': image, 'label_seq_length': features['label_seq_length']}, labels else: return {'input_images': image, 'filename_images': path} def _apply_slant(features: dict, labels=None): image = features['input_images'] height_image = tf.cast(tf.shape(image)[0], dtype=tf.float32) with tf.name_scope('add_slant'): alpha = tf.random.uniform([], -params.data_augmentation_max_slant, params.data_augmentation_max_slant, name='pick_random_slant_angle') shiftx = tf.math.maximum(tf.math.multiply(-alpha, height_image), 0) # Pad in order not to loose image info when transformation is applied x_pad = 0 y_pad = tf.math.round(tf.math.ceil(tf.math.abs(tf.math.multiply(alpha, height_image)))) y_pad = tf.cast(y_pad, dtype=tf.int32) paddings = [[x_pad, x_pad], [y_pad, 0], [0, 0]] transform_matrix = [1, alpha, shiftx, 0, 1, 0, 0, 0] # Apply transformation to image image_pad = tf.pad(image, paddings) image_transformed = transform(image_pad, transform_matrix, interpolation='BILINEAR') # Apply transformation to mask. The mask will be used to retrieve the pixels that have been filled # with zero during transformation and update their value with background value # TODO : Would be better to have some kind of binarization (i.e Otsu) and get the mean background value background_pixel_value = 255 empty = background_pixel_value * tf.ones(tf.shape(image)) empty_pad = tf.pad(empty, paddings) empty_transformed = tf.subtract( tf.cast(background_pixel_value, dtype=tf.int32), tf.cast(transform(empty_pad, transform_matrix, interpolation='NEAREST'), dtype=tf.int32) ) # Update additional zeros values with background_pixel_value and cast result to uint8 image = tf.add(tf.cast(image_transformed, dtype=tf.int32), empty_transformed) image = tf.cast(image, tf.uint8) features['input_images'] = image return features, labels if use_labels else features def _data_augment_fn(features: dict, labels=None) -> image = features['input_images'] image = augment_data(image, params.data_augmentation_max_rotation, minimum_width=params.max_chars_per_string) features.update({'input_images': image}) return features, labels if use_labels else features def _pad_image_or_resize(features: dict, labels=None): image = features['input_images'] if do_padding: with tf.name_scope('padding'): image, img_width = padding_inputs_width(image, target_shape=params.input_shape, increment=params.downscale_factor) # todo this needs to be updated # Resize else: image = tf.image.resize(image, size=params.input_shape) img_width = tf.shape(image)[1] input_seq_length = tf.cast(tf.floor(tf.divide(img_width, params.downscale_factor)), tf.int32) if use_labels: assert_op = tf.debugging.assert_greater_equal(input_seq_length, features['label_seq_length']) with tf.control_dependencies([assert_op]): return {'input_images': image, 'label_seq_length': features['label_seq_length'], 'input_seq_length': input_seq_length}, labels else: return {'input_images': image, 'input_seq_length': input_seq_length, 'filename_images': features['filename_images']} def _normalize_image(features: dict, labels=None): image = tf.cast(features['input_images'], tf.float32) image = tf.image.per_image_standardization(image) features['input_images'] = image return features, labels if use_labels else features def _format_label_codes(features: dict, string_label_codes): splits = tf.strings.split([string_label_codes], sep=' ') label_codes = tf.squeeze(tf.strings.to_number(splits, out_type=tf.int32), axis=0) features.update({'label_codes': label_codes}) return features, [0] num_parallel_calls = # 1. load image 2. data augmentation 3. padding dataset =, num_parallel_calls=num_parallel_calls) # this causes problems when using the same cache for training, validation and prediction data... # dataset = dataset.cache(filename=os.path.join(params.output_model_dir, '')) if data_augmentation and params.data_augmentation_max_slant != 0: dataset =, num_parallel_calls=num_parallel_calls) if data_augmentation: dataset =, num_parallel_calls=num_parallel_calls) dataset =, num_parallel_calls=num_parallel_calls) dataset =, num_parallel_calls=num_parallel_calls) dataset =, num_parallel_calls=num_parallel_calls) if use_labels else dataset dataset = dataset.shuffle(10 * batch_size, reshuffle_each_iteration=False) if shuffle else dataset dataset = dataset.repeat(num_epochs) if num_epochs is not None else dataset return dataset.batch(batch_size).prefetch(
# def dataset_prediction(image_filenames: Union[List[str], str]=None, # csv_filename: str=None, # params: Params=None, # batch_size: int=64): # # assert params, 'params cannot be None' # assert image_filenames or csv_filename, 'You need to feed an input (image_filenames or csv_filename)' # # do_padding = True # # def _load_image(path): # image_content = # image =, channels=params.input_channels, # try_recover_truncated=True, name='image_decoding_op') # # return {'input_images': image} # # def _normalize_image(features: dict): # image = tf.cast(features['input_images'], tf.float32) # image = tf.image.per_image_standardization(image) # # features['input_images'] = image # return features # # def _pad_image_or_resize(features: dict): # image = features['input_images'] # if do_padding: # with tf.name_scope('padding'): # image, img_width = padding_inputs_width(image, target_shape=params.input_shape, # increment=CONST.DIMENSION_REDUCTION_W_POOLING) # # Resize # else: # image = tf.image.resize(image, size=params.input_shape) # img_width = tf.shape(image)[1] # # input_seq_length = tf.cast(tf.floor(tf.math.divide(img_width, params.n_pool)), tf.int32) # # return {'input_images': image, # 'input_seq_length': input_seq_length} # if image_filenames is not None: # dataset = # elif csv_filename is not None: # column_defaults = [['None']] # dataset =, # record_defaults=column_defaults, # field_delim=params.csv_delimiter, # header=False) # # dataset = # dataset =, # dataset =, # dataset =, # # return dataset.batch(batch_size).prefetch(