in 07_training/serverlessml/flowers/ingest/tfrecords.py [0:0]
def create_preproc_dataset(pattern, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS):
"""
Does interleaving, parallel calls, prefetch, batching
Caching is not a good idea on large datasets.
"""
preproc = _Preprocessor(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)
files = [filename for filename
in tf.random.shuffle(tf.io.gfile.glob(pattern))]
if len(files) > 1:
print("Interleaving the reading of {} files.".format(len(files)))
def _create_half_ds(x):
if x == 0:
half = files[:(len(files)//2)]
else:
half = files[(len(files)//2):]
return tf.data.TFRecordDataset(half,
compression_type='GZIP')
trainds = tf.data.Dataset.range(2).interleave(
_create_half_ds, num_parallel_calls=AUTOTUNE)
else:
print("WARNING! Only {} files match {}".format(len(files), pattern))
trainds = tf.data.TFRecordDataset(files,
compression_type='GZIP')
def _preproc_img_label(img, label):
return (preproc.preprocess(img), label)
trainds = (trainds
.map(preproc.read_from_tfr, num_parallel_calls=AUTOTUNE)
.map(_preproc_img_label, num_parallel_calls=AUTOTUNE)
.shuffle(200)
.prefetch(AUTOTUNE)
)
return trainds