import tensorflow as tf
import numpy as np
import os
import tempfile
import zipfile


# GLOBAL VARIABLES

# String constants for model filenames
FILE_WEIGHTS = 'baseline_weights.h5'
FILE_NON_QUANTIZED_H5 = 'non_quantized.h5'
FILE_NON_QUANTIZED_TFLITE = 'non_quantized.tflite'
FILE_PT_QUANTIZED = 'post_training_quantized.tflite'
FILE_QAT_QUANTIZED = 'quant_aware_quantized.tflite'
FILE_PRUNED_MODEL_H5 = 'pruned_model.h5'
FILE_PRUNED_QUANTIZED_TFLITE = 'pruned_quantized.tflite'
FILE_PRUNED_NON_QUANTIZED_TFLITE = 'pruned_non_quantized.tflite'

# Dictionaries to hold measurements
MODEL_SIZE = {}
ACCURACY = {}


# UTILITY FUNCTIONS

def print_metric(metric_dict, metric_name):
  '''Prints key and values stored in a dictionary'''
  for metric, value in metric_dict.items():
    print(f'{metric_name} for {metric}: {value}')


def model_builder():
  '''Returns a shallow CNN for training on the MNIST dataset'''

  keras = tf.keras

  # Define the model architecture.
  model = keras.Sequential([
    keras.layers.InputLayer(input_shape=(28, 28)),
    keras.layers.Reshape(target_shape=(28, 28, 1)),
    keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation='relu'),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.Flatten(),
    keras.layers.Dense(10, activation='softmax')
  ])

  return model


def evaluate_tflite_model(filename, x_test, y_test):
  '''
  Measures the accuracy of a given TF Lite model and test set
  
  Args:
    filename (string) - filename of the model to load
    x_test (numpy array) - test images
    y_test (numpy array) - test labels

  Returns
    float showing the accuracy against the test set
  '''

  # Initialize the TF Lite Interpreter and allocate tensors
  interpreter = tf.lite.Interpreter(model_path=filename)
  interpreter.allocate_tensors()

  # Get input and output index
  input_index = interpreter.get_input_details()[0]["index"]
  output_index = interpreter.get_output_details()[0]["index"]

  # Initialize empty predictions list
  prediction_digits = []
  
  # Run predictions on every image in the "test" dataset.
  for i, test_image in enumerate(x_test):
    # Pre-processing: add batch dimension and convert to float32 to match with
    # the model's input data format.
    test_image = np.expand_dims(test_image, axis=0).astype(np.float32)
    interpreter.set_tensor(input_index, test_image)

    # Run inference.
    interpreter.invoke()

    # Post-processing: remove batch dimension and find the digit with highest
    # probability.
    output = interpreter.tensor(output_index)
    digit = np.argmax(output()[0])
    prediction_digits.append(digit)

  # Compare prediction results with ground truth labels to calculate accuracy.
  prediction_digits = np.array(prediction_digits)
  accuracy = (prediction_digits == y_test).mean()
  
  return accuracy


def get_gzipped_model_size(file):
  '''Returns size of gzipped model, in bytes.'''
  _, zipped_file = tempfile.mkstemp('.zip')
  with zipfile.ZipFile(zipped_file, 'w', compression=zipfile.ZIP_DEFLATED) as f:
    f.write(file)

  return os.path.getsize(zipped_file)


# Load MNIST dataset
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the input image so that each pixel value is between 0 to 1.
train_images = train_images / 255.0
test_images = test_images / 255.0

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
11493376/11490434 [==============================] - 0s 0us/step
11501568/11490434 [==============================] - 0s 0us/step


# Create the baseline model
baseline_model = model_builder()

# Save the initial weights for use later
baseline_model.save_weights(FILE_WEIGHTS)

# Print the model summary
baseline_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 reshape (Reshape)           (None, 28, 28, 1)         0         
                                                                 
 conv2d (Conv2D)             (None, 26, 26, 12)        120       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 12)       0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 2028)              0         
                                                                 
 dense (Dense)               (None, 10)                20290     
                                                                 
=================================================================
Total params: 20,410
Trainable params: 20,410
Non-trainable params: 0
_________________________________________________________________


# Setup the model for training
baseline_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
baseline_model.fit(train_images, train_labels, epochs=1, shuffle=False)

1875/1875 [==============================] - 28s 14ms/step - loss: 0.2750 - accuracy: 0.9236

<keras.callbacks.History at 0x7ff8319d74d0>


# Get the baseline accuracy
_, ACCURACY['baseline Keras model'] = baseline_model.evaluate(test_images, test_labels)

313/313 [==============================] - 2s 5ms/step - loss: 0.1340 - accuracy: 0.9607


# Save the Keras model
baseline_model.save(FILE_NON_QUANTIZED_H5, include_optimizer=False)

# Save and get the model size
MODEL_SIZE['baseline h5'] = os.path.getsize(FILE_NON_QUANTIZED_H5)

# Print records so far
print_metric(ACCURACY, "test accuracy")
print_metric(MODEL_SIZE, "model size in bytes")

test accuracy for baseline Keras model: 0.9606999754905701
model size in bytes for baseline h5: 99144


def convert_tflite(model, filename, quantize=False):
  '''
  Converts the model to TF Lite format and writes to a file

  Args:
    model (Keras model) - model to convert to TF Lite
    filename (string) - string to use when saving the file
    quantize (bool) - flag to indicate quantization

  Returns:
    None
  '''
  
  # Initialize the converter
  converter = tf.lite.TFLiteConverter.from_keras_model(model)

  # Set for quantization if flag is set to True
  if quantize:
    converter.optimizations = [tf.lite.Optimize.DEFAULT]

  # Convert the model
  tflite_model = converter.convert()

  # Save the model.
  with open(filename, 'wb') as f:
    f.write(tflite_model)


# Convert baseline model
convert_tflite(baseline_model, FILE_NON_QUANTIZED_TFLITE)


MODEL_SIZE['non quantized tflite'] = os.path.getsize(FILE_NON_QUANTIZED_TFLITE)

print_metric(MODEL_SIZE, 'model size in bytes')

model size in bytes for baseline h5: 99144
model size in bytes for non quantized tflite: 84688


ACCURACY['non quantized tflite'] = evaluate_tflite_model(FILE_NON_QUANTIZED_TFLITE, test_images, test_labels)


print_metric(ACCURACY, 'test accuracy')

test accuracy for baseline Keras model: 0.9606999754905701
test accuracy for non quantized tflite: 0.9607


# Convert and quantize the baseline model
convert_tflite(baseline_model, FILE_PT_QUANTIZED, quantize=True)


# Get the model size
MODEL_SIZE['post training quantized tflite'] = os.path.getsize(FILE_PT_QUANTIZED)

print_metric(MODEL_SIZE, 'model size')

model size for baseline h5: 99144
model size for non quantized tflite: 84688
model size for post training quantized tflite: 23920


ACCURACY['post training quantized tflite'] = evaluate_tflite_model(FILE_PT_QUANTIZED, test_images, test_labels)


print_metric(ACCURACY, 'test accuracy')

test accuracy for baseline Keras model: 0.9606999754905701
test accuracy for non quantized tflite: 0.9607
test accuracy for post training quantized tflite: 0.9608


# Install the toolkit
!pip install tensorflow_model_optimization

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_model_optimization
  Downloading tensorflow_model_optimization-0.7.3-py2.py3-none-any.whl (238 kB)
     |████████████████████████████████| 238 kB 6.8 MB/s 
Requirement already satisfied: numpy~=1.14 in /usr/local/lib/python3.7/dist-packages (from tensorflow_model_optimization) (1.21.6)
Requirement already satisfied: six~=1.10 in /usr/local/lib/python3.7/dist-packages (from tensorflow_model_optimization) (1.15.0)
Requirement already satisfied: dm-tree~=0.1.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow_model_optimization) (0.1.7)
Installing collected packages: tensorflow-model-optimization
Successfully installed tensorflow-model-optimization-0.7.3


import tensorflow_model_optimization as tfmot

# method to quantize a Keras model
quantize_model = tfmot.quantization.keras.quantize_model

# Define the model architecture.
model_to_quantize = model_builder()

# Reinitialize weights with saved file
model_to_quantize.load_weights(FILE_WEIGHTS)

# Quantize the model
q_aware_model = quantize_model(model_to_quantize)

# `quantize_model` requires a recompile.
q_aware_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

q_aware_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 quantize_layer (QuantizeLay  (None, 28, 28)           3         
 er)                                                             
                                                                 
 quant_reshape_1 (QuantizeWr  (None, 28, 28, 1)        1         
 apperV2)                                                        
                                                                 
 quant_conv2d_1 (QuantizeWra  (None, 26, 26, 12)       147       
 pperV2)                                                         
                                                                 
 quant_max_pooling2d_1 (Quan  (None, 13, 13, 12)       1         
 tizeWrapperV2)                                                  
                                                                 
 quant_flatten_1 (QuantizeWr  (None, 2028)             1         
 apperV2)                                                        
                                                                 
 quant_dense_1 (QuantizeWrap  (None, 10)               20295     
 perV2)                                                          
                                                                 
=================================================================
Total params: 20,448
Trainable params: 20,410
Non-trainable params: 38
_________________________________________________________________


# Train the model
q_aware_model.fit(train_images, train_labels, epochs=1, shuffle=False)

1875/1875 [==============================] - 34s 18ms/step - loss: 0.2747 - accuracy: 0.9234

<keras.callbacks.History at 0x7ff83162a390>


# Reinitialize the dictionary
ACCURACY = {}

# Get the accuracy of the quantization aware trained model (not yet quantized)
_, ACCURACY['quantization aware non-quantized'] = q_aware_model.evaluate(test_images, test_labels, verbose=0)
print_metric(ACCURACY, 'test accuracy')

test accuracy for quantization aware non-quantized: 0.9605000019073486


# Convert and quantize the model.
convert_tflite(q_aware_model, FILE_QAT_QUANTIZED, quantize=True)

# Get the accuracy of the quantized model
ACCURACY['quantization aware quantized'] = evaluate_tflite_model(FILE_QAT_QUANTIZED, test_images, test_labels)
print_metric(ACCURACY, 'test accuracy')

WARNING:absl:Found untraced functions such as reshape_1_layer_call_fn, reshape_1_layer_call_and_return_conditional_losses, conv2d_1_layer_call_fn, conv2d_1_layer_call_and_return_conditional_losses, flatten_1_layer_call_fn while saving (showing 5 of 8). These functions will not be directly callable after loading.
/usr/local/lib/python3.7/dist-packages/tensorflow/lite/python/convert.py:746: UserWarning: Statistics for quantized inputs were expected, but not specified; continuing anyway.
  warnings.warn("Statistics for quantized inputs were expected, but not "

test accuracy for quantization aware non-quantized: 0.9605000019073486
test accuracy for quantization aware quantized: 0.9605


# Get the pruning method
prune_low_magnitude = tfmot.sparsity.keras.prune_low_magnitude

# Compute end step to finish pruning after 2 epochs.
batch_size = 128
epochs = 2
validation_split = 0.1 # 10% of training set will be used for validation set. 

num_images = train_images.shape[0] * (1 - validation_split)
end_step = np.ceil(num_images / batch_size).astype(np.int32) * epochs

# Define pruning schedule.
pruning_params = {
      'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(initial_sparsity=0.50,
                                                               final_sparsity=0.80,
                                                               begin_step=0,
                                                               end_step=end_step)
}

# Pass in the trained baseline model
model_for_pruning = prune_low_magnitude(baseline_model, **pruning_params)

# `prune_low_magnitude` requires a recompile.
model_for_pruning.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model_for_pruning.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 prune_low_magnitude_reshape  (None, 28, 28, 1)        1         
  (PruneLowMagnitude)                                            
                                                                 
 prune_low_magnitude_conv2d   (None, 26, 26, 12)       230       
 (PruneLowMagnitude)                                             
                                                                 
 prune_low_magnitude_max_poo  (None, 13, 13, 12)       1         
 ling2d (PruneLowMagnitude)                                      
                                                                 
 prune_low_magnitude_flatten  (None, 2028)             1         
  (PruneLowMagnitude)                                            
                                                                 
 prune_low_magnitude_dense (  (None, 10)               40572     
 PruneLowMagnitude)                                              
                                                                 
=================================================================
Total params: 40,805
Trainable params: 20,410
Non-trainable params: 20,395
_________________________________________________________________


# Preview model weights
model_for_pruning.weights[1]

<tf.Variable 'conv2d/kernel:0' shape=(3, 3, 1, 12) dtype=float32, numpy=
array([[[[-0.00131879, -0.31044307, -0.4896863 , -0.77352804,
          -0.5737507 ,  0.18250749,  0.16391452,  0.19936788,
           0.02825873,  0.28855288, -0.14746025,  0.05902714]],

        [[ 0.2905742 , -0.38617322, -0.05299852, -0.15797153,
          -0.45363298,  0.5690967 ,  0.21523598,  0.06382772,
           0.3293974 , -0.02827722,  0.16038917,  0.18778408]],

        [[-0.07433511, -0.5046801 ,  0.25648767,  0.50474334,
          -0.4574229 ,  0.52256197,  0.14054975,  0.24384455,
           0.14521429, -0.01535548,  0.17227761,  0.08052304]]],


       [[[-0.08192147,  0.12761664, -0.18748058, -0.5313372 ,
           0.11180869, -0.19375524,  0.03905397,  0.05388444,
           0.1067948 ,  0.09998912,  0.06146517,  0.16614504]],

        [[ 0.2966911 , -0.23944874,  0.38411075, -0.04384878,
           0.01674813,  0.29199484, -0.02626076,  0.13813399,
          -0.08787602,  0.22885188,  0.07904211,  0.10781038]],

        [[ 0.15728188, -0.27008668,  0.07769174,  0.5569592 ,
           0.24430685,  0.145158  ,  0.09383007,  0.02966278,
          -0.21529348,  0.21796635,  0.2709011 ,  0.09508418]]],


       [[[-0.04255679,  0.47988173,  0.28537205, -0.58118373,
           0.22781311, -0.78584427,  0.00198846,  0.26316637,
           0.26885912, -0.12242381,  0.06803723,  0.23239876]],

        [[-0.05622404,  0.5827157 ,  0.24176164,  0.1550421 ,
           0.39358118, -0.91509104,  0.19484338,  0.03318498,
          -0.10516898, -0.1042311 ,  0.25362802,  0.12499411]],

        [[ 0.22935502,  0.2751322 , -0.24782728,  0.48462537,
           0.18274064, -0.6808783 ,  0.16600703, -0.2500077 ,
          -0.25898662,  0.20844641,  0.02052744,  0.00934558]]]],
      dtype=float32)>


# Callback to update pruning wrappers at each step
callbacks = [
  tfmot.sparsity.keras.UpdatePruningStep(),
]

# Train and prune the model
model_for_pruning.fit(train_images, train_labels,
                  epochs=epochs, validation_split=validation_split,
                  callbacks=callbacks)

Epoch 1/2
1688/1688 [==============================] - 20s 10ms/step - loss: 0.1424 - accuracy: 0.9613 - val_loss: 0.0983 - val_accuracy: 0.9735
Epoch 2/2
1688/1688 [==============================] - 17s 10ms/step - loss: 0.1106 - accuracy: 0.9673 - val_loss: 0.0903 - val_accuracy: 0.9755

<keras.callbacks.History at 0x7ff82a5e3790>


# Preview model weights
model_for_pruning.weights[1]

<tf.Variable 'conv2d/kernel:0' shape=(3, 3, 1, 12) dtype=float32, numpy=
array([[[[ 0.        ,  0.        ,  0.        , -1.2735044 ,
          -0.8221039 ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.7010715 ,  0.        , -0.        ]],

        [[-0.        , -0.70220304,  0.        ,  0.        ,
          -0.84345174,  0.8731478 ,  0.        ,  0.        ,
           0.7973057 ,  0.        ,  0.        , -0.        ]],

        [[-0.        , -0.78680617,  0.        ,  0.70711726,
           0.        ,  0.9153073 ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        , -0.        ]]],


       [[[ 0.        ,  0.        ,  0.        , -0.9273642 ,
           0.        , -0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ]],

        [[ 0.79275686,  0.        ,  0.7567813 ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ]],

        [[-0.        , -0.        ,  0.        ,  0.83282506,
           0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ]]],


       [[[ 0.        ,  0.74183524,  0.        , -0.64257205,
           0.        , -1.3102269 ,  0.        , -0.        ,
           0.        ,  0.        ,  0.        ,  0.        ]],

        [[ 0.        ,  0.8522613 ,  0.        ,  0.        ,
           0.9553675 , -1.1467953 ,  0.        , -0.        ,
           0.        ,  0.        ,  0.        ,  0.        ]],

        [[ 0.        ,  0.        ,  0.        ,  0.75401145,
           0.        , -1.043593  ,  0.        , -0.        ,
          -0.        ,  0.        ,  0.        ,  0.        ]]]],
      dtype=float32)>


# Remove pruning wrappers
model_for_export = tfmot.sparsity.keras.strip_pruning(model_for_pruning)
model_for_export.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 reshape (Reshape)           (None, 28, 28, 1)         0         
                                                                 
 conv2d (Conv2D)             (None, 26, 26, 12)        120       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 12)       0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 2028)              0         
                                                                 
 dense (Dense)               (None, 10)                20290     
                                                                 
=================================================================
Total params: 20,410
Trainable params: 20,410
Non-trainable params: 0
_________________________________________________________________


# Preview model weights (index 1 earlier is now 0 because pruning wrappers were removed)
model_for_export.weights[0]

<tf.Variable 'conv2d/kernel:0' shape=(3, 3, 1, 12) dtype=float32, numpy=
array([[[[ 0.        ,  0.        ,  0.        , -1.2735044 ,
          -0.8221039 ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.7010715 ,  0.        , -0.        ]],

        [[-0.        , -0.70220304,  0.        ,  0.        ,
          -0.84345174,  0.8731478 ,  0.        ,  0.        ,
           0.7973057 ,  0.        ,  0.        , -0.        ]],

        [[-0.        , -0.78680617,  0.        ,  0.70711726,
           0.        ,  0.9153073 ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        , -0.        ]]],


       [[[ 0.        ,  0.        ,  0.        , -0.9273642 ,
           0.        , -0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ]],

        [[ 0.79275686,  0.        ,  0.7567813 ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ]],

        [[-0.        , -0.        ,  0.        ,  0.83282506,
           0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ]]],


       [[[ 0.        ,  0.74183524,  0.        , -0.64257205,
           0.        , -1.3102269 ,  0.        , -0.        ,
           0.        ,  0.        ,  0.        ,  0.        ]],

        [[ 0.        ,  0.8522613 ,  0.        ,  0.        ,
           0.9553675 , -1.1467953 ,  0.        , -0.        ,
           0.        ,  0.        ,  0.        ,  0.        ]],

        [[ 0.        ,  0.        ,  0.        ,  0.75401145,
           0.        , -1.043593  ,  0.        , -0.        ,
          -0.        ,  0.        ,  0.        ,  0.        ]]]],
      dtype=float32)>


# Save Keras model
model_for_export.save(FILE_PRUNED_MODEL_H5, include_optimizer=False)

# Get uncompressed model size of baseline and pruned models
MODEL_SIZE = {}
MODEL_SIZE['baseline h5'] = os.path.getsize(FILE_NON_QUANTIZED_H5)
MODEL_SIZE['pruned non quantized h5'] = os.path.getsize(FILE_PRUNED_MODEL_H5)

print_metric(MODEL_SIZE, 'model_size in bytes')

WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.

model_size in bytes for baseline h5: 99144
model_size in bytes for pruned non quantized h5: 99144


# Get compressed size of baseline and pruned models
MODEL_SIZE = {}
MODEL_SIZE['baseline h5'] = get_gzipped_model_size(FILE_NON_QUANTIZED_H5)
MODEL_SIZE['pruned non quantized h5'] = get_gzipped_model_size(FILE_PRUNED_MODEL_H5)

print_metric(MODEL_SIZE, "gzipped model size in bytes")

gzipped model size in bytes for baseline h5: 78102
gzipped model size in bytes for pruned non quantized h5: 25973


# Convert and quantize the pruned model.
pruned_quantized_tflite = convert_tflite(model_for_export, FILE_PRUNED_QUANTIZED_TFLITE, quantize=True)

# Compress and get the model size
MODEL_SIZE['pruned quantized tflite'] = get_gzipped_model_size(FILE_PRUNED_QUANTIZED_TFLITE)
print_metric(MODEL_SIZE, "gzipped model size in bytes")

gzipped model size in bytes for baseline h5: 78102
gzipped model size in bytes for pruned non quantized h5: 25973
gzipped model size in bytes for pruned quantized tflite: 8110


# Get accuracy of pruned Keras and TF Lite models
ACCURACY = {}

_, ACCURACY['pruned model h5'] = model_for_pruning.evaluate(test_images, test_labels)
ACCURACY['pruned and quantized tflite'] = evaluate_tflite_model(FILE_PRUNED_QUANTIZED_TFLITE, test_images, test_labels)

print_metric(ACCURACY, 'accuracy')

313/313 [==============================] - 2s 5ms/step - loss: 0.0948 - accuracy: 0.9714
accuracy for pruned model h5: 0.9714000225067139
accuracy for pruned and quantized tflite: 0.9711

Ungraded Lab: Quantization and Pruning¶

Imports¶

Utilities and constants ¶

Download and Prepare the Dataset¶

Baseline Model¶

Convert the model to TF Lite format¶

Post-Training Quantization¶

Quantization Aware Training¶

Pruning¶

Wrap Up¶

Ungraded Lab: Quantization and Pruning¶

Imports¶

Utilities and constants¶

Download and Prepare the Dataset¶

Baseline Model¶

Convert the model to TF Lite format¶

Post-Training Quantization¶

Quantization Aware Training¶

Pruning¶

Wrap Up¶

Utilities and constants ¶