# Install this package to use Colab's GPU for training
!apt install --allow-change-held-packages libcudnn8=8.4.1.50-1+cuda11.6

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following packages will be REMOVED:
  libcudnn8-dev
The following held packages will be changed:
  libcudnn8
The following packages will be upgraded:
  libcudnn8
1 upgraded, 0 newly installed, 1 to remove and 18 not upgraded.
Need to get 420 MB of archives.
After this operation, 3,369 MB disk space will be freed.
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  libcudnn8 8.4.1.50-1+cuda11.6 [420 MB]
Fetched 420 MB in 17s (24.4 MB/s)
(Reading database ... 155685 files and directories currently installed.)
Removing libcudnn8-dev (8.0.5.39-1+cuda11.1) ...
(Reading database ... 155663 files and directories currently installed.)
Preparing to unpack .../libcudnn8_8.4.1.50-1+cuda11.6_amd64.deb ...
Unpacking libcudnn8 (8.4.1.50-1+cuda11.6) over (8.0.5.39-1+cuda11.1) ...
Setting up libcudnn8 (8.4.1.50-1+cuda11.6) ...


# NOTE (05/16/2022) - The default URL for downloading the dataset via TFDS is currently invalid. This piece of code
# will patch that URL with the correct one. Please run this before importing the packages. If it fails, kindly report
# in our Discourse community. Thank you!

DATASET_PATH = '/usr/local/lib/python3.7/dist-packages/tensorflow_datasets/image_classification/cats_vs_dogs.py'
!sed -i 's/3367a/5340/g' {DATASET_PATH}


# For setting random seeds
import os
os.environ['PYTHONHASHSEED']=str(42)

# Libraries
import random
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

# More random seed setup
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)


# Define train/test splits
splits = ['train[:80%]', 'train[80%:90%]', 'train[90%:]']

# Download the dataset
(train_examples, validation_examples, test_examples), info = tfds.load('cats_vs_dogs', with_info=True, as_supervised=True, split=splits)

# Print useful information
num_examples = info.splits['train'].num_examples
num_classes = info.features['label'].num_classes

print(f"There are {num_examples} images for {num_classes} classes.")

Downloading and preparing dataset 786.68 MiB (download: 786.68 MiB, generated: Unknown size, total: 786.68 MiB) to ~/tensorflow_datasets/cats_vs_dogs/4.0.0...

WARNING:absl:1738 images were corrupted and were skipped

Dataset cats_vs_dogs downloaded and prepared to ~/tensorflow_datasets/cats_vs_dogs/4.0.0. Subsequent calls will reuse this data.
There are 23262 images for 2 classes.


# Some global variables
pixels = 224
IMAGE_SIZE = (pixels, pixels)
BATCH_SIZE = 32

# Apply resizing and pixel normalization
def format_image(image, label):
    image = tf.image.resize(image, IMAGE_SIZE) / 255.0
    return  image, label

# Create batches of data
train_batches = train_examples.shuffle(num_examples // 4).map(format_image).batch(BATCH_SIZE).prefetch(1)
validation_batches = validation_examples.map(format_image).batch(BATCH_SIZE).prefetch(1)
test_batches = test_examples.map(format_image).batch(1)


class Distiller(keras.Model):

  # Needs both the student and teacher models to create an instance of this class
  def __init__(self, student, teacher):
      super(Distiller, self).__init__()
      self.teacher = teacher
      self.student = student


  # Will be used when calling model.compile()
  def compile(self, optimizer, metrics, student_loss_fn,
              distillation_loss_fn, alpha, temperature):

      # Compile using the optimizer and metrics
      super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
      
      # Add the other params to the instance
      self.student_loss_fn = student_loss_fn
      self.distillation_loss_fn = distillation_loss_fn
      self.alpha = alpha
      self.temperature = temperature


  # Will be used when calling model.fit()
  def train_step(self, data):
      # Data is expected to be a tuple of (features, labels)
      x, y = data

      # Vanilla forward pass of the teacher
      # Note that the teacher is NOT trained
      teacher_predictions = self.teacher(x, training=False)

      # Use GradientTape to save gradients
      with tf.GradientTape() as tape:
          # Vanilla forward pass of the student
          student_predictions = self.student(x, training=True)

          # Compute vanilla student loss
          student_loss = self.student_loss_fn(y, student_predictions)
          
          # Compute distillation loss
          # Should be KL divergence between logits softened by a temperature factor
          distillation_loss = self.distillation_loss_fn(
              tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
              tf.nn.softmax(student_predictions / self.temperature, axis=1))

          # Compute loss by weighting the two previous losses using the alpha param
          loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

      # Use tape to calculate gradients for student
      trainable_vars = self.student.trainable_variables
      gradients = tape.gradient(loss, trainable_vars)

      # Update student weights 
      # Note that this done ONLY for the student
      self.optimizer.apply_gradients(zip(gradients, trainable_vars))

      # Update the metrics
      self.compiled_metrics.update_state(y, student_predictions)

      # Return a performance dictionary
      # You will see this being outputted during training
      results = {m.name: m.result() for m in self.metrics}
      results.update({"student_loss": student_loss, "distillation_loss": distillation_loss})
      return results


  # Will be used when calling model.evaluate()
  def test_step(self, data):
      # Data is expected to be a tuple of (features, labels)
      x, y = data

      # Use student to make predictions
      # Notice that the training param is set to False
      y_prediction = self.student(x, training=False)

      # Calculate student's vanilla loss
      student_loss = self.student_loss_fn(y, y_prediction)

      # Update the metrics
      self.compiled_metrics.update_state(y, y_prediction)

      # Return a performance dictionary
      # You will see this being outputted during inference
      results = {m.name: m.result() for m in self.metrics}
      results.update({"student_loss": student_loss})
      return results


# Teacher model
def create_big_model():
  tf.random.set_seed(42)
  model = keras.models.Sequential([
    keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Conv2D(64, (3, 3), activation='relu'),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Dropout(0.2),
    keras.layers.Conv2D(64, (3, 3), activation='relu'),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Conv2D(128, (3, 3), activation='relu'),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Dropout(0.5),
    keras.layers.Flatten(),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dense(2)
  ])

  return model



# Student model
def create_small_model():
  tf.random.set_seed(42)
  model = keras.models.Sequential([
    keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Flatten(),
    keras.layers.Dense(2)
  ])

  return model


# Create the teacher
teacher = create_big_model()

# Plot architecture
keras.utils.plot_model(teacher, rankdir="LR")


# Create the student
student = create_small_model()

# Plot architecture
keras.utils.plot_model(student, rankdir="LR")


# Calculates number of trainable params for a given model
def num_trainable_params(model):
  return np.sum([np.prod(v.get_shape()) for v in model.trainable_weights])


student_params = num_trainable_params(student)
teacher_params = num_trainable_params(teacher)

print(f"Teacher model has: {teacher_params} trainable params.\n")
print(f"Student model has: {student_params} trainable params.\n")
print(f"Teacher model is roughly {teacher_params//student_params} times bigger than the student model.")

Teacher model has: 9568898 trainable params.

Student model has: 789442 trainable params.

Teacher model is roughly 12 times bigger than the student model.


# Compile the teacher model
teacher.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), # Notice from_logits param is set to True
    optimizer=keras.optimizers.Adam(),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

# Fit the model and save the training history (will take from 5 to 10 minutes depending on the GPU you were assigned to)
teacher_history = teacher.fit(train_batches, epochs=8, validation_data=validation_batches)

Epoch 1/8
582/582 [==============================] - 44s 59ms/step - loss: 0.6810 - sparse_categorical_accuracy: 0.5692 - val_loss: 0.6220 - val_sparse_categorical_accuracy: 0.6698
Epoch 2/8
582/582 [==============================] - 38s 59ms/step - loss: 0.5590 - sparse_categorical_accuracy: 0.7100 - val_loss: 0.4882 - val_sparse_categorical_accuracy: 0.7721
Epoch 3/8
582/582 [==============================] - 38s 60ms/step - loss: 0.4682 - sparse_categorical_accuracy: 0.7737 - val_loss: 0.4265 - val_sparse_categorical_accuracy: 0.8104
Epoch 4/8
582/582 [==============================] - 38s 59ms/step - loss: 0.4133 - sparse_categorical_accuracy: 0.8086 - val_loss: 0.3949 - val_sparse_categorical_accuracy: 0.8298
Epoch 5/8
582/582 [==============================] - 38s 59ms/step - loss: 0.3646 - sparse_categorical_accuracy: 0.8354 - val_loss: 0.3959 - val_sparse_categorical_accuracy: 0.8280
Epoch 6/8
582/582 [==============================] - 38s 59ms/step - loss: 0.3199 - sparse_categorical_accuracy: 0.8628 - val_loss: 0.3432 - val_sparse_categorical_accuracy: 0.8478
Epoch 7/8
582/582 [==============================] - 38s 59ms/step - loss: 0.2756 - sparse_categorical_accuracy: 0.8851 - val_loss: 0.3696 - val_sparse_categorical_accuracy: 0.8452
Epoch 8/8
582/582 [==============================] - 38s 59ms/step - loss: 0.2381 - sparse_categorical_accuracy: 0.9013 - val_loss: 0.3209 - val_sparse_categorical_accuracy: 0.8633


# Create student_scratch model with the same characteristics as the original student
student_scratch = create_small_model()

# Compile it
student_scratch.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

# Train and evaluate student trained from scratch (will take around 3 mins with GPU enabled)
student_scratch_history = student_scratch.fit(train_batches, epochs=5, validation_data=validation_batches)

Epoch 1/5
582/582 [==============================] - 31s 47ms/step - loss: 0.7742 - sparse_categorical_accuracy: 0.6560 - val_loss: 0.6068 - val_sparse_categorical_accuracy: 0.6849
Epoch 2/5
582/582 [==============================] - 30s 46ms/step - loss: 0.4940 - sparse_categorical_accuracy: 0.7646 - val_loss: 0.6188 - val_sparse_categorical_accuracy: 0.6926
Epoch 3/5
582/582 [==============================] - 30s 46ms/step - loss: 0.3868 - sparse_categorical_accuracy: 0.8280 - val_loss: 0.6429 - val_sparse_categorical_accuracy: 0.7111
Epoch 4/5
582/582 [==============================] - 30s 46ms/step - loss: 0.2893 - sparse_categorical_accuracy: 0.8790 - val_loss: 0.7100 - val_sparse_categorical_accuracy: 0.7150
Epoch 5/5
582/582 [==============================] - 30s 45ms/step - loss: 0.1992 - sparse_categorical_accuracy: 0.9253 - val_loss: 0.8119 - val_sparse_categorical_accuracy: 0.7098


# Create Distiller instance
distiller = Distiller(student=student, teacher=teacher)

# Compile Distiller model
distiller.compile(
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.05,
    temperature=5,
)

# Distill knowledge from teacher to student (will take around 3 mins with GPU enabled)
distiller_history = distiller.fit(train_batches, epochs=5, validation_data=validation_batches)

Epoch 1/5
582/582 [==============================] - 34s 51ms/step - sparse_categorical_accuracy: 0.6381 - student_loss: 0.8444 - distillation_loss: 0.1082 - val_sparse_categorical_accuracy: 0.6754 - val_student_loss: 0.6709
Epoch 2/5
582/582 [==============================] - 33s 50ms/step - sparse_categorical_accuracy: 0.7275 - student_loss: 0.5710 - distillation_loss: 0.0609 - val_sparse_categorical_accuracy: 0.7180 - val_student_loss: 0.6101
Epoch 3/5
582/582 [==============================] - 33s 50ms/step - sparse_categorical_accuracy: 0.7868 - student_loss: 0.4627 - distillation_loss: 0.0468 - val_sparse_categorical_accuracy: 0.7322 - val_student_loss: 0.6864
Epoch 4/5
582/582 [==============================] - 33s 50ms/step - sparse_categorical_accuracy: 0.8312 - student_loss: 0.3770 - distillation_loss: 0.0370 - val_sparse_categorical_accuracy: 0.7283 - val_student_loss: 0.8699
Epoch 5/5
582/582 [==============================] - 33s 51ms/step - sparse_categorical_accuracy: 0.8645 - student_loss: 0.3111 - distillation_loss: 0.0310 - val_sparse_categorical_accuracy: 0.7317 - val_student_loss: 0.9294


# Compute accuracies
student_scratch_acc = student_scratch.evaluate(test_batches, return_dict=True).get("sparse_categorical_accuracy")
distiller_acc = distiller.evaluate(test_batches, return_dict=True).get("sparse_categorical_accuracy")
teacher_acc = teacher.evaluate(test_batches, return_dict=True).get("sparse_categorical_accuracy")

# Print results
print(f"\n\nTeacher achieved a sparse_categorical_accuracy of {teacher_acc*100:.2f}%.\n")
print(f"Student with knowledge distillation achieved a sparse_categorical_accuracy of {distiller_acc*100:.2f}%.\n")
print(f"Student without knowledge distillation achieved a sparse_categorical_accuracy of {student_scratch_acc*100:.2f}%.\n")

2326/2326 [==============================] - 7s 3ms/step - loss: 0.7297 - sparse_categorical_accuracy: 0.6969
2326/2326 [==============================] - 7s 3ms/step - sparse_categorical_accuracy: 0.7347 - student_loss: 0.6338
2326/2326 [==============================] - 9s 4ms/step - loss: 0.3038 - sparse_categorical_accuracy: 0.8715


Teacher achieved a sparse_categorical_accuracy of 87.15%.

Student with knowledge distillation achieved a sparse_categorical_accuracy of 73.47%.

Student without knowledge distillation achieved a sparse_categorical_accuracy of 69.69%.


# Get relevant metrics from a history
def get_metrics(history):
  history = history.history
  acc = history['sparse_categorical_accuracy']
  val_acc = history['val_sparse_categorical_accuracy']
  return acc, val_acc


# Plot training and evaluation metrics given a dict of histories
def plot_train_eval(history_dict):
  
  metric_dict = {}

  for k, v in history_dict.items():
    acc, val_acc= get_metrics(v)
    metric_dict[f'{k} training acc'] = acc
    metric_dict[f'{k} eval acc'] = val_acc

  acc_plot = pd.DataFrame(metric_dict)
  
  acc_plot = sns.lineplot(data=acc_plot, markers=True)
  acc_plot.set_title('training vs evaluation accuracy')
  acc_plot.set_xlabel('epoch')
  acc_plot.set_ylabel('sparse_categorical_accuracy')
  plt.show()


# Plot for comparing the two student models
plot_train_eval({
    "distilled": distiller_history,
    "student_scratch": student_scratch_history,
})

Ungraded Lab: Knowledge Distillation¶

Imports¶

Prepare the data¶

Code the custom `Distiller` model¶

Teacher and student models¶

Train the teacher¶

Train a student from scratch for reference¶

Knowledge Distillation¶

Comparing the models¶

Ungraded Lab: Knowledge Distillation¶

Imports¶

Prepare the data¶

Code the custom Distiller model¶

Teacher and student models¶

Train the teacher¶

Train a student from scratch for reference¶

Knowledge Distillation¶

Comparing the models¶

Code the custom `Distiller` model¶