# Import the packages

# Utilities
import os
import logging

# For visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

# For modelling
import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.keras import layers, models

# Set TF logger to only print errors (dismiss warnings)
logging.getLogger("tensorflow").setLevel(logging.ERROR)


if not os.path.isdir("/tmp/data"):
    os.makedirs("/tmp/data")


!gsutil cp gs://cloud-training-demos/feat_eng/data/taxi*.csv /tmp/data

Copying gs://cloud-training-demos/feat_eng/data/taxi-test.csv...
Copying gs://cloud-training-demos/feat_eng/data/taxi-train.csv...
Copying gs://cloud-training-demos/feat_eng/data/taxi-valid.csv...
/ [3 files][  5.3 MiB/  5.3 MiB]                                                
Operation completed over 3 objects/5.3 MiB.


!ls -l /tmp/data/*.csv

-rw-r--r-- 1 root root 1113292 Sep  5 01:43 /tmp/data/taxi-test.csv
-rw-r--r-- 1 root root 3551735 Sep  5 01:43 /tmp/data/taxi-train.csv
-rw-r--r-- 1 root root  888648 Sep  5 01:43 /tmp/data/taxi-valid.csv


pd.read_csv('/tmp/data/taxi-train.csv').head()


# Specify which column is the target
LABEL_COLUMN = 'fare_amount'

# Specify numerical columns
# Note you should create another list with STRING_COLS if you 
# had text data but in this case all features are numerical
NUMERIC_COLS = ['pickup_longitude', 'pickup_latitude',
                'dropoff_longitude', 'dropoff_latitude',
                'passenger_count', 'hourofday', 'dayofweek']


# A function to separate features and labels
def features_and_labels(row_data):
    label = row_data.pop(LABEL_COLUMN)
    return row_data, label


# A utility method to create a tf.data dataset from a CSV file
def load_dataset(pattern, batch_size=1, mode='eval'):
    dataset = tf.data.experimental.make_csv_dataset(pattern, batch_size)
    
    dataset = dataset.map(features_and_labels)  # features, label
    if mode == 'train':
        # Notice the repeat method is used so this dataset will loop infinitely
        dataset = dataset.shuffle(1000).repeat()
        # take advantage of multi-threading; 1=AUTOTUNE
        dataset = dataset.prefetch(1)
    return dataset


def build_dnn_model():
    # input layer
    inputs = {
        colname: layers.Input(name=colname, shape=(), dtype='float32')
        for colname in NUMERIC_COLS
    }

    # feature_columns
    feature_columns = {
        colname: fc.numeric_column(colname)
        for colname in NUMERIC_COLS
    }

    # Constructor for DenseFeatures takes a list of numeric columns
    # and the resulting tensor takes a dictionary of Input layers
    dnn_inputs = layers.DenseFeatures(feature_columns.values())(inputs)

    # two hidden layers of 32 and 8 units, respectively
    h1 = layers.Dense(32, activation='relu', name='h1')(dnn_inputs)
    h2 = layers.Dense(8, activation='relu', name='h2')(h1)

    # final output is a linear activation because this is a regression problem
    output = layers.Dense(1, activation='linear', name='fare')(h2)

    # Create model with inputs and output
    model = models.Model(inputs, output)

    # compile model (Mean Squared Error is suitable for regression)
    model.compile(optimizer='adam', 
                  loss='mse', 
                  metrics=[
                      tf.keras.metrics.RootMeanSquaredError(name='rmse'), 
                      'mse'
                  ])

    return model


# Save compiled model into a variable
model = build_dnn_model()

# Plot the layer architecture and relationship between input features
tf.keras.utils.plot_model(model, 'dnn_model.png', show_shapes=False, rankdir='LR')


NUM_EPOCHS = 20
TRAIN_BATCH_SIZE = 32 
NUM_TRAIN_EXAMPLES = len(pd.read_csv('/tmp/data/taxi-train.csv'))
NUM_EVAL_EXAMPLES = len(pd.read_csv('/tmp/data/taxi-valid.csv'))

print(f"training split has {NUM_TRAIN_EXAMPLES} examples\n")
print(f"evaluation split has {NUM_EVAL_EXAMPLES} examples\n")

training split has 59620 examples

evaluation split has 14905 examples


# Training dataset
trainds = load_dataset('/tmp/data/taxi-train*', TRAIN_BATCH_SIZE, 'train')

# Evaluation dataset
evalds = load_dataset('/tmp/data/taxi-valid*', 1000, 'eval').take(NUM_EVAL_EXAMPLES//1000)

# Needs to be specified since the dataset is infinite 
# This happens because the repeat method was used when creating the dataset
steps_per_epoch = NUM_TRAIN_EXAMPLES // TRAIN_BATCH_SIZE

# Train the model and save the history
history = model.fit(trainds,
                    validation_data=evalds,
                    epochs=NUM_EPOCHS,
                    steps_per_epoch=steps_per_epoch)

Epoch 1/20
1863/1863 [==============================] - 7s 3ms/step - loss: 109.0923 - rmse: 10.4447 - mse: 109.0923 - val_loss: 100.2218 - val_rmse: 10.0111 - val_mse: 100.2218
Epoch 2/20
1863/1863 [==============================] - 4s 2ms/step - loss: 102.6410 - rmse: 10.1312 - mse: 102.6410 - val_loss: 99.8895 - val_rmse: 9.9945 - val_mse: 99.8895
Epoch 3/20
1863/1863 [==============================] - 5s 3ms/step - loss: 102.2085 - rmse: 10.1098 - mse: 102.2085 - val_loss: 99.1205 - val_rmse: 9.9559 - val_mse: 99.1205
Epoch 4/20
1863/1863 [==============================] - 4s 2ms/step - loss: 101.4055 - rmse: 10.0700 - mse: 101.4055 - val_loss: 99.1794 - val_rmse: 9.9589 - val_mse: 99.1794
Epoch 5/20
1863/1863 [==============================] - 5s 3ms/step - loss: 102.6442 - rmse: 10.1313 - mse: 102.6442 - val_loss: 101.4165 - val_rmse: 10.0706 - val_mse: 101.4165
Epoch 6/20
1863/1863 [==============================] - 5s 2ms/step - loss: 102.4330 - rmse: 10.1209 - mse: 102.4330 - val_loss: 99.7380 - val_rmse: 9.9869 - val_mse: 99.7380
Epoch 7/20
1863/1863 [==============================] - 4s 2ms/step - loss: 103.2879 - rmse: 10.1631 - mse: 103.2879 - val_loss: 100.5883 - val_rmse: 10.0294 - val_mse: 100.5883
Epoch 8/20
1863/1863 [==============================] - 4s 2ms/step - loss: 103.3413 - rmse: 10.1657 - mse: 103.3413 - val_loss: 101.9372 - val_rmse: 10.0964 - val_mse: 101.9372
Epoch 9/20
1863/1863 [==============================] - 5s 3ms/step - loss: 101.0157 - rmse: 10.0507 - mse: 101.0157 - val_loss: 100.5887 - val_rmse: 10.0294 - val_mse: 100.5887
Epoch 10/20
1863/1863 [==============================] - 5s 2ms/step - loss: 103.2267 - rmse: 10.1601 - mse: 103.2267 - val_loss: 101.2342 - val_rmse: 10.0615 - val_mse: 101.2342
Epoch 11/20
1863/1863 [==============================] - 4s 2ms/step - loss: 104.4978 - rmse: 10.2224 - mse: 104.4978 - val_loss: 99.8487 - val_rmse: 9.9924 - val_mse: 99.8487
Epoch 12/20
1863/1863 [==============================] - 5s 3ms/step - loss: 100.5703 - rmse: 10.0285 - mse: 100.5703 - val_loss: 100.2647 - val_rmse: 10.0132 - val_mse: 100.2647
Epoch 13/20
1863/1863 [==============================] - 4s 2ms/step - loss: 102.0809 - rmse: 10.1035 - mse: 102.0809 - val_loss: 99.1877 - val_rmse: 9.9593 - val_mse: 99.1877
Epoch 14/20
1863/1863 [==============================] - 5s 2ms/step - loss: 99.4760 - rmse: 9.9738 - mse: 99.4760 - val_loss: 97.9041 - val_rmse: 9.8947 - val_mse: 97.9041
Epoch 15/20
1863/1863 [==============================] - 5s 3ms/step - loss: 103.5187 - rmse: 10.1744 - mse: 103.5187 - val_loss: 100.0512 - val_rmse: 10.0026 - val_mse: 100.0512
Epoch 16/20
1863/1863 [==============================] - 5s 2ms/step - loss: 103.8224 - rmse: 10.1893 - mse: 103.8224 - val_loss: 101.7308 - val_rmse: 10.0862 - val_mse: 101.7308
Epoch 17/20
1863/1863 [==============================] - 5s 3ms/step - loss: 101.6058 - rmse: 10.0800 - mse: 101.6058 - val_loss: 99.1334 - val_rmse: 9.9566 - val_mse: 99.1334
Epoch 18/20
1863/1863 [==============================] - 5s 3ms/step - loss: 100.6145 - rmse: 10.0307 - mse: 100.6145 - val_loss: 99.7942 - val_rmse: 9.9897 - val_mse: 99.7942
Epoch 19/20
1863/1863 [==============================] - 5s 3ms/step - loss: 101.7698 - rmse: 10.0881 - mse: 101.7698 - val_loss: 100.5735 - val_rmse: 10.0286 - val_mse: 100.5735
Epoch 20/20
1863/1863 [==============================] - 5s 2ms/step - loss: 102.7730 - rmse: 10.1377 - mse: 102.7730 - val_loss: 100.3174 - val_rmse: 10.0159 - val_mse: 100.3174


# Function for plotting metrics for a given history
def plot_curves(history, metrics):
    nrows = 1
    ncols = 2
    fig = plt.figure(figsize=(10, 5))

    for idx, key in enumerate(metrics):  
        ax = fig.add_subplot(nrows, ncols, idx+1)
        plt.plot(history.history[key])
        plt.plot(history.history[f'val_{key}'])
        plt.title(f'model {key}')
        plt.ylabel(key)
        plt.xlabel('epoch')
        plt.legend(['train', 'validation'], loc='upper left')


# Plot history metrics
plot_curves(history, ['loss', 'mse'])


# Define a taxi ride (a data point)
taxi_ride = {
    'pickup_longitude': tf.convert_to_tensor([-73.982683]),
    'pickup_latitude': tf.convert_to_tensor([40.742104]),
    'dropoff_longitude': tf.convert_to_tensor([-73.983766]),
    'dropoff_latitude': tf.convert_to_tensor([40.755174]),
    'passenger_count': tf.convert_to_tensor([3.0]),
    'hourofday': tf.convert_to_tensor([3.0]),
    'dayofweek': tf.convert_to_tensor([3.0]),
}

# Use the model to predict
prediction = model.predict(taxi_ride, steps=1)

# Print prediction
print(f"the model predicted a fare total of {float(prediction):.2f} USD for the ride.")

the model predicted a fare total of 12.34 USD for the ride.


# Drop dayofweek and hourofday features
NUMERIC_COLS = ['pickup_longitude', 'pickup_latitude',
                'dropoff_longitude', 'dropoff_latitude']


def scale_longitude(lon_column):
    return (lon_column + 78)/8.


def scale_latitude(lat_column):
    return (lat_column - 37)/8.


def euclidean(params):
    lon1, lat1, lon2, lat2 = params
    londiff = lon2 - lon1
    latdiff = lat2 - lat1
    return tf.sqrt(londiff*londiff + latdiff*latdiff)


def transform(inputs, numeric_cols):

    # Make a copy of the inputs to apply the transformations to
    transformed = inputs.copy()

    # Define feature columns
    feature_columns = {
        colname: tf.feature_column.numeric_column(colname)
        for colname in numeric_cols
    }

    # Scaling longitude from range [-70, -78] to [0, 1]
    for lon_col in ['pickup_longitude', 'dropoff_longitude']:
        transformed[lon_col] = layers.Lambda(
            scale_longitude,
            name=f"scale_{lon_col}")(inputs[lon_col])

    # Scaling latitude from range [37, 45] to [0, 1]
    for lat_col in ['pickup_latitude', 'dropoff_latitude']:
        transformed[lat_col] = layers.Lambda(
            scale_latitude,
            name=f'scale_{lat_col}')(inputs[lat_col])

    # add Euclidean distance
    transformed['euclidean'] = layers.Lambda(
        euclidean,
        name='euclidean')([inputs['pickup_longitude'],
                           inputs['pickup_latitude'],
                           inputs['dropoff_longitude'],
                           inputs['dropoff_latitude']])
        
    
    # Add euclidean distance to feature columns
    feature_columns['euclidean'] = fc.numeric_column('euclidean')

    return transformed, feature_columns


def build_dnn_model():
    
    # input layer (notice type of float32 since features are numeric)
    inputs = {
        colname: layers.Input(name=colname, shape=(), dtype='float32')
        for colname in NUMERIC_COLS
    }

    # transformed features
    transformed, feature_columns = transform(inputs, numeric_cols=NUMERIC_COLS)

    # Constructor for DenseFeatures takes a list of numeric columns
    # and the resulting tensor takes a dictionary of Lambda layers
    dnn_inputs = layers.DenseFeatures(feature_columns.values())(transformed)

    # two hidden layers of 32 and 8 units, respectively
    h1 = layers.Dense(32, activation='relu', name='h1')(dnn_inputs)
    h2 = layers.Dense(8, activation='relu', name='h2')(h1)

    # final output is a linear activation because this is a regression problem
    output = layers.Dense(1, activation='linear', name='fare')(h2)

    # Create model with inputs and output
    model = models.Model(inputs, output)

    # Compile model (Mean Squared Error is suitable for regression)
    model.compile(optimizer='adam', 
                  loss='mse', 
                  metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse'), 'mse'])
    
    return model


# Save compiled model into a variable
model = build_dnn_model()


# Plot the layer architecture and relationship between input features
tf.keras.utils.plot_model(model, 'dnn_model_engineered.png', show_shapes=False, rankdir='LR')


# Train the model and save the history
history = model.fit(trainds,
                    validation_data=evalds,
                    epochs=NUM_EPOCHS,
                    steps_per_epoch=steps_per_epoch)

Epoch 1/20

/usr/local/lib/python3.7/dist-packages/keras/engine/functional.py:559: UserWarning: Input dict contained keys ['passenger_count', 'hourofday', 'dayofweek'] which did not match any model input. They will be ignored by the model.
  inputs = self._flatten_to_reference_inputs(inputs)

1863/1863 [==============================] - 6s 3ms/step - loss: 94.7372 - rmse: 9.7333 - mse: 94.7372 - val_loss: 62.3450 - val_rmse: 7.8959 - val_mse: 62.3450
Epoch 2/20
1863/1863 [==============================] - 5s 3ms/step - loss: 54.9703 - rmse: 7.4142 - mse: 54.9703 - val_loss: 45.1118 - val_rmse: 6.7165 - val_mse: 45.1118
Epoch 3/20
1863/1863 [==============================] - 5s 3ms/step - loss: 51.5169 - rmse: 7.1775 - mse: 51.5169 - val_loss: 41.9052 - val_rmse: 6.4734 - val_mse: 41.9052
Epoch 4/20
1863/1863 [==============================] - 5s 3ms/step - loss: 45.7830 - rmse: 6.7663 - mse: 45.7830 - val_loss: 41.6317 - val_rmse: 6.4523 - val_mse: 41.6317
Epoch 5/20
1863/1863 [==============================] - 5s 2ms/step - loss: 43.2021 - rmse: 6.5728 - mse: 43.2021 - val_loss: 43.6724 - val_rmse: 6.6085 - val_mse: 43.6724
Epoch 6/20
1863/1863 [==============================] - 4s 2ms/step - loss: 48.3511 - rmse: 6.9535 - mse: 48.3511 - val_loss: 39.5802 - val_rmse: 6.2913 - val_mse: 39.5802
Epoch 7/20
1863/1863 [==============================] - 5s 2ms/step - loss: 46.7670 - rmse: 6.8386 - mse: 46.7670 - val_loss: 42.2710 - val_rmse: 6.5016 - val_mse: 42.2710
Epoch 8/20
1863/1863 [==============================] - 5s 2ms/step - loss: 42.2775 - rmse: 6.5021 - mse: 42.2775 - val_loss: 42.3331 - val_rmse: 6.5064 - val_mse: 42.3331
Epoch 9/20
1863/1863 [==============================] - 5s 3ms/step - loss: 46.9126 - rmse: 6.8493 - mse: 46.9126 - val_loss: 40.6242 - val_rmse: 6.3737 - val_mse: 40.6242
Epoch 10/20
1863/1863 [==============================] - 5s 2ms/step - loss: 44.4140 - rmse: 6.6644 - mse: 44.4140 - val_loss: 42.0246 - val_rmse: 6.4826 - val_mse: 42.0246
Epoch 11/20
1863/1863 [==============================] - 5s 2ms/step - loss: 40.2017 - rmse: 6.3405 - mse: 40.2017 - val_loss: 40.3275 - val_rmse: 6.3504 - val_mse: 40.3275
Epoch 12/20
1863/1863 [==============================] - 5s 2ms/step - loss: 36.9943 - rmse: 6.0823 - mse: 36.9943 - val_loss: 40.0612 - val_rmse: 6.3294 - val_mse: 40.0612
Epoch 13/20
1863/1863 [==============================] - 5s 3ms/step - loss: 37.5775 - rmse: 6.1300 - mse: 37.5775 - val_loss: 38.2148 - val_rmse: 6.1818 - val_mse: 38.2148
Epoch 14/20
1863/1863 [==============================] - 5s 3ms/step - loss: 34.2113 - rmse: 5.8490 - mse: 34.2113 - val_loss: 36.0945 - val_rmse: 6.0079 - val_mse: 36.0945
Epoch 15/20
1863/1863 [==============================] - 5s 3ms/step - loss: 32.9829 - rmse: 5.7431 - mse: 32.9829 - val_loss: 36.4390 - val_rmse: 6.0365 - val_mse: 36.4390
Epoch 16/20
1863/1863 [==============================] - 5s 2ms/step - loss: 30.1575 - rmse: 5.4916 - mse: 30.1575 - val_loss: 35.7973 - val_rmse: 5.9831 - val_mse: 35.7973
Epoch 17/20
1863/1863 [==============================] - 5s 2ms/step - loss: 29.1699 - rmse: 5.4009 - mse: 29.1699 - val_loss: 35.7032 - val_rmse: 5.9752 - val_mse: 35.7032
Epoch 18/20
1863/1863 [==============================] - 4s 2ms/step - loss: 24.9900 - rmse: 4.9990 - mse: 24.9900 - val_loss: 34.1986 - val_rmse: 5.8480 - val_mse: 34.1986
Epoch 19/20
1863/1863 [==============================] - 4s 2ms/step - loss: 24.5526 - rmse: 4.9551 - mse: 24.5526 - val_loss: 34.5461 - val_rmse: 5.8776 - val_mse: 34.5461
Epoch 20/20
1863/1863 [==============================] - 5s 3ms/step - loss: 25.1756 - rmse: 5.0175 - mse: 25.1756 - val_loss: 32.6955 - val_rmse: 5.7180 - val_mse: 32.6955


# Plot history metrics
plot_curves(history, ['loss', 'mse'])


# Use the model to predict
prediction = model.predict(taxi_ride, steps=1)

# Print prediction
print(f"the model predicted a fare total of {float(prediction):.2f} USD for the ride.")

the model predicted a fare total of 6.71 USD for the ride.

	fare_amount	passenger_count	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	hourofday	dayofweek
0	8.1	1	-73.973731	40.791910	-73.962737	40.767318	14	4
1	4.5	2	-73.986495	40.739278	-73.986083	40.730933	10	6
2	2.9	1	-73.956043	40.772026	-73.956245	40.773934	22	3
3	7.0	1	-74.006557	40.705797	-73.980017	40.713617	6	3
4	6.5	1	-73.986443	40.741612	-73.990215	40.746467	10	2

Ungraded lab: Manual Feature Engineering¶

Imports¶

Load taxifare dataset¶

Inspect tha data¶

Create an input pipeline¶

Create a DNN Model in Keras¶

Train the model¶

Visualize training curves¶

Improve Model Performance Using Feature Engineering¶

Applying transformations¶

Update the model¶