# Install the KFP SDK
!pip install --upgrade kfp==1.7.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kfp==1.7.0
  Downloading kfp-1.7.0.tar.gz (231 kB)
     |████████████████████████████████| 231 kB 5.4 MB/s 
Collecting absl-py<=0.11,>=0.9
  Downloading absl_py-0.11.0-py3-none-any.whl (127 kB)
     |████████████████████████████████| 127 kB 31.9 MB/s 
Collecting PyYAML<6,>=5.3
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
     |████████████████████████████████| 636 kB 39.1 MB/s 
Collecting google-cloud-storage<2,>=1.20.0
  Downloading google_cloud_storage-1.44.0-py2.py3-none-any.whl (106 kB)
     |████████████████████████████████| 106 kB 12.1 MB/s 
Collecting kubernetes<13,>=8.0.0
  Downloading kubernetes-12.0.1-py2.py3-none-any.whl (1.7 MB)
     |████████████████████████████████| 1.7 MB 32.5 MB/s 
Requirement already satisfied: google-api-python-client<2,>=1.7.8 in /usr/local/lib/python3.7/dist-packages (from kfp==1.7.0) (1.12.11)
Requirement already satisfied: google-auth<2,>=1.6.1 in /usr/local/lib/python3.7/dist-packages (from kfp==1.7.0) (1.35.0)
Collecting requests-toolbelt<1,>=0.8.0
  Downloading requests_toolbelt-0.9.1-py2.py3-none-any.whl (54 kB)
     |████████████████████████████████| 54 kB 1.3 MB/s 
Requirement already satisfied: cloudpickle<2,>=1.3.0 in /usr/local/lib/python3.7/dist-packages (from kfp==1.7.0) (1.5.0)
Collecting kfp-server-api<2.0.0,>=1.1.2
  Downloading kfp-server-api-1.8.5.tar.gz (58 kB)
     |████████████████████████████████| 58 kB 1.9 MB/s 
Collecting jsonschema<4,>=3.0.1
  Downloading jsonschema-3.2.0-py2.py3-none-any.whl (56 kB)
     |████████████████████████████████| 56 kB 4.1 MB/s 
Requirement already satisfied: tabulate<1,>=0.8.6 in /usr/local/lib/python3.7/dist-packages (from kfp==1.7.0) (0.8.10)
Requirement already satisfied: click<8,>=7.1.1 in /usr/local/lib/python3.7/dist-packages (from kfp==1.7.0) (7.1.2)
Collecting Deprecated<2,>=1.2.7
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting strip-hints<1,>=0.1.8
  Downloading strip-hints-0.1.10.tar.gz (29 kB)
Collecting docstring-parser<1,>=0.7.3
  Downloading docstring_parser-0.14.1-py3-none-any.whl (33 kB)
Collecting kfp-pipeline-spec<0.2.0,>=0.1.8
  Downloading kfp_pipeline_spec-0.1.16-py3-none-any.whl (19 kB)
Collecting fire<1,>=0.3.1
  Downloading fire-0.4.0.tar.gz (87 kB)
     |████████████████████████████████| 87 kB 6.4 MB/s 
Requirement already satisfied: protobuf<4,>=3.13.0 in /usr/local/lib/python3.7/dist-packages (from kfp==1.7.0) (3.17.3)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from absl-py<=0.11,>=0.9->kfp==1.7.0) (1.15.0)
Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.7/dist-packages (from Deprecated<2,>=1.2.7->kfp==1.7.0) (1.14.1)
Requirement already satisfied: termcolor in /usr/local/lib/python3.7/dist-packages (from fire<1,>=0.3.1->kfp==1.7.0) (1.1.0)
Requirement already satisfied: google-api-core<3dev,>=1.21.0 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client<2,>=1.7.8->kfp==1.7.0) (1.31.6)
Requirement already satisfied: httplib2<1dev,>=0.15.0 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client<2,>=1.7.8->kfp==1.7.0) (0.17.4)
Requirement already satisfied: google-auth-httplib2>=0.0.3 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client<2,>=1.7.8->kfp==1.7.0) (0.0.4)
Requirement already satisfied: uritemplate<4dev,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client<2,>=1.7.8->kfp==1.7.0) (3.0.1)
Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from google-api-core<3dev,>=1.21.0->google-api-python-client<2,>=1.7.8->kfp==1.7.0) (2022.2.1)
Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<3dev,>=1.21.0->google-api-python-client<2,>=1.7.8->kfp==1.7.0) (2.23.0)
Requirement already satisfied: setuptools>=40.3.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<3dev,>=1.21.0->google-api-python-client<2,>=1.7.8->kfp==1.7.0) (57.4.0)
Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<3dev,>=1.21.0->google-api-python-client<2,>=1.7.8->kfp==1.7.0) (1.56.4)
Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.7/dist-packages (from google-api-core<3dev,>=1.21.0->google-api-python-client<2,>=1.7.8->kfp==1.7.0) (21.3)
Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.1->kfp==1.7.0) (4.9)
Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.1->kfp==1.7.0) (4.2.4)
Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.1->kfp==1.7.0) (0.2.8)
Collecting google-resumable-media<3.0dev,>=1.3.0
  Downloading google_resumable_media-2.3.3-py2.py3-none-any.whl (76 kB)
     |████████████████████████████████| 76 kB 4.9 MB/s 
Collecting google-cloud-core<3.0dev,>=1.6.0
  Downloading google_cloud_core-2.3.2-py2.py3-none-any.whl (29 kB)
Collecting google-crc32c<2.0dev,>=1.0
  Downloading google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32 kB)
Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema<4,>=3.0.1->kfp==1.7.0) (4.12.0)
Requirement already satisfied: pyrsistent>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema<4,>=3.0.1->kfp==1.7.0) (0.18.1)
Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema<4,>=3.0.1->kfp==1.7.0) (22.1.0)
Requirement already satisfied: urllib3>=1.15 in /usr/local/lib/python3.7/dist-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp==1.7.0) (1.24.3)
Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp==1.7.0) (2022.6.15)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp==1.7.0) (2.8.2)
Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.7/dist-packages (from kubernetes<13,>=8.0.0->kfp==1.7.0) (1.3.1)
Collecting websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0
  Downloading websocket_client-1.4.1-py3-none-any.whl (55 kB)
     |████████████████████████████████| 55 kB 2.7 MB/s 
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=14.3->google-api-core<3dev,>=1.21.0->google-api-python-client<2,>=1.7.8->kfp==1.7.0) (3.0.9)
Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.1->kfp==1.7.0) (0.4.8)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<3dev,>=1.21.0->google-api-python-client<2,>=1.7.8->kfp==1.7.0) (2.10)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<3dev,>=1.21.0->google-api-python-client<2,>=1.7.8->kfp==1.7.0) (3.0.4)
Requirement already satisfied: wheel in /usr/local/lib/python3.7/dist-packages (from strip-hints<1,>=0.1.8->kfp==1.7.0) (0.37.1)
Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->jsonschema<4,>=3.0.1->kfp==1.7.0) (3.8.1)
Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->jsonschema<4,>=3.0.1->kfp==1.7.0) (4.1.1)
Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib->kubernetes<13,>=8.0.0->kfp==1.7.0) (3.2.0)
Building wheels for collected packages: kfp, fire, kfp-server-api, strip-hints
  Building wheel for kfp (setup.py) ... done
  Created wheel for kfp: filename=kfp-1.7.0-py3-none-any.whl size=320983 sha256=a133f1f39192a4d02897e40f12f784a19de91ec307b83a67af35e84697f4a2c0
  Stored in directory: /root/.cache/pip/wheels/2c/4e/8e/7c38c0cefe4701caf621009fe1b44d07c0a4e2caba3856b288
  Building wheel for fire (setup.py) ... done
  Created wheel for fire: filename=fire-0.4.0-py2.py3-none-any.whl size=115942 sha256=7fd5b775d1df4e399f690fc675f380bba9503c12df120acba3c5c9d22ae23c3b
  Stored in directory: /root/.cache/pip/wheels/8a/67/fb/2e8a12fa16661b9d5af1f654bd199366799740a85c64981226
  Building wheel for kfp-server-api (setup.py) ... done
  Created wheel for kfp-server-api: filename=kfp_server_api-1.8.5-py3-none-any.whl size=99715 sha256=e34a7c60a8e28d72e0f991e908d78a2a1ce96206b2dd1e4b7edec133b68ef170
  Stored in directory: /root/.cache/pip/wheels/77/0e/7b/ed385d69453b7b754834c01d83fa9f5708ba66b4f6ed5d6a35
  Building wheel for strip-hints (setup.py) ... done
  Created wheel for strip-hints: filename=strip_hints-0.1.10-py2.py3-none-any.whl size=22302 sha256=33f40d1fc5e96fd0af8b7a4638220cceaefb66d59f24a731497bc2c8cc2ee04c
  Stored in directory: /root/.cache/pip/wheels/5e/14/c3/6e44e9b2545f2d570b03f5b6d38c00b7534aa8abb376978363
Successfully built kfp fire kfp-server-api strip-hints
Installing collected packages: google-crc32c, websocket-client, PyYAML, google-resumable-media, google-cloud-core, strip-hints, requests-toolbelt, kubernetes, kfp-server-api, kfp-pipeline-spec, jsonschema, google-cloud-storage, fire, docstring-parser, Deprecated, absl-py, kfp
  Attempting uninstall: PyYAML
    Found existing installation: PyYAML 6.0
    Uninstalling PyYAML-6.0:
      Successfully uninstalled PyYAML-6.0
  Attempting uninstall: google-resumable-media
    Found existing installation: google-resumable-media 0.4.1
    Uninstalling google-resumable-media-0.4.1:
      Successfully uninstalled google-resumable-media-0.4.1
  Attempting uninstall: google-cloud-core
    Found existing installation: google-cloud-core 1.0.3
    Uninstalling google-cloud-core-1.0.3:
      Successfully uninstalled google-cloud-core-1.0.3
  Attempting uninstall: jsonschema
    Found existing installation: jsonschema 4.3.3
    Uninstalling jsonschema-4.3.3:
      Successfully uninstalled jsonschema-4.3.3
  Attempting uninstall: google-cloud-storage
    Found existing installation: google-cloud-storage 1.18.1
    Uninstalling google-cloud-storage-1.18.1:
      Successfully uninstalled google-cloud-storage-1.18.1
  Attempting uninstall: absl-py
    Found existing installation: absl-py 1.2.0
    Uninstalling absl-py-1.2.0:
      Successfully uninstalled absl-py-1.2.0
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-cloud-translate 1.5.0 requires google-cloud-core<2.0dev,>=1.0.0, but you have google-cloud-core 2.3.2 which is incompatible.
google-cloud-firestore 1.7.0 requires google-cloud-core<2.0dev,>=1.0.3, but you have google-cloud-core 2.3.2 which is incompatible.
google-cloud-datastore 1.8.0 requires google-cloud-core<2.0dev,>=1.0.0, but you have google-cloud-core 2.3.2 which is incompatible.
google-cloud-bigquery 1.21.0 requires google-cloud-core<2.0dev,>=1.0.3, but you have google-cloud-core 2.3.2 which is incompatible.
google-cloud-bigquery 1.21.0 requires google-resumable-media!=0.4.0,<0.5.0dev,>=0.3.1, but you have google-resumable-media 2.3.3 which is incompatible.
Successfully installed Deprecated-1.2.13 PyYAML-5.4.1 absl-py-0.11.0 docstring-parser-0.14.1 fire-0.4.0 google-cloud-core-2.3.2 google-cloud-storage-1.44.0 google-crc32c-1.5.0 google-resumable-media-2.3.3 jsonschema-3.2.0 kfp-1.7.0 kfp-pipeline-spec-0.1.16 kfp-server-api-1.8.5 kubernetes-12.0.1 requests-toolbelt-0.9.1 strip-hints-0.1.10 websocket-client-1.4.1


# Import the modules you will use
import kfp

# For creating the pipeline
from kfp.v2 import dsl

# For building components
from kfp.v2.dsl import component

# Type annotations for the component artifacts
from kfp.v2.dsl import (
    Input,
    Output,
    Artifact,
    Dataset,
    Model,
    Metrics
)


@component(
    packages_to_install=["pandas", "openpyxl"],
    output_component_file="download_data_component.yaml"
)
def download_data(url:str, output_csv:Output[Dataset]):
    import pandas as pd

    # Use pandas excel reader
    df = pd.read_excel(url)
    df = df.sample(frac=1).reset_index(drop=True)
    df.to_csv(output_csv.path, index=False)


@component(
    packages_to_install=["pandas", "sklearn"],
    output_component_file="split_data_component.yaml"
)
def split_data(input_csv: Input[Dataset], train_csv: Output[Dataset], test_csv: Output[Dataset]):
    import pandas as pd
    from sklearn.model_selection import train_test_split

    df = pd.read_csv(input_csv.path)
    train, test = train_test_split(df, test_size=0.2)

    train.to_csv(train_csv.path, index=False)
    test.to_csv(test_csv.path, index=False)


@dsl.pipeline(
    name="my-pipeline",
)
def my_pipeline(url: str):
    download_data_task = download_data(url=url)
    split_data_task = split_data(input_csv=download_data_task.outputs['output_csv'])


kfp.compiler.Compiler(mode=kfp.dsl.PipelineExecutionMode.V2_COMPATIBLE).compile(
    pipeline_func=my_pipeline,
    package_path='pipeline.yaml')

/usr/local/lib/python3.7/dist-packages/kfp/compiler/compiler.py:76: UserWarning: V2_COMPATIBLE execution mode is at Beta quality. Some pipeline features may not work as expected.
  warnings.warn('V2_COMPATIBLE execution mode is at Beta quality.'


@component(
    packages_to_install=["pandas", "numpy"],
    output_component_file="preprocess_data_component.yaml"
)
def preprocess_data(input_train_csv: Input[Dataset], input_test_csv: Input[Dataset], 
                    output_train_x: Output[Dataset], output_test_x: Output[Dataset],
                    output_train_y: Output[Artifact], output_test_y: Output[Artifact]):
    
    import pandas as pd
    import numpy as np
    import pickle
    
    def format_output(data):
        y1 = data.pop('Y1')
        y1 = np.array(y1)
        y2 = data.pop('Y2')
        y2 = np.array(y2)
        return y1, y2

    def norm(x, train_stats):
        return (x - train_stats['mean']) / train_stats['std']

    train = pd.read_csv(input_train_csv.path)
    test = pd.read_csv(input_test_csv.path)

    train_stats = train.describe()

    # Get Y1 and Y2 as the 2 outputs and format them as np arrays
    train_stats.pop('Y1')
    train_stats.pop('Y2')
    train_stats = train_stats.transpose()
    
    train_Y = format_output(train)
    with open(output_train_y.path, "wb") as file:
      pickle.dump(train_Y, file)
    
    test_Y = format_output(test)
    with open(output_test_y.path, "wb") as file:
      pickle.dump(test_Y, file)

    # Normalize the training and test data
    norm_train_X = norm(train, train_stats)
    norm_test_X = norm(test, train_stats)

    norm_train_X.to_csv(output_train_x.path, index=False)
    norm_test_X.to_csv(output_test_x.path, index=False)



@component(
    packages_to_install=["tensorflow", "pandas"],
    output_component_file="train_model_component.yaml"
)
def train_model(input_train_x: Input[Dataset], input_train_y: Input[Artifact], 
                output_model: Output[Model], output_history: Output[Artifact]):
    import pandas as pd
    import tensorflow as tf
    import pickle
    
    from tensorflow.keras.models import Model
    from tensorflow.keras.layers import Dense, Input
    
    norm_train_X = pd.read_csv(input_train_x.path)

    with open(input_train_y.path, "rb") as file:
        train_Y = pickle.load(file)

    def model_builder(train_X):

      # Define model layers.
      input_layer = Input(shape=(len(train_X.columns),))
      first_dense = Dense(units='128', activation='relu')(input_layer)
      second_dense = Dense(units='128', activation='relu')(first_dense)

      # Y1 output will be fed directly from the second dense
      y1_output = Dense(units='1', name='y1_output')(second_dense)
      third_dense = Dense(units='64', activation='relu')(second_dense)

      # Y2 output will come via the third dense
      y2_output = Dense(units='1', name='y2_output')(third_dense)

      # Define the model with the input layer and a list of output layers
      model = Model(inputs=input_layer, outputs=[y1_output, y2_output])

      print(model.summary())

      return model

    model = model_builder(norm_train_X)

    # Specify the optimizer, and compile the model with loss functions for both outputs
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
    model.compile(optimizer=optimizer,
                  loss={'y1_output': 'mse', 'y2_output': 'mse'},
                  metrics={'y1_output': tf.keras.metrics.RootMeanSquaredError(),
                          'y2_output': tf.keras.metrics.RootMeanSquaredError()})
    # Train the model for 500 epochs
    history = model.fit(norm_train_X, train_Y, epochs=100, batch_size=10)
    model.save(output_model.path)

    with open(output_history.path, "wb") as file:
        train_Y = pickle.dump(history.history, file)



@component(
    packages_to_install=["tensorflow", "pandas"],
    output_component_file="eval_model_component.yaml"
)
def eval_model(input_model: Input[Model], input_history: Input[Artifact], 
               input_test_x: Input[Dataset], input_test_y: Input[Artifact], 
               MLPipeline_Metrics: Output[Metrics]):
    import pandas as pd
    import tensorflow as tf
    import pickle

    model = tf.keras.models.load_model(input_model.path)
    
    norm_test_X = pd.read_csv(input_test_x.path)

    with open(input_test_y.path, "rb") as file:
        test_Y = pickle.load(file)

    # Test the model and print loss and mse for both outputs
    loss, Y1_loss, Y2_loss, Y1_rmse, Y2_rmse = model.evaluate(x=norm_test_X, y=test_Y)
    print("Loss = {}, Y1_loss = {}, Y1_mse = {}, Y2_loss = {}, Y2_mse = {}".format(loss, Y1_loss, Y1_rmse, Y2_loss, Y2_rmse))
    
    MLPipeline_Metrics.log_metric("loss", loss)
    MLPipeline_Metrics.log_metric("Y1_loss", Y1_loss)
    MLPipeline_Metrics.log_metric("Y2_loss", Y2_loss)
    MLPipeline_Metrics.log_metric("Y1_rmse", Y1_rmse)
    MLPipeline_Metrics.log_metric("Y2_rmse", Y2_rmse)


# Define a pipeline and create a task from a component:
@dsl.pipeline(
    name="my-pipeline",
)
def my_pipeline(url: str):
    
    download_data_task = download_data(url=url)
    
    split_data_task = split_data(input_csv=download_data_task.outputs['output_csv'])
    
    preprocess_data_task = preprocess_data(input_train_csv=split_data_task.outputs['train_csv'],
                                           input_test_csv=split_data_task.outputs['test_csv'])
    
    train_model_task = train_model(input_train_x=preprocess_data_task.outputs["output_train_x"],
                                   input_train_y=preprocess_data_task.outputs["output_train_y"])
    
    eval_model_task = eval_model(input_model=train_model_task.outputs["output_model"],
                                 input_history=train_model_task.outputs["output_history"],
                                   input_test_x=preprocess_data_task.outputs["output_test_x"],
                                   input_test_y=preprocess_data_task.outputs["output_test_y"])


kfp.compiler.Compiler(mode=kfp.dsl.PipelineExecutionMode.V2_COMPATIBLE).compile(
    pipeline_func=my_pipeline,
    package_path='pipeline.yaml')

Ungraded Lab: Building ML Pipelines with Kubeflow¶

Setup¶

Operationalizing your ML Pipelines¶

Pipeline components¶

Building and Running a Pipeline¶

Generate the rest of the components¶

Build and run the complete pipeline¶

Tear Down¶

Wrap Up¶