import tensorflow as tf
import tensorflow_data_validation as tfdv

from tfx import v1 as tfx

from tfx.types import standard_artifacts

from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from google.protobuf.json_format import MessageToDict
from tensorflow_metadata.proto.v0 import schema_pb2

import os
import pprint
pp = pprint.PrettyPrinter()


# location of the pipeline metadata store
_pipeline_root = './pipeline/'

# directory of the raw data files
_data_root = './data/census_data'

# path to the raw training data
_data_filepath = os.path.join(_data_root, 'adult.data')


# Initialize the InteractiveContext.
# If you leave `_pipeline_root` blank, then the db will be created in a temporary directory.
context = InteractiveContext(pipeline_root=_pipeline_root)

WARNING:absl:InteractiveContext metadata_connection_config not provided: using SQLite ML Metadata database at ./pipeline/metadata.sqlite.


# Instantiate ExampleGen with the input CSV dataset
example_gen = tfx.components.CsvExampleGen(input_base=_data_root)

# Execute the component
context.run(example_gen)

WARNING:root:Make sure that locally built Python SDK docker image has Python 3.8 interpreter.


# Instantiate StatisticsGen with the ExampleGen ingested dataset
statistics_gen = tfx.components.StatisticsGen(
    examples=example_gen.outputs['examples'])

# Execute the component
context.run(statistics_gen)

WARNING:root:Make sure that locally built Python SDK docker image has Python 3.8 interpreter.


# Instantiate SchemaGen with the StatisticsGen ingested dataset
schema_gen = tfx.components.SchemaGen(
    statistics=statistics_gen.outputs['statistics'],
    )

# Run the component
context.run(schema_gen)


# Visualize the schema
context.show(schema_gen.outputs['schema'])


# Get the schema uri
schema_uri = schema_gen.outputs['schema']._artifacts[0].uri

# Get the schema pbtxt file from the SchemaGen output
schema = tfdv.load_schema_text(os.path.join(schema_uri, 'schema.pbtxt'))


# Restrict the range of the `age` feature
tfdv.set_domain(schema, 'age', schema_pb2.IntDomain(name='age', min=17, max=90))

# Display the modified schema. Notice the `Domain` column of `age`.
tfdv.display_schema(schema)


# Create schema environments for training and serving
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

# Omit label from the serving environment
tfdv.get_feature(schema, 'label').not_in_environment.append('SERVING')


# Declare the path to the updated schema directory
_updated_schema_dir = f'{_pipeline_root}/updated_schema'

# Create the said directory
!mkdir -p {_updated_schema_dir}

# Declare the path to the schema file
schema_file = os.path.join(_updated_schema_dir, 'schema.pbtxt')

# Save the curated schema to the said file
tfdv.write_schema_text(schema, schema_file)


# Use ImportSchemaGen to put the curated schema to ML Metadata
user_schema_importer = tfx.components.ImportSchemaGen(schema_file=schema_file)

# Run the component
context.run(user_schema_importer, enable_cache=False)


# See the result
context.show(user_schema_importer.outputs['schema'])


# Instantiate ExampleValidator with the StatisticsGen and SchemaGen ingested data
example_validator = tfx.components.ExampleValidator(
    statistics=statistics_gen.outputs['statistics'],
    schema=user_schema_importer.outputs['schema'])

# Run the component.
context.run(example_validator)


# Visualize the results
context.show(example_validator.outputs['anomalies'])


# Import mlmd and utilities
import ml_metadata as mlmd
from ml_metadata.proto import metadata_store_pb2

# Get the connection config to connect to the context's metadata store
connection_config = context.metadata_connection_config

# Instantiate a MetadataStore instance with the connection config
store = mlmd.MetadataStore(connection_config)


# Get artifact types
artifact_types = store.get_artifact_types()

# Print the results
[artifact_type.name for artifact_type in artifact_types]

['Examples', 'ExampleStatistics', 'Schema', 'ExampleAnomalies']


# Get artifact types
schema_list = store.get_artifacts_by_type('Schema')

[(f'schema uri: {schema.uri}', f'schema id:{schema.id}') for schema in schema_list]

[('schema uri: ./pipeline/SchemaGen/schema/3', 'schema id:3'),
 ('schema uri: ./pipeline/ImportSchemaGen/schema/4', 'schema id:4')]


# Get 1st instance of ExampleAnomalies
example_anomalies = store.get_artifacts_by_type('ExampleAnomalies')[0]

# Print the artifact id
print(f'Artifact id: {example_anomalies.id}')

Artifact id: 5


# Get first event related to the ID
anomalies_id_event = store.get_events_by_artifact_ids([example_anomalies.id])[0]

# Print results
print(anomalies_id_event)

artifact_id: 5
execution_id: 5
path {
  steps {
    key: "anomalies"
  }
  steps {
    index: 0
  }
}
type: OUTPUT
milliseconds_since_epoch: 1662192722082


# Get execution ID
anomalies_execution_id = anomalies_id_event.execution_id

# Get events by the execution ID
events_execution = store.get_events_by_execution_ids([anomalies_execution_id])

# Print results
print(events_execution)

[artifact_id: 2
execution_id: 5
path {
  steps {
    key: "statistics"
  }
  steps {
    index: 0
  }
}
type: INPUT
milliseconds_since_epoch: 1662192721699
, artifact_id: 4
execution_id: 5
path {
  steps {
    key: "schema"
  }
  steps {
    index: 0
  }
}
type: INPUT
milliseconds_since_epoch: 1662192721702
, artifact_id: 5
execution_id: 5
path {
  steps {
    key: "anomalies"
  }
  steps {
    index: 0
  }
}
type: OUTPUT
milliseconds_since_epoch: 1662192722082
]


# Filter INPUT type events
inputs_to_exval = [event.artifact_id for event in events_execution 
                       if event.type == metadata_store_pb2.Event.INPUT]

# Print results
print(inputs_to_exval)

[2, 4]

	Type	Presence	Domain
Feature name
'age'	INT	required	-
'capital-gain'	INT	required	-
'capital-loss'	INT	required	-
'education'	STRING	required	'education'
'education-num'	INT	required	-
'fnlwgt'	INT	required	-
'hours-per-week'	INT	required	-
'label'	STRING	required	'label'
'marital-status'	STRING	required	'marital-status'
'native-country'	STRING	required	'native-country'
'occupation'	STRING	required	'occupation'
'race'	STRING	required	'race'
'relationship'	STRING	required	'relationship'
'sex'	STRING	required	'sex'
'workclass'	STRING	required	'workclass'

	Values
Domain
'education'	' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate', ' HS-grad', ' Masters', ' Preschool', ' Prof-school', ' Some-college'
'label'	' <=50K', ' >50K'
'marital-status'	' Divorced', ' Married-AF-spouse', ' Married-civ-spouse', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed'
'native-country'	' ?', ' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba', ' Dominican-Republic', ' Ecuador', ' El-Salvador', ' England', ' France', ' Germany', ' Greece', ' Guatemala', ' Haiti', ' Honduras', ' Hong', ' Hungary', ' India', ' Iran', ' Ireland', ' Italy', ' Jamaica', ' Japan', ' Laos', ' Mexico', ' Nicaragua', ' Outlying-US(Guam-USVI-etc)', ' Peru', ' Philippines', ' Poland', ' Portugal', ' Puerto-Rico', ' Scotland', ' South', ' Taiwan', ' Thailand', ' Trinadad&Tobago', ' United-States', ' Vietnam', ' Yugoslavia', ' Holand-Netherlands'
'occupation'	' ?', ' Adm-clerical', ' Armed-Forces', ' Craft-repair', ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners', ' Machine-op-inspct', ' Other-service', ' Priv-house-serv', ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support', ' Transport-moving'
'race'	' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White'
'relationship'	' Husband', ' Not-in-family', ' Other-relative', ' Own-child', ' Unmarried', ' Wife'
'sex'	' Female', ' Male'
'workclass'	' ?', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private', ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'

	Type	Presence	Domain
Feature name
'age'	INT	required	min: 17; max: 90
'capital-gain'	INT	required	-
'capital-loss'	INT	required	-
'education'	STRING	required	'education'
'education-num'	INT	required	-
'fnlwgt'	INT	required	-
'hours-per-week'	INT	required	-
'label'	STRING	required	'label'
'marital-status'	STRING	required	'marital-status'
'native-country'	STRING	required	'native-country'
'occupation'	STRING	required	'occupation'
'race'	STRING	required	'race'
'relationship'	STRING	required	'relationship'
'sex'	STRING	required	'sex'
'workclass'	STRING	required	'workclass'

	Values
Domain
'education'	' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate', ' HS-grad', ' Masters', ' Preschool', ' Prof-school', ' Some-college'
'label'	' <=50K', ' >50K'
'marital-status'	' Divorced', ' Married-AF-spouse', ' Married-civ-spouse', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed'
'native-country'	' ?', ' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba', ' Dominican-Republic', ' Ecuador', ' El-Salvador', ' England', ' France', ' Germany', ' Greece', ' Guatemala', ' Haiti', ' Honduras', ' Hong', ' Hungary', ' India', ' Iran', ' Ireland', ' Italy', ' Jamaica', ' Japan', ' Laos', ' Mexico', ' Nicaragua', ' Outlying-US(Guam-USVI-etc)', ' Peru', ' Philippines', ' Poland', ' Portugal', ' Puerto-Rico', ' Scotland', ' South', ' Taiwan', ' Thailand', ' Trinadad&Tobago', ' United-States', ' Vietnam', ' Yugoslavia', ' Holand-Netherlands'
'occupation'	' ?', ' Adm-clerical', ' Armed-Forces', ' Craft-repair', ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners', ' Machine-op-inspct', ' Other-service', ' Priv-house-serv', ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support', ' Transport-moving'
'race'	' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White'
'relationship'	' Husband', ' Not-in-family', ' Other-relative', ' Own-child', ' Unmarried', ' Wife'
'sex'	' Female', ' Male'
'workclass'	' ?', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private', ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'

	Type	Presence	Domain
Feature name
'age'	INT	required	min: 17; max: 90
'capital-gain'	INT	required	-
'capital-loss'	INT	required	-
'education'	STRING	required	'education'
'education-num'	INT	required	-
'fnlwgt'	INT	required	-
'hours-per-week'	INT	required	-
'label'	STRING	required	'label'
'marital-status'	STRING	required	'marital-status'
'native-country'	STRING	required	'native-country'
'occupation'	STRING	required	'occupation'
'race'	STRING	required	'race'
'relationship'	STRING	required	'relationship'
'sex'	STRING	required	'sex'
'workclass'	STRING	required	'workclass'

Ungraded Lab: Iterative Schema with TFX and ML Metadata¶

Setup¶

Imports¶

Define paths¶

Data Pipeline¶

Create the Interactive Context¶

ExampleGen¶

StatisticsGen¶

SchemaGen¶

Curating the Schema¶

Schema Environments¶

ImportSchemaGen¶

ExampleValidator¶

No anomalies found.

No anomalies found.

Practice with ML Metadata¶