import tensorflow as tf

from tfx import v1 as tfx

from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from google.protobuf.json_format import MessageToDict

import os
import pprint
pp = pprint.PrettyPrinter()


# location of the pipeline metadata store
_pipeline_root = './pipeline/'

# directory of the raw data files
_data_root = './data/census_data'

# path to the raw training data
_data_filepath = os.path.join(_data_root, 'adult.data')


# preview the first few rows of the CSV file
!head {_data_filepath}

age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
39, State-gov, 77516, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K
50, Self-emp-not-inc, 83311, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K
38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
53, Private, 234721, 11th, 7, Married-civ-spouse, Handlers-cleaners, Husband, Black, Male, 0, 0, 40, United-States, <=50K
28, Private, 338409, Bachelors, 13, Married-civ-spouse, Prof-specialty, Wife, Black, Female, 0, 0, 40, Cuba, <=50K
37, Private, 284582, Masters, 14, Married-civ-spouse, Exec-managerial, Wife, White, Female, 0, 0, 40, United-States, <=50K
49, Private, 160187, 9th, 5, Married-spouse-absent, Other-service, Not-in-family, Black, Female, 0, 0, 16, Jamaica, <=50K
52, Self-emp-not-inc, 209642, HS-grad, 9, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 45, United-States, >50K
31, Private, 45781, Masters, 14, Never-married, Prof-specialty, Not-in-family, White, Female, 14084, 0, 50, United-States, >50K


# Initialize the InteractiveContext with a local sqlite file.
# If you leave `_pipeline_root` blank, then the db will be created in a temporary directory.
# You can safely ignore the warning about the missing config file.
context = InteractiveContext(pipeline_root=_pipeline_root)

WARNING:absl:InteractiveContext metadata_connection_config not provided: using SQLite ML Metadata database at ./pipeline/metadata.sqlite.


# Instantiate ExampleGen with the input CSV dataset
example_gen = tfx.components.CsvExampleGen(input_base=_data_root)


# Execute the component
context.run(example_gen)

WARNING:root:Make sure that locally built Python SDK docker image has Python 3.8 interpreter.


# get the artifact object
artifact = example_gen.outputs['examples'].get()[0]

# print split names and uri
print(f'split names: {artifact.split_names}')
print(f'artifact uri: {artifact.uri}')

split names: ["train", "eval"]
artifact uri: ./pipeline/CsvExampleGen/examples/1


# Get the URI of the output artifact representing the training examples
train_uri = os.path.join(artifact.uri, 'Split-train')

# See the contents of the `train` folder
!ls {train_uri}

data_tfrecord-00000-of-00001.gz


# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")


# Define a helper function to get individual examples
def get_records(dataset, num_records):
    '''Extracts records from the given dataset.
    Args:
        dataset (TFRecordDataset): dataset saved by ExampleGen
        num_records (int): number of records to preview
    '''
    
    # initialize an empty list
    records = []
    
    # Use the `take()` method to specify how many records to get
    for tfrecord in dataset.take(num_records):
        
        # Get the numpy property of the tensor
        serialized_example = tfrecord.numpy()
        
        # Initialize a `tf.train.Example()` to read the serialized data
        example = tf.train.Example()
        
        # Read the example data (output is a protocol buffer message)
        example.ParseFromString(serialized_example)
        
        # convert the protocol bufffer message to a Python dictionary
        example_dict = (MessageToDict(example))
        
        # append to the records list
        records.append(example_dict)
        
    return records


# Get 3 records from the dataset
sample_records = get_records(dataset, 3)

# Print the output
pp.pprint(sample_records)

[{'features': {'feature': {'age': {'int64List': {'value': ['39']}},
                           'capital-gain': {'int64List': {'value': ['2174']}},
                           'capital-loss': {'int64List': {'value': ['0']}},
                           'education': {'bytesList': {'value': ['IEJhY2hlbG9ycw==']}},
                           'education-num': {'int64List': {'value': ['13']}},
                           'fnlwgt': {'int64List': {'value': ['77516']}},
                           'hours-per-week': {'int64List': {'value': ['40']}},
                           'label': {'bytesList': {'value': ['IDw9NTBL']}},
                           'marital-status': {'bytesList': {'value': ['IE5ldmVyLW1hcnJpZWQ=']}},
                           'native-country': {'bytesList': {'value': ['IFVuaXRlZC1TdGF0ZXM=']}},
                           'occupation': {'bytesList': {'value': ['IEFkbS1jbGVyaWNhbA==']}},
                           'race': {'bytesList': {'value': ['IFdoaXRl']}},
                           'relationship': {'bytesList': {'value': ['IE5vdC1pbi1mYW1pbHk=']}},
                           'sex': {'bytesList': {'value': ['IE1hbGU=']}},
                           'workclass': {'bytesList': {'value': ['IFN0YXRlLWdvdg==']}}}}},
 {'features': {'feature': {'age': {'int64List': {'value': ['50']}},
                           'capital-gain': {'int64List': {'value': ['0']}},
                           'capital-loss': {'int64List': {'value': ['0']}},
                           'education': {'bytesList': {'value': ['IEJhY2hlbG9ycw==']}},
                           'education-num': {'int64List': {'value': ['13']}},
                           'fnlwgt': {'int64List': {'value': ['83311']}},
                           'hours-per-week': {'int64List': {'value': ['13']}},
                           'label': {'bytesList': {'value': ['IDw9NTBL']}},
                           'marital-status': {'bytesList': {'value': ['IE1hcnJpZWQtY2l2LXNwb3VzZQ==']}},
                           'native-country': {'bytesList': {'value': ['IFVuaXRlZC1TdGF0ZXM=']}},
                           'occupation': {'bytesList': {'value': ['IEV4ZWMtbWFuYWdlcmlhbA==']}},
                           'race': {'bytesList': {'value': ['IFdoaXRl']}},
                           'relationship': {'bytesList': {'value': ['IEh1c2JhbmQ=']}},
                           'sex': {'bytesList': {'value': ['IE1hbGU=']}},
                           'workclass': {'bytesList': {'value': ['IFNlbGYtZW1wLW5vdC1pbmM=']}}}}},
 {'features': {'feature': {'age': {'int64List': {'value': ['38']}},
                           'capital-gain': {'int64List': {'value': ['0']}},
                           'capital-loss': {'int64List': {'value': ['0']}},
                           'education': {'bytesList': {'value': ['IEhTLWdyYWQ=']}},
                           'education-num': {'int64List': {'value': ['9']}},
                           'fnlwgt': {'int64List': {'value': ['215646']}},
                           'hours-per-week': {'int64List': {'value': ['40']}},
                           'label': {'bytesList': {'value': ['IDw9NTBL']}},
                           'marital-status': {'bytesList': {'value': ['IERpdm9yY2Vk']}},
                           'native-country': {'bytesList': {'value': ['IFVuaXRlZC1TdGF0ZXM=']}},
                           'occupation': {'bytesList': {'value': ['IEhhbmRsZXJzLWNsZWFuZXJz']}},
                           'race': {'bytesList': {'value': ['IFdoaXRl']}},
                           'relationship': {'bytesList': {'value': ['IE5vdC1pbi1mYW1pbHk=']}},
                           'sex': {'bytesList': {'value': ['IE1hbGU=']}},
                           'workclass': {'bytesList': {'value': ['IFByaXZhdGU=']}}}}}]


# Instantiate StatisticsGen with the ExampleGen ingested dataset
statistics_gen = tfx.components.StatisticsGen(
    examples=example_gen.outputs['examples'])

# Execute the component
context.run(statistics_gen)

WARNING:root:Make sure that locally built Python SDK docker image has Python 3.8 interpreter.


# Show the output statistics
context.show(statistics_gen.outputs['statistics'])


# Instantiate SchemaGen with the StatisticsGen ingested dataset
schema_gen = tfx.components.SchemaGen(
    statistics=statistics_gen.outputs['statistics'],
    )

# Run the component
context.run(schema_gen)


# Visualize the schema
context.show(schema_gen.outputs['schema'])


# Instantiate ExampleValidator with the StatisticsGen and SchemaGen ingested data
example_validator = tfx.components.ExampleValidator(
    statistics=statistics_gen.outputs['statistics'],
    schema=schema_gen.outputs['schema'])

# Run the component.
context.run(example_validator)


# Visualize the results
context.show(example_validator.outputs['anomalies'])


# Set the constants module filename
_census_constants_module_file = 'census_constants.py'


%%writefile {_census_constants_module_file}

# Features with string data types that will be converted to indices
CATEGORICAL_FEATURE_KEYS = [
    'education', 'marital-status', 'occupation', 'race', 'relationship', 'workclass', 'sex', 'native-country'
]

# Numerical features that are marked as continuous
NUMERIC_FEATURE_KEYS = ['fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

# Feature that can be grouped into buckets
BUCKET_FEATURE_KEYS = ['age']

# Number of buckets used by tf.transform for encoding each bucket feature.
FEATURE_BUCKET_COUNT = {'age': 4}

# Feature that the model will predict
LABEL_KEY = 'label'

# Utility function for renaming the feature
def transformed_name(key):
    return key + '_xf'

Writing census_constants.py


# Set the transform module filename
_census_transform_module_file = 'census_transform.py'


%%writefile {_census_transform_module_file}

import tensorflow as tf
import tensorflow_transform as tft

import census_constants

# Unpack the contents of the constants module
_NUMERIC_FEATURE_KEYS = census_constants.NUMERIC_FEATURE_KEYS
_CATEGORICAL_FEATURE_KEYS = census_constants.CATEGORICAL_FEATURE_KEYS
_BUCKET_FEATURE_KEYS = census_constants.BUCKET_FEATURE_KEYS
_FEATURE_BUCKET_COUNT = census_constants.FEATURE_BUCKET_COUNT
_LABEL_KEY = census_constants.LABEL_KEY
_transformed_name = census_constants.transformed_name


# Define the transformations
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
    Args:
        inputs: map from feature keys to raw not-yet-transformed features.
    Returns:
        Map from string feature key to transformed feature operations.
    """
    outputs = {}

    # Scale these features to the range [0,1]
    for key in _NUMERIC_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.scale_to_0_1(
            inputs[key])
    
    # Bucketize these features
    for key in _BUCKET_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.bucketize(
            inputs[key], _FEATURE_BUCKET_COUNT[key])

    # Convert strings to indices in a vocabulary
    for key in _CATEGORICAL_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(inputs[key])

    # Convert the label strings to an index
    outputs[_transformed_name(_LABEL_KEY)] = tft.compute_and_apply_vocabulary(inputs[_LABEL_KEY])

    return outputs

Writing census_transform.py


# Ignore TF warning messages
tf.get_logger().setLevel('ERROR')

# Instantiate the Transform component
transform = tfx.components.Transform(
    examples=example_gen.outputs['examples'],
    schema=schema_gen.outputs['schema'],
    module_file=os.path.abspath(_census_transform_module_file))

# Run the component
context.run(transform)

WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Tuple[Dict[str, Union[NoneType, _Dataset]], Union[Dict[str, Dict[str, PCollection]], NoneType], int] instead.
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_1/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_2/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_3/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_4/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_5/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_6/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_7/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_8/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_1/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_2/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_3/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_4/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_5/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_6/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_7/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:absl:Tables initialized inside a tf.function  will be re-initialized on every invocation of the function. This  re-initialization can have significant impact on performance. Consider lifting  them out of the graph context using  `tf.init_scope`.: compute_and_apply_vocabulary_8/apply_vocab/text_file_init/InitializeTableFromTextFileV2
WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Tuple[Dict[str, Union[NoneType, _Dataset]], Union[Dict[str, Dict[str, PCollection]], NoneType], int] instead.
WARNING:root:Make sure that locally built Python SDK docker image has Python 3.8 interpreter.


# Get the uri of the transform graph
transform_graph_uri = transform.outputs['transform_graph'].get()[0].uri

# List the subdirectories under the uri
os.listdir(transform_graph_uri)

['metadata', 'transformed_metadata', 'transform_fn']


# Get the URI of the output artifact representing the transformed examples
train_uri = os.path.join(transform.outputs['transformed_examples'].get()[0].uri, 'Split-train')

# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
transformed_dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")


# Get 3 records from the dataset
sample_records_xf = get_records(transformed_dataset, 3)

# Print the output
pp.pprint(sample_records_xf)

[{'features': {'feature': {'age_xf': {'int64List': {'value': ['2']}},
                           'capital-gain_xf': {'floatList': {'value': [0.021740217]}},
                           'capital-loss_xf': {'floatList': {'value': [0.0]}},
                           'education-num_xf': {'floatList': {'value': [0.8]}},
                           'education_xf': {'int64List': {'value': ['2']}},
                           'fnlwgt_xf': {'floatList': {'value': [0.044301897]}},
                           'hours-per-week_xf': {'floatList': {'value': [0.39795917]}},
                           'label_xf': {'int64List': {'value': ['0']}},
                           'marital-status_xf': {'int64List': {'value': ['1']}},
                           'native-country_xf': {'int64List': {'value': ['0']}},
                           'occupation_xf': {'int64List': {'value': ['3']}},
                           'race_xf': {'int64List': {'value': ['0']}},
                           'relationship_xf': {'int64List': {'value': ['1']}},
                           'sex_xf': {'int64List': {'value': ['0']}},
                           'workclass_xf': {'int64List': {'value': ['4']}}}}},
 {'features': {'feature': {'age_xf': {'int64List': {'value': ['3']}},
                           'capital-gain_xf': {'floatList': {'value': [0.0]}},
                           'capital-loss_xf': {'floatList': {'value': [0.0]}},
                           'education-num_xf': {'floatList': {'value': [0.8]}},
                           'education_xf': {'int64List': {'value': ['2']}},
                           'fnlwgt_xf': {'floatList': {'value': [0.048237596]}},
                           'hours-per-week_xf': {'floatList': {'value': [0.12244898]}},
                           'label_xf': {'int64List': {'value': ['0']}},
                           'marital-status_xf': {'int64List': {'value': ['0']}},
                           'native-country_xf': {'int64List': {'value': ['0']}},
                           'occupation_xf': {'int64List': {'value': ['0']}},
                           'race_xf': {'int64List': {'value': ['0']}},
                           'relationship_xf': {'int64List': {'value': ['0']}},
                           'sex_xf': {'int64List': {'value': ['0']}},
                           'workclass_xf': {'int64List': {'value': ['1']}}}}},
 {'features': {'feature': {'age_xf': {'int64List': {'value': ['2']}},
                           'capital-gain_xf': {'floatList': {'value': [0.0]}},
                           'capital-loss_xf': {'floatList': {'value': [0.0]}},
                           'education-num_xf': {'floatList': {'value': [0.53333336]}},
                           'education_xf': {'int64List': {'value': ['0']}},
                           'fnlwgt_xf': {'floatList': {'value': [0.13811344]}},
                           'hours-per-week_xf': {'floatList': {'value': [0.39795917]}},
                           'label_xf': {'int64List': {'value': ['0']}},
                           'marital-status_xf': {'int64List': {'value': ['2']}},
                           'native-country_xf': {'int64List': {'value': ['0']}},
                           'occupation_xf': {'int64List': {'value': ['9']}},
                           'race_xf': {'int64List': {'value': ['0']}},
                           'relationship_xf': {'int64List': {'value': ['1']}},
                           'sex_xf': {'int64List': {'value': ['0']}},
                           'workclass_xf': {'int64List': {'value': ['0']}}}}}]

	Type	Presence	Domain
Feature name
'age'	INT	required	-
'capital-gain'	INT	required	-
'capital-loss'	INT	required	-
'education'	STRING	required	'education'
'education-num'	INT	required	-
'fnlwgt'	INT	required	-
'hours-per-week'	INT	required	-
'label'	STRING	required	'label'
'marital-status'	STRING	required	'marital-status'
'native-country'	STRING	required	'native-country'
'occupation'	STRING	required	'occupation'
'race'	STRING	required	'race'
'relationship'	STRING	required	'relationship'
'sex'	STRING	required	'sex'
'workclass'	STRING	required	'workclass'

	Values
Domain
'education'	' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate', ' HS-grad', ' Masters', ' Preschool', ' Prof-school', ' Some-college'
'label'	' <=50K', ' >50K'
'marital-status'	' Divorced', ' Married-AF-spouse', ' Married-civ-spouse', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed'
'native-country'	' ?', ' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba', ' Dominican-Republic', ' Ecuador', ' El-Salvador', ' England', ' France', ' Germany', ' Greece', ' Guatemala', ' Haiti', ' Honduras', ' Hong', ' Hungary', ' India', ' Iran', ' Ireland', ' Italy', ' Jamaica', ' Japan', ' Laos', ' Mexico', ' Nicaragua', ' Outlying-US(Guam-USVI-etc)', ' Peru', ' Philippines', ' Poland', ' Portugal', ' Puerto-Rico', ' Scotland', ' South', ' Taiwan', ' Thailand', ' Trinadad&Tobago', ' United-States', ' Vietnam', ' Yugoslavia', ' Holand-Netherlands'
'occupation'	' ?', ' Adm-clerical', ' Armed-Forces', ' Craft-repair', ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners', ' Machine-op-inspct', ' Other-service', ' Priv-house-serv', ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support', ' Transport-moving'
'race'	' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White'
'relationship'	' Husband', ' Not-in-family', ' Other-relative', ' Own-child', ' Unmarried', ' Wife'
'sex'	' Female', ' Male'
'workclass'	' ?', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private', ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'

Ungraded Lab: Feature Engineering Pipeline¶

Setup¶

Import packages¶

Define paths¶

Preview the dataset¶

Create the Interactive Context¶

Run TFX components interactively¶

ExampleGen¶

StatisticsGen¶

SchemaGen¶

ExampleValidator¶

No anomalies found.

No anomalies found.

Transform¶