# coding: utf-8 import numpy as np import tensorflow as tf import pandas as pd import sklearn import sklearn.model_selection # *Python Machine Learning 3rd Edition* by [Sebastian Raschka](https://sebastianraschka.com) & [Vahid Mirjalili](http://vahidmirjalili.com), Packt Publishing Ltd. 2019 # # Code Repository: https://github.com/rasbt/python-machine-learning-book-3rd-edition # # Code License: [MIT License](https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/LICENSE.txt) # # Chapter 14: Going Deeper -- the Mechanics of TensorFlow (Part 2/3) # Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s). # ## TensorFlow Estimators # # ##### Steps for using pre-made estimators # # * **Step 1:** Define the input function for importing the data # * **Step 2:** Define the feature columns to bridge between the estimator and the data # * **Step 3:** Instantiate an estimator or convert a Keras model to an estimator # * **Step 4:** Use the estimator: train() evaluate() predict() tf.random.set_seed(1) np.random.seed(1) # ### Working with feature columns # # # * See definition: https://developers.google.com/machine-learning/glossary/#feature_columns # * Documentation: https://www.tensorflow.org/api_docs/python/tf/feature_column dataset_path = tf.keras.utils.get_file("auto-mpg.data", ("http://archive.ics.uci.edu/ml/machine-learning-databases" "/auto-mpg/auto-mpg.data")) column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'ModelYear', 'Origin'] df = pd.read_csv(dataset_path, names=column_names, na_values = "?", comment='\t', sep=" ", skipinitialspace=True) df.tail() print(df.isna().sum()) df = df.dropna() df = df.reset_index(drop=True) df.tail() df_train, df_test = sklearn.model_selection.train_test_split(df, train_size=0.8) train_stats = df_train.describe().transpose() train_stats numeric_column_names = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration'] df_train_norm, df_test_norm = df_train.copy(), df_test.copy() for col_name in numeric_column_names: mean = train_stats.loc[col_name, 'mean'] std = train_stats.loc[col_name, 'std'] df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name] - mean)/std df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean)/std df_train_norm.tail() # #### Numeric Columns numeric_features = [] for col_name in numeric_column_names: numeric_features.append(tf.feature_column.numeric_column(key=col_name)) numeric_features feature_year = tf.feature_column.numeric_column(key="ModelYear") bucketized_features = [] bucketized_features.append(tf.feature_column.bucketized_column( source_column=feature_year, boundaries=[73, 76, 79])) print(bucketized_features) feature_origin = tf.feature_column.categorical_column_with_vocabulary_list( key='Origin', vocabulary_list=[1, 2, 3]) categorical_indicator_features = [] categorical_indicator_features.append(tf.feature_column.indicator_column(feature_origin)) print(categorical_indicator_features) # ### Machine learning with pre-made Estimators def train_input_fn(df_train, batch_size=8): df = df_train.copy() train_x, train_y = df, df.pop('MPG') dataset = tf.data.Dataset.from_tensor_slices((dict(train_x), train_y)) # shuffle, repeat, and batch the examples return dataset.shuffle(1000).repeat().batch(batch_size) ## inspection ds = train_input_fn(df_train_norm) batch = next(iter(ds)) print('Keys:', batch[0].keys()) print('Batch Model Years:', batch[0]['ModelYear']) all_feature_columns = (numeric_features + bucketized_features + categorical_indicator_features) print(all_feature_columns) regressor = tf.estimator.DNNRegressor( feature_columns=all_feature_columns, hidden_units=[32, 10], model_dir='models/autompg-dnnregressor/') EPOCHS = 1000 BATCH_SIZE = 8 total_steps = EPOCHS * int(np.ceil(len(df_train) / BATCH_SIZE)) print('Training Steps:', total_steps) regressor.train( input_fn=lambda:train_input_fn(df_train_norm, batch_size=BATCH_SIZE), steps=total_steps) reloaded_regressor = tf.estimator.DNNRegressor( feature_columns=all_feature_columns, hidden_units=[32, 10], warm_start_from='models/autompg-dnnregressor/', model_dir='models/autompg-dnnregressor/') def eval_input_fn(df_test, batch_size=8): df = df_test.copy() test_x, test_y = df, df.pop('MPG') dataset = tf.data.Dataset.from_tensor_slices((dict(test_x), test_y)) return dataset.batch(batch_size) eval_results = reloaded_regressor.evaluate( input_fn=lambda:eval_input_fn(df_test_norm, batch_size=8)) for key in eval_results: print('{:15s} {}'.format(key, eval_results[key])) print('Average-Loss {:.4f}'.format(eval_results['average_loss'])) pred_res = regressor.predict(input_fn=lambda: eval_input_fn(df_test_norm, batch_size=8)) print(next(iter(pred_res))) # #### Boosted Tree Regressor boosted_tree = tf.estimator.BoostedTreesRegressor( feature_columns=all_feature_columns, n_batches_per_layer=20, n_trees=200) boosted_tree.train( input_fn=lambda:train_input_fn(df_train_norm, batch_size=BATCH_SIZE)) eval_results = boosted_tree.evaluate( input_fn=lambda:eval_input_fn(df_test_norm, batch_size=8)) print(eval_results) print('Average-Loss {:.4f}'.format(eval_results['average_loss'])) # --- # # Readers may ignore the next cell.