The model
I am training a dense feed-forward NN using the Keras API on Tensorflow. Each sample of the training set defines $\mathbf{X_t}$ and $\mathbf{Y_t}$ of an observed time period $t\in T$. Input vectors $\mathbf{X_t}$ consist of multiple predictor values, Output vector $\mathbf{Y_t}$ are multiple target values, which need to be predicted as a regression model. The model is in high-dimensional space and non-linear.
NOTE: I am aware, that time series characteristics eg. trend, seasonality is not addressed with such a model
The question
For setting the hyperparameters (eg. using Grid Search or Bayesian Tuning):
WHICH HYPERPARAMETERS SHOULD BE CONSIDERED IN WHICH ORDER?
Are there any references (publications?) which propose which hyperparameters to tune in which order?
For now I use an iterative approach similar to this:
[edit: Revised the steps]
Step 1 - grid to find basic training paramaters:
- k-fold splits = [10, 20, .., 60]
- batch size = [5, 10, .., 50]
- epochs = [5, 15, .., 40]
Step 2 - grid to define the basic NN layout:
- layer 1 units = [80, 120, 160, 200]
- layer 2 units = [0, 80, 120, 160, 200]
- layer 3 units = [0, 80, 120, 160, 200]
- layer 1 dropout = [.2, .4, .6]
- layer 2 dropout = [.2, .4, .6]
- final 3 dropout = [.2, .4, .6]
Step 3 - Survey: How big is the influence of the above tuned parameters?
- Mean and deviation of each parameter
- Printing best sets
- Manually checking plot's (eg. density plots of multilple values)
- Freezing 'best' parameters
Step 4&5 - (same as 2&3) with additional parameters
- activation layers 1-3 = [relu, elu]
- activation final layer = [relu, elu, linear]
- kernel_initializer layers 1-3 = [glorot_uniform, uniform, normal]
- optimizer = [Adam, Nadam, RMSprop]
- loss = [MSE, RMSE]
This approach leads to alright results but feels a bit random.
So any ideas / papers which hyperparameters to tune in which order for such a multivariate regression model? Maybe also promising settings?
Also see: What's your methodology of tuning neural network hyperparameters?
Of course there exist auto-tuners and multiple publications focusing on the tuning of specific parameters, or specifically on convolutional NN's - but unfortunately I am not aware of a holistic concept in the domain of regression.
And yes: Its a cross-topic with some programming aspects, but I would be pleased not to discuss this :)
For the ones interested you will find the code attached:
from ModelBuilder import *
def main():
FILENAMES = {
'filename_configurations': 'Data/Input/X.csv',
'filename_orders': 'Data/Input/Y.csv',
}
PICKLEPATH = 'pickle.pickle'
PICKLEINPUT = None
# dict with parameters
GRIDPARAMS = {
'kfold_splits': [20, 30, 40, 50, 60],
'batch_size': [2],
'layer_1': [120],
'dropout_1': [0.2],
'layer_2': [300],
'dropout_2': [0.3],
'layer_3': [0],
'dropout_3': [0],
'optimizer': ['Adam', 'Nadam', 'RMSprop'],
'loss': ['logcosh', 'binary_crossentropy', 'mean_squared_error'],
'activation_1': ['relu', 'elu'],
'activation_2': ['relu', 'elu'],
'kernel_initializer_1': ['glorot_uniform', 'uniform', 'normal'],
'kernel_initializer_2': ['glorot_uniform', 'uniform', 'normal']
}
model = ANN()
results = model.gridsearch(
x=x,
y=y,
steps=10,
jobs=4, # to avoid CPU overflow dont use more then 5
pickle_path=PICKLEPATH,
pickle_input=PICKLEINPUT,
grid_params=GRIDPARAMS,
)
The class of the model
from itertools import product
from joblib import Parallel, delayed
from sklearn.model_selection import KFold
import numpy as np
import math
import pickle
import time
class ANN:
def __init__(self):
"""The class fits and trains multiple Tensorflow nets
:param filenames: Dictionary pointing to ext source data
"""
def gridsearch(self, x, y, steps, jobs, pickle_path, pickle_input=None, grid_params=None,):
if pickle_input is None:
# Generate product of all parameters
parameters = self._dict_product(grid_params)
print('{} Parameter sets are tested'.format(len(parameters)))
else:
with open(pickle_input, 'rb') as handle:
print('Loading pickle file {}'.format(pickle_input))
parameters = pickle.load(handle)
# Loop over params as long as uncalculated ('metric'=None) elements are present
while sum(param['metric'] == None for param in parameters) > 0:
# Define parameter Sets to be passed to parallel fitter
idx_to_calculate = [i for i in range(len(parameters)) if parameters[i]['metric'] == None][:steps]
print(time.strftime("%d.%m.%Y %H:%M:%S"))
print("Solving instances", min(idx_to_calculate) + 1, "to", max(idx_to_calculate) + 1)
# Run parallel fitter
sol = Parallel(n_jobs=jobs, verbose=True)(delayed(self._single_fit)(x, y, params) for params in [(parameters[i]) for i in idx_to_calculate])
# Write results into List
for i in idx_to_calculate:
parameters[i]['metric'] = sol[i-min(idx_to_calculate)]
# Pickle List
with open(pickle_path, 'wb') as handle:
pickle.dump(parameters, handle)
return parameters
def _single_fit(self, x, y, params):
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'}
from tensorflow import keras
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
# Clean Data
x = x.values
y = y.values
# Empty result ndarry
predictions = np.zeros(y.shape)
# Generate k-fold splits
kf = KFold(n_splits=params['kfold_splits'])
for train_index, test_index in kf.split(x):
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
# Define Layer
layer = []
layer.append(keras.layers.Dense(params['layer_1'], activation=params['activation_1'], kernel_initializer=params['kernel_initializer_1'], input_shape=(x_train.shape[1],)))
layer.append(keras.layers.Dropout(params['dropout_1']))
if params['layer_2'] > 0:
layer.append(keras.layers.Dense(params['layer_2'], activation=params['activation_2'], kernel_initializer=params['kernel_initializer_2']))
layer.append(keras.layers.Dropout(params['dropout_2']))
if params['layer_3'] > 0:
layer.append(keras.layers.Dense(params['layer_3'], activation='relu'))
layer.append(keras.layers.Dropout(params['dropout_3']))
layer.append(keras.layers.Dense(y_train.shape[1], activation='relu'))
model = keras.Sequential(layer)
# Compile and fit model
model.compile(loss=params['loss'], optimizer=params['optimizer'], metrics=['mean_squared_error'])
model.fit(x=x_train, y=y_train, batch_size=params['batch_size'], epochs=25, verbose=0, shuffle=True)
predictions[test_index] = model.predict(x_test)
# Calculate RMSE
return self._get_mean_rmse(y, predictions)
@staticmethod
def _arguments_product(kwargs):
keys = kwargs.keys()
vals = kwargs.values()
for instance in product(vals):
yield dict(zip(keys, instance))
@staticmethod
def _dict_product(dicts):
dicts['metric'] = [None]
return [dict(zip(dicts, x)) for x in product(*dicts.values())]
@staticmethod
def _get_mean_rmse(observations, predictions):
cols = observations.shape[1]
rows = observations.shape[0]
rmse_per_col = []
for col in range(0, cols):
sums = 0
for row in range(0, rows):
sums += (observations[row, col]-predictions[row, col])**2
rmse_per_col.append(math.sqrt(sums/rows))
return sum(rmse_per_col) / len(rmse_per_col)
```