cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = SVR()
fs = SelectKBest(score_func=f_regression)
pipeline = Pipeline(steps=[('sel',fs), ('svr', model)])
grid = dict()
grid['sel__k'] = [i for i in range(1, X_train.shape[1]+1)]
search = GridSearchCV(
pipeline,
param_grid={
'svr__C': [0.01, 0.1, 1, 10, 100, 1000], ##Regularization
'svr__epsilon': [0.0001, 0.001, 0.01, 0.1, 1, 10],
'svr__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10]
},
scoring='neg_mean_squared_error',
return_train_score=True,
verbose=1,
cv=5,
n_jobs=-1)
# perform the search
results = search.fit(X_train, y_train)
model = results.best_estimator_ # This is your model
#print(results.cv_results_.keys())
#print(results.cv_results_["mean_train_score"].shape)
#print(results.cv_results_["mean_test_score"].shape)#
#print(results.cv_results_["mean_train_score"])
#print(results.cv_results_["mean_test_score"])
# summarize best
print('Best MAE: %.3f' % results.best_score_)
print('Best Config: %s' % results.best_params_)
# summarize all
means = results.cv_results_['mean_test_score']
params = results.cv_results_['params']
for mean, param in zip(means, params):
print(">%.3f with: %r" % (mean, param))
# save the model to disk
filename = 'svr_v2_model.sav'
pickle.dump(model, open(filename, 'wb'))
You can see towards the end, I have a lot of commented-out code for different scores that I've gotten from looking around at other people's answers and trying to understand how to know if my model is overfitting my training data. I am totally confused by which score which tell me if the model is over-fitting.
Can anyone explain to me, which score/lines of code do I check, to tell me whether my model is overfit (note that this is regression and not classification)?