本質的にはclassificationのページと同じですが、こちらはregressionです。
対象データやRコードもほぼ同じです。factorへ変換せずに数値のままautomlにかけています。正答率は出せないので、代わりにRMSEを算出しています。
trainFilename <- choose.files(caption="Select Train Data File")
testFilename <- choose.files(caption="Select Test Data File")
outputFoldername <- choose.dir(default=dirname(trainFilename), caption="Select Output Folder")
trainData <- read.csv(trainFilename)
testData <- read.csv(testFilename)
library(h2o)
h2o.init()
startTime <- proc.time()
colAnswer <- "answer"
colData <- setdiff(names(trainData), colAnswer)
hTrainData <- as.h2o(trainData)
hTrainData$answer <- as.factor(hTrainData$answer)
aml <- h2o.automl(x = colData, y = colAnswer, training_frame = hTrainData, max_models=100)
# View the AutoML Leaderboard
lb <- aml@leaderboard
print(lb, n = nrow(lb))
aml@leader
write.table(as.data.frame(lb), sprintf("%s/leaderboard.txt", outputFoldername))
# save models
mod_ids <- aml@leaderboard$model_id
for(i in 1:nrow(mod_ids)) {
aml1 <- h2o.getModel(aml@leaderboard[i, 1]) # get model object in environment
h2o.saveModel(object = aml1, outputFoldername)
}
finishTime <- proc.time()
(finishTime - startTime)
# top 5
hTestData <- as.h2o(testData)
for(i in 1:5) {
print(aml@leaderboard[i, 1])
nowModel <- h2o.getModel(aml@leaderboard[i, 1])
hResultData <- h2o.predict(nowModel, hTestData)
resultData <- as.data.frame(hResultData)
rmse <- sqrt(sum((resultData-testData[,1])*(resultData-testData[,1]))/nrow(resultData))
print(rmse)
}
##
## predict
##
model1 <- h2o.loadModel(file.choose())
hResultData <- h2o.predict(model1, hTestData)
resultData <- as.data.frame(hResultData)
rmse <- sqrt(sum((resultData-testData[,1])*(resultData-testData[,1]))/nrow(resultData))
print(rmse)