# Requiered libraries
library(mgcv)
library(ggplot2)
library(gamreg)
library(qgam)
I started by importing the data using read.csv()
. This function creates a data frame that I called df. Next I called summary(df)
, where we can see that x1 - x25 are quantitative variables (there are no categorical predictors) and the only variable with missing values is the response y.
# Import data
df <- read.csv(file = "C:/Users/Juan Manuel/Documents/STAT 488 Predictive Analytics/Homework/HW 4/PredictiveHW4Dataset.csv")
# Summary
summary(df)
## x1 x2 x3
## Min. :0.0003418 Min. :0.01826 Min. :-9.9662
## 1st Qu.:0.2581589 1st Qu.:2.43705 1st Qu.:-4.6412
## Median :0.5101924 Median :4.83464 Median : 0.4002
## Mean :0.5072735 Mean :4.86289 Mean : 0.2351
## 3rd Qu.:0.7584385 3rd Qu.:7.21874 3rd Qu.: 5.0498
## Max. :0.9993030 Max. :9.96898 Max. : 9.9752
##
## x4 x5 x6 x7
## Min. :0.0008994 Min. :0.004926 Min. :4.015 Min. :0.001171
## 1st Qu.:0.2501172 1st Qu.:1.267886 1st Qu.:5.483 1st Qu.:0.258111
## Median :0.4776560 Median :2.559438 Median :6.991 Median :0.518469
## Mean :0.4959910 Mean :2.529178 Mean :6.990 Mean :0.508545
## 3rd Qu.:0.7404096 3rd Qu.:3.780419 3rd Qu.:8.473 3rd Qu.:0.754655
## Max. :0.9992324 Max. :4.993732 Max. :9.998 Max. :0.996565
##
## x8 x9 x10
## Min. : 0.06404 Min. :0.004432 Min. :-9.9404
## 1st Qu.:25.17412 1st Qu.:1.005989 1st Qu.:-6.8455
## Median :49.50348 Median :1.989346 Median :-3.5599
## Mean :49.33710 Mean :2.004622 Mean :-3.5713
## 3rd Qu.:73.54565 3rd Qu.:2.998530 3rd Qu.:-0.2997
## Max. :99.90955 Max. :3.984845 Max. : 2.9547
##
## x11 x12 x13
## Min. :-2.83653 Min. :-6.40146 Min. :-9.5153
## 1st Qu.:-0.62417 1st Qu.:-1.36190 1st Qu.:-1.9395
## Median : 0.02656 Median :-0.02123 Median : 0.1120
## Mean : 0.04550 Mean :-0.01235 Mean : 0.1383
## 3rd Qu.: 0.70953 3rd Qu.: 1.40472 3rd Qu.: 2.0369
## Max. : 3.56078 Max. : 6.58999 Max. :10.8543
##
## x14 x15 x16
## Min. :-12.74265 Min. :-33.3511 Min. :-11.9136
## 1st Qu.: -2.55358 1st Qu.: -5.8602 1st Qu.: -2.2774
## Median : 0.02752 Median : -0.1309 Median : 0.6647
## Mean : 0.11723 Mean : -0.2179 Mean : 0.5611
## 3rd Qu.: 2.89183 3rd Qu.: 5.5622 3rd Qu.: 3.1230
## Max. : 13.42808 Max. : 27.3892 Max. : 11.8955
##
## x17 x18 x19 x20
## Min. :-7.397 Min. :-15.8592 Min. :-3.0608 Min. :-55.724
## 1st Qu.: 2.087 1st Qu.: -4.8410 1st Qu.:-0.2339 1st Qu.:-10.815
## Median : 4.999 Median : 0.3511 Median : 0.4916 Median : 2.086
## Mean : 4.997 Mean : 0.2302 Mean : 0.4720 Mean : 2.316
## 3rd Qu.: 7.845 3rd Qu.: 5.1561 3rd Qu.: 1.2190 3rd Qu.: 15.634
## Max. :16.174 Max. : 13.8147 Max. : 4.1065 Max. : 68.661
##
## x21 x22 x23 x24
## Min. :-4.6082 Min. :-4.8332 Min. :-58.600 Min. :-54.471
## 1st Qu.:-0.5127 1st Qu.: 0.4815 1st Qu.:-11.959 1st Qu.:-12.353
## Median : 0.5114 Median : 2.4922 Median : 1.707 Median : 2.054
## Mean : 0.4974 Mean : 2.4643 Mean : 2.130 Mean : 2.503
## 3rd Qu.: 1.5091 3rd Qu.: 4.3080 3rd Qu.: 16.296 3rd Qu.: 16.900
## Max. : 4.6577 Max. :10.4535 Max. : 72.030 Max. : 69.993
##
## x25 y id
## Min. :-28.786 Min. :-121.86 Min. : 1.0
## 1st Qu.: -5.912 1st Qu.: 15.45 1st Qu.: 250.8
## Median : 1.925 Median : 55.71 Median : 500.5
## Mean : 1.381 Mean : 62.21 Mean : 500.5
## 3rd Qu.: 8.570 3rd Qu.: 106.38 3rd Qu.: 750.2
## Max. : 35.121 Max. : 316.23 Max. :1000.0
## NA's :100
I continued the analysis by plotting the response y versus each of the predictor variables to try to identify if there is a clear relationship between them. To do this I used ggplot()
and included the geom_smooth()
layer.
The predictor variables that seem to show a clear relationship with the response are x5, x10, x16, x22, and x25. None of the other plots show a clear pattern between the predictor and the response.
ggplot(data = df, aes(x = x1, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x2, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x3, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x4, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x5, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x6, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x7, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x8, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x9, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x10, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x11, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x12, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x13, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x14, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x15, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x16, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x17, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x18, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x19, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x20, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x21, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x22, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x23, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x24, y = y)) +
geom_point() +
geom_smooth(method = loess)
ggplot(data = df, aes(x = x25, y = y)) +
geom_point() +
geom_smooth(method = loess)
In the next sections I will fit several models with a train set, make predictions with a test set, calculate the prediction’s MSE, and compare. In order to do that, in this section I created the train and test sets.
I used 70% of the labeled observations for the train set and the remaining 30% for the test set.
# Consider only the rows without NA
df2 <- df[1:900, ]
size <- round(900 * 0.7)
# Seed
set.seed(1234)
# Train and test
idx <- sample(x = nrow(df2), size = size, replace = FALSE)
train <- df2[idx, ]
test <- df2[-idx, ]
I continued the model fitting process by doing variable selection using the function cv.gam()
from the package gamreg. This function performs robust cross validation and returns three objects: lambda, fit, and Rocv. From the function help file, the object Rocv is the one that contains the result of best model by Robust Cross-Validation.
# Cross validation
cv_g1 <- cv.gam(as.matrix(df2[ , 1:25]), as.matrix(df2[ , 26]))
##
## init.mode is sLTS
## The number of lambda is 50
## gamma is 0.1 gamma0 is 0.5
cv_g1$Rocv # This object has the cross-validation results
## $beta0
## [1] 7.311395
##
## $beta
## [,1]
## [1,] -12.12418607
## [2,] 0.00000000
## [3,] 0.10561612
## [4,] -2.56142476
## [5,] 24.56563624
## [6,] 0.00000000
## [7,] 0.00000000
## [8,] 0.00000000
## [9,] 0.00000000
## [10,] 0.00000000
## [11,] 0.00000000
## [12,] -0.24373767
## [13,] -0.42506751
## [14,] 0.00000000
## [15,] 0.00000000
## [16,] 0.07471367
## [17,] 0.00000000
## [18,] 0.00000000
## [19,] 0.00000000
## [20,] 0.00000000
## [21,] 0.00000000
## [22,] 0.00000000
## [23,] 0.00000000
## [24,] 0.03615384
## [25,] 0.00000000
##
## $sigma
## [1] 59.3488
This function sets to zero the coefficients for the variables x2, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x17, x18, x19, x20, x21, x22, x23, and x25. In most cases this coincides with the plots made before.
However, while the variables x22 and x25 are dropped out in this cross-validation step, they show a clear pattern when they are plotted against the response. For this reason, I will try including x22 and x25 in the model.
In the model fitting step I am using the function gam()
from the mgcv package.
This function can estimate the smoothing parameter using different methods. Some resources on the Internet, as well as function’s author page, advise to use REML so I set method = "REML"
.
For fitting the model I also used select = TRUE
. From the function help file: “if this is TRUE then gam can add an extra penalty to each term so that it can be penalized to zero. This means that the smoothing parameter estimation that is part of fitting can completely remove terms from the model.”
# Model g1 fit with training data
set.seed(1234)
g1 <- gam(y ~ s(x1) + s(x3) + s(x4) + s(x5) +
s(x12) + s(x13) + s(x16) + s(x23),
data = train, method = "REML", select = TRUE)
# Predictions model g1
g1_pred <- predict.gam(g1, newdata = test)
# MSEP model g1
MSEP1 <- mean((test[ , "y"] - g1_pred)^2)
MSEP1
## [1] 3535.515
# Model g2 fit with training data
set.seed(1234)
g2 <- gam(y ~ s(x1) + s(x3) + s(x4) + s(x5) +
s(x12) + s(x13) + s(x16) + s(x22) + s(x23) + s(x25),
data = train, method = "REML", select = TRUE)
# Predictions model g2
g2_pred <- predict.gam(g2, newdata = test)
# MSEP model g2
MSEP2 <- mean((test[ , "y"] - g2_pred)^2)
MSEP2
## [1] 3552.34
# Model g3 fit with training data
set.seed(1234)
g3 <- gam(y ~ s(x1) + s(x3) + s(x4) + s(x5) + s(x10) +
s(x12) + s(x13) + s(x16) + s(x22) + s(x23) + s(x25),
data = train, method = "REML", select = TRUE)
# Predictions model g3
g3_pred <- predict.gam(g3, newdata = test)
# MSEP model g3
MSEP3 <- mean((test[ , "y"] - g3_pred)^2)
MSEP3
## [1] 3379.061
To compare the models I used the prediction MSE, AIC and BIC.
# Data frame
gnames <- c("GAM 1", "GAM 2", "GAM 3")
gMSEP <- c(MSEP1, MSEP2, MSEP3)
# Bar plot
ggplot(data = data.frame(gnames, gMSEP), aes(gnames, gMSEP)) +
geom_text(aes(label = round(gMSEP)), position = position_dodge(width = 0.9), vjust = -0.25) +
geom_bar(position = "dodge", stat = "identity") +
xlab("Model") +
ylab("MSEP")
# AIC
AIC(g1, g2, g3)
## df AIC
## g1 13.02237 6880.359
## g2 16.43998 6847.962
## g3 20.72815 6820.788
# BIC
BIC(g1, g2, g3)
## df BIC
## g1 13.02237 6938.253
## g2 16.43998 6921.049
## g3 20.72815 6912.940
The comparison of MSEP, AIC, and BIC led me to choose model g3.
Next, I checked model g3 using the function gam.check()
from package mgcv. With gam.check()
I am checking whether the basis’ dimension for a smooth is adequate or not. In help(gam.check())
there is the following information: “Low p-values may indicate that the basis dimension, k, has been set too low”.
# Check model 1
gam.check(g1)
##
## Method: REML Optimizer: outer newton
## full convergence after 12 iterations.
## Gradient range [-0.0008104405,0.0004338619]
## (score 3438.999 & scale 3154.633).
## Hessian positive definite, eigenvalue range [7.510189e-05,314.5106].
## Model rank = 73 / 73
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(x1) 9.00000 0.54643 1.01 0.58
## s(x3) 9.00000 1.00043 0.93 0.03 *
## s(x4) 9.00000 0.38832 0.96 0.22
## s(x5) 9.00000 3.58898 1.00 0.49
## s(x12) 9.00000 0.46335 0.98 0.28
## s(x13) 9.00000 0.73958 1.02 0.68
## s(x16) 9.00000 1.42677 0.99 0.40
## s(x23) 9.00000 0.00269 1.07 0.97
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The first part of the output presents four plots that looks good. None of them is showing a pattern to worry about.
From the output of gam.check()
for model g3, we see that the p-value for s(x3) is marginally significant at the \(\alpha = 0.05\) level. The remaining p-values are greater than 0.05, which means that the basis dimension for the respective smooth has not been set too low.
I conclude that the basis dimensions for the smooths in model g3 have been set right.
Dr. Matthews: I came across the concept of concurvity and try to use it to improve the model. However, the changes that I did after finding concurvity did not improve the model.
It seems that concurvity can be a problem when the function is trying to estimate the smoothing parameter. The only extra information that I found about concurvity is in the book “Generalized Additive Models: an introduction with R” by Simon Wood. In page 176 it says:
This sort of problem is particularly common in the presence of concurvity, for example when a model includes terms such as \(f_{1}(x_{i}) + f_{2}(z_{i})\), but \(z\) is itself a smooth function of \(x\): in this circumstance the smooth functions, \(f_{1}\) and \(f_{2}\), are confounded, and there may be very little information in \(\nu_{\frac{u}{g}}\) from which to estimate their smoothing parameters separately: neglected dependencies in the performance iteration approach then become critical. Concurvity problems like this are very common in models which include a smooth of spatial location, and a number of covariates which themselves vary fairly smoothly over space.
To check for concurvity, I used concurvity()
function from mgcv package. First, I set the argument full = TRUE
and this considers concurvity of each term with the whole of the rest of the model.
# Concurvity model g3: full = TRUE
concurvity(g3, full = TRUE)
## para s(x1) s(x3) s(x4) s(x5) s(x10)
## worst 3.793005e-23 0.2021970 0.2198401 0.2037815 0.7107613 0.2237647
## observed 3.793005e-23 0.1563738 0.1505923 0.1541325 0.6101097 0.1444437
## estimate 3.793005e-23 0.1574016 0.1249076 0.1528843 0.6315202 0.1653357
## s(x12) s(x13) s(x16) s(x22) s(x23) s(x25)
## worst 0.8377427 0.25775803 0.2153252 0.8830930 0.2186809 0.2017489
## observed 0.8324538 0.08892743 0.1146805 0.8789397 0.1445523 0.1405627
## estimate 0.6931655 0.10725153 0.1330316 0.7326667 0.1434403 0.1380775
I focused on the case “worst” and only considered those cases where concurvity is greater than 0.8, which are the smoothers s(x12) and s(x22). Next, I set the argument full = FALSE
where pairwise concurvity measures between each smooth term (as well as the parametric component) are considered.
# Concurvity model g3: full = FALSE
g3_concurvity <- concurvity(g3, full = FALSE)
g3_concurvity[[1]][ , c(7, 10)]
## s(x12) s(x22)
## para 1.026831e-26 2.941095e-23
## s(x1) 5.018234e-02 4.896626e-02
## s(x3) 4.548191e-02 2.929528e-02
## s(x4) 4.568168e-02 3.676689e-02
## s(x5) 4.216976e-02 3.098060e-01
## s(x10) 6.260630e-02 4.232561e-02
## s(x12) 1.000000e+00 6.204385e-01
## s(x13) 5.178870e-02 4.407153e-02
## s(x16) 5.115047e-02 2.646436e-02
## s(x22) 6.204385e-01 1.000000e+00
## s(x23) 4.681701e-02 4.018267e-02
## s(x25) 3.025618e-02 3.389602e-02
concurvity
returns a list whose first component has the values for case “worst”. From there, I am selecting only columns 7 and 10 which corresponds to s(x12) and s(x22), respectively. Here we observe that the value of concurvity between this two smooths is 0.62. The concurvity values with other smooths is really small.
I fit a new model, called g4, where I am including a smooth for both x12 and x22 namely, s(x12, x22). But as I mentioned before, this does not show an improvement over model g3.
# Model g4 fit with training data
set.seed(1234)
g4 <- gam(y ~ s(x1) + s(x3) + s(x4) + s(x5) + s(x10) +
s(x12) + s(x13) + s(x16) + s(x22) + s(x23) + s(x25) + s(x12, x22),
data = train, method = "REML", select = TRUE)
# Predictions model g4
g4_pred <- predict.gam(g4, newdata = test)
# MSEP model g4
MSEP4 <- mean((test[ , "y"] - g4_pred)^2)
MSEP4
## [1] 3379.042
# Data frame
gnames <- c(gnames, "GAM 4")
gMSEP <- c(gMSEP, MSEP4)
# Bar plot
ggplot(data = data.frame(gnames, gMSEP), aes(gnames, gMSEP)) +
geom_text(aes(label = round(gMSEP)), position = position_dodge(width = 0.9), vjust = -0.25) +
geom_bar(position = "dodge", stat = "identity") +
xlab("Model") +
ylab("MSEP")
# AIC
AIC(g1, g2, g3, g4)
## df AIC
## g1 13.02237 6880.359
## g2 16.43998 6847.962
## g3 20.72815 6820.788
## g4 20.71824 6820.776
# BIC
BIC(g1, g2, g3, g4)
## df BIC
## g1 13.02237 6938.253
## g2 16.43998 6921.049
## g3 20.72815 6912.940
## g4 20.71824 6912.884
Another approach to this problem is to try to fit a quantile GAM instead. To do so, I used function qgam()
in package qgam. This function is similar to gam()
in mgcv but now we have to specify the quantile we are interested in through the argument qu
. In this case, I used qu = 0.5
set.seed(1234)
q1 <- qgam(y ~ s(x1) + s(x3) + s(x4) + s(x5) + s(x10) +
s(x12) + s(x13) + s(x16) + s(x22) + s(x23) + s(x25),
data = train, qu = 0.5, argGam = list(method = "REML", select = TRUE))
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.5.............done
q1_pred <- predict.gam(q1, newdata = test)
MSEP5 <- mean((test[ , "y"] - q1_pred)^2)
MSEP5
## [1] 3359.292
# Data set
gnames <- c(gnames, "Quantile GAM")
gMSEP <- c(gMSEP, MSEP5)
# Bar plot
ggplot(data = data.frame(gnames, gMSEP), aes(gnames, gMSEP)) +
geom_text(aes(label = round(gMSEP)), position = position_dodge(width = 0.9), vjust = -0.25) +
geom_bar(position = "dodge", stat = "identity") +
xlab("Model") +
ylab("MSEP")
# AIC
AIC(g1, g2, g3, g4, q1)
## df AIC
## g1 13.02237 6880.359
## g2 16.43998 6847.962
## g3 20.72815 6820.788
## g4 20.71824 6820.776
## q1 22.13493 6879.060
# BIC
BIC(g1, g2, g3, g4, q1)
## df BIC
## g1 13.02237 6938.253
## g2 16.43998 6921.049
## g3 20.72815 6912.940
## g4 20.71824 6912.884
## q1 22.13493 6977.465
Even though MSEP is better for the quantile GAM q1, AIC and BIC are better for model g3. For this reason I fitted the models g3 and q1 several times using different seeds to split the data set into train and test. With this I am trying to indentify if one model is better than the other, of if the differences are only due to the particular data used to train and test.
# Different seeds
df2 <- df[1:900, ]
size <- round(900 * 0.7)
seeds <- c(1, 27, 337, 1001, 1234)
MSEPg3 <- rep(NA, 5)
AICg3 <- rep(NA, 5)
BICg3 <- rep(NA, 5)
MSEPq1 <- rep(NA, 5)
AICq1 <- rep(NA, 5)
BICq1 <- rep(NA, 5)
for (i in 1:5){
# Seed
set.seed(seeds[i])
# Train and test
idx <- sample(x = nrow(df2), size = size, replace = FALSE)
train <- df2[idx, ]
test <- df2[-idx, ]
# Model g3 fit with training data
g3 <- gam(y ~ s(x1) + s(x3) + s(x4) + s(x5) + s(x10) +
s(x12) + s(x13) + s(x16) + s(x22) + s(x23) + s(x25),
data = train, method = "REML", select = TRUE)
# Model q1 fit with training data
q1 <- qgam(y ~ s(x1) + s(x3) + s(x4) + s(x5) + s(x10) +
s(x12) + s(x13) + s(x16) + s(x22) + s(x23) + s(x25),
data = train, qu = 0.5, argGam = list(method = "REML", select = TRUE))
# Predictions model g3
g3_pred <- predict.gam(g3, newdata = test)
# Predictions model q1
q1_pred <- predict.gam(q1, newdata = test)
# MSEP model g3
MSEPg3[i] <- mean((test[ , "y"] - g3_pred)^2)
# MSEP model q1
MSEPq1[i] <- mean((test[ , "y"] - q1_pred)^2)
# AIC and BIC
AICg3[i] <- AIC(g3)
BICg3[i] <- BIC(g3)
AICq1[i] <- AIC(q1)
BICq1[i] <- BIC(q1)
}
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.5.............done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.5............done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.5.............done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.5...............done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.5.............done
mean(MSEPg3)
## [1] 3185.122
mean(MSEPq1)
## [1] 3211.298
mean(AICg3)
## [1] 6839.424
mean(AICq1)
## [1] 6890.016
mean(BICg3)
## [1] 6923.98
mean(BICq1)
## [1] 6979.091
Using five different seeds and computing the average MSEP for each model, we see that model g3 has lower MSEP, AIC and BIC.
I select model g3 and with that I will make my predictions for the 100 observations with missing value of y. Because I used the names train, test, g3, etc. inside the for loop in section 9), now I will fit model g3 again for reproducibility.
# Consider only the rows without NA
df2 <- df[1:900, ]
size <- round(900 * 0.7)
# Seed
set.seed(1234)
# Train and test
idx <- sample(x = nrow(df2), size = size, replace = FALSE)
train <- df2[idx, ]
test <- df2[-idx, ]
# Fit model g3 again
# Model g3 fit with training data
set.seed(1234)
g3 <- gam(y ~ s(x1) + s(x3) + s(x4) + s(x5) + s(x10) +
s(x12) + s(x13) + s(x16) + s(x22) + s(x23) + s(x25),
data = train, method = "REML", select = TRUE)
# Predictions
prediction_data <- df[901:1000, ]
# This is the predicted y values
y <- predict(g3, newdata = prediction_data)
# Submission file
id <- prediction_data$id
gam_submission <- data.frame(y, id)
write.csv(gam_submission, file = "C:/Users/Juan Manuel/Documents/STAT 488 Predictive Analytics/Homework/HW 4/gam_submission.csv", row.names = FALSE)