Kai Hall portfolio

Site for professional code and project showcases

View the content on GitHub kaimhall/portfolio

about | code

In this project I research some methods to mitigate negative impacts of missing values. Different imputation methods are experimented on, and effect on mean is explored.

iris_data <- read.table("iris.csv", header = TRUE, sep = ";", dec = ",")
head(iris_data)
##   Petal.Length Petal.Width Species
## 1          1.4         0.2  setosa
## 2          1.7         0.4  setosa
## 3          1.5          NA  setosa
## 4          1.5         0.4  setosa
## 5          1.7         0.2  setosa
## 6          1.6         0.2  setosa
plot(iris_data$Petal.Width,iris_data$Petal.Length)

Fitting linear model.

lm_fit <- lm(Petal.Width ~ Petal.Length, data = iris_data)
summary(lm_fit)
##
## Call:
## lm(formula = Petal.Width ~ Petal.Length, data = iris_data)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -0.3518 -0.1518  0.0254  0.1391  0.3810
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -0.46809    0.10416  -4.494    2e-04 ***
## Petal.Length  0.45906    0.02738  16.769 1.23e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2149 on 21 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.9305, Adjusted R-squared:  0.9272
## F-statistic: 281.2 on 1 and 21 DF,  p-value: 1.231e-13
lm_fit$coefficients
##  (Intercept) Petal.Length
##   -0.4680856    0.4590629

Here mean can’t be calculated with NA values.

mean(iris_data$Petal.Length)
## [1] NA
mean(iris_data$Petal.Width)
## [1] NA
summary(iris_data)
##   Petal.Length  Petal.Width      Species
##  Min.   :1.2   Min.   :0.200   Length:30
##  1st Qu.:1.6   1st Qu.:0.375   Class :character
##  Median :4.2   Median :1.350   Mode  :character
##  Mean   :3.4   Mean   :1.289
##  3rd Qu.:4.7   3rd Qu.:2.025
##  Max.   :5.7   Max.   :2.500
##  NA's   :5     NA's   :2

We have set omission as method of dealing with missing values. I iris data we have 23 complete observations -> no missing values

getOption('na.action')
## [1] "na.omit"
sum(complete.cases(iris_data))
## [1] 23

with na removal.

(mean_pl <- mean(iris_data$Petal.Length, na.rm = TRUE))
## [1] 3.4
(mean_pw <- mean(iris_data$Petal.Width, na.rm = TRUE))
## [1] 1.289286

No na removal, only complete rows, complete pairwise

cov(iris_data$Petal.Length, iris_data$Petal.Width) #returns NA
## [1] NA
cov(iris_data$Petal.Length, iris_data$Petal.Width, use = "complete.obs") # case wise deletion
## [1] 1.286047
cov(iris_data$Petal.Length, iris_data$Petal.Width, use = "pairwise.complete.obs") # calculated using all pair
## [1] 1.286047

mean imputation for missing values.

mean_imputation <- iris_data

idx_pw_na <- is.na(mean_imputation$Petal.Width) #find na
mean_imputation$Petal.Width[idx_pw_na] <- mean(mean_pw)
mean_pw_meanimpute <- mean(mean_imputation$Petal.Width)

idx_pl_na <- is.na(mean_imputation$Petal.Length) #find na
mean_imputation$Petal.Length[idx_pl_na] <- mean(mean_pl)
mean_pl_meanimpute <- mean(mean_imputation$Petal.Length)
mean_pl_meanimpute
## [1] 3.4

stratified mean imputation for missing values.

strat_mean_imputation <- iris_data

#petal width
mean_pw_setosa <- mean(subset(iris_data, Species == "setosa")$Petal.Width, na.rm = TRUE)
strat_mean_imputation$Petal.Width[is.na(strat_mean_imputation$Petal.Width) & strat_mean_imputation$Species == "setosa"] <- mean_pw_setosa

mean_pw_virginica <- mean(subset(iris_data, Species == "virginica")$Petal.Width, na.rm = TRUE)
strat_mean_imputation$Petal.Width[is.na(strat_mean_imputation$Petal.Width) & strat_mean_imputation$Species == "virginica"] <- mean_pw_virginica

mean_pw_versicolor <- mean(subset(iris_data, Species == "versicolor")$Petal.Width, na.rm = TRUE)
strat_mean_imputation$Petal.Width[is.na(strat_mean_imputation$Petal.Width) & strat_mean_imputation$Species == "versicolor"] <- mean_pw_versicolor

summary(strat_mean_imputation)
##   Petal.Length  Petal.Width      Species
##  Min.   :1.2   Min.   :0.200   Length:30
##  1st Qu.:1.6   1st Qu.:0.325   Class :character
##  Median :4.2   Median :1.306   Mode  :character
##  Mean   :3.4   Mean   :1.256
##  3rd Qu.:4.7   3rd Qu.:1.975
##  Max.   :5.7   Max.   :2.500
##  NA's   :5
mean_pw_strat <- mean(strat_mean_imputation$Petal.Width)

#petal length
mean_pl_virginica <- mean(subset(iris_data, Species == "virginica")$Petal.Length, na.rm = TRUE)
strat_mean_imputation$Petal.Length[is.na(strat_mean_imputation$Petal.Length) & strat_mean_imputation$Species == "virginica"] <- mean_pl_virginica

summary(strat_mean_imputation)
##   Petal.Length    Petal.Width      Species
##  Min.   :1.200   Min.   :0.200   Length:30
##  1st Qu.:1.625   1st Qu.:0.325   Class :character
##  Median :4.400   Median :1.306   Mode  :character
##  Mean   :3.730   Mean   :1.256
##  3rd Qu.:5.275   3rd Qu.:1.975
##  Max.   :5.700   Max.   :2.500
mean_pl_strat <-  mean(strat_mean_imputation$Petal.Length)
mean_pl_strat
## [1] 3.73

regression imputation for na values.

regr_imputation <- iris_data
summary(regr_imputation)
##   Petal.Length  Petal.Width      Species
##  Min.   :1.2   Min.   :0.200   Length:30
##  1st Qu.:1.6   1st Qu.:0.375   Class :character
##  Median :4.2   Median :1.350   Mode  :character
##  Mean   :3.4   Mean   :1.289
##  3rd Qu.:4.7   3rd Qu.:2.025
##  Max.   :5.7   Max.   :2.500
##  NA's   :5     NA's   :2
#petal width
subset(regr_imputation,is.na(Petal.Width))
##    Petal.Length Petal.Width    Species
## 3           1.5          NA     setosa
## 18          4.5          NA versicolor
# fit the linear model and impute predictions

lm_fit_width <- lm(Petal.Width ~ Petal.Length, data = iris_data)
regr_imputation$Petal.Width[is.na(regr_imputation$Petal.Width)] <- predict(lm_fit_width, subset(regr_imputation, is.na(Petal.Width)))

mean_pw_regr <- mean(regr_imputation$Petal.Width)
mean_pw_regr
## [1] 1.26394
#petal length
subset(regr_imputation,is.na(Petal.Length))
##    Petal.Length Petal.Width   Species
## 21           NA         2.5 virginica
## 22           NA         2.1 virginica
## 26           NA         1.8 virginica
## 27           NA         1.9 virginica
## 28           NA         2.3 virginica
lm_fit_length <- lm(Petal.Length ~ Petal.Width, data = iris_data)
regr_imputation$Petal.Length[is.na(regr_imputation$Petal.Length)] <- predict(lm_fit_length, subset(regr_imputation, is.na(Petal.Length)))
summary(regr_imputation)
##   Petal.Length    Petal.Width      Species
##  Min.   :1.200   Min.   :0.200   Length:30
##  1st Qu.:1.625   1st Qu.:0.325   Class :character
##  Median :4.400   Median :1.350   Mode  :character
##  Mean   :3.747   Mean   :1.264
##  3rd Qu.:5.085   3rd Qu.:1.975
##  Max.   :6.255   Max.   :2.500
mean_pl_regr <- mean(regr_imputation$Petal.Length)
mean_pl_regr
## [1] 3.747445

maximum likelihood for na values.

if (!require(mclust)) {
  install.packages("mclust")
  require(mclust)
}
## Loading required package: mclust

## Package 'mclust' version 5.4.10
## Type 'citation("mclust")' for citing this R package in publications.
iris_ml <- iris_data
trainingdata <- iris_data[complete.cases(iris_data),]

dens <- densityMclust(trainingdata[,c(1,2)])

summary(dens, parameters = TRUE)
## -------------------------------------------------------
## Density estimation via Gaussian finite mixture modeling
## -------------------------------------------------------
##
## Mclust EEE (ellipsoidal, equal volume, shape and orientation) model with 5
## components:
##
##  log-likelihood  n df       BIC       ICL
##        3.332066 23 17 -46.63927 -46.65523
##
## Mixing probabilities:
##          1          2          3          4          5
## 0.39130435 0.26068872 0.08713737 0.08692027 0.17394930
##
## Means:
##                   [,1]     [,2]     [,3]     [,4]     [,5]
## Petal.Length 1.4888889 4.433393 3.651447 4.949938 5.449927
## Petal.Width  0.2666667 1.316722 1.050387 1.899959 2.324932
##
## Variances:
## [,,1]
##              Petal.Length Petal.Width
## Petal.Length  0.028012791 0.005388134
## Petal.Width   0.005388134 0.006131798
## [,,2]
##              Petal.Length Petal.Width
## Petal.Length  0.028012791 0.005388134
## Petal.Width   0.005388134 0.006131798
## [,,3]
##              Petal.Length Petal.Width
## Petal.Length  0.028012791 0.005388134
## Petal.Width   0.005388134 0.006131798
## [,,4]
##              Petal.Length Petal.Width
## Petal.Length  0.028012791 0.005388134
## Petal.Width   0.005388134 0.006131798
## [,,5]
##              Petal.Length Petal.Width
## Petal.Length  0.028012791 0.005388134
## Petal.Width   0.005388134 0.006131798
#plot(dens, what = "density", data = trainingdata[,c(1,2)])
#plot(dens, what = "density", type = "persp")

summary(iris_ml)
##   Petal.Length  Petal.Width      Species
##  Min.   :1.2   Min.   :0.200   Length:30
##  1st Qu.:1.6   1st Qu.:0.375   Class :character
##  Median :4.2   Median :1.350   Mode  :character
##  Mean   :3.4   Mean   :1.289
##  3rd Qu.:4.7   3rd Qu.:2.025
##  Max.   :5.7   Max.   :2.500
##  NA's   :5     NA's   :2
#width estimates
widths <- seq(0,4,0.1)

lengths <- rep(iris_data$Petal.Length[18], length(widths))
ml_est <- seq(0,4,0.1)[which.max(predict(dens, data.frame(Petal.Length = lengths, Petal.Width = widths)))]
iris_ml$Petal.Width[18] <- ml_est

lengths <- rep(iris_data$Petal.Length[3], length(widths))
ml_est <- seq(0,4,0.1)[which.max(predict(dens, data.frame(Petal.Length = lengths, Petal.Width = widths)))]
iris_ml$Petal.Width[3] <- ml_est

iris_ml
##    Petal.Length Petal.Width    Species
## 1           1.4         0.2     setosa
## 2           1.7         0.4     setosa
## 3           1.5         0.3     setosa
## 4           1.5         0.4     setosa
## 5           1.7         0.2     setosa
## 6           1.6         0.2     setosa
## 7           1.6         0.2     setosa
## 8           1.2         0.2     setosa
## 9           1.3         0.3     setosa
## 10          1.4         0.3     setosa
## 11          4.7         1.4 versicolor
## 12          4.5         1.3 versicolor
## 13          3.5         1.0 versicolor
## 14          4.4         1.4 versicolor
## 15          4.8         1.8 versicolor
## 16          4.4         1.4 versicolor
## 17          3.8         1.1 versicolor
## 18          4.5         1.3 versicolor
## 19          4.4         1.2 versicolor
## 20          4.2         1.2 versicolor
## 21           NA         2.5  virginica
## 22           NA         2.1  virginica
## 23          5.1         2.0  virginica
## 24          5.3         2.3  virginica
## 25          5.7         2.3  virginica
## 26           NA         1.8  virginica
## 27           NA         1.9  virginica
## 28           NA         2.3  virginica
## 29          5.6         2.4  virginica
## 30          5.2         2.3  virginica
mean_pw_ml <- mean(iris_ml$Petal.Width)
mean_pw_ml
## [1] 1.256667
#Length estimates
lengths <- seq(1,7,0.1)

widths <- rep(iris_data$Petal.Width[21], length(lengths))
ml_est <- seq(1,7,0.1)[which.max(predict(dens, data.frame(Petal.Length = lengths, Petal.Width = widths)))]
iris_ml$Petal.Length[21] <- ml_est

widths <- rep(iris_data$Petal.Width[22], length(lengths))
ml_est <- seq(1,7,0.1)[which.max(predict(dens, data.frame(Petal.Length = lengths, Petal.Width = widths)))]
iris_ml$Petal.Length[22] <- ml_est

widths <- rep(iris_data$Petal.Width[26], length(lengths))
ml_est <- seq(1,7,0.1)[which.max(predict(dens, data.frame(Petal.Length = lengths, Petal.Width = widths)))]
iris_ml$Petal.Length[26] <- ml_est

widths <- rep(iris_data$Petal.Width[27], length(lengths))
ml_est <- seq(1,7,0.1)[which.max(predict(dens, data.frame(Petal.Length = lengths, Petal.Width = widths)))]
iris_ml$Petal.Length[27] <- ml_est

widths <- rep(iris_data$Petal.Width[28], length(lengths))
ml_est <- seq(1,7,0.1)[which.max(predict(dens, data.frame(Petal.Length = lengths, Petal.Width = widths)))]
iris_ml$Petal.Length[28] <- ml_est

iris_ml
##    Petal.Length Petal.Width    Species
## 1           1.4         0.2     setosa
## 2           1.7         0.4     setosa
## 3           1.5         0.3     setosa
## 4           1.5         0.4     setosa
## 5           1.7         0.2     setosa
## 6           1.6         0.2     setosa
## 7           1.6         0.2     setosa
## 8           1.2         0.2     setosa
## 9           1.3         0.3     setosa
## 10          1.4         0.3     setosa
## 11          4.7         1.4 versicolor
## 12          4.5         1.3 versicolor
## 13          3.5         1.0 versicolor
## 14          4.4         1.4 versicolor
## 15          4.8         1.8 versicolor
## 16          4.4         1.4 versicolor
## 17          3.8         1.1 versicolor
## 18          4.5         1.3 versicolor
## 19          4.4         1.2 versicolor
## 20          4.2         1.2 versicolor
## 21          5.6         2.5  virginica
## 22          5.2         2.1  virginica
## 23          5.1         2.0  virginica
## 24          5.3         2.3  virginica
## 25          5.7         2.3  virginica
## 26          4.9         1.8  virginica
## 27          4.9         1.9  virginica
## 28          5.4         2.3  virginica
## 29          5.6         2.4  virginica
## 30          5.2         2.3  virginica
mean_pl_ml <- mean(iris_ml$Petal.Length)
mean_pl_ml
## [1] 3.7

back