Kai Hall portfolio

Site for professional code and project showcases

View the content on GitHub kaimhall/portfolio

about | code

This project is about data preprocessing. Here are examples of different normalizatons and standardizations.

Neural Network is trained with and without principal component analysis. PCA reduces dimensionality in data by linear combinations.

load('data_PCA_E7.RData')
summary(X)
##       Var1            Var2            Var3            Var4
##  Min.   :505.0   Min.   :18900   Min.   :0.073   Min.   : 12000
##  1st Qu.:610.0   1st Qu.:26900   1st Qu.:0.145   1st Qu.:168000
##  Median :645.0   Median :37200   Median :0.178   Median :219000
##  Mean   :635.3   Mean   :36101   Mean   :0.418   Mean   :203439
##  3rd Qu.:665.0   3rd Qu.:41900   3rd Qu.:0.246   3rd Qu.:229000
##  Max.   :725.0   Max.   :51200   Max.   :1.950   Max.   :756000
x.pca<-prcomp(X)
print(x.pca)
## Standard deviations (1, .., p=4):
## [1] 1.196981e+05 6.207397e+03 4.548899e+01 4.669683e-01
##
## Rotation (n x k) = (4 x 4):
##                PC1           PC2           PC3           PC4
## Var1 -2.306577e-05  1.185786e-03 -9.999895e-01 -4.427279e-03
## Var2  4.770627e-02  9.988607e-01  1.183337e-03  2.167368e-06
## Var3 -3.746301e-07 -3.057227e-06  4.427278e-03 -9.999902e-01
## Var4 -9.988614e-01  4.770621e-02  7.960712e-05  5.808034e-07
plot(x.pca,type='l')

print(summary(x.pca))
## Importance of components:
##                              PC1       PC2   PC3   PC4
## Standard deviation     1.197e+05 6.207e+03 45.49 0.467
## Proportion of Variance 9.973e-01 2.680e-03  0.00 0.000
## Cumulative Proportion  9.973e-01 1.000e+00  1.00 1.000

z-score standardization

x.pca <- prcomp(X,center=TRUE,scale=TRUE)
print(x.pca)
## Standard deviations (1, .., p=4):
## [1] 1.3076326 1.1684277 0.7876379 0.5518154
##
## Rotation (n x k) = (4 x 4):
##             PC1        PC2       PC3        PC4
## Var1 -0.1547026 -0.7010420 0.6670785 -0.1990313
## Var2 -0.6832255  0.1556693 0.2086951  0.6822143
## Var3  0.2508370  0.6504772 0.7078286 -0.1137494
## Var4  0.6680949 -0.2473593 0.1021336  0.6942847
plot(x.pca,type='l')

print(summary(x.pca))
## Importance of components:
##                           PC1    PC2    PC3     PC4
## Standard deviation     1.3076 1.1684 0.7876 0.55182
## Proportion of Variance 0.4275 0.3413 0.1551 0.07613
## Cumulative Proportion  0.4275 0.7688 0.9239 1.00000

NN without PCA

library(nnet)
rm(X)
rm(y)
load('data_NN_E7.RData')
summary(X)
##       Var1             Var2            Var3             Var4
##  Min.   : 22.00   Min.   :505.0   Min.   :0.0000   Min.   :0.1830
##  1st Qu.: 40.00   1st Qu.:605.0   1st Qu.:0.0000   1st Qu.:0.3230
##  Median : 55.00   Median :630.0   Median :1.0000   Median :0.3680
##  Mean   : 63.63   Mean   :628.9   Mean   :0.6826   Mean   :0.3562
##  3rd Qu.: 80.00   3rd Qu.:655.0   3rd Qu.:1.0000   3rd Qu.:0.4120
##  Max.   :200.00   Max.   :725.0   Max.   :1.0000   Max.   :0.5120
##       Var5              Var6             Var7             Var8
##  Min.   :0.00500   Min.   :0.0730   Min.   :0.0120   Min.   :0.00400
##  1st Qu.:0.00900   1st Qu.:0.1510   1st Qu.:0.1720   1st Qu.:0.01000
##  Median :0.01100   Median :0.1950   Median :0.2210   Median :0.01200
##  Mean   :0.01249   Mean   :0.5973   Mean   :0.2181   Mean   :0.01515
##  3rd Qu.:0.01500   3rd Qu.:1.3710   3rd Qu.:0.2300   3rd Qu.:0.01600
##  Max.   :0.03000   Max.   :2.6000   Max.   :0.7640   Max.   :0.06000
##       Var9             Var10
##  Min.   :0.00000   Min.   :0.0000
##  1st Qu.:0.00000   1st Qu.:0.0000
##  Median :0.00000   Median :1.0000
##  Mean   :0.01709   Mean   :0.5289
##  3rd Qu.:0.00000   3rd Qu.:1.0000
##  Max.   :1.00000   Max.   :1.0000

min-max scaling

minx=apply(X,2,min)
maxx=apply(X,2,max)

miny=min(y)
maxy=max(y)

X_S=scale(X,minx,maxx-minx)
y_s=scale(y,miny,maxy-miny)

NN model

model_nn <- nnet(X_S, y_s, size=20,
                 maxit=300, decay=0.03, linout=TRUE, reltol=1.e-6, MaxNWts=100000)
## # weights:  241
## initial  value 12074.926569
## iter  10 value 53.040354
## iter  20 value 39.697988
## iter  30 value 34.312506
## iter  40 value 31.351618
## iter  50 value 28.171835
## iter  60 value 26.879735
## iter  70 value 25.852764
## iter  80 value 25.353068
## iter  90 value 25.059211
## iter 100 value 24.893339
## iter 110 value 24.763462
## iter 120 value 24.611410
## iter 130 value 24.505750
## iter 140 value 24.425072
## iter 150 value 24.307079
## iter 160 value 24.157097
## iter 170 value 23.999051
## iter 180 value 23.881802
## iter 190 value 23.792141
## iter 200 value 23.710257
## iter 210 value 23.643505
## iter 220 value 23.589224
## iter 230 value 23.534901
## iter 240 value 23.494990
## iter 250 value 23.469023
## iter 260 value 23.439473
## iter 270 value 23.415806
## iter 280 value 23.392344
## iter 290 value 23.275909
## iter 300 value 23.187084
## final  value 23.187084
## stopped after 300 iterations

produce the predictions

y_s.predict <- predict(model_nn, X_S)
print(cor(y_s.predict,y_s))
##           [,1]
## [1,] 0.9388577
str(X)
## 'data.frame':    7196 obs. of  10 variables:
##  $ Var1 : num  90 90 90 90 38 38 38 38 65 90 ...
##  $ Var2 : num  710 710 710 710 580 580 580 580 655 620 ...
##  $ Var3 : num  1 1 1 1 1 1 1 1 0 1 ...
##  $ Var4 : num  0.463 0.463 0.463 0.463 0.247 0.247 0.247 0.247 0.428 0.337 ...
##  $ Var5 : num  0.011 0.011 0.011 0.011 0.013 0.013 0.013 0.013 0.011 0.008 ...
##  $ Var6 : num  0.151 0.151 0.151 0.151 0.208 ...
##  $ Var7 : num  0.027 0.027 0.027 0.027 0.219 0.219 0.219 0.219 0.172 0.226 ...
##  $ Var8 : num  0.011 0.011 0.011 0.011 0.01 0.01 0.01 0.01 0.008 0.016 ...
##  $ Var9 : num  1 1 1 1 0 0 0 0 0 0 ...
##  $ Var10: num  0 0 0 0 1 1 1 1 1 0 ...

NN with PCA

remove binary variables and standardize X.



miny=min(y)
maxy=max(y)
y_s=scale(y,miny,maxy-miny)

x.pca <- prcomp(X[,c(1,2,4,5,6,7,8)],center=TRUE,scale=TRUE)
pred_pca <- predict(x.pca)

print(pred_pca[1:5,])
##            PC1        PC2        PC3        PC4        PC5       PC6        PC7
## [1,]  2.769676  0.9674481  0.5816383 -1.7773290 -0.4168766 0.9762095 0.78995243
## [2,]  2.769676  0.9674481  0.5816383 -1.7773290 -0.4168766 0.9762095 0.78995243
## [3,]  2.769676  0.9674481  0.5816383 -1.7773290 -0.4168766 0.9762095 0.78995243
## [4,]  2.769676  0.9674481  0.5816383 -1.7773290 -0.4168766 0.9762095 0.78995243
## [5,] -1.412512 -0.0040778 -1.3465561  0.7394715 -1.0735162 0.2160057 0.03617177
print(summary(x.pca))
## Importance of components:
##                          PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.328 1.1657 1.1342 0.9897 0.8571 0.77783 0.52149
## Proportion of Variance 0.252 0.1941 0.1838 0.1399 0.1049 0.08643 0.03885
## Cumulative Proportion  0.252 0.4461 0.6299 0.7698 0.8747 0.96115 1.00000

binary variables are added back to the data


N=7
x_data <- data.frame(X[,c(3,9,10)],pred_pca[,1:N])

NN model with pca


model_pcann <- nnet(x_data, y_s, size=20,
                 maxit=300, decay=0.03, linout=TRUE, reltol=1.e-6, MaxNWts=100000)
## # weights:  241
## initial  value 2548.425480
## iter  10 value 60.234397
## iter  20 value 29.770896
## iter  30 value 25.437188
## iter  40 value 23.035419
## iter  50 value 21.986917
## iter  60 value 20.768992
## iter  70 value 20.073042
## iter  80 value 19.625506
## iter  90 value 19.356206
## iter 100 value 19.097833
## iter 110 value 18.938737
## iter 120 value 18.784620
## iter 130 value 18.635220
## iter 140 value 18.516993
## iter 150 value 18.435954
## iter 160 value 18.361717
## iter 170 value 18.281183
## iter 180 value 18.209129
## iter 190 value 18.142622
## iter 200 value 18.071377
## iter 210 value 18.010881
## iter 220 value 17.943529
## iter 230 value 17.878284
## iter 240 value 17.816297
## iter 250 value 17.788753
## iter 260 value 17.768241
## iter 270 value 17.752451
## iter 280 value 17.736956
## iter 290 value 17.719340
## iter 300 value 17.698402
## final  value 17.698402
## stopped after 300 iterations

predict and eval

y_s.pred <- predict(model_pcann, x_data)
print(cor(y_s.pred, y_s))
##           [,1]
## [1,] 0.9514857
plot(y_s.pred,y_s)

back