4 min read

R Machine Learning Codebook (Part 3)

  1. Clustering
  • unsupervised learning, without response variable
  1. Hierarchial Clustering
head(USArrests)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7
zUSArrests = scale(USArrests)
summary(zUSArrests)
##      Murder           Assault           UrbanPop             Rape        
##  Min.   :-1.6044   Min.   :-1.5090   Min.   :-2.31714   Min.   :-1.4874  
##  1st Qu.:-0.8525   1st Qu.:-0.7411   1st Qu.:-0.76271   1st Qu.:-0.6574  
##  Median :-0.1235   Median :-0.1411   Median : 0.03178   Median :-0.1209  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.7949   3rd Qu.: 0.9388   3rd Qu.: 0.84354   3rd Qu.: 0.5277  
##  Max.   : 2.2069   Max.   : 1.9948   Max.   : 1.75892   Max.   : 2.6444

Clustering - average clustering with 5 rectangels

#Using average
hc1 = hclust(dist(zUSArrests),method="average")
plot(hc1,hang=-1)
rect.hclust(hc1,k=5)

Show the clusters

cutree(hc1,k=5)
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              2              3              4              3 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              3              4              4              3              1 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              4              4              3              4              5 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              4              4              1              5              3 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              4              3              4              1              3 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              4              4              3              5              4 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              3              3              1              5              4 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              4              4              4              4              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              5              1              3              4              5 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              4              4              5              4              4

Center points for each cluster

cluster <- cutree(hc1,k=5)
cent <- NULL
for (k in 1:5){
  cent <- rbind(cent, colMeans(USArrests[cluster == k,]))
}
cent
##         Murder   Assault UrbanPop      Rape
## [1,] 14.671429 251.28571 54.28571 21.685714
## [2,] 10.000000 263.00000 48.00000 44.500000
## [3,] 10.883333 256.91667 78.33333 32.250000
## [4,]  5.530435 129.43478 68.91304 17.786957
## [5,]  2.700000  65.14286 46.28571  9.885714
  1. K-means Clustering
# 4 clusters
kmc1 = kmeans(zUSArrests,4)
kmc1
## K-means clustering with 4 clusters of sizes 10, 20, 10, 10
## 
## Cluster means:
##       Murder    Assault   UrbanPop       Rape
## 1 -0.2084716 -0.4110987 -0.3412836 -0.2030666
## 2  1.0049340  1.0138274  0.1975853  0.8469650
## 3 -0.6286291 -0.4086988  0.9506200 -0.3888373
## 4 -1.1727674 -1.2078573 -1.0045069 -1.1020261
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              2              2              2              1              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              3              3              2              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              4              2              1              4 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              1              1              2              4              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              2              4              2              2 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              1              1              2              4              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              2              4              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              1              1              3              3              2 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              4              2              2              3              4 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              1              3              4              4              1 
## 
## Within cluster sum of squares by cluster:
## [1]  6.148786 46.747955  9.326266  7.443899
##  (between_SS / total_SS =  64.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Scatterplot

pairs(zUSArrests,col=kmc1$cluster, pch=16)

Elbow Point - chooses k where plot harshly drops - in this case, k = 4

wss = 0
for (i in 1:10) {wss[i]=sum(kmeans(zUSArrests,center=i)$withinss)}
plot(1:10,wss,type='b',xlab='Number of Clusters',ylab='Within group sum of squares')

  1. K-medoid Clustering
library(fpc)
kmed = pamk(zUSArrests,4)
kmed
## $pamobject
## Medoids:
##               ID     Murder    Assault   UrbanPop         Rape
## Alabama        1  1.2425641  0.7828393 -0.5209066 -0.003416473
## Michigan      22  0.9900104  1.0108275  0.5844655  1.480613993
## Oklahoma      36 -0.2727580 -0.2371077  0.1699510 -0.131534211
## New Hampshire 29 -1.3059321 -1.3650491 -0.6590781 -1.252564419
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              2              2              1              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              3              3              2              1 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              4              2              3              4 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              3              1              4              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              2              4              1              3 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              3              3              2              4              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              1              4              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              4              1              2              3              4 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              4              4              3 
## Objective function:
##    build     swap 
## 1.035116 1.027102 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"      
## 
## $nc
## [1] 4
## 
## $crit
## [1] 0.0000000 0.0000000 0.0000000 0.3389904