# Clear Memory rm(list=ls()) # WARNING: This deletes everything!!!!! # Create Noise data set containing 10 pts randomly # chosen such that 0 < x < 1 and 2 < y < 3 # Note runif(n, x1, x2) chooses n point randomly # with uniform distribution between x1 and x2. # For more info on runif, use ?runif # cbind binds together the two runif columns, # creating a list of points in R^2. Noise1 <- cbind(runif(10, 0,1), runif(10, 2,3)) # create two more datasets Noise2 <- cbind(runif(5, 0,1), runif(5, 0,0.5)) Noise3 <- cbind(runif(4, 2.5,3), runif(4, 0,1)) # combine these 3 data sets # cbind binds together columns while # rbind binds together rows Noise = rbind(Noise1, Noise2, Noise3) # plot Noise with title at bottom and aspect ratio = 1 plot (Noise, sub = "19 points from noise.", asp = 1) cl <- kmeans(Noise, 3) # Cluster the data using k-means, k = 3 plot(Noise,col=cl$cluster, asp = 1) # Plot circle with clusters colored ########################################################### ## create data set from 2 concentric circles and cluster ## ########################################################### # Load package needed to create circles library("TDA") # randomly choose 100 points from circle of radius 1 circle1 = circleUnif(100, r = 1) # randomly choose 100 points from circle of radius 2 circle2 = circleUnif(100, r = 2) # combine the 2 circle data sets circle = rbind(circle1, circle2) plot(circle, asp = 1) # Plot circle with aspect ratio = 1 # 2-means cluster this data set cl <- kmeans(circle, 2) # Cluster the data using k-means, k = 2 plot(circle,col=cl$cluster, asp = 1) # Plot circle with clusters colored ############################################################### ############################################################### ############################################################### ################################################### ##### Reduce size of data set ##### ################################################### ## One way to reduce the size of a data set is to ## take a random subset of your data set # In the following example, we first create a data # set of 300 points from a circle of radius one # We then choose a random subsample of 15 points # from these 300 points circle = circleUnif(300, r = 1) sample_from_circle <- circle[sample(1:nrow(circle), 15, replace=FALSE),] plot(circle, asp = 1) points(sample_from_circle, pch=8, cex = 2, col=rgb(1, 0, 0)) plot(sample_from_circle, pch=8, cex = 2, col=rgb(1, 0, 0), asp = 1) ## A 2nd method to reduce a data set is to ## choose centroids of clusters ## # Note we can also reduce our data set using clustering # unstead of using all data points, one can instead use # centroids of clusters (which need not be data points) # In this example, we create a data set with 300 points # We use 10-means clustering to obtain a simplified # version with only 10 points. Note these 10 points # are not part of the original data set circle = circleUnif(300, r = 1) plot(circle, asp = 1) cl <- kmeans(circle, 10) plot(circle,col=cl$cluster, asp = 1) points(cl$centers, pch=8, cex = 2) # plot centroids plot(cl$centers, asp = 1) # plot centroids ############################################################### ############################################################### ############################################################### install.packages("ElemStatLearn") library("ElemStatLearn") data(prostate) cl <- kmeans(prostate, 10) centroids <- cl$centers data <- cbind(prostate$lcp, prostate$gleason) plot(data) ############################################################### ##### Create dendograms for a variety of data sets ##### ############################################################### ## Create dendograms for a variety of data sets including some of the data ## sets that you worked with earlier this week. Also create dendograms for ## data sets containing circles, noise, circles plus noise. See below for ## how to create these data sets. ## Recall that you can determine how to use a command using ? or help plot(Noise, asp=1) d <- dist(as.matrix(Noise)) # find distance matrix for dataset hc <- hclust(d, method = "single") # apply hierarchical clustering plot(hc) # plot the dendrogram ?hclust for (i in 4:15) clusterCut <- cutree(hc, 2) clusterCut <- rbind(clusterCut, cutree(hc,3)) a<-2 for (i in 3:15) {clusterCut <- rbind(clusterCut, cutree(hc,i))} for (i in 0:12) {plot(d, col = clusterCut[i,])} plot(d, col = clusterCut) plot(hc, col = clusterCut)