# KS statistic (2-sided) # Author: Isabel Darcy # Date: Feb 5, 2017 a <- sort(runif(30, 0,3)) # take 30 random points betwn 0 and 3 ## sort in increasing order sa <-sin(a) # take sine of these 30 points b <- sort(runif(25, 0,3)) # 25 random points betwn 0 and 3 sb <-sin(b) # take sine of these 25 points c <- sort(runif(30, 0,3)) # 30 random points betwn 0 and 3 sc <- c^2 # square these 30 points # plot the data set sa, title = main # pch = 17: choose triangle for shape of data points ## see http://www.sthda.com/english/wiki/r-plot-pch-symbols-the-different-point-shapes-available-in-r # cex.main increases font size of title by 50% # cex.main increases font size of title by 50% # cex.main increases font size of title by 50% plot(sa, main = "data", col="blue", pch = 17, cex.main = 1.5, cex.lab = 1.7, cex.axis = 2) points(sb, col="red", pch = 19) # add sb dataset to previous plot points(sc, pch = 10, cex=2) # add sc dataset to previous plot plot(sc, main = "data", pch = 10, cex=2, cex.main = 1.5, cex.lab = 1.7, cex.axis = 2) points(sb, col="red", pch = 19) points(sa, col="blue", pch = 17) # Plot empirical cumulative distribution function ## for these 3 data sets plot(ecdf(sa)) plot(ecdf(sb), add=TRUE, col="red") plot(ecdf(sc), add=TRUE, col="blue") plot(ecdf(sc), , col="blue") plot(ecdf(sb), add=TRUE, col="red") plot(ecdf(sa), add=TRUE) # Calculate KS statistics for each pair of data sets ks.test(sa, sb) ks.test(sc, sb) ks.test(sa, sc) # generate larger datasets a1 <- sort(runif(30000, 0,3)) saLarge <-sin(a1) b1 <- sort(runif(2500, 0,3)) sbLarge <-sin(b1) c1 <- sort(runif(3000, 0,3)) sc <- c1^2 ks.test(saL, sbL) a <- sort(runif(30, 0,3)) # take 30 random points betwn 0 and 3 ## sort in increasing order saSmall <-sin(a) # take sine of these 30 points b <- sort(runif(25, 0,3)) # 25 random points betwn 0 and 3 sbSmall <-sin(b) # take sine of these 25 points ks.test(saSmall, sbSmall) ks.test(saLarge, sbLarge) ############################################################### ##### Create dendograms for a variety of data sets ##### ############################################################### ## Create dendograms for a variety of data sets including some of the data ## sets that you worked with earlier this week. Also create dendograms for ## data sets containing circles, noise, circles plus noise. See below for ## how to create these data sets. ## Recall that you can determine how to use a command using ? or help mydata <- Noise mydata <- c(0,7,2, 10, 6) d <- dist(as.matrix(mydata)) # find distance matrix for dataset hc <- hclust(d, method = "single") # apply hierarchical clustering plot(hc) # plot the dendrogram d ?hclust plot(as.dendrogram(hclust(dist(data)))) k <-3 #x <- identify(hclust(dist(data))) ?identify hc dd <- read.csv("test2.txt", sep = " ", header = FALSE) plot(dd) d <- dist(as.matrix(dd)) # find distance matrix for dataset hc <- hclust(d, method = "single") # apply hierarchical clustering clusterCut <- cutree(hc, 3) for (i in 4:15) clusterCut <- cutree(hc, 2) clusterCut <- rbind(clusterCut, cutree(hc,3)) a<-2 for (i in 3:15) {clusterCut <- rbind(clusterCut, cutree(hc,i))} for (i in 0:12) {plot(dd, col = clusterCut[i,])}