# Clear Memory
rm(list=ls()) # WARNING: This deletes everything!!!!!

# Create Noise data set containing 10 pts randomly 
# chosen such that 0 < x < 1 and 2 < y < 3
# Note runif(n, x1, x2) chooses n point randomly 
# with uniform distribution between x1 and x2.  
# For more info on runif, use ?runif
# cbind binds together the two runif columns, 
# creating a list of points in R^2.

Noise1 <- cbind(runif(10, 0,1), runif(10, 2,3))

# create two more datasets
Noise2 <- cbind(runif(5, 0,1), runif(5, 0,0.5))
Noise3 <- cbind(runif(4, 2.5,3), runif(4, 0,1))

# combine these 3 data sets
# cbind binds together columns while
# rbind binds together rows
Noise = rbind(Noise1, Noise2, Noise3)

# plot Noise with title at bottom and aspect ratio = 1
plot (Noise, sub = "19 points from noise.", asp = 1)

cl <- kmeans(Noise, 3)   # Cluster the data using k-means, k = 3
plot(Noise,col=cl$cluster, asp = 1)  # Plot circle with clusters colored

###########################################################
## create data set from 2 concentric circles and cluster ##
###########################################################
# Load package needed to create circles
library("TDA")

# randomly choose 100 points from circle of radius 1
circle1 = circleUnif(100, r = 1)  
# randomly choose 100 points from circle of radius 2
circle2 = circleUnif(100, r = 2) 
# combine the 2 circle data sets
circle = rbind(circle1, circle2)  
plot(circle, asp = 1) # Plot circle with aspect ratio = 1

# 2-means cluster this data set
cl <- kmeans(circle, 2)   # Cluster the data using k-means, k = 2
plot(circle,col=cl$cluster, asp = 1)  # Plot circle with clusters colored

###############################################################
###############################################################
###############################################################

###################################################
#####       Reduce size of data set           #####
###################################################

## One way to reduce the size of a data set is to 
## take a random subset of your data set
# In the following example, we first create a data
# set of 300 points from a circle of radius one
# We then choose a random subsample of 15 points
# from these 300 points

circle = circleUnif(300, r = 1)
sample_from_circle <- circle[sample(1:nrow(circle), 15,
                          replace=FALSE),]
plot(circle, asp = 1)
points(sample_from_circle, pch=8, cex = 2, col=rgb(1, 0, 0))
plot(sample_from_circle, pch=8, cex = 2, col=rgb(1, 0, 0), asp = 1)

## A 2nd method to reduce a data set is to
## choose centroids of clusters ##
# Note we can also reduce our data set using clustering
# unstead of using all data points, one can instead use
# centroids of clusters (which need not be data points)
# In this example, we create a data set with 300 points
# We use 10-means clustering to obtain a simplified 
# version with only 10 points.  Note these 10 points
# are not part of the original data set

circle = circleUnif(300, r = 1)  
plot(circle, asp = 1)
cl <- kmeans(circle, 10)
plot(circle,col=cl$cluster, asp = 1)
points(cl$centers, pch=8, cex = 2)  # plot centroids
plot(cl$centers, asp = 1)           # plot centroids   

###############################################################
###############################################################
###############################################################
install.packages("ElemStatLearn")
library("ElemStatLearn")
data(prostate)
cl <- kmeans(prostate, 10)
centroids <- cl$centers
data <- cbind(prostate$lcp, prostate$gleason)
plot(data)

###############################################################
##### Create dendograms for a variety of data sets        #####
###############################################################

## Create dendograms for a variety of data sets including some of the data
## sets that you worked with earlier this week.  Also create dendograms for 
## data sets containing circles, noise, circles plus noise. See below for 
## how to create these data sets.

## Recall that you can determine how to use a command using ? or help
plot(Noise, asp=1)
d <- dist(as.matrix(Noise))          # find distance matrix for dataset
hc <- hclust(d, method = "single")  # apply hierarchical clustering 
plot(hc)                            # plot the dendrogram
?hclust


for (i in 4:15)
  clusterCut <- cutree(hc, 2)
clusterCut <- rbind(clusterCut, cutree(hc,3))
a<-2
for (i in 3:15)
{clusterCut <- rbind(clusterCut, cutree(hc,i))}
for (i in 0:12)
{plot(d, col = clusterCut[i,])}

plot(d, col = clusterCut)
plot(hc, col = clusterCut)