# KS statistic (2-sided)
# Author:  Isabel K Darcy 
# Date:  Feb 13, 2018

a <- sort(runif(30, 0,3))  # take 30 random points betwn 0 and 3
                           ## sort in increasing order
sa <-sin(a)                # take sine of these 30 points
b <- sort(runif(25, 0,3))  # 25 random points betwn 0 and 3
sb <-sin(b)                # take sine of these 25 points
c <- sort(runif(30, 0,3))  # 30 random points betwn 0 and 3
sc <- c^2                  # square these 30 points

# plot the data set sa, title = main
# pch = 17:  choose triangle for shape of data points
## see http://www.sthda.com/english/wiki/r-plot-pch-symbols-the-different-point-shapes-available-in-r
# cex.main increases font size of title by 50%
# cex.main increases font size of title by 50%
# cex.main increases font size of title by 50%
plot(sa, main = "data", col="blue", pch = 17, 
     cex.main = 1.5, cex.lab = 1.7, cex.axis = 2)
points(sb, col="red", pch = 19) # add sb dataset to previous plot
points(sc,  pch = 10, cex=2)    # add sc dataset to previous plot
                                # cex = 2 doubles size of data point

plot(sc, main = "data",  pch = 10,  cex=2, cex.main = 1.5, cex.lab = 1.7, cex.axis = 2)
points(sb, col="red", pch = 19)
points(sa, col="blue", pch = 17)

# Plot empirical cumulative distribution function
## for these 3 data sets
plot(ecdf(sa), col="blue")
plot(ecdf(sb), add=TRUE, col="red")
plot(ecdf(sc), add=TRUE)

plot(ecdf(sc))
plot(ecdf(sb), add=TRUE, col="red")
plot(ecdf(sa), add=TRUE, col="blue")

# Calculate KS statistics for each pair of data sets
ks.test(sa, sb)
ks.test(sc, sb)
ks.test(sa, sc)

# generate larger datasets 
a1 <- sort(runif(30000, 0,3))
saLarge <-sin(a1)
b1 <- sort(runif(2500, 0,3))
sbLarge <-sin(b1)

# generate smaller datasets 
a <- sort(runif(30, 0,3))  # take 30 random points betwn 0 and 3
                           ## sort in increasing order
saSmall <-sin(a)           # take sine of these 30 points
b <- sort(runif(25, 0,3))  # 25 random points betwn 0 and 3
sbSmall <-sin(b)           # take sine of these 25 points

#Compare ks on small vs large datasets
ks.test(saSmall, sbSmall)
ks.test(saLarge, sbLarge)