data()

datairis <- iris
?iris

library(dplyr)
data1 <- tbl_df(datairis)
data1 <- datairis
?select  # remove columns
data2 <- select(data1, -Species)
?filter  # remove rows
data3 <- filter(data2, Petal.Length > 6.5)
data3
select(data3, Sepal.Length:Petal.Length)
select(data3, -(Sepal.Length:Petal.Length))
select(data3, Sepal.Length, Petal.Length)
dataTemp <- rbind(data3, c(1, 2, NA, 3))
dataTemp
na.omit(dataTemp) # remove all rows with NAs

# one way to normalize data
scaledata2 <- scale(data2)
colMeans(scaledata2)  # faster version of apply(scaled.dat, 2, mean)
apply(scaledata2, 2, sd)

# save data to current working directory
# as a text file
write.table(scaledata2, "data.txt", sep=" ", 
            row.names = FALSE,
            col.names = FALSE)

# install.packages("TDAmapper")
library("TDAmapper")

# The fastcluster package is not necessary.  By loading the
# fastcluster package, the fastcluster::hclust() function 
# automatically replaces the slower stats::hclust() function
# whenever hclust() is called.
# install.packages("fastcluster") 
require(fastcluster) 

# The igraph package is necessary to view simplicial complexes
# (undirected graph) resulting from mapper1D().

library(igraph)
P<- select(tbl_df(scaledata2), Petal.Length)
# Apply mapper
m1 <- mapper1D(
  distance_matrix = dist(data.frame(scaledata2)),
  filter_values = P,
  num_intervals = 10,
  percent_overlap = 50,
  num_bins_when_clustering = 10)

# create and plot mapper graph
g1 <- graph.adjacency(m1$adjacency, mode="undirected")
plot(g1, layout = layout.auto(g1) )


## load package TDA

library("TDA")
?ripsDiag
## ripsDiag computes the persistence diagram of the Rips filtration built on 
## top of a point cloud.
## First argument = data set (matrix of data points or distances, see dist).
## Second argument = max dimension of the homological features to be computed.
## Third argument =  maximum value of the rips filtration (largest threshold).
## Optional arguments:
## dist
##     dist = "euclidean" :  The euclidean distance is used to calculate 
##                           distances between points in your data set
##     dist = "arbitrary" :  Assumes your data set is NOT a matrix of data points
##                           assumes your data set is a matrix of distances 
##                           between your data points.  Note only distances 
##                           between data points are used, not the actual 
##                           data points
## library
##     library = "GUDHI"    :  The GUDHI program is used to calculate Rips PD.
##     library = "Dionysis" :  The Dionysus program is used to calculate Rips PD.
##     GUDHI is default unless distance matrix used (ie, dist = "arbitrary"). 
## printProgress
##      printProgress = TRUE  :   progress bar printed
##      printProgress = FALSE :   progress bar not printed
## The output is a list containing the object of class diagram, a P by 3 matrix, 
## where P is the number of cycles in the barcode and 
## the columns correspond to dimension, birth, death of the cycles.
## To access the diagram in the list, use $diagram

maxscale=2  # max radius = 2
maxdimension=2  # to compute H0, H1, H2
Diag=ripsDiag(scaledata2,maxdimension,maxscale, printProgress=TRUE)$diagram

Diag  # View birth and death times of each cycle 

## Plot barcode

plot(Diag, diagLim=NULL, dimension=NULL, col=NULL, rotated=FALSE, barcode=TRUE, 
     band=NULL, lab.line=2.2, colorBand="pink", colorBorder=NA, add = FALSE, sub = "Barcode 
     for data set consisting of 60 points from circle plus 20 points from noise.")

## Plot persistence diagram

plot(Diag, diagLim=NULL, dimension=NULL, col=NULL, rotated=FALSE, barcode=FALSE, 
     band=NULL, lab.line=2.2, colorBand="pink", colorBorder=NA, add = FALSE, sub = 
       "Persistence Diagram for data set consisting of 60 points from circle plus 20 points 
     from noise.")


########################################


############################################################################

##  The TDA package has several commands for generating data sets 
##  with known topology:

# choose 300 points randomly (with uniform distribution)from a circle of radius 1.

Circle = circleUnif(300, r = 1)  

# choose 300 points randomly (with uniform distribution)from a 2-dimensional 
# sphere of radius 1.  Note the 2nd parameter determines the dimension of the 
# sphere.  
# 2nd parameter = 1: 1-d sphere in R^2 = circle in R^2 
# 2nd parameter = 2: 2-d sphere in R^3 
# 2nd parameter = 3: 3-d sphere in R^4 

Sphere = sphereUnif(300, 2, r = 1)

# choose 300 points randomly (with uniform distribution)from a torus with tube  
# radius 1 where the center of the tube is a circle of radius 2 centered at the 
# origin.  Note the center of the tube is not part of the torus.

Torus = torusUnif(300,  1, 2)

############################################################################

## To plot your points in a new window of size 4x5

windows(4,5);plot(torus)

## To save your data points to the csv file, circle.csv
## Note ncolumns = dimension of your data set (unless you also have a column 
## containing the names of your rows).

write(Circle, file = "circle.csv", ncolumns = 2, sep = ",")

# To determine the directory where your file was saved

getwd()

# Recall you can specify where you would like to save your file, by giving the 
# path to the directory.  For example,

write(torus, file = "C:/users/YourUserName/Downloads/torus.csv", ncolumns = 3, sep = 
        ",")

############################################################################

## We will first generate a data set where we know the topology
## circleUnif(30, r = 1) chooses 30 points randomly (with uniform distribution)
## from a circle of radius 1.
## A set of data points is called a point cloud.
## Thus XX is a point cloud.

XX = circleUnif(30, r = 1)

# plot XX with title at bottom and aspect ratio = 1

plot (XX, sub = "30 points from circle.", asp = 1) 

?ripsDiag
## ripsDiag computes the persistence diagram of the Rips filtration built on 
## top of a point cloud.
## First argument = data set (matrix of data points or distances, see dist).
## Second argument = max dimension of the homological features to be computed.
## Third argument =  maximum value of the rips filtration (largest threshold).
## Optional arguments:
## dist
##     dist = "euclidean" :  The euclidean distance is used to calculate 
##                           distances between points in your data set
##     dist = "arbitrary" :  Assumes your data set is NOT a matrix of data points
##                           assumes your data set is a matrix of distances 
##                           between your data points.  Note only distances 
##                           between data points are used, not the actual 
##                           data points
## library
##     library = "GUDHI"    :  The GUDHI program is used to calculate Rips PD.
##     library = "Dionysis" :  The Dionysus program is used to calculate Rips PD.
##     GUDHI is default unless distance matrix used (ie, dist = "arbitrary"). 
## printProgress
##      printProgress = TRUE  :   progress bar printed
##      printProgress = FALSE :   progress bar not printed
## The output is a list containing the object of class diagram, a P by 3 matrix, 
## where P is the number of cycles in the barcode and 
## the columns correspond to dimension, birth, death of the cycles.
## To access the diagram in the list, use $diagram

maxscale=2.5  # max radius = 2.5
maxdimension=2  # to compute H0, H1, H2
Diag=ripsDiag(XX,maxdimension,maxscale, printProgress=TRUE)$diagram

Diag  # View birth and death times of each cycle 

## Plot barcode

plot(Diag, diagLim=NULL, dimension=NULL, col=NULL, rotated=FALSE, barcode=TRUE, 
     band=NULL, lab.line=2.2, colorBand="pink", colorBorder=NA, add = FALSE, sub = "Barcode 
     for data set consisting of 30 points from circle.")

## Plot persistence diagram

plot(Diag, diagLim=NULL, dimension=NULL, col=NULL, rotated=FALSE, barcode=FALSE, 
     band=NULL, lab.line=2.2, colorBand="pink", colorBorder=NA, add = FALSE, sub = 
       "Persistence Diagram for data set consisting of 30 points from circle.")


## Let's now add noise to our data set:

?runif
noise = cbind(runif(20, -2,2),runif(20, -2,2))
plot(noise)
noise

## Add 20 more data points from noise to 30 point data set XX
## for a total of 50 data points in XXnoisy 

XXnoisy=rbind(XX, noise)
plot(XXnoisy, asp = 1)

# Caculate ripsDiag

DiagN=ripsDiag(XXnoisy,maxdimension,maxscale, printProgress=TRUE)$diagram

## Plot barcode

plot(DiagN, diagLim=NULL, dimension=NULL, col=NULL, rotated=FALSE, barcode=TRUE, 
     band=NULL, lab.line=2.2, colorBand="pink", colorBorder=NA, add = FALSE)

## Plot persistence diagram

plot(DiagN, diagLim=NULL, dimension=NULL, col=NULL, rotated=FALSE, barcode=FALSE, 
     band=NULL, lab.line=2.2, colorBand="pink", colorBorder=NA, add = FALSE)

?bottleneck
print( bottleneck(Diag, DiagN, dimension=0) )
? wasserstein
print( wasserstein(Diag, DiagN, p=2, dimension=0) )

print( bottleneck(Diag, DiagN, dimension=1) )
print( wasserstein(Diag, DiagN, p=2, dimension=1) )

print( bottleneck(Diag, DiagN, dimension=2) )
print( wasserstein(Diag, DiagN, p=2, dimension=2) )

##  Add even more noise

noise=cbind(runif(40, -2,2),runif(40, -2,2))

## Add 40 more data points from noise to 30 point data set XX
## for a total of 70 data points in XXnoisy 

XXnoisy2=rbind(XX, noise)
plot(XXnoisy2, asp = 1)

# Caculate ripsDiag

DiagNN=ripsDiag(XXnoisy2,maxdimension,maxscale, printProgress=TRUE)$diagram

## Plot barcode

plot(DiagNN, diagLim=NULL, dimension=NULL, col=NULL, rotated=FALSE, barcode=TRUE, 
     band=NULL, lab.line=2.2, colorBand="pink", colorBorder=NA, add = FALSE)

## Plot persistence diagram

plot(DiagNN, diagLim=NULL, dimension=NULL, col=NULL, rotated=FALSE, barcode=FALSE, 
     band=NULL, lab.line=2.2, colorBand="pink", colorBorder=NA, add = FALSE)

?bottleneck
print( bottleneck(Diag, DiagNN, dimension=0) )
? wasserstein
print( wasserstein(Diag, DiagNN, p=2, dimension=0) )

print( bottleneck(Diag, DiagNN, dimension=1) )
print( wasserstein(Diag, DiagNN, p=2, dimension=1) )

print( bottleneck(Diag, DiagNN, dimension=2) )
print( wasserstein(Diag, DiagNN, p=2, dimension=2) )

############################################################################
############################################################################
############################################################################

## You can write for loops in R
## To get help for some special characters and words, one must use quotes
##  NOTE sapply is much more efficient than using for loops.
help("for")

# Create Noise data set containing 20 pts randomly chosen such that -2 < x,y < 2

Noise =cbind(runif(20, -2,2),runif(20, -2,2))

# Create a list where first element on the list is the Noise data set with 20pts.

NoiseList <- list(Noise)

# For each i = 2, 3, 4, 5, we create a new data set of noisy points by adding
# 20 more points to the previously created noisy data set.

for (i in 2:2)
{ 
  MoreNoise <- cbind(runif(20, -2,2),runif(20, -2,2)) 
  NoiseList[i] <- list(rbind(Noise, MoreNoise))    
  Noise <- NoiseList[[i]]
  windows(5,5); plot(Noise, sub = "20i points from noise.", asp = 1)
  filename <- "Noise.csv" 
  write(Noise, file = filename, ncolumns = 2, sep = ",")
}


##############################################################################