library(readxl)
library(dplyr)
library(moments)
library(ggplot2)

# Before installing «modeest» we need to install the package «genefilter» as describe in this link:
# http://www.bioconductor.org/packages/release/bioc/html/genefilter.html
library(modeest)




# -------------------------------------------------------------------------- -
#  Loading the dataset ----
# -------------------------------------------------------------------------- -
# Importing the Darts.xls dataset into R
darts = read_excel("/home/adelo/1-system/desktop/it_cct/3-Big_Data_Integration/6-5-6-CA3/Darts.xls")



#  ================================================================== ----
#  Exploratory data analysis           ----
#  ----------------------------------- ----
#  ================================================================== =

# ====================================================================== =
#  Shape of Distribution: ----
# ====================================================================== =
# Another time when we usually prefer the median over the mean (or mode) is
# when our data is skewed (i.e., the frequency distribution for our data is skewed).
# If we consider the normal distribution - as this is the most frequently assessed 
# in statistics - when the data is perfectly normal, the mean, median and mode 
# are identical. Moreover, they all represent the most typical value in the 
# data set. However, as the data becomes skewed the mean loses its ability to
# provide the best central location for the data because the skewed data is dragging 
# it away from the typical value. However, the median best retains this position and is
# not as strongly influenced by the skewed values.


# -------------------------------------------------------------------------- -
#  * Skewness ----
# -------------------------------------------------------------------------- -
## Skewness is a method for quantifying the lack of symmetry in the distribution of a variable.
## Skewness value of zero indicates that the variable is distributed symmetrically. Positive number indicate asymmetry to the left, negative number indicates asymmetry to the right.
skewness(darts$`Expert #1`)

# -------------------------------------------------------------------------- -
#  * Kurtosis  ----
# -------------------------------------------------------------------------- -
## Kurtosis is a measure that gives indication in terms of the peak of the distribution.
## Variables with a pronounced peak toward the mean have a high Kurtosis score and variables with a flat peak have a low Kurtosis score.
kurtosis(darts$`Expert #1`)


# -------------------------------------------------------------------------- -
#  * Histogram  ----
# -------------------------------------------------------------------------- -
hist(darts$`Expert #1`,
     border="grey",
     col="blue"
     )
abline(h=0, col="red")
box();


# -------------------------------------------------------------------------- -
#  * Skewness, kurtosis and Histogram of `Expert #4`  ----
# -------------------------------------------------------------------------- -
skewness(darts$`Expert #4`)
kurtosis(darts$`Expert #4`)
hist(darts$`Expert #4`,
     border="grey",
     col="blue"
)
abline(h=0, col="red")
box();


# -------------------------------------------------------------------------- -
#  * Skewness, kurtosis and Histogram of `Dart #1`  ----
# -------------------------------------------------------------------------- -
skewness(darts$`Dart #1`)
kurtosis(darts$`Dart #1`)
hist(darts$`Dart #1`,
     border="grey",
     col="blue"
)
abline(h=0, col="red")
box();


# -------------------------------------------------------------------------- -
#  * Skewness, kurtosis and Histogram of `Dart #4`  ----
# -------------------------------------------------------------------------- -
skewness(darts$`Dart #4`)
kurtosis(darts$`Dart #4`)
hist(darts$`Dart #4`,
     border="grey",
     col="blue"
)
abline(h=0, col="red")
box();


# -------------------------------------------------------------------------- -
#  * Skewness, kurtosis and Histogram of `DJIA`  ----
# -------------------------------------------------------------------------- -
skewness(darts$DJIA)
kurtosis(darts$DJIA)
hist(darts$DJIA,
     border="grey",
     col="blue"
)
abline(h=0, col="red")
box();


# Examples of normal, left and right skewed distribution:
# hist(rbeta(10000,5,2))
# hist(rbeta(10000,2,5))
# hist(rbeta(10000,5,5))




#  ================================================================== ----
#  Descriptive Data Analysis           ----
#  ----------------------------------- ----
#  ================================================================== =

# ====================================================================== =
#  Central tendency: ----
# ====================================================================== =

# -------------------------------------------------------------------------- -
#  * Mean ----
# -------------------------------------------------------------------------- -
mean(darts$`Expert #1`)


# -------------------------------------------------------------------------- -
#  * Media ----
# -------------------------------------------------------------------------- -
median(darts$`Expert #1`)


# -------------------------------------------------------------------------- -
#  * Mode ----
# -------------------------------------------------------------------------- -
mean(mfv(darts$`Expert #1`, method='mfv'))


# -------------------------------------------------------------------------- -
#  * Mean, Media and Mode for `Expert #4` ----
# -------------------------------------------------------------------------- -
mean(darts$`Expert #4`)
median(darts$`Expert #4`)
mean(mfv(darts$`Expert #4`, method='mfv'))


# -------------------------------------------------------------------------- -
#  * Mean, Media and Mode for `Dart #1` ----
# -------------------------------------------------------------------------- -
mean(darts$`Dart #1`)
median(darts$`Dart #1`)
mean(mfv(darts$`Dart #1`, method='mfv'))


# -------------------------------------------------------------------------- -
#  * Mean, Media and Mode for `Dart #4` ----
# -------------------------------------------------------------------------- -
mean(darts$`Dart #4`)
median(darts$`Dart #4`)
mean(mfv(darts$`Dart #4`, method='mfv'))


# -------------------------------------------------------------------------- -
#  * Mean, Media and Mode for DJIA ----
# -------------------------------------------------------------------------- -
mean(darts$DJIA)
median(darts$DJIA)
mean(mfv(darts$DJIA, method='mfv'))




# ====================================================================== =
#  Measures of Variation: ----
# ====================================================================== =

# -------------------------------------------------------------------------- -
#  * Min, Max and Range ----
# -------------------------------------------------------------------------- -
min(darts$`Dart #1`)
max(darts$`Dart #1`)
range(darts$`Dart #1`)


min(darts$`Expert #1`)
max(darts$`Expert #1`)
range(darts$`Expert #1`)


range(darts$DJIA)


# -------------------------------------------------------------------------- -
#  * Computing the Quantiles and visualizing the result using Box Plots: ----
# -------------------------------------------------------------------------- -

# -------------------------------------------------------------------------- -
#  ** Quantiles for `Expert 1` ----
# -------------------------------------------------------------------------- -
quantile(darts$`Expert #1`)
boxplot(darts$`Expert #1`,
        col = "blue", 
        main="Expert #1", 
        ylab = "Gain (%)")


# -------------------------------------------------------------------------- -
#  ** Quantiles for `Expert 3` ----
# -------------------------------------------------------------------------- -
quantile(darts$`Expert #3`)
boxplot(darts$`Expert #3`,
        col = "blue", 
        main="Expert #3", 
        ylab = "Gain (%)")


# -------------------------------------------------------------------------- -
#  ** Quantiles for `Dart 1` ----
# -------------------------------------------------------------------------- -
quantile(darts$`Dart #1`)
boxplot(darts$`Dart #1`,
        col = "blue", 
        main="Dart #1", 
        ylab = "Gain (%)")


# -------------------------------------------------------------------------- -
#  ** Quantiles for `Dart 3` ----
# -------------------------------------------------------------------------- -
quantile(darts$`Dart #3`)
boxplot(darts$`Dart #3`,
        col = "blue", 
        main="Dart #3", 
        ylab = "Gain (%)")


# -------------------------------------------------------------------------- -
#  ** Quantiles for `DJIA` ----
# -------------------------------------------------------------------------- -
quantile(darts$DJIA)
boxplot(darts$DJIA,
        col = "blue", 
        main="DJIA", 
        ylab = "Gain (%)")




# -------------------------------------------------------------------------- -
#  * Variance ----
# -------------------------------------------------------------------------- -
var(darts$`Expert #1`)
var(darts$`Dart #1`)
var(darts$DJIA)




# -------------------------------------------------------------------------- -
#  * Standard Deviation ----
# -------------------------------------------------------------------------- -
sd(darts$`Expert #1`)
sd(darts$`Dart #1`)
sd(darts$DJIA)




# -------------------------------------------------------------------------- -
#  * Z-score ----
# -------------------------------------------------------------------------- -
zscore_expert1 <-((darts$`Expert #1`) - mean(darts$`Expert #1`))/sd(darts$`Expert #1`)
zscore_expert1[10]




# -------------------------------------------------------------------------- -
#  Simple Linear Correlation ----
# -------------------------------------------------------------------------- -
ggplot(darts, aes(x=meanExpertsByMonth, y=meanDartsByMonth)) + 
  geom_point(color='black') + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)

ggplot(darts, aes(x=meanExpertsByMonth, y=DJIA)) + 
  geom_point(color='black') + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)

ggplot(darts, aes(x=meanDartsByMonth, y=DJIA)) + 
  geom_point(color='black') + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)



ggplot(darts, aes(x=`Expert #1`, y=`Dart #1`)) + 
  geom_point(color='black') + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)

ggplot(darts, aes(x=`Expert #2`, y=`Dart #2`)) + 
  geom_point(color='black') + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)

ggplot(darts, aes(x=`Expert #3`, y=`Dart #3`)) + 
  geom_point(color='black') + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)

ggplot(darts, aes(x=`Expert #4`, y=`Dart #4`)) + 
  geom_point(color='black') + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)




#  ================================================================== ----
#  A little deeper analysis of the Dart Dataset ----
#  ------------- ----
#  ================================================================== =

# -------------------------------------------------------------------------- -
#  Computing the Mean ''by MONTH'' of the 4 Experts combined and the Mean of the 4 Darts combined ----
# -------------------------------------------------------------------------- -
# meanExpertsByMonth and  meanDartsByMonth are the Mean of the 4 Experts and the Mean of the 4 Darts, respectively.

darts$meanExpertsByMonth <- NA
darts$meanDartsByMonth   <- NA
for (i in 1:dim(darts)[1]){
    darts$meanExpertsByMonth[i] = mean(c(darts$`Expert #1`[i],
                                         darts$`Expert #2`[i],
                                         darts$`Expert #3`[i],
                                         darts$`Expert #4`[i]))
    
    darts$meanDartsByMonth[i]   = mean(c(darts$`Dart #1`[i],
                                         darts$`Dart #2`[i],
                                         darts$`Dart #3`[i],
                                         darts$`Dart #4`[i]))
}




# -------------------------------------------------------------------------- -
#  Computing the Mean ''by YEAR'' of the 4 Experts combined, the Mean of the 4 Darts combined and Mean of the DJIA ----
# -------------------------------------------------------------------------- -
meanExpertsByYear = c(1990:2002)
meanDartsByYear   = c(1990:2002)
meanDJIAByYear    = c(1990:2002)
Year              = c(1990:2002)

j = 1
for (i in c(1990:2002)) {
  meanExpert_1_ByYear  = mean(darts$`Expert #1`[which(darts$Year==i)])
  meanExpert_2_ByYear  = mean(darts$`Expert #2`[which(darts$Year==i)])
  meanExpert_3_ByYear  = mean(darts$`Expert #3`[which(darts$Year==i)])
  meanExpert_4_ByYear  = mean(darts$`Expert #4`[which(darts$Year==i)])
  
  meanDart_1_ByYear    = mean(darts$`Dart #1`[which(darts$Year==i)])
  meanDart_2_ByYear    = mean(darts$`Dart #2`[which(darts$Year==i)])
  meanDart_3_ByYear    = mean(darts$`Dart #3`[which(darts$Year==i)])
  meanDart_4_ByYear    = mean(darts$`Dart #4`[which(darts$Year==i)])
  
  meanExpertsByYear[j] = mean(c(meanExpert_1_ByYear,
                                meanExpert_2_ByYear,
                                meanExpert_3_ByYear,
                                meanExpert_4_ByYear))
  
  meanDartsByYear[j]   = mean(c(meanDart_1_ByYear,
                                meanDart_2_ByYear,
                                meanDart_3_ByYear,
                                meanDart_4_ByYear))
  
  meanDJIAByYear[j]    = mean(darts$DJIA[which(darts$Year==i)])
  j = j+1
}
meanByYear <- data.frame(Year, meanExpertsByYear, meanDartsByYear, meanDJIAByYear)




# -------------------------------------------------------------------------- -
#  Making a BarChart of the total Mean by Year ----
# -------------------------------------------------------------------------- -
# https://stackoverflow.com/questions/37931327/barplot-with-multiple-columns-in-r
cols <- c('red', 'blue', 'grey');
ylim <- c(min(meanByYear[c('meanExpertsByYear', 'meanDartsByYear', 'meanDJIAByYear')])*1.2,max(meanByYear[c('meanExpertsByYear', 'meanDartsByYear', 'meanDJIAByYear')])*1.2);
par(lwd=0);
barplot(
  t(meanByYear[c('meanExpertsByYear', 'meanDartsByYear', 'meanDJIAByYear')]),
  beside=T,
  ylim=ylim,
  border=cols,
  col=cols,
  names.arg=meanByYear$Year,
  ylab='Gain (%)'
);
abline(h=0, col="yellow")
box();




# -------------------------------------------------------------------------- -
#  Computing total Mean (for the whole dataset: from 1990 to 2002) of each Expert and each Dart  ----
# -------------------------------------------------------------------------- -
meanExpert1 = mean(darts$`Expert #1`)
meanExpert2 = mean(darts$`Expert #2`)
meanExpert3 = mean(darts$`Expert #3`)
meanExpert4 = mean(darts$`Expert #4`)

meanDart1 = mean(darts$`Dart #1`)
meanDart2 = mean(darts$`Dart #2`)
meanDart3 = mean(darts$`Dart #3`)
meanDart4 = mean(darts$`Dart #4`)




# -------------------------------------------------------------------------- -
#  Computing total Mean (for the whole dataset: from 1990 to 2002) of the 4 Expert combined, the 4 Dart combined and the DJIA ----
# -------------------------------------------------------------------------- -
meanExperts = mean(c(meanExpert1, meanExpert2, meanExpert3, meanExpert4))
meanDarts   = mean(c(meanDart1, meanDart2, meanDart3, meanDart4))
meanDJIA    = mean(darts$DJIA)

means = c(meanExperts, meanDarts, meanDJIA)




# -------------------------------------------------------------------------- -
#  Making a BarChart of the total Mean----
# -------------------------------------------------------------------------- -
ylim <- c(0,15);
barplot(means,
        main = "Total mean for Experts, Darts and DJIA",
        col = c("red","blue", "grey"),
        xlab='From 1990 to 2002',
        ylab='Gain (%)',
        names.arg=c("Experts", "Darts", "DJIA"),
        ylim=ylim
)
box();
abline(h=0, col="blue")
# legend("topright",
#        c("Experts","Darts", "DJIA"),
#        fill = c("red","green", "blue"),
#        ncol=1,
#        cex = 0.70
# )




# -------------------------------------------------------------------------- -
#  Creating subset by year ----
# -------------------------------------------------------------------------- -
year1991 <- subset(darts, Year == 1991)
# year1991 <- darts[which(darts$Year==1991),]  # This is another way of doing it

year1991_expert1 <- year1991$`Expert #1`
year1991_dart1   <- year1991$`Dart #1`


year2002 <- subset(darts, Year == 2002)





# https://stackoverflow.com/questions/37931327/barplot-with-multiple-columns-in-r
cols <- c('red', 'blue', 'grey');
ylim <- c(min(year1991[c('meanExpertsByMonth', 'meanDartsByMonth', 'DJIA')])*1.2,max(year1991[c('meanExpertsByMonth', 'meanDartsByMonth', 'DJIA')])*1.2);
par(lwd=0);
barplot(
  t(year1991[c('meanExpertsByMonth', 'meanDartsByMonth', 'DJIA')]),
  beside=T,
  ylim=ylim,
  border=cols,
  col=cols,
  names.arg=year1991$Month,
  xlab='1991',
  ylab='Gain (%)'
);
abline(h=0, col="yellow")
box()


cols <- c('red', 'blue', 'grey');
ylim <- c(min(year2002[c('meanExpertsByMonth', 'meanDartsByMonth', 'DJIA')])*1.2,max(year2002[c('meanExpertsByMonth', 'meanDartsByMonth', 'DJIA')])*1.2);
par(lwd=0);
barplot(
  t(year2002[c('meanExpertsByMonth', 'meanDartsByMonth', 'DJIA')]),
  beside=T,
  ylim=ylim,
  border=cols,
  col=cols,
  names.arg=year2002$Month,
  xlab='1991',
  ylab='Gain (%)'
);
abline(h=0, col="yellow")
box()




