# solution WS 2 Descriptives

# importing data
library(readxl)
data = read_excel("path")
#  categorical variables
data$sex <- factor(data$sex,
                   levels = c('F', 'M'),
                   labels = c('female', 'male'))
data$pstat <- factor(data$pstat,
                     levels = c('0', '1'),
                     labels = c('no', 'yes'))
data$death <- factor(data$death,
                     levels = c('0', '1'),
                     labels = c('no', 'yes'))

#  continuous variables 
data$hgb <- as.numeric(data$hgb)
data$creat <- as.numeric(data$creat)
data$mspike <- as.numeric(data$mspike)

# ex. 1
# a. frequency tables for categorical variables
summary(data$sex)
summary(data$pstat)
summary(data$death)

#  rounded relative frequencies
round(summary(data$sex) / 1384, 2)
round(summary(data$pstat) /1384, 2)
round(summary(data$death) /1384, 2)

# b. bar plots for categorical variables
par(mfrow = c(1, 1))
barplot(table(data$sex)/1382,
        main = c("sex"),
        ylim = range(0, 0.8))
barplot(table(data$pstat)/1382,
        main = c("pstat"),
        ylim = range(0, 0.8))
barplot(table(data$death)/1382,
        main = c("death"),
        ylim = range(0, 0.8))


# c. contingency table gender and death
table(data$sex)
table(data$death)
table(data$sex, data$death)
barplot(table(data$sex, data$death))
#  crosstab() for more detailed table and graph 
library(descr)
crosstab(data$sex, data$death,
         xlab = "death", # description for x-axis
         ylab = "sex") # description for y-axis

# d. do more men or more women in the sample die?
423/631 # 0.67
540/753 # 0.72
#  more men die than women in this sample.
summary(data$age)

# ex. 2
# a. mean and median, interpret distribution
z <- c(4, 5, 6, 6, 7, 8, 8, 8, 9, 10, 100)
summary(z)
#  alternatively:
mean(z)
median(z)
#  more detailed
library(psych)
describe(z)    # skew 2.45 
#  distribution of data
hist(z) # is skewed to the right, asymmetrical

# b. descriptives for hgb and creat
summary(data$hgb)
data$hgb=as.numeric(data$hgb)
data$creat=as.numeric(data$creat)
summary(data$hgb)
summary(data$creat)
#  alternatively:
library(psych)
describe(data$hgb)

#  or manually for creat:
mean(data$creat,
     na.rm = TRUE) # removes missing values, otherwise no result
median(data$creat,
     na.rm = TRUE)
sd(data$creat, # standard deviation
     na.rm = TRUE)
quantile(data$creat, # quartiles
     na.rm = TRUE)
range(data$creat, # minimum and maximum
     na.rm = TRUE)

# c. descriptives for subgroups gender
describeBy(data$hgb, group = data$sex)
describeBy(data$creat, group = data$sex)

# d. histogram for hgb and creat
par(mfrow = c(1, 2), oma = c(0, 0, 2, 0))
hist(data$hgb,
     xlab = "haemoglobin",
     main = "")
hist(data$creat,
     breaks = 50, # variable to determine how many bars are used
     xlab = "creatinine",
     main = "")
mtext("Histograms of haemoglobin and creatinine", cex = 1.5, outer = TRUE)

# e. add normal curves 
par(mfrow = c(1, 2), oma = c(0, 0, 2, 0))
hist(data$hgb,
     xlab = "haemoglobin",
     main = "",
     probability = TRUE)
curve(dnorm(x, mean = mean(data$hgb, na.rm = T), sd = sd(data$hgb, na.rm = T)), # use mean and sd of variable
      add = TRUE, # added to previous graph
      col = "red")
hist(data$creat,
     breaks = 50, # variable to determine how many bars are used
     xlab = "creatinine",
     main = "",
     probability = TRUE)
curve(dnorm(x, mean = mean(data$creat, na.rm = T), sd = sd(data$creat, na.rm = T)), # use mean and sd of variable
      add = TRUE, # added to previous graph
      col = "red")
mtext("Histograms of haemoglobin and creatinine", cex = 1.5, outer = TRUE)

# f. normally distributed? haemoglobin is close to normally distributed, creatinine not