# solution WS 2 Descriptives # importing data library(readxl) data = read_excel("path") # categorical variables data$sex <- factor(data$sex, levels = c('F', 'M'), labels = c('female', 'male')) data$pstat <- factor(data$pstat, levels = c('0', '1'), labels = c('no', 'yes')) data$death <- factor(data$death, levels = c('0', '1'), labels = c('no', 'yes')) # continuous variables data$hgb <- as.numeric(data$hgb) data$creat <- as.numeric(data$creat) data$mspike <- as.numeric(data$mspike) # ex. 1 # a. frequency tables for categorical variables summary(data$sex) summary(data$pstat) summary(data$death) # rounded relative frequencies round(summary(data$sex) / 1384, 2) round(summary(data$pstat) /1384, 2) round(summary(data$death) /1384, 2) # b. bar plots for categorical variables par(mfrow = c(1, 1)) barplot(table(data$sex)/1382, main = c("sex"), ylim = range(0, 0.8)) barplot(table(data$pstat)/1382, main = c("pstat"), ylim = range(0, 0.8)) barplot(table(data$death)/1382, main = c("death"), ylim = range(0, 0.8)) # c. contingency table gender and death table(data$sex) table(data$death) table(data$sex, data$death) barplot(table(data$sex, data$death)) # crosstab() for more detailed table and graph library(descr) crosstab(data$sex, data$death, xlab = "death", # description for x-axis ylab = "sex") # description for y-axis # d. do more men or more women in the sample die? 423/631 # 0.67 540/753 # 0.72 # more men die than women in this sample. summary(data$age) # ex. 2 # a. mean and median, interpret distribution z <- c(4, 5, 6, 6, 7, 8, 8, 8, 9, 10, 100) summary(z) # alternatively: mean(z) median(z) # more detailed library(psych) describe(z) # skew 2.45 # distribution of data hist(z) # is skewed to the right, asymmetrical # b. descriptives for hgb and creat summary(data$hgb) data$hgb=as.numeric(data$hgb) data$creat=as.numeric(data$creat) summary(data$hgb) summary(data$creat) # alternatively: library(psych) describe(data$hgb) # or manually for creat: mean(data$creat, na.rm = TRUE) # removes missing values, otherwise no result median(data$creat, na.rm = TRUE) sd(data$creat, # standard deviation na.rm = TRUE) quantile(data$creat, # quartiles na.rm = TRUE) range(data$creat, # minimum and maximum na.rm = TRUE) # c. descriptives for subgroups gender describeBy(data$hgb, group = data$sex) describeBy(data$creat, group = data$sex) # d. histogram for hgb and creat par(mfrow = c(1, 2), oma = c(0, 0, 2, 0)) hist(data$hgb, xlab = "haemoglobin", main = "") hist(data$creat, breaks = 50, # variable to determine how many bars are used xlab = "creatinine", main = "") mtext("Histograms of haemoglobin and creatinine", cex = 1.5, outer = TRUE) # e. add normal curves par(mfrow = c(1, 2), oma = c(0, 0, 2, 0)) hist(data$hgb, xlab = "haemoglobin", main = "", probability = TRUE) curve(dnorm(x, mean = mean(data$hgb, na.rm = T), sd = sd(data$hgb, na.rm = T)), # use mean and sd of variable add = TRUE, # added to previous graph col = "red") hist(data$creat, breaks = 50, # variable to determine how many bars are used xlab = "creatinine", main = "", probability = TRUE) curve(dnorm(x, mean = mean(data$creat, na.rm = T), sd = sd(data$creat, na.rm = T)), # use mean and sd of variable add = TRUE, # added to previous graph col = "red") mtext("Histograms of haemoglobin and creatinine", cex = 1.5, outer = TRUE) # f. normally distributed? haemoglobin is close to normally distributed, creatinine not