# solution WS 1 Data Management # ex. 1 # a. importing file mgus2.xlsx from wd library(readxl) library(xlsx) mgus2 = read_excel("path") # b. list of variables ls(data) # c. meaning of variables ?mgus2 # d. checking categorical and numeric variables # categorical variables data$sex <- factor(data$sex, levels = c('F', 'M'), labels = c('female', 'male')) data$pstat <- factor(data$pstat, levels = c('0', '1'), labels = c('no', 'yes')) data$death <- factor(data$death, levels = c('0', '1'), labels = c('no', 'yes')) # continuous variables data$hgb <- as.numeric(data$hgb) data$creat <- as.numeric(data$creat) data$mspike <- as.numeric(data$mspike) # ex. 2 # a. subset female patients female <- subset(data, data$sex == "female") # b. save as excel file write.xlsx(female, file = "E:/Malawi 2024/Summer School Malawi/Data/mgus2_female.xlsx") # c. deleting variable id data$id <- NULL # d. add categorical variable "fu_cat" data$fu_cat[data$futime < 60] <- 1 data$fu_cat[(data$futime >= 60) & (data$futime < 120)] <- 2 data$fu_cat[(data$futime >= 120) & (data$futime < 180)] <- 3 data$fu_cat[data$futime >= 180] <- 4 # declaring new variable as categorial data$fu_cat <- factor(data$fu_cat, levels = c('1', '2', '3', '4'), labels = c('< 5 yrs', '5 - 10 yrs', '10 - 15 yrs', '> 15 yrs')) # e. add categorical variable "age_qt" using quartiles library(stats) # find quartiles quantile(data$age) # create new variable data$age_qt[data$age < 63] <- 1 data$age_qt[(data$age >= 63) & (data$age < 72)] <- 2 data$age_qt[(data$age >= 72) & (data$age < 79)] <- 3 data$age_qt[data$age >= 79] <- 4 # declare as categorical and change labels data$age_qt <- factor(data$age_qt, levels = c('1', '2', '3', '4'), labels = c('< 63 yrs', '63 - 71 yrs', '72 - 78 yrs', '> 78 yrs')) View(data) min(mgus2$age) max(mgus2$age) mgus2$agecat2[mgus2$age < 40] <- '< 40 yrs' mgus2$agecat2[(mgus2$age >= 40) & (mgus2$age < 60)] <- 2 mgus2$agecat2[(mgus2$age >= 60) & (mgus2$age < 80)] <- 3 mgus2$agecat2[mgus2$age >= 80] <- 4 mgus2$agecat <- factor(mgus2$agecat, levels = c('1', '2', '3', '4'), labels = c('< 40 yrs', '40 - 60 yrs', '60 - 80 yrs', '> 80 yrs')) summary(mgus2$age) table(mgus2$agecat) par(mfrow=c(1,2)) hist(mgus2$age, breaks = 50, # variable to determine how many bars are used xlab = "age", main = "", probability = TRUE) hist(mgus2$age, breaks = 50, # variable to determine how many bars are used xlab = "age", main = "", probability = FALSE) curve(dnorm(x, mean = mean(mgus2$age, na.rm = T), sd = sd(mgus2$age, na.rm = T)), # use mean and sd of variable add = TRUE, # added to previous graph col = "red")