Exercises

Data

Hypoxia MAP Treatment Dataset

The Hypoxia MAP dataset was contributed by Dr. Amy Nowacki, Associate Professor, Cleveland Clinic. Please refer to this resource as: Amy S. Nowacki, “Hypoxia MAP Treatment Dataset”, TSHS Resources Portal (2022). Available at causeweb.org/tshs/hypoxia. The data is licensed under a Creative Commons Attribution-Non Commerical-Share Alike 4.0 International (CC BY-NC-SA 4.0) license, and is intended for educational purposes only.

Download CSV: hypoxia.csv

Initial R Script

The original messy R script used for the exercises can be downloaded below.

Download .R Script: messy_exercises_script.R

See initial R script
messy_exercises_script.R
install.packages("readr")
install.packages("writexl")
library(readxl)
library(readexcel)
library(readr)
df <- read_csv("C:/Users/username/OneDrive/R code/hypoxia.csv")

View(df)

# histograms
hist(df$Age) # don't use this
hist(df$BMI)
hist(df$Sleeptime)
hist(df$AHI)
# An AHI of 5-14 is mild; 15-29 is moderate and 30 or more events per hour characterizes severe sleep apnea.
# All normal????
# 1 = (AHI < 5); 2 = (5 ≤ AHI < 15);
# 3 = (15 ≤ AHI < 30); 4 = (AHI ≥ 30)

myfunction <- function() {
  df <- df |> mutate(Sex = if_else(Female == 1, "Female", "Male"), .after = Female) |> mutate(AHI = factor(AHI))
  return(df)
}
df <- myfunction()

library(ggplot2)
ggplot(df) +
  geom_histogram(aes(Age))
ggplot(df) +
  geom_histogram(aes(AHI))
ggplot(df) +
  geom_bar(aes(AHI))

library(ggcorrplot)
# doesn't work error 
# ggcorrplot::ggcorrplot(cor(df))

df |> 
  select(where(is.numeric)) |> 
  cor(use = "complete.obs") |> 
  ggcorrplot()

# save as plot - PNG format
# width 5 inch, height = 5 inch for paper

ggplot(df) +
  geom_bar(aes(AHI)) +
  facet_wrap(~Sex)

ggplot(df) +
  geom_bar(aes(AHI)) +
  facet_wrap(Race~Sex) 

ggplot(df) +
  geom_bar(aes(AHI)) +
  facet_grid(Race~Sex) 


ggplot(df) +
  geom_bar(aes(AHI, fill = Diabetes)) # no colours


df <- df |> 
  mutate(Diabetesfct = factor(Diabetes))
ggplot(df) +   geom_bar(aes(AHI, fill=Diabetesfct),position ="fill")+scale_fill_brewer(palette ="Dark2")+ theme_minimal() # no colours

library(dplyr)

mean(df$Age) #50.65667
sd(df$Age)
mean(df$BMI) #46.74875
table(df$AHI) #?missing
table(df$`Duration of Surg`)
mean(df$`Duration of Surg`) #4.314947
# error - don't know why

#tests
chisq.test(table(df$AHI, df$Sex))
chisq.test(table(df$AHI, df$Race))
chisq.test(table(df$AHI, df$Smoking))
chisq.test(table(df$AHI, df$Diabetes))

BMI1 <- df |> filter(BMI <= median(BMI))
BMI2 <- df |> filter(BMI > median(BMI))
t.test(BMI1$Sleeptime, BMI2$Sleeptime, var.equal = T)
t.test(BMI1$Sleeptime, BMI2$Sleeptime, var.equal = T)$p.value

# models

df <- df |> 
  mutate(severeAHI = if_else(AHI ==4, 1, 0))

mod1 <- glm(severeAHI~., data = df, family = "binomial")
summary(mod1)

mod_data <- df |> select(-c(Female, AHI))
mod1 <- glm(severe_AHI~., data = mod_data, family = "binomial")
summary(mod1)

mod2 <- glm(severe_AHI~Age+Sex+BMI, data=mod_data, family = "binomial")
summary(mod2)

mod3 <- glm(severe_AHI~Age+Sex, data=mod_data, family= "binomial")
summary(mod3)

mod4 <- glm(severe_AHI~Age+BMI, data=mod_data, family ="binomial")
summary(mod4)

mod5 <- glm(severe_AHI~Sex+BMI, data=mod_data, family ="binomial")
summary(mod5)

mod6 <- glm(severe_AHI~Age, data=mod_data, family ="binomial")
summary(mod6)

mod7 <- glm(severe_AHI~Sex, data=mod_data, family ="binomial")
summary(mod7)

mod8 <- glm(severe_AHI~BMI, data=mod_data, family ="binomial")
summary(mod8)

c(mod2$aic, mod3$aic, mod4$aic, mod5$aic, mod6$aic, mod7$aic, mod8$aic)
# mod2 lowest 

# results
library(ggforestplot)
library(forestmodel)
forest_model(mod2)
ggsave("forestplot.png")
# saved as forestplot.png

Exercises

Exercise 1: Projects and relative file paths

  1. Create an R project for today’s workshop - name it something sensible!

  2. Download the messy_exercises_script.R script from nrennie.rbind.io/training-better-r-code/exercises.html and the hypoxia.csv data.

  3. Add those files to your R project in a sensible way.

  4. Edit the Global / Project options appropriately.

  5. Edit the script to use relative file paths.

Solution

Folder structure:

project folder
│   messy_exercises_script.R
|   hypoxia.csv
|   project_name.Rproj

R script:

messy_exercises_script.R
install.packages("readr")
install.packages("writexl")
library(readxl)
library(readexcel)
library(readr)
df <- read_csv("hypoxia.csv")

View(df)

# histograms
hist(df$Age) # don't use this
hist(df$BMI)
hist(df$Sleeptime)
hist(df$AHI)
# An AHI of 5-14 is mild; 15-29 is moderate and 30 or more events per hour characterizes severe sleep apnea.
# All normal????
# 1 = (AHI < 5); 2 = (5 ≤ AHI < 15);
# 3 = (15 ≤ AHI < 30); 4 = (AHI ≥ 30)

myfunction <- function() {
  df <- df |> mutate(Sex = if_else(Female == 1, "Female", "Male"), .after = Female) |> mutate(AHI = factor(AHI))
  return(df)
}
df <- myfunction()

library(ggplot2)
ggplot(df) +
  geom_histogram(aes(Age))
ggplot(df) +
  geom_histogram(aes(AHI))
ggplot(df) +
  geom_bar(aes(AHI))

library(ggcorrplot)
# doesn't work error 
# ggcorrplot::ggcorrplot(cor(df))

df |> 
  select(where(is.numeric)) |> 
  cor(use = "complete.obs") |> 
  ggcorrplot()

# save as plot - PNG format
# width 5 inch, height = 5 inch for paper

ggplot(df) +
  geom_bar(aes(AHI)) +
  facet_wrap(~Sex)

ggplot(df) +
  geom_bar(aes(AHI)) +
  facet_wrap(Race~Sex) 

ggplot(df) +
  geom_bar(aes(AHI)) +
  facet_grid(Race~Sex) 


ggplot(df) +
  geom_bar(aes(AHI, fill = Diabetes)) # no colours


df <- df |> 
  mutate(Diabetesfct = factor(Diabetes))
ggplot(df) +   geom_bar(aes(AHI, fill=Diabetesfct),position ="fill")+scale_fill_brewer(palette ="Dark2")+ theme_minimal() # no colours

library(dplyr)

mean(df$Age) #50.65667
sd(df$Age)
mean(df$BMI) #46.74875
table(df$AHI) #?missing
table(df$`Duration of Surg`)
mean(df$`Duration of Surg`) #4.314947
# error - don't know why

#tests
chisq.test(table(df$AHI, df$Sex))
chisq.test(table(df$AHI, df$Race))
chisq.test(table(df$AHI, df$Smoking))
chisq.test(table(df$AHI, df$Diabetes))

BMI1 <- df |> filter(BMI <= median(BMI))
BMI2 <- df |> filter(BMI > median(BMI))
t.test(BMI1$Sleeptime, BMI2$Sleeptime, var.equal = T)
t.test(BMI1$Sleeptime, BMI2$Sleeptime, var.equal = T)$p.value

# models

df <- df |> 
  mutate(severeAHI = if_else(AHI ==4, 1, 0))

mod1 <- glm(severeAHI~., data = df, family = "binomial")
summary(mod1)

mod_data <- df |> select(-c(Female, AHI))
mod1 <- glm(severe_AHI~., data = mod_data, family = "binomial")
summary(mod1)

mod2 <- glm(severe_AHI~Age+Sex+BMI, data=mod_data, family = "binomial")
summary(mod2)

mod3 <- glm(severe_AHI~Age+Sex, data=mod_data, family= "binomial")
summary(mod3)

mod4 <- glm(severe_AHI~Age+BMI, data=mod_data, family ="binomial")
summary(mod4)

mod5 <- glm(severe_AHI~Sex+BMI, data=mod_data, family ="binomial")
summary(mod5)

mod6 <- glm(severe_AHI~Age, data=mod_data, family ="binomial")
summary(mod6)

mod7 <- glm(severe_AHI~Sex, data=mod_data, family ="binomial")
summary(mod7)

mod8 <- glm(severe_AHI~BMI, data=mod_data, family ="binomial")
summary(mod8)

c(mod2$aic, mod3$aic, mod4$aic, mod5$aic, mod6$aic, mod7$aic, mod8$aic)
# mod2 lowest 

# results
library(ggforestplot)
library(forestmodel)
forest_model(mod2)
ggsave("forestplot.png")
# saved as forestplot.png

Exercise 2: Organising a script

  1. Reorganise the messy R script by adding sections and subsections.

  2. Edit the comments in the document to make them more useful.

  3. Rename variables and functions if you think they need to be renamed.

Solution
messy_exercises_script.R
# Load packages -----------------------------------------------------------

library(readr)
library(ggplot2)
library(dplyr)
library(ggforestplot)
library(forestmodel)
library(ggcorrplot)


# Read data ---------------------------------------------------------------

hypoxia <- read_csv("data/hypoxia.csv")
View(hypoxia)


# Useful functions --------------------------------------------------------

# An AHI of 5-14 is mild; 15-29 is moderate and 
# 30 or more events per hour characterizes severe sleep apnea.
# 1 = (AHI < 5); 2 = (5 ≤ AHI < 15);
# 3 = (15 ≤ AHI < 30); 4 = (AHI ≥ 30)

# Convert binary `Female` column to characters for Male and Female
convert_to_factors <- function(data = hypoxia) {
  output <- data |> mutate(Sex = if_else(Female == 1, "Female", "Male"), .after = Female) |> mutate(AHI = factor(AHI))
  return(output)
}
hypoxia <- convert_to_factors()


# Exploratory plots -------------------------------------------------------

# Histograms and bar charts
ggplot(hypoxia) +
  geom_histogram(aes(Age))
ggplot(hypoxia) +
  geom_histogram(aes(AHI))
ggplot(hypoxia) +
  geom_bar(aes(AHI))

# Correlation
hypoxia |> 
  select(where(is.numeric)) |> 
  cor(use = "complete.obs") |> 
  ggcorrplot()
ggsave("corr_plot.png", width = 5, height = 5)

# Multiple factors
ggplot(hypoxia) +
  geom_bar(aes(AHI)) +
  facet_wrap(~Sex)

ggplot(hypoxia) +
  geom_bar(aes(AHI)) +
  facet_wrap(Race~Sex) 

ggplot(hypoxia) +
  geom_bar(aes(AHI)) +
  facet_grid(Race~Sex) 

hypoxia <- hypoxia |> 
  mutate(Diabetesfct = factor(Diabetes))
ggplot(hypoxia) +   geom_bar(aes(AHI, fill=Diabetesfct),position ="fill")+scale_fill_brewer(palette ="Dark2")+ theme_minimal() # no colours


# Summary statistics ------------------------------------------------------

# Mean
mean(hypoxia$Age)
mean(hypoxia$BMI)
mean(hypoxia$`Duration of Surg`)

# Standard Deviation
sd(hypoxia$Age)

# Counts
table(hypoxia$AHI)
table(hypoxia$`Duration of Surg`)


# Statistical tests -------------------------------------------------------

# Chi-squared tests of AHI factors
chisq.test(table(hypoxia$AHI, hypoxia$Sex))
chisq.test(table(hypoxia$AHI, hypoxia$Race))
chisq.test(table(hypoxia$AHI, hypoxia$Smoking))
chisq.test(table(hypoxia$AHI, hypoxia$Diabetes))

# T-tests for BMI and Sleep time
# Assume variance equal 
BMI1 <- hypoxia |> filter(BMI <= median(BMI))
BMI2 <- hypoxia |> filter(BMI > median(BMI))
t.test(BMI1$Sleeptime, BMI2$Sleeptime, var.equal = T)
t.test(BMI1$Sleeptime, BMI2$Sleeptime, var.equal = T)$p.value


# Modelling ---------------------------------------------------------------

mod_data <- hypoxia |> 
  mutate(severeAHI = if_else(AHI ==4, 1, 0)) |> 
  select(-c(Female, AHI))

mod1 <- glm(severe_AHI~., data = mod_data, family = "binomial")
summary(mod1)

mod2 <- glm(severe_AHI~Age+Sex+BMI, data=mod_data, family = "binomial")
summary(mod2)

mod3 <- glm(severe_AHI~Age+Sex, data=mod_data, family= "binomial")
summary(mod3)

mod4 <- glm(severe_AHI~Age+BMI, data=mod_data, family ="binomial")
summary(mod4)

mod5 <- glm(severe_AHI~Sex+BMI, data=mod_data, family ="binomial")
summary(mod5)

mod6 <- glm(severe_AHI~Age, data=mod_data, family ="binomial")
summary(mod6)

mod7 <- glm(severe_AHI~Sex, data=mod_data, family ="binomial")
summary(mod7)

mod8 <- glm(severe_AHI~BMI, data=mod_data, family ="binomial")
summary(mod8)

# Results -----------------------------------------------------------------

# AIC
aic_results <- c(mod2$aic, mod3$aic, mod4$aic, mod5$aic, mod6$aic, mod7$aic, mod8$aic)

# Forest plot of best model
forest_model(mod2)
ggsave("forestplot.png")

Exercise 3: Styling scripts

  1. Install and load the lintr and styler packages if you don’t already use them.

  2. Run lint() on the messy R script. Do you understand all of the messages?

  3. Run style_file(). What has changed in your script?

  4. Re-run lint() on the script. Have all of the issues been fixed? If not, manually implement changes to the file.

  5. Bonus: Add an RStudio keyboard shortcut for style_active_file().

Solution

Running lintr::lint("messy_exercises_script.R"):

After running styler::style_active_file():

messy_exercises_script.R
# Load packages -----------------------------------------------------------

library(readr)
library(ggplot2)
library(dplyr)
library(ggforestplot)
library(forestmodel)
library(ggcorrplot)


# Read data ---------------------------------------------------------------

hypoxia <- read_csv("data/hypoxia.csv")
View(hypoxia)


# Useful functions --------------------------------------------------------

# An AHI of 5-14 is mild; 15-29 is moderate and
# 30 or more events per hour characterizes severe sleep apnea.
# 1 = (AHI < 5); 2 = (5 ≤ AHI < 15);
# 3 = (15 ≤ AHI < 30); 4 = (AHI ≥ 30)

# Convert binary `Female` column to characters for Male and Female
convert_to_factors <- function(data = hypoxia) {
  output <- data |>
    mutate(Sex = if_else(Female == 1, "Female", "Male"), .after = Female) |>
    mutate(AHI = factor(AHI))
  return(output)
}
hypoxia <- convert_to_factors()


# Exploratory plots -------------------------------------------------------

# Histograms and bar charts
ggplot(hypoxia) +
  geom_histogram(aes(Age))
ggplot(hypoxia) +
  geom_histogram(aes(AHI))
ggplot(hypoxia) +
  geom_bar(aes(AHI))

# Correlation
hypoxia |>
  select(where(is.numeric)) |>
  cor(use = "complete.obs") |>
  ggcorrplot()
ggsave("corr_plot.png", width = 5, height = 5)

# Multiple factors
ggplot(hypoxia) +
  geom_bar(aes(AHI)) +
  facet_wrap(~Sex)

ggplot(hypoxia) +
  geom_bar(aes(AHI)) +
  facet_wrap(Race ~ Sex)

ggplot(hypoxia) +
  geom_bar(aes(AHI)) +
  facet_grid(Race ~ Sex)

hypoxia <- hypoxia |>
  mutate(Diabetesfct = factor(Diabetes))
ggplot(hypoxia) +
  geom_bar(aes(AHI, fill = Diabetesfct), position = "fill") +
  scale_fill_brewer(palette = "Dark2") +
  theme_minimal() # no colours


# Summary statistics ------------------------------------------------------

# Mean
mean(hypoxia$Age)
mean(hypoxia$BMI)
mean(hypoxia$`Duration of Surg`)

# Standard Deviation
sd(hypoxia$Age)

# Counts
table(hypoxia$AHI)
table(hypoxia$`Duration of Surg`)


# Statistical tests -------------------------------------------------------

# Chi-squared tests of AHI factors
chisq.test(table(hypoxia$AHI, hypoxia$Sex))
chisq.test(table(hypoxia$AHI, hypoxia$Race))
chisq.test(table(hypoxia$AHI, hypoxia$Smoking))
chisq.test(table(hypoxia$AHI, hypoxia$Diabetes))

# T-tests for BMI and Sleep time
# Assume variance equal
BMI1 <- hypoxia |> filter(BMI <= median(BMI))
BMI2 <- hypoxia |> filter(BMI > median(BMI))
t.test(BMI1$Sleeptime, BMI2$Sleeptime, var.equal = T)
t.test(BMI1$Sleeptime, BMI2$Sleeptime, var.equal = T)$p.value


# Modelling ---------------------------------------------------------------

mod_data <- hypoxia |>
  mutate(severeAHI = if_else(AHI == 4, 1, 0)) |>
  select(-c(Female, AHI))

mod1 <- glm(severe_AHI ~ ., data = mod_data, family = "binomial")
summary(mod1)

mod2 <- glm(severe_AHI ~ Age + Sex + BMI, data = mod_data, family = "binomial")
summary(mod2)

mod3 <- glm(severe_AHI ~ Age + Sex, data = mod_data, family = "binomial")
summary(mod3)

mod4 <- glm(severe_AHI ~ Age + BMI, data = mod_data, family = "binomial")
summary(mod4)

mod5 <- glm(severe_AHI ~ Sex + BMI, data = mod_data, family = "binomial")
summary(mod5)

mod6 <- glm(severe_AHI ~ Age, data = mod_data, family = "binomial")
summary(mod6)

mod7 <- glm(severe_AHI ~ Sex, data = mod_data, family = "binomial")
summary(mod7)

mod8 <- glm(severe_AHI ~ BMI, data = mod_data, family = "binomial")
summary(mod8)

# Results -----------------------------------------------------------------

# AIC
aic_results <- c(mod2$aic, mod3$aic, mod4$aic, mod5$aic, mod6$aic, mod7$aic, mod8$aic)

# Forest plot of best model
forest_model(mod2)
ggsave("forestplot.png")

Running lintr::lint("messy_exercises_script.R") again:

After manual tidy up:

messy_exercises_script.R
# Load packages -----------------------------------------------------------

library(readr)
library(ggplot2)
library(dplyr)
library(ggforestplot)
library(forestmodel)
library(ggcorrplot)


# Read data ---------------------------------------------------------------

hypoxia <- read_csv("data/hypoxia.csv")
View(hypoxia)


# Useful functions --------------------------------------------------------

# An AHI of 5-14 is mild; 15-29 is moderate and
# 30 or more events per hour characterizes severe sleep apnea.
# 1 = (AHI < 5); 2 = (5 ≤ AHI < 15);
# 3 = (15 ≤ AHI < 30); 4 = (AHI ≥ 30)

# Convert binary `Female` column to characters for Male and Female
convert_to_factors <- function(data = hypoxia) {
  output <- data |>
    mutate(
      Sex = if_else(.data$Female == 1, "Female", "Male"),
      .after = .data$Female
    ) |>
    mutate(AHI = factor(.data$AHI))
  return(output)
}
hypoxia <- convert_to_factors()


# Exploratory plots -------------------------------------------------------

# Histograms and bar charts
ggplot(hypoxia) +
  geom_histogram(aes(Age))
ggplot(hypoxia) +
  geom_histogram(aes(AHI))
ggplot(hypoxia) +
  geom_bar(aes(AHI))

# Correlation
hypoxia |>
  select(where(is.numeric)) |>
  cor(use = "complete.obs") |>
  ggcorrplot()
ggsave("corr_plot.png", width = 5, height = 5)

# Multiple factors
ggplot(hypoxia) +
  geom_bar(aes(AHI)) +
  facet_wrap(~Sex)

ggplot(hypoxia) +
  geom_bar(aes(AHI)) +
  facet_wrap(Race ~ Sex)

ggplot(hypoxia) +
  geom_bar(aes(AHI)) +
  facet_grid(Race ~ Sex)

hypoxia <- hypoxia |>
  mutate(Diabetesfct = factor(Diabetes))
ggplot(hypoxia) +
  geom_bar(aes(AHI, fill = Diabetesfct), position = "fill") +
  scale_fill_brewer(palette = "Dark2") +
  theme_minimal() # no colours


# Summary statistics ------------------------------------------------------

# Mean
mean(hypoxia$Age)
mean(hypoxia$BMI)
mean(hypoxia$`Duration of Surg`)

# Standard Deviation
sd(hypoxia$Age)

# Counts
table(hypoxia$AHI)
table(hypoxia$`Duration of Surg`)


# Statistical tests -------------------------------------------------------

# Chi-squared tests of AHI factors
chisq.test(table(hypoxia$AHI, hypoxia$Sex))
chisq.test(table(hypoxia$AHI, hypoxia$Race))
chisq.test(table(hypoxia$AHI, hypoxia$Smoking))
chisq.test(table(hypoxia$AHI, hypoxia$Diabetes))

# T-tests for BMI and Sleep time
# Assume variance equal
BMI1 <- hypoxia |> filter(BMI <= median(BMI)) # nolint
BMI2 <- hypoxia |> filter(BMI > median(BMI)) # nolint
t.test(BMI1$Sleeptime, BMI2$Sleeptime, var.equal = TRUE)
t.test(BMI1$Sleeptime, BMI2$Sleeptime, var.equal = TRUE)$p.value


# Modelling ---------------------------------------------------------------

mod_data <- hypoxia |>
  mutate(severeAHI = if_else(AHI == 4, 1, 0)) |>
  select(-c(Female, AHI))

mod1 <- glm(severe_AHI ~ ., data = mod_data, family = "binomial")
summary(mod1)

mod2 <- glm(severe_AHI ~ Age + Sex + BMI, data = mod_data, family = "binomial")
summary(mod2)

mod3 <- glm(severe_AHI ~ Age + Sex, data = mod_data, family = "binomial")
summary(mod3)

mod4 <- glm(severe_AHI ~ Age + BMI, data = mod_data, family = "binomial")
summary(mod4)

mod5 <- glm(severe_AHI ~ Sex + BMI, data = mod_data, family = "binomial")
summary(mod5)

mod6 <- glm(severe_AHI ~ Age, data = mod_data, family = "binomial")
summary(mod6)

mod7 <- glm(severe_AHI ~ Sex, data = mod_data, family = "binomial")
summary(mod7)

mod8 <- glm(severe_AHI ~ BMI, data = mod_data, family = "binomial")
summary(mod8)

# Results -----------------------------------------------------------------

# AIC
aic_results <- c(
  mod2$aic, mod3$aic, mod4$aic,
  mod5$aic, mod6$aic, mod7$aic, mod8$aic
)

# Forest plot of best model
forest_model(mod2)
ggsave("forestplot.png")

Note: linters aren’t always correct - use # nolint to ignore a line.

Exercise 4: Multiple scripts and folders

  1. Re-organise your R project folder with multiple sub-directories.

  2. Split your messy R script into multiple files, that are appropriately named.

  3. What is the order and dependencies of each script?

Solution

Folder structure:

project folder
│   project_name.Rproj
└───data
│   │   hypoxia.csv
└───plots
│   │   forestplot.png
│   │   corr_plot.png.png
└───R
│   │   00_packages.R
│   │   01_load_data.R
│   │   02_exploratory_analysis.R
│   │   03_modelling.R
R/00_packages.R
# Load packages -----------------------------------------------------------

library(readr)
library(ggplot2)
library(dplyr)
library(ggforestplot)
library(forestmodel)
library(ggcorrplot)

# Useful functions --------------------------------------------------------

# An AHI of 5-14 is mild; 15-29 is moderate and
# 30 or more events per hour characterizes severe sleep apnea.
# 1 = (AHI < 5); 2 = (5 ≤ AHI < 15);
# 3 = (15 ≤ AHI < 30); 4 = (AHI ≥ 30)

# Convert binary `Female` column to characters for Male and Female
convert_to_factors <- function(data = hypoxia) {
  output <- data |>
    mutate(
      Sex = if_else(.data$Female == 1, "Female", "Male"),
      .after = .data$Female
    ) |>
    mutate(AHI = factor(.data$AHI))
  return(output)
}
R/01_load_data.R
source("R/00_packages.R")
hypoxia <- read_csv("data/hypoxia.csv")
hypoxia <- convert_to_factors(hypoxia)
R/02_exploratory_analysis.R
02_exploratory_analysis.R
source("R/01_load_data.R")

# Exploratory plots -------------------------------------------------------

# Histograms and bar charts
ggplot(hypoxia) +
  geom_histogram(aes(Age))
ggplot(hypoxia) +
  geom_histogram(aes(AHI))
ggplot(hypoxia) +
  geom_bar(aes(AHI))

# Correlation
hypoxia |>
  select(where(is.numeric)) |>
  cor(use = "complete.obs") |>
  ggcorrplot()
ggsave("plots/corr_plot.png", width = 5, height = 5)

# Multiple factors
ggplot(hypoxia) +
  geom_bar(aes(AHI)) +
  facet_wrap(~Sex)

ggplot(hypoxia) +
  geom_bar(aes(AHI)) +
  facet_wrap(Race ~ Sex)

ggplot(hypoxia) +
  geom_bar(aes(AHI)) +
  facet_grid(Race ~ Sex)

hypoxia <- hypoxia |>
  mutate(Diabetesfct = factor(Diabetes))
ggplot(hypoxia) +
  geom_bar(aes(AHI, fill = Diabetesfct), position = "fill") +
  scale_fill_brewer(palette = "Dark2") +
  theme_minimal() # no colours


# Summary statistics ------------------------------------------------------

# Mean
mean(hypoxia$Age)
mean(hypoxia$BMI)
mean(hypoxia$`Duration of Surg`)

# Standard Deviation
sd(hypoxia$Age)

# Counts
table(hypoxia$AHI)
table(hypoxia$`Duration of Surg`)


# Statistical tests -------------------------------------------------------

# Chi-squared tests of AHI factors
chisq.test(table(hypoxia$AHI, hypoxia$Sex))
chisq.test(table(hypoxia$AHI, hypoxia$Race))
chisq.test(table(hypoxia$AHI, hypoxia$Smoking))
chisq.test(table(hypoxia$AHI, hypoxia$Diabetes))

# T-tests for BMI and Sleep time
# Assume variance equal
BMI1 <- hypoxia |> filter(BMI <= median(BMI)) # nolint
BMI2 <- hypoxia |> filter(BMI > median(BMI)) # nolint
t.test(BMI1$Sleeptime, BMI2$Sleeptime, var.equal = TRUE)
t.test(BMI1$Sleeptime, BMI2$Sleeptime, var.equal = TRUE)$p.value
R/03_modelling.R
source("R/01_load_data.R")

# Modelling ---------------------------------------------------------------

mod_data <- hypoxia |>
  mutate(severeAHI = if_else(AHI == 4, 1, 0)) |>
  select(-c(Female, AHI))

mod1 <- glm(severe_AHI ~ ., data = mod_data, family = "binomial")
summary(mod1)

mod2 <- glm(severe_AHI ~ Age + Sex + BMI, data = mod_data, family = "binomial")
summary(mod2)

mod3 <- glm(severe_AHI ~ Age + Sex, data = mod_data, family = "binomial")
summary(mod3)

mod4 <- glm(severe_AHI ~ Age + BMI, data = mod_data, family = "binomial")
summary(mod4)

mod5 <- glm(severe_AHI ~ Sex + BMI, data = mod_data, family = "binomial")
summary(mod5)

mod6 <- glm(severe_AHI ~ Age, data = mod_data, family = "binomial")
summary(mod6)

mod7 <- glm(severe_AHI ~ Sex, data = mod_data, family = "binomial")
summary(mod7)

mod8 <- glm(severe_AHI ~ BMI, data = mod_data, family = "binomial")
summary(mod8)

# Results -----------------------------------------------------------------

# AIC
aic_results <- c(
  mod2$aic, mod3$aic, mod4$aic,
  mod5$aic, mod6$aic, mod7$aic, mod8$aic
)

# Forest plot of best model
forest_model(mod2)
ggsave("plots/forestplot.png")

Exercise 5: Other useful tips

  1. The following code doesn’t work. Imagine you are not allowed to share the hypoxia data with anyone else. Build a reprex that you could share with someone else.
library(ggplot2)
ggplot(hypoxia) +
  geom_col(aes(AHI, fill = Smoking))
Solution
```
library(ggplot2)
library(tibble)

hypoxia_reprex <- tribble(
  ~AHI, ~Smoking,
  2, 0,
  3, 1,
  2, 0, 
  4, 0, 
  1, 0,
  1, 0,
  3, 0
)

ggplot(hypoxia_reprex) +
  geom_col(aes(AHI, fill = Smoking))
#> Error in `geom_col()`:
#> ! Problem while setting up geom.
#> ℹ Error occurred in the 1st layer.
#> Caused by error in `compute_geom_1()`:
#> ! `geom_col()` requires the following missing aesthetics: y.
```

```
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.4.1 (2024-06-14 ucrt)
#>  os       Windows 11 x64 (build 22000)
#>  system   x86_64, mingw32
#>  ui       RTerm
#>  language (EN)
#>  collate  English_United Kingdom.utf8
#>  ctype    English_United Kingdom.utf8
#>  tz       Europe/London
#>  date     2024-10-21
#>  pandoc   3.1.11 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version date (UTC) lib source
#>  cli           3.6.2   2023-12-11 [1] CRAN (R 4.3.2)
#>  colorspace    2.1-1   2023-10-23 [1] R-Forge (R 4.3.1)
#>  digest        0.6.35  2024-03-11 [1] CRAN (R 4.4.0)
#>  dplyr         1.1.4   2023-11-17 [1] CRAN (R 4.3.2)
#>  evaluate      0.23    2023-11-01 [1] CRAN (R 4.3.2)
#>  fansi         1.0.6   2023-12-08 [1] CRAN (R 4.3.2)
#>  farver        2.1.1   2022-07-06 [1] CRAN (R 4.3.0)
#>  fastmap       1.2.0   2024-05-15 [1] CRAN (R 4.4.0)
#>  fs            1.6.4   2024-04-25 [1] CRAN (R 4.4.0)
#>  generics      0.1.3   2022-07-05 [1] CRAN (R 4.3.0)
#>  ggplot2     * 3.5.1   2024-04-23 [1] CRAN (R 4.3.3)
#>  glue          1.7.0   2024-01-09 [1] CRAN (R 4.3.2)
#>  gtable        0.3.5   2024-04-22 [1] CRAN (R 4.3.3)
#>  htmltools     0.5.8.1 2024-04-04 [1] CRAN (R 4.4.0)
#>  knitr         1.47    2024-05-29 [1] CRAN (R 4.3.3)
#>  lifecycle     1.0.4   2023-11-07 [1] CRAN (R 4.3.2)
#>  magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.3.0)
#>  munsell       0.5.1   2024-04-01 [1] CRAN (R 4.3.3)
#>  pillar        1.9.0   2023-03-22 [1] CRAN (R 4.3.0)
#>  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.3.0)
#>  purrr         1.0.2   2023-08-10 [1] CRAN (R 4.3.1)
#>  R.cache       0.16.0  2022-07-21 [1] CRAN (R 4.3.0)
#>  R.methodsS3   1.8.2   2022-06-13 [1] CRAN (R 4.3.0)
#>  R.oo          1.26.0  2024-01-24 [1] CRAN (R 4.3.2)
#>  R.utils       2.12.3  2023-11-18 [1] CRAN (R 4.3.2)
#>  R6            2.5.1   2021-08-19 [1] CRAN (R 4.3.0)
#>  reprex        2.1.0   2024-01-11 [1] CRAN (R 4.3.2)
#>  rlang         1.1.3   2024-01-10 [1] CRAN (R 4.3.2)
#>  rmarkdown     2.27    2024-05-17 [1] CRAN (R 4.3.3)
#>  rstudioapi    0.16.0  2024-03-24 [1] CRAN (R 4.3.3)
#>  scales        1.3.0   2023-11-28 [1] CRAN (R 4.3.2)
#>  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.3.0)
#>  styler        1.10.3  2024-04-07 [1] CRAN (R 4.3.3)
#>  tibble      * 3.2.1   2023-03-20 [1] CRAN (R 4.3.0)
#>  tidyselect    1.2.1   2024-03-11 [1] CRAN (R 4.3.3)
#>  utf8          1.2.4   2023-10-22 [1] CRAN (R 4.3.2)
#>  vctrs         0.6.5   2023-12-01 [1] CRAN (R 4.3.2)
#>  withr         3.0.0   2024-01-16 [1] CRAN (R 4.3.2)
#>  xfun          0.44    2024-05-15 [1] CRAN (R 4.4.0)
#>  yaml          2.3.8   2023-12-11 [1] CRAN (R 4.3.2)
#> 
#>  [1] C:/Users/rennien2/AppData/Local/R/win-library/4.4
#>  [2] C:/Users/rennien2/AppData/Local/Programs/R/R-4.4.1/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────
```
Collaborating on code with Git and GitHub for R users

The exercise materials for the session on collaborating on code with Git and GitHub for R users can be found at nrennie.rbind.io/training-git-r/exercises.html.