Examples

Data

Length of Stay Model Data

The LOS dataset is a simulated hospital length-of-stay dataset, available through the {NHSRdatasets} R package. The dataset is licensed under creativecommons.org/publicdomain/zero/1.0.

Download CSV: LOS.csv

Initial R Script

The original messy R script used in the examples can be downloaded below.

Download .R Script: messy_example_script.R

See initial R script

messy_example_script.R

install.packages("readr")
library(readr)
data <- read_csv("C:/Users/username/OneDrive/R code/LOS.csv")

View(data)

# plot age
hist(data$Age) # don't use this
ggplot(data) +
  geom_histogram(aes(Age))
library(ggplot2)
library(ggcorrplot)
ggplot(data) +
  geom_histogram(aes(LOS))

ggplot(data) +
  geom_histogram(aes(Age)) +
  facet_wrap(~Organisation)

library(dplyr)
library(tidyverse)
library(forcats)
data <- data |> mutate(Organisation = factor(Organisation), Organisation = fct_relevel(Organisation, "Trust10", after = Inf))
ggplot(data) +
  geom_histogram(aes(Age)) +
  facet_wrap(~Organisation)

mean(data$Age) #50.65667
sd(data$Age)
mean(data$LOS) #4.936667
mean(data$Death) #0.1766667
table(data$organisation)
# error - don't know why

age1 <- data |> 
  filter(Age <= 50)
age2 <- data |> 
  filter(Age > 50)
t.test(age1$LOS, age2$LOS, var.equal = T)
t.test(age1$LOS, age2$LOS, var.equal = T)$p.value

# models
mod1 <- glm(Death~Age, data = data, family = "binomial")
summary(mod1)

mod2 <- glm(Death~Age+LOS, data=data, family = "binomial")
summary(mod2)

mod3 <- glm(Death~Age+LOS+Organisation, data = data, family= "binomial")
summary(mod3)

mod4 <- glm(Death~LOS, data = data, family ="binomial")
summary(mod4)

# results
library(gtsummary)
modelTable = tbl_regression(mod3)

modelTable |> 
  as_gt() |> 
  gtsave("table.docx")

Examples

This section includes code for the examples shown. These may differ slightly from the examples shown in the live demonstration.

Example 1: Projects and relative file paths

See example

Creating a project folder
Downloading data
Project settings

Folder structure:

better_R
│   messy_example_script.R
|   LOS.csv
|   better_R.Rproj

R script:

messy_example_script.R

install.packages("readr")
library(readr)
data <- read_csv("LOS.csv")

### REST OF SCRIPT UNCHANGED

Example 2: Organising a script

Restructuring scripts
Adding sections
Renaming variables
Namespacing

See example

messy_example_script.R

# Load packages -----------------------------------------------------------

library(gtsummary)
library(ggcorrplot)
library(tidyverse)
library(forcats)


# Load LOS ---------------------------------------------------------------

LOS <- read_csv("C:/Users/username/OneDrive/R code/LOS.csv")

LOS <- LOS |> mutate(Organisation = factor(Organisation), Organisation = fct_relevel(Organisation, "Trust10", after = Inf))


# Exploratory plots -------------------------------------------------------

ggplot(LOS) +
  geom_histogram(aes(Age))
ggplot(LOS) +
  geom_histogram(aes(LOS))
ggplot(LOS) +
  geom_histogram(aes(Age)) +
  facet_wrap(~Organisation)


# Summary statistics ------------------------------------------------------

mean(LOS$Age) 
mean(LOS$LOS) 
mean(LOS$Death)
sd(LOS$Age)
table(LOS$organisation)


# Statistical tests -------------------------------------------------------

age1 <- LOS |> 
  filter(Age <= 50)
age2 <- LOS |> 
  filter(Age > 50)
t.test(age1$LOS, age2$LOS, var.equal = T)
t.test(age1$LOS, age2$LOS, var.equal = T)$p.value


# Modelling ---------------------------------------------------------------

mod1 <- glm(Death~Age, data = LOS, family = "binomial")
summary(mod1)

mod2 <- glm(Death~Age+LOS, data=LOS, family = "binomial")
summary(mod2)

mod3 <- glm(Death~Age+LOS+Organisation, data = LOS, family= "binomial")
summary(mod3)

mod4 <- glm(Death~LOS, data = LOS, family ="binomial")
summary(mod4)


# Results -----------------------------------------------------------------

modelTable = tbl_regression(mod3)

modelTable |> 
  as_gt() |> 
  gtsave("table.docx")

Example 3: Styling scripts

Linting
Styling

See example

messy_example_script.R

# Load packages -----------------------------------------------------------

library(gtsummary)
library(ggcorrplot)
library(tidyverse)
library(forcats)


# Load LOS ---------------------------------------------------------------

LOS <- read_csv("LOS.csv") # nolint

LOS <- LOS |> # nolint
  mutate(
    Organisation = factor(Organisation),
    Organisation = fct_relevel(Organisation, "Trust10", after = Inf)
  )


# Exploratory plots -------------------------------------------------------

ggplot(LOS) +
  geom_histogram(aes(Age))

ggplot(LOS) +
  geom_histogram(aes(LOS))

ggplot(LOS) +
  geom_histogram(aes(Age)) +
  facet_wrap(~Organisation)


# Summary statistics ------------------------------------------------------

mean(LOS$Age)
sd(LOS$Age)
mean(LOS$LOS)
mean(LOS$Death)
table(LOS$organisation)


# Statistical tests -------------------------------------------------------

age1 <- LOS |>
  filter(Age <= 50)
age2 <- LOS |>
  filter(Age > 50)
t.test(age1$LOS, age2$LOS, var.equal = TRUE)
t.test(age1$LOS, age2$LOS, var.equal = TRUE)$p.value


# Modelling ---------------------------------------------------------------

mod1 <- glm(Death ~ Age, data = LOS, family = "binomial")
summary(mod1)

mod2 <- glm(Death ~ Age + LOS, data = LOS, family = "binomial")
summary(mod2)

mod3 <- glm(Death ~ Age + LOS + Organisation, data = LOS, family = "binomial")
summary(mod3)

mod4 <- glm(Death ~ LOS, data = LOS, family = "binomial")
summary(mod4)


# Results -----------------------------------------------------------------

model_table <- tbl_regression(mod3)

model_table |>
  as_gt() |>
  gtsave("regression_table.docx")

Sometimes you’ll see me write e.g. dplyr::filter() rather than just filter(). It means use the filter() function from the dplyr package, and saves you having to run library(dplyr). It also avoid confusion between two functions from different packages with the same name.

Example 4: Multiple scripts and folders

See example

Folder structure:

project folder
│   project_name.Rproj
└───data
│   │   LOS.csv
└───outputs
│   │   regression_table.docx
└───R
│   │   00_packages.R
│   │   01_load_data.R
│   │   02_exploratory_analysis.R
│   │   03_modelling.R

R/00_packages.R

# Load packages -----------------------------------------------------------

library(gtsummary)
library(ggcorrplot)
library(tidyverse)
library(forcats)

R/01_load_data.R

source("R/00_packages.R")

# Load LOS ---------------------------------------------------------------

LOS <- read_csv("data/LOS.csv") # nolint

LOS <- LOS |> # nolint
  mutate(
    Organisation = factor(Organisation),
    Organisation = fct_relevel(Organisation, "Trust10", after = Inf)
  )

R/02_exploratory_analysis.R

source("R/01_load_data.R")

# Exploratory plots -------------------------------------------------------

ggplot(LOS) +
  geom_histogram(aes(Age))

ggplot(LOS) +
  geom_histogram(aes(LOS))

ggplot(LOS) +
  geom_histogram(aes(Age)) +
  facet_wrap(~Organisation)


# Summary statistics ------------------------------------------------------

mean(LOS$Age)
mean(LOS$LOS)
mean(LOS$Death)
sd(LOS$Age)
table(LOS$organisation)


# Statistical tests -------------------------------------------------------

age1 <- LOS |>
  filter(Age <= 50)
age2 <- LOS |>
  filter(Age > 50)
t.test(age1$LOS, age2$LOS, var.equal = TRUE)
t.test(age1$LOS, age2$LOS, var.equal = TRUE)$p.value

R/03_modelling.R

source("R/01_load_data.R")

# Modelling ---------------------------------------------------------------

mod1 <- glm(Death ~ Age, data = LOS, family = "binomial")
summary(mod1)

mod2 <- glm(Death ~ Age + LOS, data = LOS, family = "binomial")
summary(mod2)

mod3 <- glm(Death ~ Age + LOS + Organisation, data = LOS, family = "binomial")
summary(mod3)

mod4 <- glm(Death ~ LOS, data = LOS, family = "binomial")
summary(mod4)


# Results -----------------------------------------------------------------

model_table <- tbl_regression(mod3)

model_table |>
  as_gt() |>
  gtsave("outputs/regression_table.docx")

Example 5: Other useful tips

See example

This code returns an error:

mean(LOS$Organisation)

If we just write the code mean(LOS$Organisation) returns an error, the person we’re asking for help doesn’t know what LOS looks like, what the Organisation column is, or which version of R or any packages you’re using. We can instead create a reproducible example to send them.

library(reprex)
reprex()

If we can share the data, you can copy in the results of dput(LOS). If you can’t share the data, you’ll need to make a synthetic data set or use a built-in one for the example.

Collaborating on code with Git and GitHub for R users

The example materials for the session on collaborating on code with Git and GitHub for R users can be found at nrennie.rbind.io/training-git-r/examples.html.