Data Exercise

Disclaimer: used ChatGPT to generate code for synthetic data

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
set.seed(123)

n <- 500

synthetic_data <- tibble(
  age = rnorm(n, mean = 35, sd = 10),
  exposure = rbinom(n, size = 1, prob = 0.4),
  dose = runif(n, 0, 10)
) %>%
  mutate(
    # Linear outcome with known effects
    outcome_continuous =
      5 +
      0.4 * age +        # age effect
      2.5 * exposure +  # exposure effect
      1.2 * dose +      # dose effect
      rnorm(n, 0, 5),

    # Logistic outcome (binary)
    outcome_binary =
      rbinom(
        n,
        size = 1,
        prob = plogis(
          -6 +
          0.05 * age +
          1.3 * exposure +
          0.25 * dose
        )
      )
  )
skim(synthetic_data)
Data summary
Name synthetic_data
Number of rows 500
Number of columns 5
_______________________
Column type frequency:
numeric 5
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 35.35 9.73 8.39 29.25 35.21 41.85 67.41 ▁▆▇▃▁
exposure 0 1 0.38 0.48 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅
dose 0 1 5.03 2.88 0.05 2.58 4.99 7.53 9.96 ▇▇▇▇▇
outcome_continuous 0 1 26.24 7.39 5.26 21.02 26.34 31.03 49.03 ▁▅▇▃▁
outcome_binary 0 1 0.11 0.31 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
synthetic_data <- synthetic_data %>%
  mutate(
    age_group = cut(
      age,
      breaks = seq(15, 75, by = 10),
      include.lowest = TRUE
    )
  )

age_outcome_summary <- synthetic_data %>%
  group_by(age_group) %>%
  summarize(
    proportion_positive = mean(outcome_binary),
    n = n(),
    .groups = "drop"
  )
plot1 <- ggplot(synthetic_data, aes(x = dose, y = outcome_continuous, color = age_group)) +
  geom_point(alpha = 0.4) +
  theme_bw()

plot1

plot2 <- synthetic_data %>%
  filter(outcome_binary == 1) %>%
  ggplot(aes(x = age_group)) +
  geom_bar() +
  labs(
    x = "Age group",
    y = "Count of positive outcomes",
    title = "Count of binary outcomes by age group"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot2

lm_fit <- lm(
  outcome_continuous ~ age + exposure + dose,
  data = synthetic_data
)

summary(lm_fit)

Call:
lm(formula = outcome_continuous ~ age + exposure + dose, data = synthetic_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-14.0025  -3.6045   0.1997   3.3425  16.9875 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  4.48969    0.92793   4.838 1.75e-06 ***
age          0.43068    0.02285  18.847  < 2e-16 ***
exposure     2.55416    0.45829   5.573 4.11e-08 ***
dose         1.10777    0.07709  14.370  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.963 on 496 degrees of freedom
Multiple R-squared:  0.5518,    Adjusted R-squared:  0.549 
F-statistic: 203.5 on 3 and 496 DF,  p-value: < 2.2e-16