Data Exercise

Disclaimer: used ChatGPT to generate code for synthetic data

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(skimr)

set.seed(123)

n <- 500

synthetic_data <- tibble(
  age = rnorm(n, mean = 35, sd = 10),
  exposure = rbinom(n, size = 1, prob = 0.4),
  dose = runif(n, 0, 10)
) %>%
  mutate(
    # Linear outcome with known effects
    outcome_continuous =
      5 +
      0.4 * age +        # age effect
      2.5 * exposure +  # exposure effect
      1.2 * dose +      # dose effect
      rnorm(n, 0, 5),

    # Logistic outcome (binary)
    outcome_binary =
      rbinom(
        n,
        size = 1,
        prob = plogis(
          -6 +
          0.05 * age +
          1.3 * exposure +
          0.25 * dose
        )
      )
  )

skim(synthetic_data)

Data summary
Name	synthetic_data
Number of rows	500
Number of columns	5
_______________________
Column type frequency:
numeric	5
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
age	1	35.35	9.73	8.39	29.25	35.21	41.85	67.41	▁▆▇▃▁
exposure	1	0.38	0.48	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▅
dose	1	5.03	2.88	0.05	2.58	4.99	7.53	9.96	▇▇▇▇▇
outcome_continuous	1	26.24	7.39	5.26	21.02	26.34	31.03	49.03	▁▅▇▃▁
outcome_binary	1	0.11	0.31	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁

synthetic_data <- synthetic_data %>%
  mutate(
    age_group = cut(
      age,
      breaks = seq(15, 75, by = 10),
      include.lowest = TRUE
    )
  )

age_outcome_summary <- synthetic_data %>%
  group_by(age_group) %>%
  summarize(
    proportion_positive = mean(outcome_binary),
    n = n(),
    .groups = "drop"
  )

plot1 <- ggplot(synthetic_data, aes(x = dose, y = outcome_continuous, color = age_group)) +
  geom_point(alpha = 0.4) +
  theme_bw()

plot1

plot2 <- synthetic_data %>%
  filter(outcome_binary == 1) %>%
  ggplot(aes(x = age_group)) +
  geom_bar() +
  labs(
    x = "Age group",
    y = "Count of positive outcomes",
    title = "Count of binary outcomes by age group"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot2

lm_fit <- lm(
  outcome_continuous ~ age + exposure + dose,
  data = synthetic_data
)

summary(lm_fit)


Call:
lm(formula = outcome_continuous ~ age + exposure + dose, data = synthetic_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-14.0025  -3.6045   0.1997   3.3425  16.9875 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  4.48969    0.92793   4.838 1.75e-06 ***
age          0.43068    0.02285  18.847  < 2e-16 ***
exposure     2.55416    0.45829   5.573 4.11e-08 ***
dose         1.10777    0.07709  14.370  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.963 on 496 degrees of freedom
Multiple R-squared:  0.5518,    Adjusted R-squared:  0.549 
F-statistic: 203.5 on 3 and 496 DF,  p-value: < 2.2e-16