Chapter 2 Linear models | Omics Data Analysis (original) (raw)
library("tidyverse")
## Prepare the data
x <-
data.frame(lmdata) |>
rownames_to_column() |>
pivot_longer(names_to = "sample", values_to = "expression", -rowname) |>
full_join(lmannot) |>
mutate(sample = sub("sample_", "", sample))
## Analyse gene 1
x_i <- filter(x, rowname == "gene_1")
x_i
## # A tibble: 12 × 5
## rowname sample expression condition cell_type
## <chr> <chr> <dbl> <chr> <chr>
## 1 gene_1 1 4.44 ctrl A
## 2 gene_1 2 4.77 ctrl B
## 3 gene_1 3 6.56 ctrl A
## 4 gene_1 4 5.07 ctrl B
## 5 gene_1 5 5.13 ctrl A
## 6 gene_1 6 6.72 ctrl B
## 7 gene_1 7 5.46 cond A
## 8 gene_1 8 3.73 cond B
## 9 gene_1 9 4.31 cond A
## 10 gene_1 10 4.55 cond B
## 11 gene_1 11 6.22 cond A
## 12 gene_1 12 5.36 cond B
mod_i <- lm(expression ~ condition * cell_type, data = x_i)
summary(mod_i)
##
## Call:
## lm(formula = expression ~ condition * cell_type, data = x_i)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0196 -0.7652 -0.1210 0.8304 1.1966
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.33272 0.56643 9.415 1.33e-05 ***
## conditionctrl 0.04312 0.80105 0.054 0.958
## cell_typeB -0.78302 0.80105 -0.977 0.357
## conditionctrl:cell_typeB 0.92564 1.13286 0.817 0.438
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9811 on 8 degrees of freedom
## Multiple R-squared: 0.1824, Adjusted R-squared: -0.1242
## F-statistic: 0.595 on 3 and 8 DF, p-value: 0.6357
ggplot(x_i, aes(x = sample, y = expression,
colour = condition,
shape = cell_type)) +
geom_point()

## Analyse gene 2
x_i <- filter(x, rowname == "gene_2")
x_i
## # A tibble: 12 × 5
## rowname sample expression condition cell_type
## <chr> <chr> <dbl> <chr> <chr>
## 1 gene_2 1 6.16 ctrl A
## 2 gene_2 2 6.04 ctrl B
## 3 gene_2 3 5.78 ctrl A
## 4 gene_2 4 6.71 ctrl B
## 5 gene_2 5 6.20 ctrl A
## 6 gene_2 6 5.21 ctrl B
## 7 gene_2 7 4.70 cond A
## 8 gene_2 8 3.53 cond B
## 9 gene_2 9 2.93 cond A
## 10 gene_2 10 3.78 cond B
## 11 gene_2 11 2.97 cond A
## 12 gene_2 12 3.27 cond B
mod_i <- lm(expression ~ condition * cell_type, data = x_i)
summary(mod_i)
##
## Call:
## lm(formula = expression ~ condition * cell_type, data = x_i)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.77745 -0.34149 0.02695 0.17889 1.16551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.535847 0.376883 9.382 1.36e-05 ***
## conditionctrl 2.509857 0.532992 4.709 0.00152 **
## cell_typeB -0.009063 0.532992 -0.017 0.98685
## conditionctrl:cell_typeB -0.045843 0.753765 -0.061 0.95300
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6528 on 8 degrees of freedom
## Multiple R-squared: 0.8448, Adjusted R-squared: 0.7866
## F-statistic: 14.52 on 3 and 8 DF, p-value: 0.001335
ggplot(x_i, aes(x = sample, y = expression, colour = cell_type)) +
geom_point() +
facet_wrap(~ condition)

## Analyse gene 3
x_i <- filter(x, rowname == "gene_3")
x_i
## # A tibble: 12 × 5
## rowname sample expression condition cell_type
## <chr> <chr> <dbl> <chr> <chr>
## 1 gene_3 1 2.69 ctrl A
## 2 gene_3 2 3.31 ctrl B
## 3 gene_3 3 3.42 ctrl A
## 4 gene_3 4 5.15 ctrl B
## 5 gene_3 5 2.43 ctrl A
## 6 gene_3 6 6.25 ctrl B
## 7 gene_3 7 3.21 cond A
## 8 gene_3 8 4.70 cond B
## 9 gene_3 9 3.45 cond A
## 10 gene_3 10 5.88 cond B
## 11 gene_3 11 3.41 cond A
## 12 gene_3 12 5.69 cond B
mod_i <- lm(expression ~ condition * cell_type, data = x_i)
summary(mod_i)
##
## Call:
## lm(formula = expression ~ condition * cell_type, data = x_i)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.59352 -0.22242 0.07198 0.31211 1.34698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.357193 0.490117 6.850 0.000131 ***
## conditionctrl -0.511427 0.693130 -0.738 0.481685
## cell_typeB 2.066707 0.693130 2.982 0.017555 *
## conditionctrl:cell_typeB -0.005643 0.980234 -0.006 0.995547
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8489 on 8 degrees of freedom
## Multiple R-squared: 0.7019, Adjusted R-squared: 0.5901
## F-statistic: 6.278 on 3 and 8 DF, p-value: 0.01696
ggplot(x_i, aes(x = sample, y = expression, colour = condition)) +
geom_point() +
facet_wrap(~ cell_type)

## Analyse gene 4
x_i <- filter(x, rowname == "gene_4")
x_i
## # A tibble: 12 × 5
## rowname sample expression condition cell_type
## <chr> <chr> <dbl> <chr> <chr>
## 1 gene_4 1 7.11 ctrl A
## 2 gene_4 2 5.88 ctrl B
## 3 gene_4 3 5.39 ctrl A
## 4 gene_4 4 5.24 ctrl B
## 5 gene_4 5 4.61 ctrl A
## 6 gene_4 6 5.58 ctrl B
## 7 gene_4 7 3.47 cond A
## 8 gene_4 8 10.3 cond B
## 9 gene_4 9 8.42 cond A
## 10 gene_4 10 3.75 cond B
## 11 gene_4 11 5.19 cond A
## 12 gene_4 12 5.07 cond B
mod_i <- lm(expression ~ condition * cell_type, data = x_i)
summary(mod_i)
##
## Call:
## lm(formula = expression ~ condition * cell_type, data = x_i)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6323 -1.1485 -0.3208 0.5837 3.9518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.693120 1.296865 4.390 0.00232 **
## conditionctrl 0.009047 1.834044 0.005 0.99619
## cell_typeB 0.693007 1.834044 0.378 0.71537
## conditionctrl:cell_typeB -0.828703 2.593730 -0.320 0.75753
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.246 on 8 degrees of freedom
## Multiple R-squared: 0.02982, Adjusted R-squared: -0.334
## F-statistic: 0.08197 on 3 and 8 DF, p-value: 0.968
ggplot(x_i, aes(x = sample, y = expression,
colour = condition,
shape = cell_type)) +
geom_point()

## Analyse gene 5
x_i <- filter(x, rowname == "gene_5")
x_i
## # A tibble: 12 × 5
## rowname sample expression condition cell_type
## <chr> <chr> <dbl> <chr> <chr>
## 1 gene_5 1 6.56 ctrl A
## 2 gene_5 2 4.83 ctrl B
## 3 gene_5 3 5.52 ctrl A
## 4 gene_5 4 4.14 ctrl B
## 5 gene_5 5 5.75 ctrl A
## 6 gene_5 6 5.18 ctrl B
## 7 gene_5 7 4.09 cond A
## 8 gene_5 8 6.21 cond B
## 9 gene_5 9 4.65 cond A
## 10 gene_5 10 5.43 cond B
## 11 gene_5 11 4.87 cond A
## 12 gene_5 12 5.54 cond B
mod_i <- lm(expression ~ condition * cell_type, data = x_i)
summary(mod_i)
##
## Call:
## lm(formula = expression ~ condition * cell_type, data = x_i)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57907 -0.32745 -0.03921 0.36578 0.61935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.5338 0.2775 16.338 1.98e-07 ***
## conditionctrl 1.4092 0.3924 3.591 0.00708 **
## cell_typeB 1.1929 0.3924 3.040 0.01607 *
## conditionctrl:cell_typeB -2.4201 0.5550 -4.361 0.00241 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4806 on 8 degrees of freedom
## Multiple R-squared: 0.7094, Adjusted R-squared: 0.6005
## F-statistic: 6.511 on 3 and 8 DF, p-value: 0.01536
group_by(x_i, cell_type, condition) |> summarise(mean = mean(expression))
## `summarise()` has grouped output by 'cell_type'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups: cell_type [2]
## cell_type condition mean
## <chr> <chr> <dbl>
## 1 A cond 4.53
## 2 A ctrl 5.94
## 3 B cond 5.73
## 4 B ctrl 4.72
group_by(x_i, cell_type) |> summarise(mean = mean(expression))
## # A tibble: 2 × 2
## cell_type mean
## <chr> <dbl>
## 1 A 5.24
## 2 B 5.22
group_by(x_i, condition) |> summarise(mean = mean(expression))
## # A tibble: 2 × 2
## condition mean
## <chr> <dbl>
## 1 cond 5.13
## 2 ctrl 5.33
ggplot(x_i, aes(x = sample, y = expression,
colour = condition,
shape = cell_type)) +
geom_point() +
facet_grid(cell_type ~ condition)
