Confirmatory Factor Analysis using brms (original) (raw)

The tiny ESS values are likely due to the non-identifiability of the factor loadings across different HMC chains. When fitting these models there can often be multiple solutions that lead to the same linear predictors, whereby the loading coefficients in \lambda and the random draws for the factors \eta can be sign-flipped to still lead to the same product. Below I demonstrate a strategy to deal with this, which is implemented in the {mvgam} package. The first model uses a sign-flipping correction in the generated quantities block to ensure convergence of the loadings across chains, while the second model does not use the sign-flipping (in my models I am not usually interested in the loadings themselves, but in the product of the loadings and the factors). You should be able to clearly see the non-identifiability issue when inspecting the parameter estimates from the second model. Note that I changed the model slightly as {mvgam} cannot yet model variation in the observation-level scale parameters \sigma

library(mvgam)
#> Loading 'mvgam' (version 1.1.56). Useful instructions can be found by
#>   typing help('mvgam'). A more detailed introduction to the package is
#>   available through vignette('mvgam_overview').

# Simulate data, but use same observation variance for all
# items as mvgam currently cannot model these (yet)
set.seed(0)
N <- 100
J <- 5
lambda <- c(1, -1, 1, -1, 2)
Y      <- matrix(NA, N, J)

eta    <- rnorm(N)

for (j in 1:J) {
  Y[, j] <- lambda[j]*eta + rnorm(N, sd = 0.5)
}

df <- data.frame(
  y = as.vector(t(Y)),
  
  # each item is a separate 'series' in mvgam language
  item = as.factor(rep(1:J, times = N)),
  series = as.factor(rep(1:J, times = N)),
  
  # each subject is a separate 'time point' in mvgam language
  subject = rep(1:N, each = J),
  time = rep(1:N, each = J)
)

# Dynamic factor model allowing the factor to evolve as a random walk
# over subjects; this model imposes the sign-flipping strategy to ensure
# convergence of factor loadings (lambda)
fit <- mvgam(
  formula = y ~ 1,
  use_lv = TRUE,
  n_lv = 1,
  trend_model = RW(),
  data = df,
  family = gaussian(),
  share_obs_params = TRUE
)

#> Compiling Stan program using cmdstanr
#> 
#> Start sampling
#> Running MCMC with 4 parallel chains...
#> 
#> Chain 1 Iteration:   1 / 1000 [  0%]  (Warmup) 
#> Chain 1 Iteration: 100 / 1000 [ 10%]  (Warmup) 
#> Chain 1 Iteration: 200 / 1000 [ 20%]  (Warmup) 
#> Chain 1 Iteration: 300 / 1000 [ 30%]  (Warmup) 
#> Chain 1 Iteration: 400 / 1000 [ 40%]  (Warmup) 
#> Chain 1 Iteration: 500 / 1000 [ 50%]  (Warmup) 
#> Chain 1 Iteration: 501 / 1000 [ 50%]  (Sampling) 
#> Chain 2 Iteration:   1 / 1000 [  0%]  (Warmup) 
#> Chain 2 Iteration: 100 / 1000 [ 10%]  (Warmup) 
#> Chain 2 Iteration: 200 / 1000 [ 20%]  (Warmup) 
#> Chain 2 Iteration: 300 / 1000 [ 30%]  (Warmup) 
#> Chain 2 Iteration: 400 / 1000 [ 40%]  (Warmup) 
#> Chain 2 Iteration: 500 / 1000 [ 50%]  (Warmup) 
#> Chain 2 Iteration: 501 / 1000 [ 50%]  (Sampling) 
#> Chain 3 Iteration:   1 / 1000 [  0%]  (Warmup) 
#> Chain 3 Iteration: 100 / 1000 [ 10%]  (Warmup) 
#> Chain 4 Iteration:   1 / 1000 [  0%]  (Warmup) 
#> Chain 1 Iteration: 600 / 1000 [ 60%]  (Sampling) 
#> Chain 2 Iteration: 600 / 1000 [ 60%]  (Sampling) 
#> Chain 3 Iteration: 200 / 1000 [ 20%]  (Warmup) 
#> Chain 3 Iteration: 300 / 1000 [ 30%]  (Warmup) 
#> Chain 3 Iteration: 400 / 1000 [ 40%]  (Warmup) 
#> Chain 3 Iteration: 500 / 1000 [ 50%]  (Warmup) 
#> Chain 3 Iteration: 501 / 1000 [ 50%]  (Sampling) 
#> Chain 4 Iteration: 100 / 1000 [ 10%]  (Warmup) 
#> Chain 4 Iteration: 200 / 1000 [ 20%]  (Warmup) 
#> Chain 4 Iteration: 300 / 1000 [ 30%]  (Warmup) 
#> Chain 4 Iteration: 400 / 1000 [ 40%]  (Warmup) 
#> Chain 4 Iteration: 500 / 1000 [ 50%]  (Warmup) 
#> Chain 4 Iteration: 501 / 1000 [ 50%]  (Sampling) 
#> Chain 1 Iteration: 700 / 1000 [ 70%]  (Sampling) 
#> Chain 3 Iteration: 600 / 1000 [ 60%]  (Sampling) 
#> Chain 4 Iteration: 600 / 1000 [ 60%]  (Sampling) 
#> Chain 2 Iteration: 700 / 1000 [ 70%]  (Sampling) 
#> Chain 1 Iteration: 800 / 1000 [ 80%]  (Sampling) 
#> Chain 3 Iteration: 700 / 1000 [ 70%]  (Sampling) 
#> Chain 4 Iteration: 700 / 1000 [ 70%]  (Sampling) 
#> Chain 2 Iteration: 800 / 1000 [ 80%]  (Sampling) 
#> Chain 1 Iteration: 900 / 1000 [ 90%]  (Sampling) 
#> Chain 3 Iteration: 800 / 1000 [ 80%]  (Sampling) 
#> Chain 4 Iteration: 800 / 1000 [ 80%]  (Sampling) 
#> Chain 2 Iteration: 900 / 1000 [ 90%]  (Sampling) 
#> Chain 1 Iteration: 1000 / 1000 [100%]  (Sampling) 
#> Chain 3 Iteration: 900 / 1000 [ 90%]  (Sampling) 
#> Chain 4 Iteration: 900 / 1000 [ 90%]  (Sampling) 
#> Chain 1 finished in 1.3 seconds.
#> Chain 2 Iteration: 1000 / 1000 [100%]  (Sampling) 
#> Chain 2 finished in 1.3 seconds.
#> Chain 3 Iteration: 1000 / 1000 [100%]  (Sampling) 
#> Chain 4 Iteration: 1000 / 1000 [100%]  (Sampling) 
#> Chain 3 finished in 1.3 seconds.
#> Chain 4 finished in 1.3 seconds.
#> 
#> All 4 chains finished successfully.
#> Mean chain execution time: 1.3 seconds.
#> Total execution time: 1.7 seconds.

# Notice the sign-flipping in generated quantities
stancode(fit)
#> // Stan model code generated by package mvgam
#> functions {
#>   vector rep_each(vector x, int K) {
#>     int N = rows(x);
#>     vector[N * K] y;
#>     int pos = 1;
#>     for (n in 1 : N) {
#>       for (k in 1 : K) {
#>         y[pos] = x[n];
#>         pos += 1;
#>       }
#>     }
#>     return y;
#>   }
#> }
#> data {
#>   int<lower=0> total_obs; // total number of observations
#>   int<lower=0> n; // number of timepoints per series
#>   int<lower=0> n_lv; // number of dynamic factors
#>   int<lower=0> n_series; // number of series
#>   int<lower=0> num_basis; // total number of basis coefficients
#>   matrix[total_obs, num_basis] X; // mgcv GAM design matrix
#>   array[n, n_series] int<lower=0> ytimes; // time-ordered matrix (which col in X belongs to each [time, series] observation?)
#>   int<lower=0> n_nonmissing; // number of nonmissing observations
#>   vector[n_nonmissing] flat_ys; // flattened nonmissing observations
#>   matrix[n_nonmissing, num_basis] flat_xs; // X values for nonmissing observations
#>   array[n_nonmissing] int<lower=0> obs_ind; // indices of nonmissing observations
#> }
#> transformed data {
#>   // Number of non-zero lower triangular factor loadings
#>   
#>   // Ensures identifiability of the model - no rotation of factors
#>   int<lower=1> M;
#>   M = n_lv * (n_series - n_lv) + n_lv * (n_lv - 1) / 2 + n_lv;
#> }
#> parameters {
#>   // raw basis coefficients
#>   vector[num_basis] b_raw;
#>   
#>   // gaussian observation error
#>   real<lower=0> sigma_obs;
#>   
#>   // dynamic factors
#>   matrix[n, n_lv] LV_raw;
#>   
#>   // dynamic factor lower triangle loading coefficients
#>   vector[M] L;
#> }
#> transformed parameters {
#>   // trends and dynamic factor loading matrix
#>   matrix[n, n_series] trend;
#>   matrix[n_series, n_lv] lv_coefs_raw = rep_matrix(0, n_series, n_lv);
#>   
#>   // basis coefficients
#>   vector[num_basis] b;
#>   b[1 : num_basis] = b_raw[1 : num_basis];
#>   
#>   // constraints allow identifiability of loadings
#>   {
#>     int index;
#>     index = 0;
#>     for (j in 1 : n_lv) {
#>       for (i in j : n_series) {
#>         index = index + 1;
#>         lv_coefs_raw[i, j] = L[index];
#>       }
#>     }
#>   }
#>   
#>   // derived latent trends
#>   for (i in 1 : n) {
#>     for (s in 1 : n_series) {
#>       trend[i, s] = dot_product(lv_coefs_raw[s,  : ], LV_raw[i,  : ]);
#>     }
#>   }
#> }
#> model {
#>   // prior for (Intercept)...
#>   b_raw[1] ~ student_t(3, 0, 2.5);
#>   
#>   // priors for dynamic factor loading coefficients
#>   L ~ student_t(5, 0, 1);
#>   
#>   // dynamic factor estimates
#>   LV_raw[1, 1 : n_lv] ~ normal(0, 0.1);
#>   for (j in 1 : n_lv) {
#>     LV_raw[2 : n, j] ~ normal(LV_raw[1 : (n - 1), j], 0.1);
#>   }
#>   
#>   // priors for observation error parameters
#>   sigma_obs ~ inv_gamma(1.418, 0.452);
#>   {
#>     // likelihood functions
#>     vector[n_nonmissing] flat_trends;
#>     flat_trends = to_vector(trend)[obs_ind];
#>     flat_ys ~ normal_id_glm(append_col(flat_xs, flat_trends), 0.0,
#>                             append_row(b, 1.0), sigma_obs);
#>   }
#> }
#> generated quantities {
#>   vector[total_obs] eta;
#>   matrix[n, n_series] sigma_obs_vec;
#>   matrix[n, n_series] mus;
#>   matrix[n, n_lv] LV;
#>   matrix[n_series, n_lv] lv_coefs;
#>   vector[n_lv] penalty;
#>   array[n, n_series] real ypred;
#>   penalty = rep_vector(100.0, n_lv);
#>   
#>   // Sign correct factor loadings and factors
#>   for (j in 1 : n_lv) {
#>     if (lv_coefs_raw[j, j] < 0) {
#>       lv_coefs[ : , j] = -1 * lv_coefs_raw[ : , j];
#>       LV[ : , j] = -1 * LV_raw[ : , j];
#>     } else {
#>       lv_coefs[ : , j] = lv_coefs_raw[ : , j];
#>       LV[ : , j] = LV_raw[ : , j];
#>     }
#>   }
#>   
#>   // posterior predictions
#>   eta = X * b;
#>   for (s in 1 : n_series) {
#>     sigma_obs_vec[1 : n, s] = rep_vector(sigma_obs, n);
#>   }
#>   for (s in 1 : n_series) {
#>     mus[1 : n, s] = eta[ytimes[1 : n, s]] + trend[1 : n, s];
#>     ypred[1 : n, s] = normal_rng(mus[1 : n, s], sigma_obs_vec[1 : n, s]);
#>   }
#> }

# Back-transform the loadings and plot
bayesplot::mcmc_hist(
  fit$model_output, 
  regex_pars = 'lv_coefs',
  transformations = function(x) x * 0.1
)
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


# Now a more flexible factor model that does not use sign-flipping
fit2 <- jsdgam(
  formula = y ~ 1,
  factor_formula = ~ -1,
  n_lv = 1,
  data = df,
  family = gaussian(),
  share_obs_params = TRUE,
  unit = subject,
  species = item
)
#> Compiling Stan program using cmdstanr
#> 
#> Start sampling
#> Running MCMC with 4 parallel chains...
#> 
#> Chain 1 Iteration:   1 / 1000 [  0%]  (Warmup) 
#> Chain 1 Iteration: 100 / 1000 [ 10%]  (Warmup) 
#> Chain 1 Iteration: 200 / 1000 [ 20%]  (Warmup) 
#> Chain 1 Iteration: 300 / 1000 [ 30%]  (Warmup) 
#> Chain 1 Iteration: 400 / 1000 [ 40%]  (Warmup) 
#> Chain 1 Iteration: 500 / 1000 [ 50%]  (Warmup) 
#> Chain 1 Iteration: 501 / 1000 [ 50%]  (Sampling) 
#> Chain 2 Iteration:   1 / 1000 [  0%]  (Warmup) 
#> Chain 2 Iteration: 100 / 1000 [ 10%]  (Warmup) 
#> Chain 2 Iteration: 200 / 1000 [ 20%]  (Warmup) 
#> Chain 2 Iteration: 300 / 1000 [ 30%]  (Warmup) 
#> Chain 3 Iteration:   1 / 1000 [  0%]  (Warmup) 
#> Chain 4 Iteration:   1 / 1000 [  0%]  (Warmup) 
#> Chain 1 Iteration: 600 / 1000 [ 60%]  (Sampling) 
#> Chain 2 Iteration: 400 / 1000 [ 40%]  (Warmup) 
#> Chain 2 Iteration: 500 / 1000 [ 50%]  (Warmup) 
#> Chain 2 Iteration: 501 / 1000 [ 50%]  (Sampling) 
#> Chain 3 Iteration: 100 / 1000 [ 10%]  (Warmup) 
#> Chain 3 Iteration: 200 / 1000 [ 20%]  (Warmup) 
#> Chain 3 Iteration: 300 / 1000 [ 30%]  (Warmup) 
#> Chain 3 Iteration: 400 / 1000 [ 40%]  (Warmup) 
#> Chain 3 Iteration: 500 / 1000 [ 50%]  (Warmup) 
#> Chain 3 Iteration: 501 / 1000 [ 50%]  (Sampling) 
#> Chain 4 Iteration: 100 / 1000 [ 10%]  (Warmup) 
#> Chain 4 Iteration: 200 / 1000 [ 20%]  (Warmup) 
#> Chain 4 Iteration: 300 / 1000 [ 30%]  (Warmup) 
#> Chain 4 Iteration: 400 / 1000 [ 40%]  (Warmup) 
#> Chain 4 Iteration: 500 / 1000 [ 50%]  (Warmup) 
#> Chain 4 Iteration: 501 / 1000 [ 50%]  (Sampling) 
#> Chain 2 Iteration: 600 / 1000 [ 60%]  (Sampling) 
#> Chain 1 Iteration: 700 / 1000 [ 70%]  (Sampling) 
#> Chain 3 Iteration: 600 / 1000 [ 60%]  (Sampling) 
#> Chain 4 Iteration: 600 / 1000 [ 60%]  (Sampling) 
#> Chain 2 Iteration: 700 / 1000 [ 70%]  (Sampling) 
#> Chain 1 Iteration: 800 / 1000 [ 80%]  (Sampling) 
#> Chain 3 Iteration: 700 / 1000 [ 70%]  (Sampling) 
#> Chain 4 Iteration: 700 / 1000 [ 70%]  (Sampling) 
#> Chain 2 Iteration: 800 / 1000 [ 80%]  (Sampling) 
#> Chain 1 Iteration: 900 / 1000 [ 90%]  (Sampling) 
#> Chain 3 Iteration: 800 / 1000 [ 80%]  (Sampling) 
#> Chain 4 Iteration: 800 / 1000 [ 80%]  (Sampling) 
#> Chain 2 Iteration: 900 / 1000 [ 90%]  (Sampling) 
#> Chain 1 Iteration: 1000 / 1000 [100%]  (Sampling) 
#> Chain 3 Iteration: 900 / 1000 [ 90%]  (Sampling) 
#> Chain 4 Iteration: 900 / 1000 [ 90%]  (Sampling) 
#> Chain 1 finished in 1.4 seconds.
#> Chain 2 Iteration: 1000 / 1000 [100%]  (Sampling) 
#> Chain 2 finished in 1.4 seconds.
#> Chain 3 Iteration: 1000 / 1000 [100%]  (Sampling) 
#> Chain 4 Iteration: 1000 / 1000 [100%]  (Sampling) 
#> Chain 3 finished in 1.4 seconds.
#> Chain 4 finished in 1.4 seconds.
#> 
#> All 4 chains finished successfully.
#> Mean chain execution time: 1.4 seconds.
#> Total execution time: 1.8 seconds.
stancode(fit2)
#> // Stan model code generated by package mvgam
#> functions {
#>   vector rep_each(vector x, int K) {
#>     int N = rows(x);
#>     vector[N * K] y;
#>     int pos = 1;
#>     for (n in 1 : N) {
#>       for (k in 1 : K) {
#>         y[pos] = x[n];
#>         pos += 1;
#>       }
#>     }
#>     return y;
#>   }
#> }
#> data {
#>   int<lower=0> total_obs; // total number of observations
#>   int<lower=0> n; // number of timepoints per series
#>   int<lower=0> n_lv; // number of dynamic factors
#>   int<lower=0> M; // number of nonzero lower-triangular factor loadings
#>   int<lower=0> n_series; // number of series
#>   int<lower=0> num_basis; // total number of basis coefficients
#>   int<lower=0> num_basis_trend; // number of trend basis coefficients
#>   matrix[total_obs, num_basis] X; // mgcv GAM design matrix
#>   matrix[n * n_lv, num_basis_trend] X_trend; // trend model design matrix
#>   array[n, n_series] int<lower=0> ytimes; // time-ordered matrix (which col in X belongs to each [time, series] observation?)
#>   array[n, n_lv] int ytimes_trend;
#>   int<lower=0> n_nonmissing; // number of nonmissing observations
#>   vector[n_nonmissing] flat_ys; // flattened nonmissing observations
#>   matrix[n_nonmissing, num_basis] flat_xs; // X values for nonmissing observations
#>   array[n_nonmissing] int<lower=0> obs_ind; // indices of nonmissing observations
#> }
#> transformed data {
#>   
#> }
#> parameters {
#>   // raw basis coefficients
#>   vector[num_basis] b_raw;
#>   vector[num_basis_trend] b_raw_trend;
#>   
#>   // gaussian observation error
#>   real<lower=0> sigma_obs;
#>   
#>   // dynamic factors
#>   matrix[n, n_lv] LV_raw;
#>   
#>   // factor lower triangle loadings
#>   vector[M] L_lower;
#>   
#>   // factor diagonal loadings
#>   vector<lower=0>[n_lv] L_diag;
#> }
#> transformed parameters {
#>   // raw latent states
#>   vector[n * n_lv] trend_mus;
#>   matrix[n, n_series] trend;
#>   matrix[n_series, n_lv] lv_coefs = rep_matrix(0, n_series, n_lv);
#>   matrix[n, n_lv] LV;
#>   
#>   // basis coefficients
#>   vector[num_basis] b;
#>   vector[num_basis_trend] b_trend;
#>   
#>   // observation model basis coefficients
#>   b[1 : num_basis] = b_raw[1 : num_basis];
#>   
#>   // process model basis coefficients
#>   b_trend[1 : num_basis_trend] = b_raw_trend[1 : num_basis_trend];
#>   b_trend[1] = 0;
#>   
#>   // latent process linear predictors
#>   trend_mus = X_trend * b_trend;
#>   
#>   // constraints allow identifiability of loadings
#>   {
#>     int idx;
#>     idx = 0;
#>     for (j in 1 : n_lv) {
#>       lv_coefs[j, j] = L_diag[j];
#>     }
#>     for (j in 1 : n_lv) {
#>       for (k in (j + 1) : n_series) {
#>         idx = idx + 1;
#>         lv_coefs[k, j] = L_lower[idx];
#>       }
#>     }
#>   }
#>   
#>   // raw latent factors
#>   LV = LV_raw;
#>   for (i in 1 : n) {
#>     for (s in 1 : n_series) {
#>       trend[i, s] = dot_product(lv_coefs[s,  : ], LV[i,  : ]);
#>     }
#>   }
#> }
#> model {
#>   // prior for (Intercept)...
#>   b_raw[1] ~ student_t(3, 0, 2.5);
#>   
#>   // priors for factors and loading coefficients
#>   L_lower ~ student_t(3, 0, 1);
#>   L_diag ~ student_t(3, 0, 1);
#>   to_vector(LV_raw) ~ std_normal();
#>   
#>   // dynamic factor estimates
#>   
#>   // priors for observation error parameters
#>   sigma_obs ~ inv_gamma(1.418, 0.452);
#>   {
#>     // likelihood functions
#>     vector[n_nonmissing] flat_trends;
#>     flat_trends = to_vector(trend)[obs_ind];
#>     flat_ys ~ normal_id_glm(append_col(flat_xs, flat_trends), 0.0,
#>                             append_row(b, 1.0), sigma_obs);
#>   }
#> }
#> generated quantities {
#>   vector[total_obs] eta;
#>   matrix[n, n_series] sigma_obs_vec;
#>   matrix[n, n_series] mus;
#>   vector[n_lv] sigma;
#>   vector[n_lv] penalty;
#>   array[n, n_series] real ypred;
#>   penalty = rep_vector(1.0, n_lv);
#>   sigma = rep_vector(1.0, n_lv);
#>   
#>   // posterior predictions
#>   eta = X * b;
#>   for (s in 1 : n_series) {
#>     sigma_obs_vec[1 : n, s] = rep_vector(sigma_obs, n);
#>   }
#>   for (s in 1 : n_series) {
#>     mus[1 : n, s] = eta[ytimes[1 : n, s]] + trend[1 : n, s];
#>     ypred[1 : n, s] = normal_rng(mus[1 : n, s], sigma_obs_vec[1 : n, s]);
#>   }
#> }

# Loadings are on the correct scale, but clearly there are 
# convergence issues
bayesplot::mcmc_hist(
  fit2$model_output, 
  regex_pars = 'lv_coefs',
)
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Created on 2025-04-29 with reprex v2.1.1