PNUT logo

Summary

PNUT derives personalized (individual-level) network features from questionnaire item responses to enable Network-Augmented Machine Learning Utility (NAMU). Specifically, PNUT converts each participant’s item-response profile into an individualized symptom co-activation network and extracts network-informed predictors (e.g., node strength, pairwise edge weights, and graph-level connectivity summaries such as binary/weighted density). These features can be used alongside conventional predictors in machine learning pipelines to incorporate both symptom severity and symptom–symptom structure into NAMU-based modeling.

For additional details on the NAMU framework and its rationale, please refer to the following paper:
Kim, H., Yocum, A., McInnis, M., & Sperry, S. H. (2025, September). Predicting suicidal ideation from depression screening data: A network-augmented machine learning approach. In revision at the Journal of Affective Disorders. https://doi.org/10.31234/osf.io/rqyvx_v2


1 Project Setup

1.1 Load Required Packages

library(dplyr)
library(tidyr)
library(ggplot2)

1.2 Set project parameters

To run PNUT, edit the YAML header:

  • params: input_file - path to your CSV
  • params: id_col - participant ID column name
  • Choose ONE:
    • params: item_cols - list of item column names (recommended), or
    • params: item_regex - pattern to select item columns automatically

If both are provided, PNUT uses item_cols.

p <- params

input_file <- p$input_file
input_file_has_headers <- p$input_file_has_headers
id_col <- p$id_col
item_cols <- p$item_cols
item_regex <- p$item_regex
na_action <- p$na_action
scale_items <- p$scale_items
binary_edge_rule <- p$binary_edge_rule
output_file <- p$output_file
show_data_glimpse <- p$show_data_glimpse
histogram_bins <- p$histogram_bins

cat("input_file:", input_file, "\n")
## input_file: data/combined_data.csv

1.3 Global functions

adj_outer <- function(x){A<-outer(x,x,"*");diag(A)<-0;A}
strength_from_A <- function(A) rowSums(A)
edge_vector_from_A <- function(A) A[upper.tri(A)]
connectivity_summaries <- function(x,A,rule=c("A_gt_0","x_gt_0")){
  rule<-match.arg(rule);v<-A[upper.tri(A)]
  if(rule=="A_gt_0") binary_prop<-mean(v>0)
  else{M<-outer(x>0,x>0,"&");diag(M)<-FALSE;binary_prop<-mean(M[upper.tri(M)])}
  c(BinaryDensity=binary_prop,WeightedDensityMean=mean(v),WeightedDensitySum=sum(v))
}
impute_mean <- function(v){v[is.na(v)]<-mean(v,na.rm=TRUE);v}
impute_median <- function(v){v[is.na(v)]<-median(v,na.rm=TRUE);v}

1.4 Load data sets

Put your CSV in the same folder as this .Rmd (recommended) or in data/. Update YAML: params: input_file, params: id_col, and params: item_cols (or item_regex).
If the file is not found, PNUT generates a synthetic demo dataset (demo/testing only; not for interpretation). In this template, DPQ010DPQ080 illustrate PHQ-8 items.

if(file.exists(input_file)){
  dat <- read.csv(input_file, header=input_file_has_headers)
} else {
  set.seed(1)
  dat <- data.frame(
    SEQN=1:120,
    DPQ010=sample(0:3,120,TRUE),
    DPQ020=sample(0:3,120,TRUE),
    DPQ030=sample(0:3,120,TRUE),
    DPQ040=sample(0:3,120,TRUE),
    DPQ050=sample(0:3,120,TRUE),
    DPQ060=sample(0:3,120,TRUE),
    DPQ070=sample(0:3,120,TRUE),
    DPQ080=sample(0:3,120,TRUE)
  )
  id_col <- "SEQN"
  if(is.null(item_cols)||length(item_cols)==0)
    item_cols <- c("DPQ010","DPQ020","DPQ030","DPQ040","DPQ050","DPQ060","DPQ070","DPQ080")
}

if(is.null(item_cols)||length(item_cols)==0){
  if(is.null(item_regex)) stop("Provide item_cols or item_regex.")
  item_cols <- names(dat)[grepl(item_regex,names(dat))]
}

stopifnot(id_col %in% names(dat))
stopifnot(all(item_cols %in% names(dat)))

df <- dat[,c(id_col,item_cols)]
df[item_cols] <- lapply(df[item_cols], function(x){
  if(is.factor(x)) x <- as.character(x)
  suppressWarnings(as.numeric(x))
})

if(isTRUE(scale_items)) df[item_cols] <- as.data.frame(scale(as.matrix(df[item_cols])))

if (na_action == "listwise") {
  df_clean <- na.omit(df)

} else if (na_action == "mean_impute") {
  df_clean <- df
  for (cc in item_cols) {
    df_clean[[cc]] <- impute_mean(df_clean[[cc]])
  }

} else if (na_action == "median_impute") {
  df_clean <- df
  for (cc in item_cols) {
    df_clean[[cc]] <- impute_median(df_clean[[cc]])
  }

} else {
  stop("Invalid na_action: use listwise / mean_impute / median_impute")
}

if(isTRUE(show_data_glimpse)) dplyr::glimpse(df_clean)
## Rows: 120
## Columns: 9
## $ SEQN   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
## $ DPQ010 <dbl> 0, 3, 2, 0, 1, 0, 2, 2, 1, 1, 2, 2, 0, 0, 0, 1, 1, 1, 1, 2, 0, …
## $ DPQ020 <dbl> 1, 3, 2, 2, 3, 2, 0, 1, 1, 0, 2, 2, 1, 2, 1, 0, 3, 1, 3, 0, 3, …
## $ DPQ030 <dbl> 3, 3, 2, 1, 0, 3, 2, 3, 0, 1, 0, 3, 2, 0, 0, 2, 3, 2, 0, 3, 3, …
## $ DPQ040 <dbl> 2, 1, 0, 3, 3, 0, 3, 1, 2, 2, 2, 2, 1, 3, 0, 3, 1, 0, 0, 2, 1, …
## $ DPQ050 <dbl> 3, 0, 2, 2, 2, 3, 1, 0, 0, 1, 2, 3, 2, 1, 3, 2, 0, 2, 3, 3, 2, …
## $ DPQ060 <dbl> 2, 0, 1, 0, 0, 3, 2, 1, 1, 1, 0, 2, 2, 2, 3, 1, 1, 1, 0, 3, 2, …
## $ DPQ070 <dbl> 0, 1, 3, 3, 1, 0, 3, 2, 2, 1, 0, 0, 0, 3, 0, 1, 1, 2, 2, 3, 3, …
## $ DPQ080 <dbl> 3, 3, 1, 2, 0, 3, 0, 2, 3, 2, 1, 0, 0, 1, 3, 0, 1, 1, 1, 3, 1, …

2 Derive Personalized Network Features

If scale_items = TRUE, all network features are computed from standardized item scores (mean 0, SD 1).

2.1 Strength centrality

p <- length(item_cols)
strength_names <- paste0("Strength_", item_cols)
n_used <- nrow(df_clean)
ids <- df_clean[[id_col]]

strength_mat <- matrix(
  NA_real_, nrow = n_used, ncol = p,
  dimnames = list(NULL, strength_names)
)

for (i in seq_len(n_used)) {
  x <- as.numeric(df_clean[i, item_cols])
  A <- adj_outer(x)
  strength_mat[i, ] <- strength_from_A(A)
}

strength_df <- cbind(
  setNames(data.frame(ids), id_col),
  as.data.frame(strength_mat)
)

Note. For each participant, PNUT constructs an individual co-activation matrix \(A\) using the outer product of item responses, \(A_{jk} = x_j x_k\), with the diagonal set to 0 (\(A_{jj}=0\)).
Strength for item \(j\) is then computed as the row-sum of \(A\): \(\mathrm{Strength}_j = \sum_{k \ne j} A_{jk}\).
This is a within-person, response-derived connectivity feature, not a group-estimated correlation/partial-correlation network parameter.

2.2 Edge weights

pairs_idx <- combn(seq_along(item_cols), 2)
edge_names <- paste0("EW_", item_cols[pairs_idx[1,]], "_", item_cols[pairs_idx[2,]])

edge_mat <- matrix(NA_real_, nrow = n_used, ncol = ncol(pairs_idx),
                   dimnames = list(NULL, edge_names))

for (i in seq_len(n_used)) {
  x <- as.numeric(df_clean[i, item_cols])
  A <- adj_outer(x)
  edge_mat[i, ] <- apply(pairs_idx, 2, function(v) A[v[1], v[2]])
}

edges_df <- cbind(
  setNames(data.frame(ids), id_col),
  as.data.frame(edge_mat)
)

Note. For each participant, PNUT constructs an individual co-activation matrix \(A\) using the outer product of item responses, \(A_{jk} = x_j x_k\), with the diagonal set to 0.
The edge weight between items \(j\) and \(k\) is then defined as the corresponding off-diagonal entry of \(A\): \(\mathrm{EW}_{jk} = A_{jk} = x_j x_k\) (reported for unique pairs only, i.e., the upper triangle of \(A\)).
These edge weights are within-person, response-derived co-activation features, not correlation/partial-correlation network parameters estimated from group-level data.

2.3 Density

BinaryDensity counts the fraction of “active” item pairs; binary_edge_rule controls whether “active” means co-activation > 0 (A_gt_0) or both items > 0 (x_gt_0).

summary_mat <- matrix(NA_real_, nrow=n_used, ncol=3,
                      dimnames=list(NULL,c("BinaryDensity","WeightedDensityMean","WeightedDensitySum")))
for(i in seq_len(n_used)){
  x <- as.numeric(df_clean[i,item_cols])
  A <- adj_outer(x)
  summary_mat[i,] <- connectivity_summaries(x,A,rule=binary_edge_rule)
}
summary_df <- cbind(
  setNames(data.frame(ids), id_col),
  as.data.frame(summary_mat)
)

Note. PNUT summarizes each participant’s co-activation matrix \(A\) into graph-level connectivity features. Using the unique edge set (upper triangle of \(A\)), PNUT reports: 1) BinaryDensity: the proportion of edges that are nonzero (default rule: \(A_{jk} > 0\)). This corresponds to a count-based connectivity notion (i.e., how many item pairs are “active”) used in the NAMU paper. 2) WeightedDensityMean: the mean of the edge weights, \(\frac{1}{m}\sum_{j<k} A_{jk}\). 3) WeightedDensitySum: the sum of the edge weights, \(\sum_{j<k} A_{jk}\).

These density summaries are within-person, response-derived co-activation features, not group-estimated correlation/partial-correlation network parameters.

2.4 Merge features

features_df <- strength_df %>%
  left_join(edges_df, by=setNames(id_col,id_col)) %>%
  left_join(summary_df, by=setNames(id_col,id_col))

out <- dat %>% left_join(features_df, by=setNames(id_col,id_col))

3 Quality checks

dup_n should be 0 (no duplicated IDs). miss_rates shows the proportion of missing derived features. This depends on na_action and missingness in the raw items.

dup_n <- sum(duplicated(out[[id_col]]))
na_id <- sum(is.na(out[[id_col]]))
miss_cols <- c(colnames(strength_mat),colnames(edge_mat),colnames(summary_mat))
miss_rates <- out %>% summarize(across(all_of(miss_cols), ~mean(is.na(.)))) %>% t()

4 Visualization

ggplot(out, aes(x=BinaryDensity)) +
  geom_histogram(bins=histogram_bins) +
  theme_minimal()

5 Output

out_dir <- dirname(output_file)
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)

write.csv(out, file = output_file, row.names = FALSE)
cat("Saved:", output_file, "\n")
## Saved: output/PNUT_outputs.csv

6 Reproducibility

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_4.0.1 tidyr_1.3.1   dplyr_1.1.4  
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.6.5        cli_3.6.5          knitr_1.50         rlang_1.1.6       
##  [5] xfun_0.53          purrr_1.2.0        generics_0.1.4     S7_0.2.0          
##  [9] jsonlite_2.0.0     labeling_0.4.3     glue_1.8.0         htmltools_0.5.8.1 
## [13] sass_0.4.10        scales_1.4.0       rmarkdown_2.29     grid_4.5.1        
## [17] evaluate_1.0.5     jquerylib_0.1.4    tibble_3.3.0       fastmap_1.2.0     
## [21] yaml_2.3.10        lifecycle_1.0.4    compiler_4.5.1     RColorBrewer_1.1-3
## [25] pkgconfig_2.0.3    rstudioapi_0.17.1  farver_2.1.2       digest_0.6.37     
## [29] R6_2.6.1           tidyselect_1.2.1   pillar_1.11.0      magrittr_2.0.3    
## [33] bslib_0.9.0        withr_3.0.2        gtable_0.3.6       tools_4.5.1       
## [37] cachem_1.1.0

Citation: Kim, H. (2026). PNUT: Personalized Network Utility Toolkit. GitHub repository.