PNUT derives personalized (individual-level) network features from questionnaire item responses to enable Network-Augmented Machine Learning Utility (NAMU). Specifically, PNUT converts each participant’s item-response profile into an individualized symptom co-activation network and extracts network-informed predictors (e.g., node strength, pairwise edge weights, and graph-level connectivity summaries such as binary/weighted density). These features can be used alongside conventional predictors in machine learning pipelines to incorporate both symptom severity and symptom–symptom structure into NAMU-based modeling.
For additional details on the NAMU framework and its rationale, please
refer to the following paper:
Kim, H.,
Yocum, A., McInnis, M., & Sperry, S. H. (2025, September).
Predicting suicidal ideation from depression screening data: A
network-augmented machine learning approach. In revision at the
Journal of Affective Disorders. https://doi.org/10.31234/osf.io/rqyvx_v2
library(dplyr)
library(tidyr)
library(ggplot2)
To run PNUT, edit the YAML header:
params: input_file - path to your CSVparams: id_col - participant ID column nameparams: item_cols - list of item column names
(recommended), orparams: item_regex - pattern to select item columns
automaticallyIf both are provided, PNUT uses item_cols.
p <- params
input_file <- p$input_file
input_file_has_headers <- p$input_file_has_headers
id_col <- p$id_col
item_cols <- p$item_cols
item_regex <- p$item_regex
na_action <- p$na_action
scale_items <- p$scale_items
binary_edge_rule <- p$binary_edge_rule
output_file <- p$output_file
show_data_glimpse <- p$show_data_glimpse
histogram_bins <- p$histogram_bins
cat("input_file:", input_file, "\n")
## input_file: data/combined_data.csv
adj_outer <- function(x){A<-outer(x,x,"*");diag(A)<-0;A}
strength_from_A <- function(A) rowSums(A)
edge_vector_from_A <- function(A) A[upper.tri(A)]
connectivity_summaries <- function(x,A,rule=c("A_gt_0","x_gt_0")){
rule<-match.arg(rule);v<-A[upper.tri(A)]
if(rule=="A_gt_0") binary_prop<-mean(v>0)
else{M<-outer(x>0,x>0,"&");diag(M)<-FALSE;binary_prop<-mean(M[upper.tri(M)])}
c(BinaryDensity=binary_prop,WeightedDensityMean=mean(v),WeightedDensitySum=sum(v))
}
impute_mean <- function(v){v[is.na(v)]<-mean(v,na.rm=TRUE);v}
impute_median <- function(v){v[is.na(v)]<-median(v,na.rm=TRUE);v}
Put your CSV in the same folder as this .Rmd
(recommended) or in data/. Update YAML:
params: input_file, params: id_col, and
params: item_cols (or item_regex).
If the file is not found, PNUT generates a synthetic demo
dataset (demo/testing only; not for interpretation). In this
template, DPQ010–DPQ080 illustrate PHQ-8
items.
if(file.exists(input_file)){
dat <- read.csv(input_file, header=input_file_has_headers)
} else {
set.seed(1)
dat <- data.frame(
SEQN=1:120,
DPQ010=sample(0:3,120,TRUE),
DPQ020=sample(0:3,120,TRUE),
DPQ030=sample(0:3,120,TRUE),
DPQ040=sample(0:3,120,TRUE),
DPQ050=sample(0:3,120,TRUE),
DPQ060=sample(0:3,120,TRUE),
DPQ070=sample(0:3,120,TRUE),
DPQ080=sample(0:3,120,TRUE)
)
id_col <- "SEQN"
if(is.null(item_cols)||length(item_cols)==0)
item_cols <- c("DPQ010","DPQ020","DPQ030","DPQ040","DPQ050","DPQ060","DPQ070","DPQ080")
}
if(is.null(item_cols)||length(item_cols)==0){
if(is.null(item_regex)) stop("Provide item_cols or item_regex.")
item_cols <- names(dat)[grepl(item_regex,names(dat))]
}
stopifnot(id_col %in% names(dat))
stopifnot(all(item_cols %in% names(dat)))
df <- dat[,c(id_col,item_cols)]
df[item_cols] <- lapply(df[item_cols], function(x){
if(is.factor(x)) x <- as.character(x)
suppressWarnings(as.numeric(x))
})
if(isTRUE(scale_items)) df[item_cols] <- as.data.frame(scale(as.matrix(df[item_cols])))
if (na_action == "listwise") {
df_clean <- na.omit(df)
} else if (na_action == "mean_impute") {
df_clean <- df
for (cc in item_cols) {
df_clean[[cc]] <- impute_mean(df_clean[[cc]])
}
} else if (na_action == "median_impute") {
df_clean <- df
for (cc in item_cols) {
df_clean[[cc]] <- impute_median(df_clean[[cc]])
}
} else {
stop("Invalid na_action: use listwise / mean_impute / median_impute")
}
if(isTRUE(show_data_glimpse)) dplyr::glimpse(df_clean)
## Rows: 120
## Columns: 9
## $ SEQN <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
## $ DPQ010 <dbl> 0, 3, 2, 0, 1, 0, 2, 2, 1, 1, 2, 2, 0, 0, 0, 1, 1, 1, 1, 2, 0, …
## $ DPQ020 <dbl> 1, 3, 2, 2, 3, 2, 0, 1, 1, 0, 2, 2, 1, 2, 1, 0, 3, 1, 3, 0, 3, …
## $ DPQ030 <dbl> 3, 3, 2, 1, 0, 3, 2, 3, 0, 1, 0, 3, 2, 0, 0, 2, 3, 2, 0, 3, 3, …
## $ DPQ040 <dbl> 2, 1, 0, 3, 3, 0, 3, 1, 2, 2, 2, 2, 1, 3, 0, 3, 1, 0, 0, 2, 1, …
## $ DPQ050 <dbl> 3, 0, 2, 2, 2, 3, 1, 0, 0, 1, 2, 3, 2, 1, 3, 2, 0, 2, 3, 3, 2, …
## $ DPQ060 <dbl> 2, 0, 1, 0, 0, 3, 2, 1, 1, 1, 0, 2, 2, 2, 3, 1, 1, 1, 0, 3, 2, …
## $ DPQ070 <dbl> 0, 1, 3, 3, 1, 0, 3, 2, 2, 1, 0, 0, 0, 3, 0, 1, 1, 2, 2, 3, 3, …
## $ DPQ080 <dbl> 3, 3, 1, 2, 0, 3, 0, 2, 3, 2, 1, 0, 0, 1, 3, 0, 1, 1, 1, 3, 1, …
If scale_items = TRUE, all network features are computed
from standardized item scores (mean 0, SD 1).
p <- length(item_cols)
strength_names <- paste0("Strength_", item_cols)
n_used <- nrow(df_clean)
ids <- df_clean[[id_col]]
strength_mat <- matrix(
NA_real_, nrow = n_used, ncol = p,
dimnames = list(NULL, strength_names)
)
for (i in seq_len(n_used)) {
x <- as.numeric(df_clean[i, item_cols])
A <- adj_outer(x)
strength_mat[i, ] <- strength_from_A(A)
}
strength_df <- cbind(
setNames(data.frame(ids), id_col),
as.data.frame(strength_mat)
)
Note. For each participant, PNUT constructs
an individual co-activation matrix \(A\) using the outer product of item
responses, \(A_{jk} = x_j x_k\), with
the diagonal set to 0 (\(A_{jj}=0\)).
Strength for item \(j\) is then
computed as the row-sum of \(A\): \(\mathrm{Strength}_j = \sum_{k \ne j}
A_{jk}\).
This is a within-person, response-derived connectivity feature, not a
group-estimated correlation/partial-correlation network parameter.
pairs_idx <- combn(seq_along(item_cols), 2)
edge_names <- paste0("EW_", item_cols[pairs_idx[1,]], "_", item_cols[pairs_idx[2,]])
edge_mat <- matrix(NA_real_, nrow = n_used, ncol = ncol(pairs_idx),
dimnames = list(NULL, edge_names))
for (i in seq_len(n_used)) {
x <- as.numeric(df_clean[i, item_cols])
A <- adj_outer(x)
edge_mat[i, ] <- apply(pairs_idx, 2, function(v) A[v[1], v[2]])
}
edges_df <- cbind(
setNames(data.frame(ids), id_col),
as.data.frame(edge_mat)
)
Note. For each participant, PNUT constructs
an individual co-activation matrix \(A\) using the outer product of item
responses, \(A_{jk} = x_j x_k\), with
the diagonal set to 0.
The edge weight between items \(j\) and
\(k\) is then defined as the
corresponding off-diagonal entry of \(A\): \(\mathrm{EW}_{jk} = A_{jk} = x_j x_k\)
(reported for unique pairs only, i.e., the upper triangle of \(A\)).
These edge weights are within-person, response-derived co-activation
features, not correlation/partial-correlation network parameters
estimated from group-level data.
BinaryDensity counts the fraction of “active” item pairs;
binary_edge_rule controls whether “active” means
co-activation > 0 (A_gt_0) or both items > 0
(x_gt_0).
summary_mat <- matrix(NA_real_, nrow=n_used, ncol=3,
dimnames=list(NULL,c("BinaryDensity","WeightedDensityMean","WeightedDensitySum")))
for(i in seq_len(n_used)){
x <- as.numeric(df_clean[i,item_cols])
A <- adj_outer(x)
summary_mat[i,] <- connectivity_summaries(x,A,rule=binary_edge_rule)
}
summary_df <- cbind(
setNames(data.frame(ids), id_col),
as.data.frame(summary_mat)
)
Note. PNUT summarizes each participant’s co-activation matrix \(A\) into graph-level connectivity features. Using the unique edge set (upper triangle of \(A\)), PNUT reports: 1) BinaryDensity: the proportion of edges that are nonzero (default rule: \(A_{jk} > 0\)). This corresponds to a count-based connectivity notion (i.e., how many item pairs are “active”) used in the NAMU paper. 2) WeightedDensityMean: the mean of the edge weights, \(\frac{1}{m}\sum_{j<k} A_{jk}\). 3) WeightedDensitySum: the sum of the edge weights, \(\sum_{j<k} A_{jk}\).
These density summaries are within-person, response-derived co-activation features, not group-estimated correlation/partial-correlation network parameters.
features_df <- strength_df %>%
left_join(edges_df, by=setNames(id_col,id_col)) %>%
left_join(summary_df, by=setNames(id_col,id_col))
out <- dat %>% left_join(features_df, by=setNames(id_col,id_col))
dup_n should be 0 (no duplicated IDs).
miss_rates shows the proportion of missing derived
features. This depends on na_action and missingness in the
raw items.
dup_n <- sum(duplicated(out[[id_col]]))
na_id <- sum(is.na(out[[id_col]]))
miss_cols <- c(colnames(strength_mat),colnames(edge_mat),colnames(summary_mat))
miss_rates <- out %>% summarize(across(all_of(miss_cols), ~mean(is.na(.)))) %>% t()
ggplot(out, aes(x=BinaryDensity)) +
geom_histogram(bins=histogram_bins) +
theme_minimal()
out_dir <- dirname(output_file)
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
write.csv(out, file = output_file, row.names = FALSE)
cat("Saved:", output_file, "\n")
## Saved: output/PNUT_outputs.csv
sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_4.0.1 tidyr_1.3.1 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] vctrs_0.6.5 cli_3.6.5 knitr_1.50 rlang_1.1.6
## [5] xfun_0.53 purrr_1.2.0 generics_0.1.4 S7_0.2.0
## [9] jsonlite_2.0.0 labeling_0.4.3 glue_1.8.0 htmltools_0.5.8.1
## [13] sass_0.4.10 scales_1.4.0 rmarkdown_2.29 grid_4.5.1
## [17] evaluate_1.0.5 jquerylib_0.1.4 tibble_3.3.0 fastmap_1.2.0
## [21] yaml_2.3.10 lifecycle_1.0.4 compiler_4.5.1 RColorBrewer_1.1-3
## [25] pkgconfig_2.0.3 rstudioapi_0.17.1 farver_2.1.2 digest_0.6.37
## [29] R6_2.6.1 tidyselect_1.2.1 pillar_1.11.0 magrittr_2.0.3
## [33] bslib_0.9.0 withr_3.0.2 gtable_0.3.6 tools_4.5.1
## [37] cachem_1.1.0
Citation: Kim, H. (2026). PNUT: Personalized Network Utility Toolkit. GitHub repository.