<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e86379</article-id><article-id pub-id-type="doi">10.2196/86379</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Beyond Area Under the Receiver Operating Characteristic Curve: Evaluating Predictive Performance Metrics Under Class Imbalance in Real-World Clinical Data</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Ventura</surname><given-names>Vanessa das Gra&#x00E7;as Jos&#x00E9;</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Andrade</surname><given-names>Claudio Mois&#x00E9;s Valiense de</given-names></name><degrees>BSc, MSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Almeida</surname><given-names>Jussara Marques de</given-names></name><degrees>MSc, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pessoa</surname><given-names>Bruno Porto</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Polanczyk</surname><given-names>Car&#x00ED;si Anne</given-names></name><degrees>MSc, MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nascimento</surname><given-names>Guilherme Fonseca do</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Boersma</surname><given-names>Eric</given-names></name><degrees>MSc, PhD</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Vianna</surname><given-names>Heloisa Reniers</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Farah</surname><given-names>Katia de Paula</given-names></name><degrees>MSc, MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rocha</surname><given-names>Leonardo Chaves Dutra da</given-names></name><degrees>MSc, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gon&#x00E7;alves</surname><given-names>Marcos Andr&#x00E9;</given-names></name><degrees>MSc, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Marcolino</surname><given-names>Milena Soriano</given-names></name><degrees>MSc, MD, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff9">9</xref><xref ref-type="aff" rid="aff10">10</xref></contrib></contrib-group><aff id="aff1"><institution>Medical School and University Hospital, Universidade Federal de Minas Gerais</institution><addr-line>Avenida Alfredo Balena, 110</addr-line><addr-line>Belo Horizonte</addr-line><country>Brazil</country></aff><aff id="aff2"><institution>Department of Computer Science, Universidade Federal de Minas Gerais</institution><addr-line>Belo Horizonte</addr-line><country>Brazil</country></aff><aff id="aff3"><institution>Hospital J&#x00FA;lia Kubitschek</institution><addr-line>Belo Horizonte</addr-line><country>Brazil</country></aff><aff id="aff4"><institution>Department of Medicine Internal, Medical School, Universidade Federal do Rio Grande do Sul</institution><addr-line>Porto Alegre</addr-line><country>Brazil</country></aff><aff id="aff5"><institution>Institute for Health Assessment and Translation for Chronic and Neglected Diseases of High RElevance (IATS-CARE)</institution><addr-line>Belo Horizonte</addr-line><country>Brazil</country></aff><aff id="aff6"><institution>Hospital Moinhos de Vento</institution><addr-line>Porto Alegre</addr-line><country>Brazil</country></aff><aff id="aff7"><institution>Erasmus University Medical Center</institution><addr-line>Rotterdam</addr-line><country>The Netherlands</country></aff><aff id="aff8"><institution>Hospital Universit&#x00E1;rio Ci&#x00EA;ncias M&#x00E9;dicas</institution><addr-line>Belo Horizonte</addr-line><country>Brazil</country></aff><aff id="aff9"><institution>Department of Internal Medicine, Medical School, Medical School and University Hospital, Universidade Federal de Minas Gerais</institution><addr-line>Belo Horizonte</addr-line><country>Brazil</country></aff><aff id="aff10"><institution>Telehealth Center, University Hospital, Universidade Federal de Minas Gerais</institution><addr-line>Belo Horizonte</addr-line><country>Brazil</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Toma</surname><given-names>Milan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Abd-Alsabour</surname><given-names>Nadia</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Vanessa das Gra&#x00E7;as Jos&#x00E9; Ventura, MSc, MD, Medical School and University Hospital, Universidade Federal de Minas Gerais, Avenida Alfredo Balena, 110, Belo Horizonte, 30130-100, Brazil, 55 31991314221; <email>nessachesed@yahoo.com.br</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>24</day><month>6</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e86379</elocation-id><history><date date-type="received"><day>01</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>20</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>22</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Vanessa das Gra&#x00E7;as Jos&#x00E9; Ventura, Claudio Mois&#x00E9;s Valiense de Andrade, Jussara Marques de Almeida, Bruno Porto Pessoa, Car&#x00ED;si Anne Polanczyk, Guilherme Fonseca do Nascimento, Eric Boersma, Heloisa Reniers Vianna, Katia de Paula Farah, Leonardo Chaves Dutra da Rocha, Marcos Andr&#x00E9; Gon&#x00E7;alves, Milena Soriano Marcolino. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 24.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e86379"/><abstract><sec><title>Background</title><p>Predictive models increasingly support clinical decision-making, although imbalanced outcome distributions are common in health care datasets and can distort performance evaluation. The area under the receiver operating characteristic curve (AUROC) remains the most frequently reported metric, despite its limited ability to reflect clinically meaningful performance under class imbalance.</p></sec><sec><title>Objective</title><p>This study aimed to examine the influences of metric selection on the clinical interpretation of predictive models in imbalanced real-world health care data.</p></sec><sec sec-type="methods"><title>Methods</title><p>This was a retrospective cohort study, including 17,018 hospitalized patients with COVID-19. Two predictive models using extreme gradient boosting (XGBoost) were developed to predict kidney replacement therapy (KRT) and mortality. Model performance was assessed using AUROC, macro-<italic>F</italic><sub>1</sub>-score, class-specific precision and recall, calibration (curve, slope, and intercept), decision curve analysis, and learning curves. Standard rebalancing strategies were applied exclusively to the training data to evaluate their impact on performance.</p></sec><sec sec-type="results"><title>Results</title><p>KRT occurred in 9.5%, and mortality in 18.0%. Although AUROC values were high (0.928 for KRT and 0.945 for mortality), performance in the minority class was substantially lower. For KRT, precision was 0.539 and recall 0.372; for mortality, precision was 0.725 and recall 0.718. Rebalancing strategies were associated with higher recall for the minority class, but this gain was accompanied by a reduction in precision, with minimal impact on AUROC values. As a result, AUROC remained high despite clinically relevant changes in error distribution between false positives and false negatives. The learning curves show a plateau-like shape, with stable validation performance across all training set sizes for both outcomes.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>AUROC alone is insufficient to evaluate prediction models in imbalanced health care scenarios, even with rebalancing. Routine reporting of class-aware metrics, alongside learning curve analysis, is essential to support robust and clinically meaningful evaluation of predictive models, rather than their direct translation into practice.</p></sec></abstract><kwd-group><kwd>predictive model</kwd><kwd>artificial intelligence</kwd><kwd>learning curve</kwd><kwd>area under the receiver operating characteristic curve</kwd><kwd>F1-score</kwd><kwd>performance metrics</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Clinical prediction models are increasingly used in health care to support diagnostic, prognostic, and therapeutic decisions [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Their adoption has expanded with advances in machine learning (ML) and access to large-scale electronic health data, enabling the development of models with the potential to improve risk stratification and personalized care [<xref ref-type="bibr" rid="ref3">3</xref>]. However, the evaluation of these models frequently relies on metrics that may not reflect real-world clinical usefulness, especially when outcome distributions are imbalanced [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>The area under the receiver operating characteristic curve (AUROC) is the most commonly reported metric in clinical prediction research [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Although AUROC is intuitive and threshold-independent, it often overestimates performance in datasets in which one class (eg, survival or absence of disease) predominates, a common characteristic in clinical datasets [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. In such settings, AUROC may suggest high discriminative ability while concealing poor sensitivity for minority-class outcomes, such as death or the need for critical interventions [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>For example, risk prediction scores have long been recommended in clinical practice to guide preventive strategies, particularly in cardiovascular disease. The Framingham score, once widely used for predicting 10-year cardiovascular outcomes, illustrates the risk of relying on AUROC-based metrics [<xref ref-type="bibr" rid="ref10">10</xref>]. Although it showed acceptable discrimination (C-statistic: 0.763 for men and 0.793 for women) [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>], the dataset exhibited class imbalance (10.08% women and 18.09% men with outcomes) [<xref ref-type="bibr" rid="ref10">10</xref>], and the score likely performed better for healthy individuals while failing to identify many at-risk patients early, potentially missing opportunities for interventions that could have improved outcomes [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>For this reason, calibration measures are essential complements to discrimination, since high AUROC values do not necessarily guarantee reliable probability estimates or clinical applicability [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. More recent cardiovascular risk prediction models, such as SCORE2, incorporated improved calibration across European populations, but their performance reporting still relies heavily on AUROC [<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>This illustrates a broader issue: even when calibration is addressed, discrimination metrics alone can mask poor identification of minority outcomes, underscoring the need for comprehensive evaluation strategies. Additionally, although the Hosmer-Lemeshow test is a commonly used goodness-of-fit test for logistic regression models, it is less suitable for ML models due to its sensitivity to sample size and arbitrary grouping of predicted probabilities [<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>While these limitations are widely recognized in the ML literature, clinical studies continue to prioritize AUROC in model reporting [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref21">21</xref>]. Most discussions on metric limitations remain either theoretical or based on synthetic datasets [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. As a result, model outputs often lack interpretability and applicability for health professionals, limiting their practical relevance for clinical implementation [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. There are few applied studies using large real-world clinical datasets that demonstrate, in concrete terms, how metric selection affects the identification of high-risk patients and subsequent clinical decisions [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>Recently, Carriero et al [<xref ref-type="bibr" rid="ref28">28</xref>] reported the challenges posed by imbalanced datasets in predictive modeling, showing that common strategies to deal with class-imbalance issues, such as oversampling and undersampling, may compromise calibration, leading to overestimated risk predictions and systematic bias [<xref ref-type="bibr" rid="ref28">28</xref>]. These findings highlight the need for evaluation strategies that move beyond AUROC and artificial rebalancing, offering instead a comprehensive assessment of model performance that prioritizes clinical reliability and patient safety.</p><p>This study addresses this gap by applying a structured evaluation of predictive model performance in a real-world clinical setting, using a large, multicenter dataset of hospitalized patients with COVID-19 in Brazil. As a case study to illustrate the impact of class imbalance on model evaluation, we developed 2 ML models to predict kidney replacement therapy (KRT) and in-hospital mortality, outcomes with different prevalence levels, and assessed them using metrics that capture different aspects of model performance. Rather than proposing new predictive models, this study focuses on how commonly used performance metrics influence the interpretation of model usefulness in real-world, imbalanced clinical settings. Beyond AUROC, we focused on class-specific precision, recall, and macro-<italic>F</italic><sub>1</sub>-scores, which, although well-established in data science, remain underutilized in clinical contexts. Additionally, we critically examine how metric selection influences clinical interpretation in imbalanced scenarios, making our findings relevant not only to data scientists but also to health care professionals. In doing so, this study helps bridge the gap between methodological rigor and clinical applicability in predictive model evaluation.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>This was a retrospective cohort study. We collected data on consecutive adult patients (aged 18 years and older) with laboratory-confirmed COVID-19 [<xref ref-type="bibr" rid="ref29">29</xref>], admitted in one of 41 participating hospitals in Brazil from March 2020 to August 2022. Details of the cohort have been published elsewhere [<xref ref-type="bibr" rid="ref30">30</xref>]. Pregnant women; patients undergoing palliative treatment, or with a history of prior KRT or already in KRT upon hospital presentation; and those who were transferred from or to another hospital were excluded from this particular analysis (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><p>Two predictive models were developed and validated: 1 for KRT and 1 for in-hospital mortality. Both models presented imbalanced class distributions, but in different proportions, and were used as case studies. These outcomes were selected due to their clinical relevance and prognostic implications in hospitalized patients with COVID-19 [<xref ref-type="bibr" rid="ref31">31</xref>].</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of the patients included in the study. KRT: kidney replacement therapy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e86379_fig01.png"/></fig></sec><sec id="s2-2"><title>Data Collection</title><p>Sociodemographic, clinical, and laboratory data; medications; interventions; and outcomes were extracted from medical records by trained researchers using the REDCap (Research Electronic Data Capture) electronic platform [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>], hosted at the Telehealth Center of the University Hospital of the <italic>Universidade Federal de Minas Gerais</italic> [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. An automated data verification algorithm was implemented to ensure data quality, checking for inconsistencies. Any discrepancies were resolved in consultation with the coordinating researchers.</p></sec><sec id="s2-3"><title>Predictors and Outcome Definition</title><p>Candidate predictors were selected based on clinical relevance, prior literature, and data availability (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref> and <xref ref-type="supplementary-material" rid="app2">2</xref>). No automated feature selection was applied, as the study objective was not to maximize predictive performance but to assess how different evaluation metrics behave under identical modeling conditions. The same predictor set was maintained across all experiments to ensure comparability between models and evaluation strategies. KRT was defined as the initiation of dialysis during hospitalization, excluding patients with preexisting chronic dialysis. In-hospital mortality refers to death occurring during hospitalization, as documented in medical records. Patients were classified into binary outcome groups for each end point (KRT vs no KRT; death vs survival).</p></sec><sec id="s2-4"><title>The Predictive Models</title><p>Extreme gradient boosting (XGBoost) was chosen due to its strong performance in structured clinical data, ability to capture nonlinear relationships, native handling of missing values, and favorable calibration properties reported in prior studies [<xref ref-type="bibr" rid="ref36">36</xref>-<xref ref-type="bibr" rid="ref38">38</xref>]. Since XGBoost supports missing values natively, no imputation method was used in the primary analysis.</p></sec><sec id="s2-5"><title>Cross-Validation and Modeling Pipeline</title><p>A 10-fold stratified cross-validation strategy was used. In each iteration, 1-fold was held out as the test set, while the remaining 9-folds constituted the training set. Within this training partition, a further split was performed to create a validation subset used exclusively for hyperparameter tuning and model selection. All preprocessing and rebalancing procedures that did not require imputation, which are detailed in the next subsection, were performed strictly within the training data of each fold. The test set remained fully held out and was used only for final performance evaluation, preserving the original data distribution and preventing data leakage.</p><p>Three key hyperparameters were systematically explored: (1) booster (gbtree, gblinear, and dart), which defines the base learner used to build the ensemble; (2) eta (learning rate), which controls the step-size shrinkage during boosting to prevent overfitting by making the learning process more conservative; and (3) max_depth, which controls the maximum depth of individual trees, thereby regulating model complexity. The complete grid of values evaluated for each hyperparameter is reported in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p><p>After selecting the optimal hyperparameter configuration, the model was retrained on the full training data (training + validation) following standard practice and prior work [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>], as well as the default behavior of widely used libraries such as scikit-learn (GridSearchCV with refit=True). The held-out test fold was then used exclusively for final performance evaluation. This process was repeated across all folds, so that each fold served once as the test set, and the reported results correspond to the average performance across the 10 iterations. This strategy ensures robust performance estimation and minimizes data leakage [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. The overview of the analytical pipeline is presented in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Overview of the analytical pipeline applied within each cross-validation iteration. *booster, eta, max_depth. CV: cross-validation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e86379_fig02.png"/></fig></sec><sec id="s2-6"><title>Handling of Class Imbalance</title><p>To assess the impact of data imbalance on model performance, both oversampling and undersampling techniques were applied exclusively on the training data. For each method, default resampling parameters were used, without additional tuning, following the standard implementation of each algorithm. Due to the intrinsic operational characteristics of certain resampling methods, a fully balanced (1:1) class distribution was not always achieved. The final class proportions after resampling are presented in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p><p>Oversampling techniques included Random Oversampling [<xref ref-type="bibr" rid="ref43">43</xref>], Adaptive Synthetic [<xref ref-type="bibr" rid="ref44">44</xref>], Synthetic Minority Oversampling Technique (SMOTE) [<xref ref-type="bibr" rid="ref45">45</xref>], BorderlineSMOTE [<xref ref-type="bibr" rid="ref46">46</xref>], SVMSMOTE [<xref ref-type="bibr" rid="ref47">47</xref>], and KMeansSMOTE [<xref ref-type="bibr" rid="ref48">48</xref>], which increase minority-class representation through random duplication or synthetic sample generation [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref49">49</xref>]. Undersampling methods included Random Undersampling [<xref ref-type="bibr" rid="ref44">44</xref>], Redundancy-Based Undersampling [<xref ref-type="bibr" rid="ref39">39</xref>], e2sc-us (Effective, Efficient, and Scalable Confidence-Based-UnderSampling) [<xref ref-type="bibr" rid="ref39">39</xref>], Condensed Nearest Neighbor [<xref ref-type="bibr" rid="ref50">50</xref>], Near Miss 1 [<xref ref-type="bibr" rid="ref51">51</xref>], and Near Miss 2 [<xref ref-type="bibr" rid="ref51">51</xref>], which reduce majority-class instances while attempting to preserve relevant decision boundaries [<xref ref-type="bibr" rid="ref39">39</xref>].</p><p>Because most resampling algorithms cannot handle missing values, the MissForest imputation method was incorporated into the pipeline when required, exclusively within the training data for rebalancing experiments [<xref ref-type="bibr" rid="ref52">52</xref>]. Importantly, the imputation model was fitted exclusively on the training partition within each cross-validation fold and subsequently applied to the corresponding validation and test sets, thereby preventing information leakage.</p><p>For each resampling strategy, the entire modeling pipeline, including imputation (when applicable), resampling, and model training, was re-executed within each cross-validation iteration. Hyperparameter tuning was performed from scratch for each resampled dataset, ensuring that model optimization was specific to each data configuration. Details of the hyperparameter search space are provided in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. No information from the test set was used at any stage of model development, including imputation, resampling, or hyperparameter tuning.</p><p>To enable direct comparison across models, a fixed probability threshold of 0.5 was used for classification. This threshold was selected based on its consistent performance across preliminary analyses. All experiments were conducted using a fixed random seed (random_state=42), applied consistently across all stochastic components of the pipeline, including cross-validation splitting, imputation, resampling procedures, and model initialization, ensuring full reproducibility.</p></sec><sec id="s2-7"><title>Performance Evaluation</title><p>Model performance was assessed using a complementary set of metrics that offer different perspectives to evaluate model performance, capturing global discrimination, class-specific behavior, calibration, and clinical usefulness (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendices 5</xref> and <xref ref-type="supplementary-material" rid="app6">6</xref>). All metrics were computed on the held-out test fold in each iteration and averaged across folds. Specifically, we analyzed global metrics, such as accuracy and AUROC, as well as metrics that are more sensitive to class imbalance, such as macro-<italic>F</italic><sub>1</sub> and per-class precision and recall, including precision and recall across both majority and minority classes and the impact of different decision thresholds on the values of these metrics.</p><p>We first evaluated global performance using accuracy and AUROC. These metrics summarize overall discrimination across all instances but treat all elements equally, regardless of their class, which inherently biases these metrics toward the majority class in imbalanced datasets [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Accuracy represents the proportion of correctly classified instances [<xref ref-type="bibr" rid="ref4">4</xref>], while AUROC quantifies the model&#x2019;s ability to rank positive cases higher than negative ones across all decision thresholds [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. However, what is considered a &#x201C;correct&#x201D; prediction depends on the chosen decision threshold for the risk score, as different thresholds influence the balance between sensitivity and specificity [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>To explicitly capture performance under class imbalance, we additionally reported per-class precision, recall, and <italic>F</italic><sub>1</sub>-score, as well as macro-<italic>F</italic><sub>1</sub>, which assigns equal weight to each class regardless of prevalence [<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref53">53</xref>]. To better characterize performance under class imbalance, we first examined class-specific precision and recall, which directly quantify errors for both minority and majority outcomes. Regarding the positive class, recall (sensitivity) reflects the proportion of true cases correctly identified, whereas precision (positive predictive value) reflects the proportion of predicted positives that are true events. The same logic applies to the negative class, where recall corresponds to specificity and precision to negative predictive value [<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref55">55</xref>].</p><p>In addition to these class-specific metrics, we evaluated resampling strategies using TPRGap, a bias-oriented measure that quantifies performance disparity between classes as the absolute difference between their true-positive rates. This metric directly captures classifier favoritism toward the majority class, which may persist even when global performance measures remain high [<xref ref-type="bibr" rid="ref56">56</xref>]. Finally, to summarize the trade-off between precision and recall in a single indicator, we reported the <italic>F</italic><sub>1</sub>-score and its macroaveraged form, which assigns equal weight to each class and is therefore robust to outcome imbalance [<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref57">57</xref>].</p><p>While this perspective is common in the ML literature, it may be less intuitive for health care professionals, who are generally more familiar with metrics such as sensitivity and specificity. By reporting both precision and recall for each class, we provide a more nuanced and clinically interpretable understanding of model performance, especially relevant in the presence of class imbalance [<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref57">57</xref>]. This approach enables assessment not only of how well the model identifies patients at risk but also how confidently it excludes those unlikely to experience the outcome. Therefore, it supports a more comprehensive assessment of predictive usefulness and more informed decision-making in clinical applications.</p><p>Primary analyses were conducted using a default probability threshold of 0.5, consistent with standard binary classification practice. To explore clinically relevant trade-offs between missed events and false alarms, we further evaluated precision-recall behavior across varying decision thresholds using precision-recall curves [<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]. The precision-recall curve was generated by plotting precision against recall at various decision thresholds [<xref ref-type="bibr" rid="ref54">54</xref>].</p><p>Model calibration was assessed using the plot with predicted probability against observed probability, testing intercept equals zero and slope equals 1. In a well-calibrated model, there is agreement between observed and predicted events, allowing the probability to be interpreted as the confidence in the prediction [<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref59">59</xref>]. In addition, the global accuracy of the model was assessed using the Brier score. The Brier score ranges from 0 to 1, with lower values indicating better probabilistic accuracy [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>Clinical usefulness was assessed through decision curve analysis, which quantifies net benefit across a range of decision thresholds compared with &#x201C;treat-all&#x201D; and &#x201C;treat-none&#x201D; strategies [<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref57">57</xref>]. While decision curves assess whether model-guided decisions outperform simple strategies, they do not ensure balanced error distribution or detect bias toward the majority class, reinforcing the need for class-specific performance metrics [<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref57">57</xref>]. Finally, the learning curves were used as a graphical representation of how a model&#x2019;s performance evolves as training data are added [<xref ref-type="bibr" rid="ref60">60</xref>].</p></sec><sec id="s2-8"><title>Risk-of-Bias Assessment and Reporting</title><p>This study adheres to the TRIPOD+AI (Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis + Artificial Intelligence) standards for transparent reporting (<xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>) [<xref ref-type="bibr" rid="ref6">6</xref>]. To ensure methodological rigor, we used the PROBAST+AI (Updated Quality, Risk of Bias, and Applicability Assessment Tool for Prediction Models Using Regression or Artificial Intelligence Methods) to assess risk of bias and applicability (<xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>). The study was considered to have a low risk of bias in all domains (participants, predictors, outcomes, and analysis). However, the lack of external validation of the model should be considered as a point of attention in the domain of analysis. Applicability concerns were judged to be low across all domains [<xref ref-type="bibr" rid="ref16">16</xref>].</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>The study was approved by the Brazilian National Research Ethics Committee&#x2014;Comiss&#x00E3;o Nacional de &#x00C9;tica em Pesquisa (CAAE 30350820.5.0000.0008) and internal approval of ethics boards from each hospital. Individual informed consent term was waived due to the pandemic situation and analysis of deidentified data, based on chart review only.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>The database included 17,018 patients (median age of 60 years, IQR 37&#x2010;83; 54.6% were men). The outcome distributions were highly imbalanced (<xref ref-type="fig" rid="figure3">Figure 3</xref>). Approximately 9.5% (1617/17,018) of the patients underwent KRT (1617 patients), resulting in an imbalance ratio of 9.5:1. Similarly, 18% (3063/17,018) of the patients died, corresponding to an imbalance ratio of 4.6:1.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Imbalanced outcome class distribution. (A) KRT; (B) in-hospital mortality. KRT: kidney replacement therapy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e86379_fig03.png"/></fig></sec><sec id="s3-2"><title>Prediction of KRT</title><p>The predictive XGBoost model demonstrated high overall performance when considering accuracy (0.910) and AUROC (0.928; <xref ref-type="table" rid="table1">Table 1</xref>). However, due to class imbalance, the other metrics revealed some important observations that would otherwise be overlooked. Notably, the macro-<italic>F</italic><sub>1</sub>-score of 0.695 suggests relatively lower performance, specifically for the minority class (KRT=yes; <xref ref-type="table" rid="table1">Tables 1</xref> and <xref ref-type="table" rid="table2">2</xref>). This is mainly due to the low recall (0.372), indicating that the model struggles to correctly identify a large proportion of actual KRT cases. The precision for this class was 0.539, resulting in an <italic>F</italic><sub>1</sub>-score of 0.439 (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Global metrics at different cutoff thresholds for kidney replacement therapy<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Cutoff</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">AUROC<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="bottom">Macro-<italic>F</italic><sub>1</sub></td><td align="left" valign="bottom">Brier</td></tr></thead><tbody><tr><td align="left" valign="top">50%</td><td align="left" valign="top">0.910 (0.906&#x2010;0.914)</td><td align="left" valign="top">0.928 (0.923&#x2010;0.933)</td><td align="left" valign="top">0.695 (0.682&#x2010;0.708)</td><td align="left" valign="top">0.066 (0.063&#x2010;0.069)</td></tr><tr><td align="left" valign="top">40%</td><td align="left" valign="top">0.907 (0.902&#x2010;0.912)</td><td align="left" valign="top">0.930 (0.923&#x2010;0.937)</td><td align="left" valign="top">0.711 (0.694&#x2010;0.728)</td><td align="left" valign="top">0.062 (0.060&#x2010;0.064)</td></tr><tr><td align="left" valign="top">30%</td><td align="left" valign="top">0.901 (0.896&#x2010;0.906)</td><td align="left" valign="top">0.930 (0.923&#x2010;0.937)</td><td align="left" valign="top">0.731 (0.719&#x2010;0.743)</td><td align="left" valign="top">0.062 (0.060&#x2010;0.064)</td></tr><tr><td align="left" valign="top">20%</td><td align="left" valign="top">0.890 (0.884&#x2010;0.896)</td><td align="left" valign="top">0.930 (0.923&#x2010;0.937)</td><td align="left" valign="top">0.742 (0.729&#x2010;0.755)</td><td align="left" valign="top">0.062 (0.060&#x2010;0.064)</td></tr><tr><td align="left" valign="top">10%</td><td align="left" valign="top">0.865 (0.861&#x2010;0.869)</td><td align="left" valign="top">0.930 (0.923&#x2010;0.937)</td><td align="left" valign="top">0.731 (0.723&#x2010;0.739)</td><td align="left" valign="top">0.062 (0.060&#x2010;0.064)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Data are presented as mean (95% CI).</p></fn><fn id="table1fn2"><p><sup>b</sup>AUROC: area under the receiver operating characteristic curve.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Per-class metrics at different cutoff thresholds for kidney replacement therapy<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Cutoff</td><td align="left" valign="bottom" colspan="2">Precision</td><td align="left" valign="bottom" colspan="2">Recall</td><td align="left" valign="bottom" colspan="2"><italic>F</italic><sub>1</sub></td></tr><tr><td align="left" valign="top"/><td align="left" valign="bottom">KRT<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="bottom">No KRT</td><td align="left" valign="bottom">KRT</td><td align="left" valign="bottom">No KRT</td><td align="left" valign="bottom">KRT</td><td align="left" valign="bottom">No KRT</td></tr></thead><tbody><tr><td align="left" valign="top">50%</td><td align="left" valign="top">0.539<break/>(0.508&#x2010;0.570)</td><td align="left" valign="top">0.936<break/>(0.934&#x2010;0.938)</td><td align="left" valign="top">0.372<break/>(0.345&#x2010;0.399)</td><td align="left" valign="top">0.966<break/>(0.961&#x2010;0.971)</td><td align="left" valign="top">0.439<break/>(0.415&#x2010;0.463)</td><td align="left" valign="top">0.951<break/>(0.949&#x2010;0.953)</td></tr><tr><td align="left" valign="top">40%</td><td align="left" valign="top">0.511<break/>(0.481&#x2010;0.541)</td><td align="left" valign="top">0.942<break/>(0.938&#x2010;0.946)</td><td align="left" valign="top">0.442<break/>(0.406&#x2010;0.478)</td><td align="left" valign="top">0.956<break/>(0.953&#x2010;0.959)</td><td align="left" valign="top">0.474<break/>(0.442&#x2010;0.506)</td><td align="left" valign="top">0.949<break/>(0.946&#x2010;0.952)</td></tr><tr><td align="left" valign="top">30%</td><td align="left" valign="top">0.480<break/>(0.460&#x2010;0.500)</td><td align="left" valign="top">0.953<break/>(0.950&#x2010;0.956)</td><td align="left" valign="top">0.563<break/>(0.536&#x2010;0.590)</td><td align="left" valign="top">0.936<break/>(0.933&#x2010;0.939)</td><td align="left" valign="top">0.518<break/>(0.495&#x2010;0.541)</td><td align="left" valign="top">0.945<break/>(0.942&#x2010;0.948)</td></tr><tr><td align="left" valign="top">20%</td><td align="left" valign="top">0.449<break/>(0.430&#x2010;0.468)</td><td align="left" valign="top">0.967<break/>(0.963&#x2010;0.971)</td><td align="left" valign="top">0.701<break/>(0.668&#x2010;0.734)</td><td align="left" valign="top">0.910<break/>(0.905&#x2010;0.915)</td><td align="left" valign="top">0.547<break/>(0.525&#x2010;0.569)</td><td align="left" valign="top">0.937<break/>(0.934&#x2010;0.940)</td></tr><tr><td align="left" valign="top">10%</td><td align="left" valign="top">0.400<break/>(0.390&#x2010;0.410)</td><td align="left" valign="top">0.981<break/>(0.978&#x2010;0.984)</td><td align="left" valign="top">0.837<break/>(0.817&#x2010;0.857)</td><td align="left" valign="top">0.868<break/>(0.864&#x2010;0.872)</td><td align="left" valign="top">0.542<break/>(0.529&#x2010;0.555)</td><td align="left" valign="top">0.921<break/>(0.918&#x2010;0.924)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Data are presented as mean (95% CI).</p></fn><fn id="table2fn2"><p><sup>b</sup>KRT: kidney replacement therapy.</p></fn></table-wrap-foot></table-wrap><p>The confusion matrices for KRT prediction also highlight this trend, showing that lowering the threshold from 50% to 10% enhances sensitivity, evidenced by a 174.5% increase in true positive (from 51 to 140). However, this adjustment also leads to a 284.0% rise in false positive (from 50 to 192; <xref ref-type="fig" rid="figure4">Figure 4A</xref>).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>(A) Confusion matrix at different cutoff thresholds to predict kidney replacement therapy. (B) Confusion matrix at different cutoff thresholds to predict in-hospital mortality.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e86379_fig04.png"/></fig><p>Changing the cutoff threshold affected the model&#x2019;s precision, recall, and <italic>F</italic><sub>1</sub> values. In this context, considering the class of interest, which is KRT, and the default cutoff threshold of 50%, the precision was 0.539, recall was 0.372, and <italic>F</italic><sub>1</sub>-score was 0.439 (<xref ref-type="table" rid="table2">Table 2</xref>). Lowering the cutoff threshold to 20% resulted in a precision of 0.449, recall of 0.701, and an improved <italic>F</italic><sub>1</sub>-score of 0.547 (<xref ref-type="table" rid="table2">Table 2</xref>). The data presented in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref> elucidate the trade-off between precision and recall, where an increase in precision usually implies a reduction in recall and vice versa.</p><p>The calibration plot shows systematic deviation from the diagonal, with predicted probabilities falling below the diagonal at higher values and above it at lower values, indicating overconfidence at the extremes (<xref ref-type="fig" rid="figure5">Figure 5A</xref>). This pattern is further supported by a calibration slope of 0.60 and an intercept of &#x2212;0.14 (<xref ref-type="fig" rid="figure5">Figure 5B</xref>), indicating both overconfident predictions and a slight global overestimation of risk.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>(A). Calibration curve for kidney replacement therapy. (B). Plot showing the calibration slope and intercept for the kidney replacement therapy task. (C). Calibration curve for death. (D). Plot showing the calibration slope and intercept for the death.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e86379_fig05.png"/></fig><p>The precision-recall curve for each class (<xref ref-type="fig" rid="figure6">Figure 6A</xref>) showed that the non-KRT class achieved high precision and recall simultaneously, which is desirable, while the KRT class showed a performance closer to random. In other words, the model is relatively good at recalling non&#x2013;KRT patients (high specificity), but it struggles to identify the ones who underwent KRT (low sensitivity).</p><p>It is observed that the curve for the class that did not undergo KRT remains close to the upper right corner, indicating that the model can achieve high precision and recall rates simultaneously. Conversely, the curve for the (interest) minority class (underwent KRT) approaches the diagonal, suggesting that the model is struggling to balance precision and recall, with performance close to random.</p><p>The decision curve analysis (<xref ref-type="supplementary-material" rid="app10">Multimedia Appendix 10</xref>) indicates that the proposed model generates a positive net benefit for low to moderate decision thresholds, starting at approximately 0.10 and gradually decreasing to zero as the threshold increases to 0.5. Consequently, the model exhibits a practical benefit for decision thresholds (<italic>P&#x2264;</italic>.5), indicating usefulness in scenarios that tolerate decisions based on relatively moderate predicted probabilities. In contrast, the strategy of treating all cases shows a positive net benefit only at very low thresholds, reaching zero around a threshold of 0.10 and becoming increasingly negative thereafter.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>(A) Precision-recall curves for patients undergoing KRT and not undergoing KRT. (B) Precision-recall curves for death, and no death. KRT: kidney replacement therapy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e86379_fig06.png"/></fig></sec><sec id="s3-3"><title>Prediction of In-Hospital Mortality</title><p>The predictive XGBoost model achieved high values of accuracy (0.900) and AUROC (0.945; <xref ref-type="table" rid="table3">Table 3</xref>). However, due to class imbalance, the macro-<italic>F</italic><sub>1</sub>-score of 0.830 indicates lower performance (<xref ref-type="table" rid="table3">Table 3</xref>). Specifically, for the minority class (deceased=yes), the precision is 0.725, the recall is 0.718, and the <italic>F</italic><sub>1</sub>-score is 0.721 (<xref ref-type="table" rid="table4">Table 4</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Global metrics at different cutoff thresholds for death<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Cutoff</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">AUROC<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom">Macro-<italic>F</italic><sub>1</sub></td><td align="left" valign="bottom">Brier</td></tr></thead><tbody><tr><td align="left" valign="top">50%</td><td align="left" valign="top">0.900 (0.896&#x2010;0.904)</td><td align="left" valign="top">0.945 (0.939&#x2010;0.951)</td><td align="left" valign="top">0.830 (0.825&#x2010;0.835)</td><td align="left" valign="top">0.072 (0.069&#x2010;0.075)</td></tr><tr><td align="left" valign="top">40%</td><td align="left" valign="top">0.900 (0.895&#x2010;0.905)</td><td align="left" valign="top">0.945 (0.939&#x2010;0.951)</td><td align="left" valign="top">0.837 (0.830&#x2010;0.844)</td><td align="left" valign="top">0.072 (0.069&#x2010;0.075)</td></tr><tr><td align="left" valign="top">30%</td><td align="left" valign="top">0.896 (0.891&#x2010;0.901)</td><td align="left" valign="top">0.945 (0.939&#x2010;0.951)</td><td align="left" valign="top">0.837 (0.829&#x2010;0.845)</td><td align="left" valign="top">0.072 (0.069&#x2010;0.075)</td></tr><tr><td align="left" valign="top">20%</td><td align="left" valign="top">0.893 (0.887&#x2010;0.899)</td><td align="left" valign="top">0.945 (0.939&#x2010;0.951)</td><td align="left" valign="top">0.840 (0.831&#x2010;0.849)</td><td align="left" valign="top">0.072 (0.069&#x2010;0.075)</td></tr><tr><td align="left" valign="top">10%</td><td align="left" valign="top">0.878 (0.870&#x2010;0.886)</td><td align="left" valign="top">0.945 (0.939&#x2010;0.951)</td><td align="left" valign="top">0.827 (0.815&#x2010;0.839)</td><td align="left" valign="top">0.072 (0.069&#x2010;0.075)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Data are presented as mean (95% CI).</p></fn><fn id="table3fn2"><p><sup>b</sup>AUROC: area under the receiver operating characteristic curve.</p></fn></table-wrap-foot></table-wrap><p>The confusion matrices for mortality prediction demonstrated a similar precision-recall trade-off (<xref ref-type="fig" rid="figure4">Figure 4B</xref>). Reducing the cutoff threshold from 50% to 10% enhanced recall, with true positives increasing by approximately 29.1% (from 202 to 261), but also resulted in a 113% increase in false positives (from 88 to 188).</p><p>As with KRT, the cutoff threshold affected the precision, recall, and <italic>F</italic><sub>1</sub>-scores. At a 50% cutoff threshold, the precision is 0.725, the recall is 0.718, and the <italic>F</italic><sub>1</sub>-score is 0.721 (<xref ref-type="table" rid="table4">Table 4</xref>). Lowering the threshold to 20% increased the recall (0.880) and improved the <italic>F</italic><sub>1</sub>-score (0.748), while precision decreased to 0.651 (<xref ref-type="table" rid="table4">Table 4</xref> and <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). Similar to KRT, the calibration plot (slope=0.72; intercept=&#x2212;0.16) was not satisfactory, and the Brier score was low (0.072; <xref ref-type="fig" rid="figure5">Figure 5C and D</xref> and <xref ref-type="table" rid="table3">Table 3</xref>).</p><p>The precision-recall curves for each class (<xref ref-type="fig" rid="figure6">Figure 6B</xref>) followed a similar pattern to that observed for KRT, with the non&#x2013;death class exhibiting higher precision and recall than the death class. Once again, the model performed well in identifying survivors (high specificity) but demonstrated limited ability to detect patients who died (low sensitivity).</p><p>It is observed that the curve for the non&#x2013;death class remains close to the upper right corner, indicating that the model can achieve high precision and recall rates simultaneously. In contrast, the curve for the death class, which is the minority class and of greater interest, approaches the diagonal (random model).</p><p>Similar to KRT, the decision curve for death shows that the net benefit of the strategy of treating all cases is approximately 0.2, while the proposed model achieves a substantially higher net benefit, around 0.8 (<xref ref-type="supplementary-material" rid="app11">Multimedia Appendix 11</xref>). This behavior demonstrates that indiscriminate intervention quickly becomes inadequate as the decision threshold increases, while the proposed model maintains its practical usefulness over a substantially wider range of thresholds.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Per-class metrics at different cutoff thresholds for death<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Cutoff</td><td align="left" valign="bottom" colspan="2">Precision</td><td align="left" valign="bottom" colspan="2">Recall</td><td align="left" valign="bottom" colspan="2"><italic>F</italic><sub>1</sub></td></tr><tr><td align="left" valign="top"/><td align="left" valign="bottom">Death</td><td align="left" valign="bottom">No death</td><td align="left" valign="bottom">Death</td><td align="left" valign="bottom">No death</td><td align="left" valign="bottom">Death</td><td align="left" valign="bottom">No death</td></tr></thead><tbody><tr><td align="left" valign="top">50%</td><td align="left" valign="top">0.725<break/>(0.703&#x2010;0.747)</td><td align="left" valign="top">0.938<break/>(0.934&#x2010;0.942)</td><td align="left" valign="top">0.718<break/>(0.710&#x2010;0.726)</td><td align="left" valign="top">0.940<break/>(0.934&#x2010;0.946)</td><td align="left" valign="top">0.721<break/>(0.712&#x2010;0.730)</td><td align="left" valign="top">0.939<break/>(0.937&#x2010;0.941)</td></tr><tr><td align="left" valign="top">40%</td><td align="left" valign="top">0.701<break/>(0.678&#x2010;0.724)</td><td align="left" valign="top">0.949<break/>(0.946&#x2010;0.952)</td><td align="left" valign="top">0.776<break/>(0.768&#x2010;0.784)</td><td align="left" valign="top">0.927<break/>(0.920&#x2010;0.934)</td><td align="left" valign="top">0.736<break/>(0.725&#x2010;0.747)</td><td align="left" valign="top">0.938<break/>(0.935&#x2010;0.941)</td></tr><tr><td align="left" valign="top">30%</td><td align="left" valign="top">0.673<break/>(0.652&#x2010;0.694)</td><td align="left" valign="top">0.958<break/>(0.955&#x2010;0.961)</td><td align="left" valign="top">0.821<break/>(0.811&#x2010;0.831)</td><td align="left" valign="top">0.912<break/>(0.906&#x2010;0.918)</td><td align="left" valign="top">0.739<break/>(0.726&#x2010;0.752)</td><td align="left" valign="top">0.935<break/>(0.931&#x2010;0.939)</td></tr><tr><td align="left" valign="top">20%</td><td align="left" valign="top">0.651<break/>(0.630&#x2010;0.672)</td><td align="left" valign="top">0.971<break/>(0.969&#x2010;0.973)</td><td align="left" valign="top">0.880<break/>(0.872&#x2010;0.888)</td><td align="left" valign="top">0.896<break/>(0.889&#x2010;0.903)</td><td align="left" valign="top">0.748<break/>(0.734&#x2010;0.762)</td><td align="left" valign="top">0.932<break/>(0.928&#x2010;0.936)</td></tr><tr><td align="left" valign="top">10%</td><td align="left" valign="top">0.608<break/>(0.585&#x2010;0.631)</td><td align="left" valign="top">0.980<break/>(0.977&#x2010;0.983)</td><td align="left" valign="top">0.921<break/>(0.910&#x2010;0.932)</td><td align="left" valign="top">0.869<break/>(0.861&#x2010;0.877)</td><td align="left" valign="top">0.732<break/>(0.714&#x2010;0.750)</td><td align="left" valign="top">0.921<break/>(0.916&#x2010;0.926)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Data are presented as mean (95% CI).</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>The Influence of Class Imbalance on Prediction</title><p>The model for KRT, which exhibited higher class imbalance, demonstrated superior performance for accuracy and AUROC when compared with the model for mortality. However, when examining the precision and recall for the minority class, the performance was suboptimal, and the KRT model exhibited lower performance than the mortality model. This pattern was also reflected in the macro-<italic>F</italic><sub>1</sub>-score, where the KRT model displayed a more significant drop in performance, further highlighting the impact of class imbalance.</p><p>It is important to highlight that KRT represents a distinct endpoint from mortality. Although both models used the same set of variables, the features and the importance of each feature vary depending on the endpoint (<xref ref-type="supplementary-material" rid="app12">Multimedia Appendices 12</xref> and <xref ref-type="supplementary-material" rid="app13">13</xref>). Therefore, discrepancies in the performance of the KRT and mortality models should not be solely attributed to differences in class balance.</p></sec><sec id="s3-5"><title>Learning Curves Analysis</title><p>The learning curves show a plateau-like shape, with stable validation performance across all training set sizes for both outcomes (<xref ref-type="fig" rid="figure7">Figures 7A and 7B</xref>). This pattern suggests limited change in performance as the training set size increases, indicating that model performance does not substantially improve with additional data.</p><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>(A) Learning curves for different training set sizes for kidney replacement therapy. (B) Learning curves for different training set sizes for death. AUROC: area under the receiver operating characteristic curve; <italic>F</italic><sub>1</sub>: <italic>F</italic><sub>1</sub>-score; KRT: kidney replacement therapy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e86379_fig07.png"/></fig></sec><sec id="s3-6"><title>Impact of Balancing Strategies on Prediction Results and on the Metrics</title><p>When evaluating the impact of class rebalancing strategies, we observe a clear and systematic discrepancy between AUROC and metrics that explicitly account for class-specific behavior. As shown in <xref ref-type="supplementary-material" rid="app13">Multimedia Appendices 12</xref> and <xref ref-type="supplementary-material" rid="app12">13</xref>, AUROC remains consistently high across all experimental conditions, exceeding 0.8 in every scenario, regardless of whether rebalancing is applied or whether minority-class performance substantially improves or deteriorates. This stability may give the misleading impression that rebalancing strategies have limited effect on model behavior.</p><p>However, a closer inspection using alternative metrics reveals a markedly different picture. In particular, TPRGap provides a direct measure of classifier bias induced by class imbalance, capturing disparities in true-positive rates between classes. Using this metric, several rebalancing techniques substantially reduce bias relative to the unbalanced baseline. For instance, in the death outcome, TPRGap decreases from 0.230 in the unbalanced setting to 0.043 when using Redundancy-Based Undersampling, indicating a pronounced reduction in class-dependent performance disparity. In contrast, AUROC changes only marginally in the same scenario, from 0.945 to 0.941, failing to reflect this improvement.</p><p>Similar patterns are observed across other class-aware metrics, particularly positive-class precision, recall, and <italic>F</italic><sub>1</sub>-score. These metrics exhibit significant sensitivity to rebalancing strategies, capturing both beneficial and harmful effects on minority-class performance. In several cases, rebalancing leads to meaningful gains in recall at the expense of precision, or vice versa, reflecting trade-offs that are critical in clinical decision-making contexts. Yet, AUROC remains largely unchanged, masking these trade-offs and providing little insight into how the classifier&#x2019;s behavior actually shifts.</p><p>An especially illustrative example arises in the death outcome under the Near Miss 2 undersampling strategy. In this case, the positive-class <italic>F</italic><sub>1</sub>-score drops sharply from 0.721 in the unbalanced model to 0.345, signaling a severe degradation in clinically relevant performance. Despite this substantial decline, AUROC remains comparatively high, decreasing from 0.945 to 0.809. This modest reduction does not adequately reflect the magnitude of the performance loss experienced by the minority class, underscoring the disconnect between AUROC and clinically meaningful outcomes.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In recent years, the rapid expansion of ML applications in health care has led to an increasing number of predictive models being proposed for clinical use. However, many of these studies continue to rely primarily, or even exclusively, on AUROC to report model performance, even in highly imbalanced clinical scenarios. Our findings demonstrate the limitations of this practice, showing that high AUROC values may coexist with poor performance for clinically critical minority outcomes.</p><p>Although methodological literature has long acknowledged the limitations of AUROC and accuracy in imbalanced settings, clinical prediction studies frequently continue to emphasize these global metrics. By applying complementary class-specific measures and analyzing the learning curves in a large cohort of over 17,000 hospitalized patients with COVID-19, our study provides practical evidence of how metric selection directly influences clinical interpretation, particularly when outcomes are imbalanced.</p><p>For both KRT and in-hospital mortality, AUROC values suggested excellent discrimination. However, class-specific metrics revealed substantial deficiencies in identifying minority outcomes. The KRT model achieved an AUROC of 0.928. However, recall for the KRT class was only 0.372, meaning that the model failed to identify nearly two-thirds of patients who would require dialysis. In a real clinical scenario, such as a hospital without dialysis services, this could result in missed opportunities for early referrals, with serious consequences for patient care. This discrepancy was captured by the macro-<italic>F</italic><sub>1</sub>-score (0.695), which penalizes imbalanced performance across classes and thus offers a more clinically realistic summary of the model&#x2019;s performance than AUROC alone.</p><p>Calibration analysis further highlighted these limitations. Calibration slopes of 0.60 and 0.72, along with negative intercepts, indicate suboptimal calibration for both outcomes. Although Brier scores were low, this likely reflects strong discrimination combined with outcome imbalance, rather than well-calibrated probability estimates. These findings reinforce that no single metric adequately captures model performance in imbalanced clinical settings.</p><p>Precision-recall analysis further highlighted the limitations of AUROC-based evaluation by prioritizing performance on the minority class, which often represents the clinically most relevant outcome [<xref ref-type="bibr" rid="ref57">57</xref>]. While decision curve analysis further illustrates that clinically meaningful usefulness may vary substantially across thresholds [<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref62">62</xref>] and demonstrated net benefit across certain thresholds, it did not capture how prediction errors were distributed between classes. By examining class-specific precision and recall across varying thresholds, we directly linked model behavior to real-world clinical trade-offs between underdiagnosis and overdiagnosis. Given that class prevalence directly affects metrics such as precision and recall, particularly in imbalanced settings [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref63">63</xref>], relying on AUROC alone may obscure clinically relevant deficiencies in minority-class performance. These results underscore the importance of reporting complementary, class-aware metrics, as no single metric adequately captures model performance across different clinical contexts. Therefore, metric selection should be guided by the intended clinical application. In high-risk settings, maximizing recall may be preferable to avoid missing cases, even at the expense of increased false positives. In contrast, in resource-limited settings, higher precision may be prioritized to reduce unnecessary interventions. These trade-offs cannot be adequately captured by a single metric, reinforcing the need for a multidimensional evaluation approach.</p><p>Learning curve analysis provided an additional and complementary perspective on model performance. In general, learning curves may reflect either progressive improvement with increasing data or early convergence in relatively simple prediction tasks [<xref ref-type="bibr" rid="ref60">60</xref>]. In our study, however, the combination of flat learning curves, persistently low minority-class performance, and stable AUROC values across increasing training set sizes suggests limited gains in performance as more data are added, highlighting that AUROC alone may not capture important aspects of model behavior in imbalanced settings.</p><p>This finding has relevant methodological considerations. Despite consistently high AUROC values (&#x003E;0.92), the lack of substantial improvement in performance with increasing training data suggests that discrimination alone may not fully reflect how model performance evolves, particularly for identifying high-risk patients. In this context, AUROC may reflect stable ranking ability driven by dataset characteristics, such as class imbalance, rather than improvements in clinically relevant performance. Therefore, reliance on AUROC alone may lead to overestimation of model performance and potential underrecognition of high-risk patients.</p><p>Learning curves offer a complementary tool to assess how model performance changes as additional data are incorporated [<xref ref-type="bibr" rid="ref60">60</xref>]. When performance remains relatively stable, caution is warranted in interpreting high discrimination metrics as sufficient evidence of model adequacy. Together, these findings reinforce that AUROC alone is insufficient to determine whether a model is suitable for clinical use, particularly in imbalanced scenarios.</p><p>International reporting frameworks such as PROBAST+AI and TRIPOD+AI emphasize comprehensive evaluation of predictive model performance and transparency in results presentation but remain focused on global performance measures of discrimination, calibration, and overall clinical usefulness [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Although these frameworks acknowledge the impacts of class imbalance on outcomes, they do not offer strategies for measuring the redistribution of errors across classes with varying thresholds, nor do they recommend incorporating learning curve analysis into model evaluation [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref16">16</xref>].</p><p>Additionally, according to PROBAST+AI, applicability concerns were considered low, as the study population, predictors, and outcomes are consistent with real-world clinical settings. However, this should not be interpreted as evidence supporting clinical use of the models. Despite this apparent applicability, the models demonstrated important limitations in clinically relevant performance, including suboptimal calibration and limited sensitivity for the minority class. This apparent contradiction highlights a key finding of our study: even when models are developed using appropriate data and aligned with clinical contexts, reliance on conventional metrics such as AUROC may obscure critical weaknesses. Therefore, methodological soundness and contextual relevance alone are insufficient to ensure clinically meaningful performance, reinforcing the need for comprehensive, class-aware evaluation frameworks before considering any potential clinical implementation.</p><p>Our findings provide a methodological contribution that extends beyond these current standards. Specifically, we demonstrate that high AUROC values were maintained despite limited changes in performance across increasing training set sizes. This observation suggests that AUROC alone may not reflect whether a model has learned clinically meaningful patterns but rather may capture stable discrimination driven by dataset characteristics such as class imbalance, highlighting the need for more transparent and comprehensive reporting.</p><p>This has important implications: models may comply with current reporting standards while still underperforming in clinically relevant minority outcomes, which are often the most clinically relevant. Because learning curves are rarely reported, this limitation may go unrecognized in many published models. Incorporating learning curve analysis alongside class-specific metrics can therefore enhance transparency and provide a more robust assessment of model performance.</p><p>Resampling techniques, including over- and undersampling, resulted in only modest improvements in minority-class performance and did not resolve the fundamental limitations of AUROC-based evaluation [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref64">64</xref>,<xref ref-type="bibr" rid="ref65">65</xref>]. Even when class distributions were artificially modified, AUROC remained largely insensitive to clinically meaningful changes in recall and precision. This reinforces that rebalancing alone cannot compensate for inappropriate performance metrics.</p><p>Our study contributes to literature by bridging theoretical concerns with practical, real-world application. By evaluating 2 predictive models in a large clinical cohort with varying degrees of outcome imbalance, we demonstrate how metric selection and threshold choice directly influence clinical interpretation and link how these shifts directly affect clinical decision-making. By evaluating per-class precision and recall across multiple cutoffs and visualizing these relationships through precision-recall curves, we make explicit the trade-offs inherent to real-world model deployment.</p><p>These trade-offs are particularly relevant in health care, where underdiagnosing high-risk patients (low recall) may lead to missed interventions, while overdiagnosis (low precision) can result in unnecessary procedures and resource strain [<xref ref-type="bibr" rid="ref66">66</xref>]. In high-stakes settings, such as intensive care units or emergency triage, prioritizing recall may be appropriate, even at the cost of more false positives, to avoid missing patients at risk of deterioration. On the other hand, in resource-constrained environments, higher precision may be preferable. Together, these findings reinforce that no single metric is sufficient and that different clinical contexts require different operating points and different emphases on recall, precision, or their balance, which is effectively summarized by the macro-<italic>F</italic><sub>1</sub>.</p><p>Despite growing methodological awareness [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref64">64</xref>], most applied health care research still relies predominantly on this AUROC for model evaluation (<xref ref-type="supplementary-material" rid="app14">Multimedia Appendices 14</xref> and <xref ref-type="supplementary-material" rid="app15">15</xref>) [<xref ref-type="bibr" rid="ref67">67</xref>]. For example, DynaMed, a widely used evidence-based clinical reference platform, currently lists 26 predictive models specifically developed for COVID-19&#x2014;1 diagnostic and 25 prognostics, including outcomes such as severe disease progression, thrombosis, intensive care unit admission, KRT, and mortality (<xref ref-type="supplementary-material" rid="app11">Multimedia Appendix 11</xref>) [<xref ref-type="bibr" rid="ref67">67</xref>]. Notably, 88.5% (23/26) of these models primarily report AUROC as the main performance metric [<xref ref-type="bibr" rid="ref67">67</xref>].</p><p>Additionally, some studies report multiple metrics without adequately contextualizing their relevance or the trade-offs involved [<xref ref-type="bibr" rid="ref67">67</xref>-<xref ref-type="bibr" rid="ref69">69</xref>]. Our findings highlight the importance of not only reporting multiple metrics but also interpreting them in relation to clinical context and outcome imbalance.</p><p>Therefore, metric selection and learning curve analysis may substantially influence the clinical interpretation of model performance. Choosing evaluation strategies that account for outcome imbalance and clinical priorities is essential to support more rigorous evaluation before potential clinical implementation of predictive models.</p></sec><sec id="s4-2"><title>Limitations</title><p>The methodology focused on a single algorithm (XGBoost), although the observed patterns are not model-specific and reflect broader issues related to class imbalance and performance evaluation. External validation was not feasible due to data availability; however, this does not affect the central methodological contribution of the study, which concerns the interpretation of model performance rather than the generalizability of a specific model. Therefore, our findings should not be interpreted as supporting the clinical use of the models presented, given the lack of external validation and suboptimal calibration. Importantly, the aim of this study was not to develop the best-performing predictive model but to examine how evaluation strategies influence the interpretation of model performance in imbalanced clinical scenarios.</p><p>In prediction tasks with imbalanced outcomes, which are common in health care, reliance on accuracy and AUROC alone may obscure clinically important failures. Complementary metrics, including precision, recall, and macro-<italic>F</italic><sub>1</sub>, provide a more realistic assessment of model performance and should be systematically reported. In addition, learning curve analysis offers insight into a model&#x2019;s learning dynamics and helps explore how model performance evolves as more training data are incorporated. Together, these approaches support a more comprehensive and clinically meaningful evaluation of predictive models, particularly in imbalanced settings, rather than their direct translation into clinical practice.</p></sec></sec></body><back><ack><p>The authors would like to thank the hospitals which are part of this collaboration for supporting this project. They also thank all the clinical staff at those hospitals, who cared for the patients, and all undergraduate students who helped with data collection. The authors declare the use of generative artificial intelligence (GenAI) in the research and writing process. According to the GAIDeT taxonomy (2025), the following tasks were delegated to GenAI tools under full human supervision: assisted linguistic editing and grammatical review. The GenAI tools used were Grammarly, Gemini 1.5, ChatGPT-5.2. The responsibility for the final manuscript lies entirely with the authors. GenAI tools are not listed as authors and do not bear responsibility for the final outcomes.</p></ack><notes><sec><title>Funding</title><p>This study was supported in part by the Minas Gerais State Agency for Research and Development (Funda&#x00E7;&#x00E3;o de Amparo &#x00E0; Pesquisa do Estado de Minas Gerais&#x2013;FAPEMIG) (grant APQ-01154-21), National Institute of Science and Technology for Health Technology Assessment (Instituto de Avalia&#x00E7;&#x00E3;o de Tecnologias em Sa&#x00FA;de&#x2013;IATS)/National Council for Scientific and Technological Development (Conselho Nacional de Desenvolvimento Cient&#x00ED;fico e Tecnol&#x00F3;gico&#x2013;CNPq) (grant 408659/2024-6), and the Center for Innovation and Artificial Intelligence for Health (CI-IA Sa&#x00FA;de), which is funded by the S&#x00E3;o Paulo State Research Support Foundation (FAPESP) (2020/09866-4), FAPEMIG (PPE-00030-21), and UNIMED Belo Horizonte. MSM was partially supported by CNPq (311742/2025-4).</p></sec><sec><title>Data Availability</title><p>The data generated or analyzed during this study are included in this paper and its <xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref><xref ref-type="supplementary-material" rid="app2"/><xref ref-type="supplementary-material" rid="app3"/><xref ref-type="supplementary-material" rid="app4"/><xref ref-type="supplementary-material" rid="app5"/><xref ref-type="supplementary-material" rid="app6"/><xref ref-type="supplementary-material" rid="app7"/><xref ref-type="supplementary-material" rid="app8"/><xref ref-type="supplementary-material" rid="app9"/><xref ref-type="supplementary-material" rid="app10"/><xref ref-type="supplementary-material" rid="app11"/><xref ref-type="supplementary-material" rid="app12"/><xref ref-type="supplementary-material" rid="app13"/><xref ref-type="supplementary-material" rid="app14"/><xref ref-type="supplementary-material" rid="app15"/><xref ref-type="supplementary-material" rid="app16"/><xref ref-type="supplementary-material" rid="app17"/>-<xref ref-type="supplementary-material" rid="app18">18</xref>. The corresponding author is available to provide additional data regarding this manuscript upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conception and design of the work: VGJV, MAG, MSM</p><p>Data collection: VGJV, BPP, CAP, HRV, MSM</p><p>Data curation: MSM</p><p>Data analysis and interpretation: VGJV, CMVA, JMA, GFN, LCDR, MAG, MSM</p><p>Drafting the paper: VGJV, CMVA, JMA, LCDR, MAG, MSM</p><p>Writing &#x2013; review &#x0026; editing: VGJV, CMVA, JMA, BPP, CAP, GFN, EB, HRV, KPF, LCDR, MAG, MSM</p><p>Project administration: MSM</p><p>Supervision: MSM</p><p>Reading and approving the final version of the manuscript: all authors</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb2">e2sc-us</term><def><p>Effective, Efficient, and Scalable Confidence-Based-UnderSampling</p></def></def-item><def-item><term id="abb3">KRT</term><def><p>kidney replacement therapy</p></def></def-item><def-item><term id="abb4">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb5">PROBAST+AI</term><def><p>Updated Quality, Risk of Bias, and Applicability Assessment Tool for Prediction Models Using Regression or Artificial Intelligence Methods</p></def></def-item><def-item><term id="abb6">REDCap</term><def><p>Research Electronic Data Capture</p></def></def-item><def-item><term id="abb7">SMOTE</term><def><p>Synthetic Minority Over-Sampling Technique</p></def></def-item><def-item><term id="abb8">TRIPOD+AI</term><def><p>Transparent Reporting of a multivariable prediction model for Individual Prognosis or Diagnosis + Artificial Intelligence</p></def></def-item><def-item><term id="abb9">XGBoost</term><def><p>extreme gradient boosting</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Dhiman</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Evaluation of clinical prediction models (part 1): from development to external validation</article-title><source>BMJ</source><year>2024</year><month>01</month><day>8</day><volume>384</volume><fpage>e074819</fpage><pub-id pub-id-type="doi">10.1136/bmj-2023-074819</pub-id><pub-id pub-id-type="medline">38191193</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Steyerberg</surname><given-names>EW</given-names> </name></person-group><source>Clinical Prediction Models: A Practical Approach to Development, Validation, and Updating</source><year>2019</year><access-date>2025-08-22</access-date><publisher-name>Springer Nature</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1007/978-3-030-16399-0">https://doi.org/10.1007/978-3-030-16399-0</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Smeden</surname><given-names>M</given-names> </name><name name-style="western"><surname>Reitsma</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Riley</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Moons</surname><given-names>KG</given-names> </name></person-group><article-title>Clinical prediction models: diagnosis versus prognosis</article-title><source>J Clin Epidemiol</source><year>2021</year><month>04</month><volume>132</volume><fpage>142</fpage><lpage>145</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2021.01.009</pub-id><pub-id pub-id-type="medline">33775387</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adhikari</surname><given-names>S</given-names> </name><name name-style="western"><surname>Normand</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Bloom</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shahian</surname><given-names>D</given-names> </name><name name-style="western"><surname>Rose</surname><given-names>S</given-names> </name></person-group><article-title>Revisiting performance metrics for prediction with rare outcomes</article-title><source>Stat Methods Med Res</source><year>2021</year><month>10</month><volume>30</volume><issue>10</issue><fpage>2352</fpage><lpage>2366</lpage><pub-id pub-id-type="doi">10.1177/09622802211038754</pub-id><pub-id pub-id-type="medline">34468239</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cabot</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Ross</surname><given-names>EG</given-names> </name></person-group><article-title>Evaluating prediction model performance</article-title><source>Surgery</source><year>2023</year><month>09</month><volume>174</volume><issue>3</issue><fpage>723</fpage><lpage>726</lpage><pub-id pub-id-type="doi">10.1016/j.surg.2023.05.023</pub-id><pub-id pub-id-type="medline">37419761</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Moons</surname><given-names>KGM</given-names> </name><name name-style="western"><surname>Dhiman</surname><given-names>P</given-names> </name><etal/></person-group><article-title>TRIPOD+AI statement: updated guidance for reporting clinical prediction models that use regression or machine learning methods</article-title><source>BMJ</source><year>2024</year><month>04</month><day>16</day><volume>385</volume><fpage>e078378</fpage><pub-id pub-id-type="doi">10.1136/bmj-2023-078378</pub-id><pub-id pub-id-type="medline">38626948</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cartus</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Samuels</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Cerd&#x00E1;</surname><given-names>M</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>BDL</given-names> </name></person-group><article-title>Outcome class imbalance and rare events: an underappreciated complication for overdose risk prediction modeling</article-title><source>Addiction</source><year>2023</year><month>06</month><volume>118</volume><issue>6</issue><fpage>1167</fpage><lpage>1176</lpage><pub-id pub-id-type="doi">10.1111/add.16133</pub-id><pub-id pub-id-type="medline">36683137</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Paiva</surname><given-names>BBM</given-names> </name><name name-style="western"><surname>Pereira</surname><given-names>PD</given-names> </name><name name-style="western"><surname>de Andrade</surname><given-names>CMV</given-names> </name><etal/></person-group><article-title>Potential and limitations of machine meta-learning (ensemble) methods for predicting COVID-19 mortality in a large inhospital Brazilian dataset</article-title><source>Sci Rep</source><year>2023</year><month>03</month><day>1</day><volume>13</volume><issue>1</issue><fpage>3463</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-28579-z</pub-id><pub-id pub-id-type="medline">36859446</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Roemer</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ge</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Comparison of evaluation metrics of deep learning for imbalanced imaging data in osteoarthritis studies</article-title><source>Osteoarthr Cartil</source><year>2023</year><month>09</month><volume>31</volume><issue>9</issue><fpage>1242</fpage><lpage>1248</lpage><pub-id pub-id-type="doi">10.1016/j.joca.2023.05.006</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>D&#x2019;Agostino</surname><given-names>RB</given-names>  <suffix>Sr</suffix></name><name name-style="western"><surname>Vasan</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Pencina</surname><given-names>MJ</given-names> </name><etal/></person-group><article-title>General cardiovascular risk profile for use in primary care: the Framingham Heart Study</article-title><source>Circulation</source><year>2008</year><month>02</month><day>12</day><volume>117</volume><issue>6</issue><fpage>743</fpage><lpage>753</lpage><pub-id pub-id-type="doi">10.1161/CIRCULATIONAHA.107.699579</pub-id><pub-id pub-id-type="medline">18212285</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Hosmer</surname><given-names>DW</given-names> </name><name name-style="western"><surname>Lemeshow</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sturdivant</surname><given-names>RX</given-names> </name></person-group><source>Applied Logistic Regression</source><access-date>2025-08-22</access-date><edition>1</edition><publisher-name>John Wiley &#x0026; Sons, Inc</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/9781118548387">https://doi.org/10.1002/9781118548387</ext-link></comment><pub-id pub-id-type="doi">10.1002/9781118548387</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Iragorri</surname><given-names>N</given-names> </name><name name-style="western"><surname>Spackman</surname><given-names>E</given-names> </name></person-group><article-title>Assessing the value of screening tools: reviewing the challenges and opportunities of cost-effectiveness analysis</article-title><source>Public Health Rev</source><year>2018</year><volume>39</volume><fpage>17</fpage><pub-id pub-id-type="doi">10.1186/s40985-018-0093-8</pub-id><pub-id pub-id-type="medline">30009081</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arnett</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Blumenthal</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Albert</surname><given-names>MA</given-names> </name><etal/></person-group><article-title>2019 ACC/AHA guideline on the primary prevention of cardiovascular disease: a report of the American College of Cardiology/American Heart Association task force on clinical practice guidelines</article-title><source>Circulation</source><year>2019</year><month>09</month><day>10</day><volume>140</volume><issue>11</issue><fpage>e596</fpage><lpage>e646</lpage><pub-id pub-id-type="doi">10.1161/CIR.0000000000000678</pub-id><pub-id pub-id-type="medline">30879355</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>US Preventive Services Task Force</collab><name name-style="western"><surname>Krist</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Davidson</surname><given-names>KW</given-names> </name><etal/></person-group><article-title>Behavioral counseling interventions to promote a healthy diet and physical activity for cardiovascular disease prevention in adults with cardiovascular risk factors: US Preventive Services Task Force recommendation statement</article-title><source>JAMA</source><year>2020</year><month>11</month><day>24</day><volume>324</volume><issue>20</issue><fpage>2069</fpage><lpage>2075</lpage><pub-id pub-id-type="doi">10.1001/jama.2020.21749</pub-id><pub-id pub-id-type="medline">33231670</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rufibach</surname><given-names>K</given-names> </name></person-group><article-title>Use of Brier score to assess binary predictions</article-title><source>J Clin Epidemiol</source><year>2010</year><month>08</month><volume>63</volume><issue>8</issue><fpage>938</fpage><lpage>939</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2009.11.009</pub-id><pub-id pub-id-type="medline">20189763</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moons</surname><given-names>KGM</given-names> </name><name name-style="western"><surname>Damen</surname><given-names>JAA</given-names> </name><name name-style="western"><surname>Kaul</surname><given-names>T</given-names> </name><etal/></person-group><article-title>PROBAST+AI: an updated quality, risk of bias, and applicability assessment tool for prediction models using regression or artificial intelligence methods</article-title><source>BMJ</source><year>2025</year><month>03</month><day>24</day><volume>388</volume><fpage>e082505</fpage><pub-id pub-id-type="doi">10.1136/bmj-2024-082505</pub-id><pub-id pub-id-type="medline">40127903</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hageman</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pennells</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ojeda</surname><given-names>F</given-names> </name><etal/></person-group><article-title>SCORE2 risk prediction algorithms: new models to estimate 10-year risk of cardiovascular disease in Europe</article-title><source>Eur Heart J</source><year>2021</year><month>07</month><day>1</day><volume>42</volume><issue>25</issue><fpage>2439</fpage><lpage>2454</lpage><pub-id pub-id-type="doi">10.1093/eurheartj/ehab309</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Van Calster</surname><given-names>B</given-names> </name><name name-style="western"><surname>McLernon</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>van Smeden</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wynants</surname><given-names>L</given-names> </name><name name-style="western"><surname>Steyerberg</surname><given-names>EW</given-names> </name><collab>Topic Group &#x2018;Evaluating diagnostic tests and prediction models&#x2019; of the STRATOS initiative</collab></person-group><article-title>Calibration: the Achilles heel of predictive analytics</article-title><source>BMC Med</source><year>2019</year><month>12</month><day>16</day><volume>17</volume><issue>1</issue><fpage>230</fpage><pub-id pub-id-type="doi">10.1186/s12916-019-1466-7</pub-id><pub-id pub-id-type="medline">31842878</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Krentz</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Curcin</surname><given-names>V</given-names> </name></person-group><article-title>Machine learning based prediction models for cardiovascular disease risk using electronic health records data: systematic review and meta-analysis</article-title><source>Eur Heart J Digit Health</source><year>2025</year><month>01</month><volume>6</volume><issue>1</issue><fpage>7</fpage><lpage>22</lpage><pub-id pub-id-type="doi">10.1093/ehjdh/ztae080</pub-id><pub-id pub-id-type="medline">39846062</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Laranjo</surname><given-names>L</given-names> </name><name name-style="western"><surname>Klimis</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Machine-learning versus traditional approaches for atherosclerotic cardiovascular risk prognostication in primary prevention cohorts: a systematic review and meta-analysis</article-title><source>Eur Heart J Qual Care Clin Outcomes</source><year>2023</year><month>06</month><day>21</day><volume>9</volume><issue>4</issue><fpage>310</fpage><lpage>322</lpage><pub-id pub-id-type="doi">10.1093/ehjqcco/qcad017</pub-id><pub-id pub-id-type="medline">36869800</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Andersen</surname><given-names>ES</given-names> </name><name name-style="western"><surname>Birk-Korch</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Hansen</surname><given-names>RS</given-names> </name><etal/></person-group><article-title>Monitoring performance of clinical artificial intelligence in health care: a scoping review</article-title><source>JBI Evid Synth</source><year>2024</year><month>12</month><day>1</day><volume>22</volume><issue>12</issue><fpage>2423</fpage><lpage>2446</lpage><pub-id pub-id-type="doi">10.11124/JBIES-24-00042</pub-id><pub-id pub-id-type="medline">39658865</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oettl</surname><given-names>FC</given-names> </name><name name-style="western"><surname>Pareek</surname><given-names>A</given-names> </name><name name-style="western"><surname>Winkler</surname><given-names>PW</given-names> </name><etal/></person-group><article-title>A practical guide to the implementation of AI in orthopaedic research, part 6: how to evaluate the performance of AI research?</article-title><source>J Exp Orthop</source><year>2024</year><month>07</month><volume>11</volume><issue>3</issue><fpage>e12039</fpage><pub-id pub-id-type="doi">10.1002/jeo2.12039</pub-id><pub-id pub-id-type="medline">38826500</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hicks</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Str&#x00FC;mke</surname><given-names>I</given-names> </name><name name-style="western"><surname>Thambawita</surname><given-names>V</given-names> </name><etal/></person-group><article-title>On evaluation metrics for medical applications of artificial intelligence</article-title><source>Sci Rep</source><year>2022</year><month>04</month><day>8</day><volume>12</volume><issue>1</issue><fpage>5979</fpage><pub-id pub-id-type="doi">10.1038/s41598-022-09954-8</pub-id><pub-id pub-id-type="medline">35395867</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Megahed</surname><given-names>FM</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Megahed</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ong</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>N</given-names> </name><name name-style="western"><surname>Krzywinski</surname><given-names>M</given-names> </name></person-group><article-title>The class imbalance problem</article-title><source>Nat Methods</source><year>2021</year><month>11</month><volume>18</volume><issue>11</issue><fpage>1270</fpage><lpage>1272</lpage><pub-id pub-id-type="doi">10.1038/s41592-021-01302-4</pub-id><pub-id pub-id-type="medline">34654918</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lever</surname><given-names>J</given-names> </name><name name-style="western"><surname>Krzywinski</surname><given-names>M</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>N</given-names> </name></person-group><article-title>Classification evaluation</article-title><source>Nat Methods</source><year>2016</year><month>08</month><volume>13</volume><issue>8</issue><fpage>603</fpage><lpage>604</lpage><pub-id pub-id-type="doi">10.1038/nmeth.3945</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kelly</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Karthikesalingam</surname><given-names>A</given-names> </name><name name-style="western"><surname>Suleyman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Corrado</surname><given-names>G</given-names> </name><name name-style="western"><surname>King</surname><given-names>D</given-names> </name></person-group><article-title>Key challenges for delivering clinical impact with artificial intelligence</article-title><source>BMC Med</source><year>2019</year><month>10</month><day>29</day><volume>17</volume><issue>1</issue><fpage>195</fpage><pub-id pub-id-type="doi">10.1186/s12916-019-1426-2</pub-id><pub-id pub-id-type="medline">31665002</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kocak</surname><given-names>B</given-names> </name><name name-style="western"><surname>Klontzas</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Stanzione</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Evaluation metrics in medical imaging AI: fundamentals, pitfalls, misapplications, and recommendations</article-title><source>Eur J Radiol Artif Intell</source><year>2025</year><month>09</month><volume>3</volume><fpage>100030</fpage><pub-id pub-id-type="doi">10.1016/j.ejrai.2025.100030</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carriero</surname><given-names>A</given-names> </name><name name-style="western"><surname>Luijken</surname><given-names>K</given-names> </name><name name-style="western"><surname>de Hond</surname><given-names>A</given-names> </name><name name-style="western"><surname>Moons</surname><given-names>KGM</given-names> </name><name name-style="western"><surname>van Calster</surname><given-names>B</given-names> </name><name name-style="western"><surname>van Smeden</surname><given-names>M</given-names> </name></person-group><article-title>The harms of class imbalance corrections for machine learning based prediction models: a simulation study</article-title><source>Stat Med</source><year>2025</year><month>02</month><day>10</day><volume>44</volume><issue>3-4</issue><fpage>e10320</fpage><pub-id pub-id-type="doi">10.1002/sim.10320</pub-id><pub-id pub-id-type="medline">39865585</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>Recommendations for national SARS-cov-2 testing strategies and diagnostic capacities: interim guidance, 25 June 2021</article-title><source>World Health Organization</source><year>2021</year><access-date>2025-08-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://iris.who.int/handle/10665/342002">https://iris.who.int/handle/10665/342002</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marcolino</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Ziegelmann</surname><given-names>PK</given-names> </name><name name-style="western"><surname>Souza-Silva</surname><given-names>MVR</given-names> </name><etal/></person-group><article-title>Clinical characteristics and outcomes of patients hospitalized with COVID-19 in Brazil: results from the Brazilian COVID-19 registry</article-title><source>Int J Infect Dis</source><year>2021</year><month>06</month><volume>107</volume><fpage>300</fpage><lpage>310</lpage><pub-id pub-id-type="doi">10.1016/j.ijid.2021.01.019</pub-id><pub-id pub-id-type="medline">33444752</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Kidney health in the COVID-19 pandemic: an umbrella review of meta-analyses and systematic reviews</article-title><source>Front Public Health</source><year>2022</year><volume>10</volume><fpage>963667</fpage><pub-id pub-id-type="doi">10.3389/fpubh.2022.963667</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harris</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>R</given-names> </name><name name-style="western"><surname>Thielke</surname><given-names>R</given-names> </name><name name-style="western"><surname>Payne</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gonzalez</surname><given-names>N</given-names> </name><name name-style="western"><surname>Conde</surname><given-names>JG</given-names> </name></person-group><article-title>Research electronic data capture (REDCap)--a metadata-driven methodology and workflow process for providing translational research informatics support</article-title><source>J Biomed Inform</source><year>2009</year><month>04</month><volume>42</volume><issue>2</issue><fpage>377</fpage><lpage>381</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2008.08.010</pub-id><pub-id pub-id-type="medline">18929686</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harris</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>R</given-names> </name><name name-style="western"><surname>Minor</surname><given-names>BL</given-names> </name><etal/></person-group><article-title>The REDCap consortium: building an international community of software platform partners</article-title><source>J Biomed Inform</source><year>2019</year><month>07</month><volume>95</volume><fpage>103208</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2019.103208</pub-id><pub-id pub-id-type="medline">31078660</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Soriano Marcolino</surname><given-names>M</given-names> </name><name name-style="western"><surname>Minelli Figueira</surname><given-names>R</given-names> </name><name name-style="western"><surname>Pereira Afonso Dos Santos</surname><given-names>J</given-names> </name><name name-style="western"><surname>Silva Cardoso</surname><given-names>C</given-names> </name><name name-style="western"><surname>Luiz Ribeiro</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alkmim</surname><given-names>MB</given-names> </name></person-group><article-title>The experience of a sustainable large scale Brazilian telehealth network</article-title><source>Telemed J E Health</source><year>2016</year><month>11</month><volume>22</volume><issue>11</issue><fpage>899</fpage><lpage>908</lpage><pub-id pub-id-type="doi">10.1089/tmj.2015.0234</pub-id><pub-id pub-id-type="medline">27167901</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bicalho</surname><given-names>MAC</given-names> </name><name name-style="western"><surname>Aliberti</surname><given-names>MJR</given-names> </name><name name-style="western"><surname>Delfino-Pereira</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Clinical characteristics and outcomes of COVID-19 patients with preexisting dementia: a large multicenter propensity-matched Brazilian cohort study</article-title><source>BMC Geriatr</source><year>2024</year><month>01</month><day>5</day><volume>24</volume><issue>1</issue><fpage>25</fpage><pub-id pub-id-type="doi">10.1186/s12877-023-04494-w</pub-id><pub-id pub-id-type="medline">38182982</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Guestrin</surname><given-names>C</given-names> </name></person-group><article-title>XGBoost: a scalable tree boosting system</article-title><conf-name>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name><conf-date>Aug 13-17, 2016</conf-date><conf-loc>San Francisco, CA</conf-loc><fpage>785</fpage><lpage>794</lpage><pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shwartz-Ziv</surname><given-names>R</given-names> </name><name name-style="western"><surname>Armon</surname><given-names>A</given-names> </name></person-group><article-title>Tabular data: deep learning is not all you need</article-title><source>Inf Fusion</source><year>2022</year><month>05</month><volume>81</volume><fpage>84</fpage><lpage>90</lpage><pub-id pub-id-type="doi">10.1016/j.inffus.2021.11.011</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>X</given-names> </name><name name-style="western"><surname>Che</surname><given-names>H</given-names> </name></person-group><article-title>Prediction of type 2 diabetes risk and its effect evaluation based on the XGBoost model</article-title><source>Healthcare (Basel)</source><year>2020</year><month>07</month><day>31</day><volume>8</volume><issue>3</issue><fpage>247</fpage><pub-id pub-id-type="doi">10.3390/healthcare8030247</pub-id><pub-id pub-id-type="medline">32751894</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wilimitis</surname><given-names>D</given-names> </name><name name-style="western"><surname>Walsh</surname><given-names>CG</given-names> </name></person-group><article-title>Practical considerations and applied examples of cross-validation for model development and evaluation in health care: tutorial</article-title><source>JMIR AI</source><year>2023</year><month>12</month><day>18</day><volume>2</volume><fpage>e49023</fpage><pub-id pub-id-type="doi">10.2196/49023</pub-id><pub-id pub-id-type="medline">38875530</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bradshaw</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Huemann</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rahmim</surname><given-names>A</given-names> </name></person-group><article-title>A guide to cross-validation for artificial intelligence in medical imaging</article-title><source>Radiol Artif Intell</source><year>2023</year><month>07</month><volume>5</volume><issue>4</issue><fpage>e220232</fpage><pub-id pub-id-type="doi">10.1148/ryai.220232</pub-id><pub-id pub-id-type="medline">37529208</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kohavi</surname><given-names>R</given-names> </name></person-group><article-title>A study of cross-validation and bootstrap for accuracy estimation and model selection</article-title><conf-name>14th International Joint Conference on Artificial Intelligence (IJCAI &#x2019;95)</conf-name><conf-date>Aug 20-25, 1995</conf-date><conf-loc>Montreal, Canada</conf-loc><fpage>1137</fpage><lpage>1145</lpage><pub-id pub-id-type="doi">10.5555/1643031.1643047</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Krainski</surname><given-names>ET</given-names> </name><name name-style="western"><surname>Lenzi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Mart&#x00ED;nez-Minaya</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rue</surname><given-names>H</given-names> </name></person-group><article-title>Automatic cross-validation in structured models: is it time to leave out leave-one-out?</article-title><source>Spat Stat</source><year>2024</year><month>08</month><volume>62</volume><fpage>100843</fpage><pub-id pub-id-type="doi">10.1016/j.spasta.2024.100843</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lemaitre</surname><given-names>G</given-names> </name><name name-style="western"><surname>Nogueira</surname><given-names>F</given-names> </name><name name-style="western"><surname>Aridas</surname><given-names>CK</given-names> </name></person-group><article-title>Imbalanced-learn: a Python toolbox to tackle the curse of imbalanced datasets in machine learning</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 21, 2016</comment><pub-id pub-id-type="doi">10.48550/arXiv.1609.06570</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Garcia</surname><given-names>EA</given-names> </name><name name-style="western"><surname>He</surname><given-names>H</given-names> </name><name name-style="western"><surname>Bai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name></person-group><article-title>ADASYN: adaptive synthetic sampling approach for imbalanced learning</article-title><conf-name>2008 IEEE International Joint Conference on Neural Networks (IJCNN 2008)</conf-name><conf-date>Jun 1-8, 2008</conf-date><conf-loc>Hong Kong, China</conf-loc><fpage>1322</fpage><lpage>1328</lpage><pub-id pub-id-type="doi">10.1109/IJCNN.2008.4633969</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chawla</surname><given-names>NV</given-names> </name><name name-style="western"><surname>Bowyer</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Hall</surname><given-names>LO</given-names> </name><name name-style="western"><surname>Kegelmeyer</surname><given-names>WP</given-names> </name></person-group><article-title>SMOTE: Synthetic Minority Over-sampling Technique</article-title><source>J Artif Intell Res</source><year>2002</year><volume>16</volume><fpage>321</fpage><lpage>357</lpage><pub-id pub-id-type="doi">10.1613/jair.953</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>WY</given-names> </name><name name-style="western"><surname>Mao</surname><given-names>BH</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Huang</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>XP</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>GB</given-names> </name></person-group><article-title>Borderline-SMOTE: a new over-sampling method in imbalanced data sets learning</article-title><source>Advances in Intelligent Computing</source><year>2005</year><access-date>2026-01-28</access-date><publisher-name>Springer Nature</publisher-name><fpage>878</fpage><lpage>887</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1007/11538059_91">https://doi.org/10.1007/11538059_91</ext-link></comment><pub-id pub-id-type="doi">10.1007/11538059_91</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nguyen</surname><given-names>HM</given-names> </name><name name-style="western"><surname>Cooper</surname><given-names>EW</given-names> </name><name name-style="western"><surname>Kamei</surname><given-names>K</given-names> </name></person-group><article-title>Borderline over-sampling for imbalanced data classification</article-title><source>Int J Knowledge Eng Soft Data Paradigms</source><year>2011</year><volume>3</volume><issue>1</issue><fpage>4</fpage><pub-id pub-id-type="doi">10.1504/IJKESDP.2011.039875</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Douzas</surname><given-names>G</given-names> </name><name name-style="western"><surname>Bacao</surname><given-names>F</given-names> </name><name name-style="western"><surname>Last</surname><given-names>F</given-names> </name></person-group><article-title>Improving imbalanced learning through a heuristic oversampling method based on k-means and SMOTE</article-title><source>Inf Sci (Ny)</source><year>2018</year><month>10</month><volume>465</volume><fpage>1</fpage><lpage>20</lpage><pub-id pub-id-type="doi">10.1016/j.ins.2018.06.056</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>More</surname><given-names>A</given-names> </name></person-group><article-title>Survey of resampling techniques for improving classification performance in unbalanced datasets</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 22, 2016</comment><pub-id pub-id-type="doi">10.48550/arXiv.1608.06048</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Murphy</surname><given-names>KP</given-names> </name></person-group><source>Machine Learning: A Probabilistic Perspective</source><year>2012</year><publisher-name>MIT Press</publisher-name><pub-id pub-id-type="other">0262018020</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Powers</surname><given-names>DMW</given-names> </name></person-group><article-title>Evaluation: from precision, recall and F-measure to ROC, informedness, markedness and correlation</article-title><source>J Mach Learn Technol</source><year>2020</year><access-date>2026-06-17</access-date><fpage>37</fpage><lpage>63</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://bioinfopublication.org/files/articles/2_1_1_JMLT.pdf">https://bioinfopublication.org/files/articles/2_1_1_JMLT.pdf</ext-link></comment><pub-id pub-id-type="doi">10.9735/2229-3981</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stekhoven</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>B&#x00FC;hlmann</surname><given-names>P</given-names> </name></person-group><article-title>MissForest--non-parametric missing value imputation for mixed-type data</article-title><source>Bioinformatics</source><year>2012</year><month>01</month><day>1</day><volume>28</volume><issue>1</issue><fpage>112</fpage><lpage>118</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btr597</pub-id><pub-id pub-id-type="medline">22039212</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sokolova</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lapalme</surname><given-names>G</given-names> </name></person-group><article-title>A systematic analysis of performance measures for classification tasks</article-title><source>Inf Process Manag</source><year>2009</year><month>07</month><volume>45</volume><issue>4</issue><fpage>427</fpage><lpage>437</lpage><pub-id pub-id-type="doi">10.1016/j.ipm.2009.03.002</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Davis</surname><given-names>J</given-names> </name><name name-style="western"><surname>Goadrich</surname><given-names>M</given-names> </name></person-group><article-title>The relationship between precision-recall and ROC curves</article-title><conf-name>Proceedings of the 23rd international conference on Machine learning - ICML &#x2019;06</conf-name><conf-date>Jun 25-29, 2006</conf-date><conf-loc>Pittsburgh, PA</conf-loc><fpage>233</fpage><lpage>240</lpage><pub-id pub-id-type="doi">10.1145/1143844.1143874</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vickers</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Elkin</surname><given-names>EB</given-names> </name></person-group><article-title>Decision curve analysis: a novel method for evaluating prediction models</article-title><source>Med Decis Making</source><year>2006</year><volume>26</volume><issue>6</issue><fpage>565</fpage><lpage>574</lpage><pub-id pub-id-type="doi">10.1177/0272989X06295361</pub-id><pub-id pub-id-type="medline">17099194</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lipton</surname><given-names>ZC</given-names> </name><name name-style="western"><surname>Elkan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Naryanaswamy</surname><given-names>B</given-names> </name></person-group><article-title>Optimal thresholding of classifiers to maximize F1 measure</article-title><source>Mach Learn Knowl Discov Databases</source><year>2014</year><volume>8725</volume><fpage>225</fpage><lpage>239</lpage><pub-id pub-id-type="doi">10.1007/978-3-662-44851-9_15</pub-id><pub-id pub-id-type="medline">26023687</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Steyerberg</surname><given-names>EW</given-names> </name><name name-style="western"><surname>Vickers</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>NR</given-names> </name><etal/></person-group><article-title>Assessing the performance of prediction models: a framework for traditional and novel measures</article-title><source>Epidemiology (Sunnyvale)</source><year>2010</year><month>01</month><volume>21</volume><issue>1</issue><fpage>128</fpage><lpage>138</lpage><pub-id pub-id-type="doi">10.1097/EDE.0b013e3181c30fb2</pub-id><pub-id pub-id-type="medline">20010215</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>W</given-names> </name><name name-style="western"><surname>Macheret</surname><given-names>F</given-names> </name><name name-style="western"><surname>Gabriel</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Ohno-Machado</surname><given-names>L</given-names> </name></person-group><article-title>A tutorial on calibration measurements and calibration models for clinical prediction models</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>04</month><day>1</day><volume>27</volume><issue>4</issue><fpage>621</fpage><lpage>633</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocz228</pub-id><pub-id pub-id-type="medline">32106284</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alba</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Agoritsas</surname><given-names>T</given-names> </name><name name-style="western"><surname>Walsh</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Discrimination and calibration of clinical prediction models: users&#x2019; guides to the medical literature</article-title><source>JAMA</source><year>2017</year><month>10</month><day>10</day><volume>318</volume><issue>14</issue><fpage>1377</fpage><lpage>1384</lpage><pub-id pub-id-type="doi">10.1001/jama.2017.12126</pub-id><pub-id pub-id-type="medline">29049590</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Viering</surname><given-names>T</given-names> </name><name name-style="western"><surname>Loog</surname><given-names>M</given-names> </name></person-group><article-title>The shape of learning curves: a review</article-title><source>IEEE Trans Pattern Anal Mach Intell</source><year>2023</year><month>06</month><volume>45</volume><issue>6</issue><fpage>7799</fpage><lpage>7819</lpage><pub-id pub-id-type="doi">10.1109/TPAMI.2022.3220744</pub-id><pub-id pub-id-type="medline">36350870</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="book"><person-group person-group-type="editor"><name name-style="western"><surname>Simon</surname><given-names>GJ</given-names> </name><name name-style="western"><surname>Aliferis</surname><given-names>C</given-names> </name></person-group><source>Artificial Intelligence and Machine Learning in Health Care and Medical Sciences: Best Practices and Pitfalls</source><year>2024</year><access-date>2026-04-10</access-date><publisher-name>Springer International Publishing</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1007/978-3-031-39355-6">https://doi.org/10.1007/978-3-031-39355-6</ext-link></comment><pub-id pub-id-type="doi">10.1007/978-3-031-39355-6</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ke</surname><given-names>JXC</given-names> </name><name name-style="western"><surname>DhakshinaMurthy</surname><given-names>A</given-names> </name><name name-style="western"><surname>George</surname><given-names>RB</given-names> </name><name name-style="western"><surname>Branco</surname><given-names>P</given-names> </name></person-group><article-title>The effect of resampling techniques on the performances of machine learning clinical risk prediction models in the setting of severe class imbalance: development and internal validation in a retrospective cohort</article-title><source>Discov Artif Intell</source><year>2024</year><volume>4</volume><issue>1</issue><fpage>91</fpage><pub-id pub-id-type="doi">10.1007/s44163-024-00199-0</pub-id><pub-id pub-id-type="medline">39624046</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Dinov</surname><given-names>ID</given-names> </name></person-group><source>Data Science and Predictive Analytics: Biomedical and Health Applications Using R</source><year>2018</year><access-date>2026-04-10</access-date><publisher-name>Springer</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://link.springer.com/book/10.1007/978-3-031-17483-4">https://link.springer.com/book/10.1007/978-3-031-17483-4</ext-link></comment></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Welvaars</surname><given-names>K</given-names> </name><name name-style="western"><surname>Oosterhoff</surname><given-names>JHF</given-names> </name><name name-style="western"><surname>van den Bekerom</surname><given-names>MPJ</given-names> </name><name name-style="western"><surname>Doornberg</surname><given-names>JN</given-names> </name><name name-style="western"><surname>van Haarst</surname><given-names>EP</given-names> </name><collab>OLVG Urology Consortium, and the Machine Learning Consortium</collab></person-group><article-title>Implications of resampling data to address the class imbalance problem (IRCIP): an evaluation of impact on performance between classification algorithms in medical data</article-title><source>JAMIA Open</source><year>2023</year><month>07</month><volume>6</volume><issue>2</issue><fpage>ooad033</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooad033</pub-id><pub-id pub-id-type="medline">37266187</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van den Goorbergh</surname><given-names>R</given-names> </name><name name-style="western"><surname>van Smeden</surname><given-names>M</given-names> </name><name name-style="western"><surname>Timmerman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Van Calster</surname><given-names>B</given-names> </name></person-group><article-title>The harm of class imbalance corrections for risk prediction models: illustration and simulation using logistic regression</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>08</month><day>16</day><volume>29</volume><issue>9</issue><fpage>1525</fpage><lpage>1534</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac093</pub-id><pub-id pub-id-type="medline">35686364</pub-id></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Monaghan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Rahman</surname><given-names>SN</given-names> </name><name name-style="western"><surname>Agudelo</surname><given-names>CW</given-names> </name><etal/></person-group><article-title>Foundational statistical principles in medical research: sensitivity, specificity, positive predictive value, and negative predictive value</article-title><source>Medicina (Kaunas)</source><year>2021</year><month>05</month><day>16</day><volume>57</volume><issue>5</issue><fpage>503</fpage><pub-id pub-id-type="doi">10.3390/medicina57050503</pub-id><pub-id pub-id-type="medline">34065637</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="web"><article-title>Clinical criteria</article-title><source>DynaMed</source><year>2025</year><month>08</month><day>22</day><access-date>2026-05-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.dynamed.com/calculators/#cc-idx">https://www.dynamed.com/calculators/#cc-idx</ext-link></comment></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Riley</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Pate</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dhiman</surname><given-names>P</given-names> </name><name name-style="western"><surname>Archer</surname><given-names>L</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>GP</given-names> </name><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name></person-group><article-title>Clinical prediction models and the multiverse of madness</article-title><source>BMC Med</source><year>2023</year><month>12</month><day>18</day><volume>21</volume><issue>1</issue><fpage>502</fpage><pub-id pub-id-type="doi">10.1186/s12916-023-03212-y</pub-id><pub-id pub-id-type="medline">38110939</pub-id></nlm-citation></ref><ref id="ref69"><label>69</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bozkurt</surname><given-names>C</given-names> </name><name name-style="western"><surname>A&#x015F;uro&#x011F;lu</surname><given-names>T</given-names> </name></person-group><article-title>Mortality prediction of various cancer patients via relevant feature analysis and machine learning</article-title><source>SN Comput Sci</source><year>2023</year><volume>4</volume><issue>3</issue><fpage>264</fpage><pub-id pub-id-type="doi">10.1007/s42979-023-01720-5</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Potential predictors for patients with COVID-19 undergoing kidney replacement therapy.</p><media xlink:href="formative_v10i1e86379_app1.docx" xlink:title="DOCX File, 2585 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Potential predictors for in-hospital mortality in patients with COVID-19.</p><media xlink:href="formative_v10i1e86379_app2.docx" xlink:title="DOCX File, 2589 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Final proportions in the training partition after application of each rebalancing strategy.</p><media xlink:href="formative_v10i1e86379_app3.docx" xlink:title="DOCX File, 3585 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Hyperparameters evaluated for optimization using extreme gradient boosting (XGBoost).</p><media xlink:href="formative_v10i1e86379_app4.docx" xlink:title="DOCX File, 2584 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Definitions and characteristics of the metrics frequently used to evaluate performance in predictive models, using the machine learning and the statistics terminology.</p><media xlink:href="formative_v10i1e86379_app5.docx" xlink:title="DOCX File, 2587 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Hypothetical confusion matrix and calculation of performance metrics for binary classification.</p><media xlink:href="formative_v10i1e86379_app6.docx" xlink:title="DOCX File, 2489 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Checklist for TRIPOD+AI (Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis + Artificial Intelligence).</p><media xlink:href="formative_v10i1e86379_app7.docx" xlink:title="DOCX File, 2588 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Checklist for PROBAST+AI (Updated Quality, Risk of Bias, and Applicability Assessment Tool for Prediction Models Using Regression or Artificial Intelligence Methods).</p><media xlink:href="formative_v10i1e86379_app8.docx" xlink:title="DOCX File, 2096 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Means of performance metrics at different cutoffs for kidney replacement therapy (KRT) and not undergoing KRT.</p><media xlink:href="formative_v10i1e86379_app9.png" xlink:title="PNG File, 31 KB"/></supplementary-material><supplementary-material id="app10"><label>Multimedia Appendix 10</label><p>Decision curve for kidney replacement therapy.</p><media xlink:href="formative_v10i1e86379_app10.png" xlink:title="PNG File, 254 KB"/></supplementary-material><supplementary-material id="app11"><label>Multimedia Appendix 11</label><p>Decision curve for death.</p><media xlink:href="formative_v10i1e86379_app11.png" xlink:title="PNG File, 282 KB"/></supplementary-material><supplementary-material id="app12"><label>Multimedia Appendix 12</label><p>Global and per-class metrics for different rebalancing techniques for kidney replacement therapy.</p><media xlink:href="formative_v10i1e86379_app12.docx" xlink:title="DOCX File, 3130 KB"/></supplementary-material><supplementary-material id="app13"><label>Multimedia Appendix 13</label><p>Global and per-class metrics for different rebalancing techniques for death.</p><media xlink:href="formative_v10i1e86379_app13.docx" xlink:title="DOCX File, 2587 KB"/></supplementary-material><supplementary-material id="app14"><label>Multimedia Appendix 14</label><p>Outcomes and evaluation metrics of predictive scores for patients with COVID-19 based on the DynaMed summary.</p><media xlink:href="formative_v10i1e86379_app14.docx" xlink:title="DOCX File, 2585 KB"/></supplementary-material><supplementary-material id="app15"><label>Multimedia Appendix 15</label><p>Outcomes and evaluation metrics of predictive scores for cardiovascular disease based on the DynaMed summary.</p><media xlink:href="formative_v10i1e86379_app15.docx" xlink:title="DOCX File, 2582 KB"/></supplementary-material><supplementary-material id="app16"><label>Multimedia Appendix 16</label><p>Means of performance metrics at different cutoffs for death and no death.</p><media xlink:href="formative_v10i1e86379_app16.png" xlink:title="PNG File, 28 KB"/></supplementary-material><supplementary-material id="app17"><label>Multimedia Appendix 17</label><p>Features&#x2019; importance and contribution to the final predictive model of kidney replacement therapy.</p><media xlink:href="formative_v10i1e86379_app17.docx" xlink:title="DOCX File, 2579 KB"/></supplementary-material><supplementary-material id="app18"><label>Multimedia Appendix 18</label><p>Features&#x2019; importance and contribution to the final predictive model of in-hospital mortality.</p><media xlink:href="formative_v10i1e86379_app18.docx" xlink:title="DOCX File, 2579 KB"/></supplementary-material></app-group></back></article>