<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e81039</article-id><article-id pub-id-type="doi">10.2196/81039</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Complication Risk Classification in Children and Adolescents With Type 1 Diabetes: Interpretable Machine Learning Study Based on Saudi Clinical Guidelines</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Fllatah</surname><given-names>Jalilah</given-names></name><degrees>BCS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Banjar</surname><given-names>Haneen</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Computer Science, Faculty of Computing and Information Technology, King Abdulaziz University</institution><addr-line>P.O. Box 80200</addr-line><addr-line>Jeddah</addr-line><country>Saudi Arabia</country></aff><aff id="aff2"><institution>Center of Research Excellence in Artificial Intelligence and Data Science, King Abdulaziz University</institution><addr-line>Jeddah</addr-line><country>Saudi Arabia</country></aff><aff id="aff3"><institution>Institute of Genomic Medicine Sciences, King Abdulaziz University</institution><addr-line>Jeddah</addr-line><country>Saudi Arabia</country></aff><aff id="aff4"><institution>Centre of Artificial Intelligence in Precision Medicines, King Abdulaziz University</institution><addr-line>Jeddah</addr-line><country>Saudi Arabia</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Steenstra</surname><given-names>Ivan</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Harper</surname><given-names>Simon</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jalilah Fllatah, BCS, Department of Computer Science, Faculty of Computing and Information Technology, King Abdulaziz University, P.O. Box 80200, Jeddah, 21589, Saudi Arabia, 966 544027109; <email>jalilahfallatah@hotmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>15</day><month>5</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e81039</elocation-id><history><date date-type="received"><day>21</day><month>07</month><year>2025</year></date><date date-type="rev-recd"><day>04</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>07</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Jalilah Fllatah, Haneen Banjar. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 15.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e81039"/><abstract><sec><title>Background</title><p>Complication risks in children and adolescents with type 1 diabetes (T1D) can lead to serious health outcomes if not detected early. Despite the availability of clinical data, there remains a gap in interpretable tools that support risk stratification in this age group, particularly in alignment with local clinical guidelines.</p></sec><sec><title>Objective</title><p>The purpose of this study is to develop a clinically interpretable model that classifies the risk levels of T1D complications&#x2014;acute, chronic, and low&#x2014;using real-world data and expert clinical rules derived from the Saudi Diabetes Clinical Practice Guidelines.</p></sec><sec sec-type="methods"><title>Methods</title><p>A pediatric T1D dataset comprising of 306 patients was preprocessed through structured cleaning and feature engineering. Risk labels were constructed using Saudi Diabetes Clinical Practice Guidelines&#x2013;derived rules. Feature selection was performed using a hybrid approach that combined the SHAP (Shapley Additive Explanations) analysis with exhaustive feature selection. A decision tree model was trained and optimized via cross-validation, using the <italic>F</italic><sub>1</sub>-score as the primary performance metric.</p></sec><sec sec-type="results"><title>Results</title><p>The final model achieved a high mean <italic>F</italic><sub>1</sub>-score of 0.9876 with a low variance of 0.0189, using only 5 clinical features: BMI, hypoglycemia, disease duration, hemoglobin A<sub>1c</sub>, and impaired glucose metabolism. These features were consistently ranked as the most influential. The resulting decision tree offered a transparent logic path, enhancing its clinical interpretability and usability.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study demonstrates that a simple and interpretable model, guided by national clinical guidelines, can effectively predict the risk levels of T1D complications in children and adolescents. Its strong performance, clarity, and reliance on a small number of clinically meaningful features make it a promising candidate for integration into clinical decision support systems. This supports a shift toward predictive and personalized diabetes care.</p></sec></abstract><kwd-group><kwd>type 1 diabetes</kwd><kwd>children and adolescents</kwd><kwd>complication risk classification</kwd><kwd>Saudi Diabetes Clinical Practice Guidelines</kwd><kwd>interpretable machine learning</kwd><kwd>predictive modeling</kwd><kwd>P4 medicine</kwd><kwd>clinical decision support systems</kwd><kwd>SHAP analysis</kwd><kwd>Shapley Additive Explanations</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Type 1 diabetes (T1D) is an autoimmune condition in which the body&#x2019;s immune system selectively destroys pancreatic beta cells, resulting in absolute insulin deficiency and lifelong dependence on exogenous insulin. It primarily affects children and adolescents and requires continuous insulin administration, glucose monitoring, precise dietary management, and lifestyle adjustments [<xref ref-type="bibr" rid="ref1">1</xref>]. Recent data from the International Diabetes Federation Diabetes Atlas 2025 indicate a significant rise in the global burden of T1D, with an estimate of 30,000 children and youth worldwide at risk of death due to undiagnosed T1D at onset [<xref ref-type="bibr" rid="ref2">2</xref>]. In 2024, more than 9.5 million people were living with T1D globally, including approximately 1.9 million children and adolescents [<xref ref-type="bibr" rid="ref2">2</xref>]. Within this context, Saudi Arabia is among the countries most affected globally, with 46,469 children and adolescents reported to be living with T1D in 2024 [<xref ref-type="bibr" rid="ref2">2</xref>]. This burden is further compounded during adolescence, when hormonal and psychological changes can affect treatment adherence and increase the risk of complications [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Acute complications, such as diabetic ketoacidosis (DKA) and severe hypoglycemia, are among the leading causes of mortality in this age group if not promptly addressed. In many patients, approximately 30% of first hospital admissions and initial diagnoses of T1D occur due to DKA [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. DKA occurs as a result of absolute insulin deficiency, which increases lipolysis, leading to uncontrolled hyperglycemia, ketone body production, and metabolic acidosis, and may be fatal if not treated promptly [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Severe hypoglycemia, on the other hand, results from an imbalance between insulin levels and glucose availability, which can cause seizures, loss of consciousness, or sudden deterioration [<xref ref-type="bibr" rid="ref4">4</xref>]. Both acute complications represent life-threatening emergencies and remain a significant challenge contributing to mortality in children and adolescents with T1D [<xref ref-type="bibr" rid="ref1">1</xref>].</p><p>In addition to acute risks, long-term poorly controlled T1D leads to chronic complications, including nephropathy, chronic kidney disease (CKD), and neuropathy [<xref ref-type="bibr" rid="ref3">3</xref>]. These complications may begin during adolescence and progress over time, negatively impacting quality of life and increasing health care and economic burdens [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Clinical evidence underscores that maintaining blood glucose levels near normal, as monitored by hemoglobin A<sub>1c</sub> (HbA<sub>1c</sub>), significantly reduces the long-term incidence of these adverse outcomes [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. This highlights the importance of early risk prediction and advanced therapeutic approaches to mitigate long-term adverse outcomes [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>The medical literature emphasizes that early intervention in chronic conditions, such as T1D, can lead to significantly better outcomes. Accurate risk prediction supports more timely preventive care, reduces hospital admissions, improves patients&#x2019; daily lives, helps health care providers allocate resources effectively, and also strengthens patient and family education [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>Despite the abundance of available clinical data, a noticeable gap remains in the availability of interpretable tools that assist clinicians in reliable and meaningful risk assessment, particularly for pediatric populations [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref5">5</xref>].</p></sec><sec id="s1-2"><title>Prior Work</title><p>Machine-learning (ML) studies on predicting diabetes complications have made notable advances. Recent reviews indicate significant advancements in ML-based detection, but persistent challenges remain in translating these models into clinical practice, particularly in integrating them into established medical workflows [<xref ref-type="bibr" rid="ref7">7</xref>]. Most work has focused on the adult population with type 2 diabetes, with limited attention on children and adolescents with T1D. This focus leaves a gap in pediatric T1D care, where comprehensive multicomplication risk assessment remains limited [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>Jian et al [<xref ref-type="bibr" rid="ref5">5</xref>] developed predictive models that achieved high accuracy using common algorithms, such as random forests and decision trees. However, their work focused on adults and chronic complications without leveraging clinical guidelines. Similarly, Ravaut et al [<xref ref-type="bibr" rid="ref9">9</xref>] applied gradient-boosted models to a large administrative dataset to predict both acute and chronic complications, although the binary outcome approach limits its use for nuanced risk stratification.</p><p>Eid et al [<xref ref-type="bibr" rid="ref10">10</xref>] and Subramanian et al [<xref ref-type="bibr" rid="ref11">11</xref>] focused on acute complications, such as DKA in pediatric patients, but their models were confined to specific outcomes without broader complication profiling or integration of clinical knowledge. Voskergian et al [<xref ref-type="bibr" rid="ref12">12</xref>] employed synthetic electronic health records (EHRs) to predict multiple complications while not incorporating guideline-based features or ensuring interpretability.</p><p>Deep learning techniques, including deep neural networks, convolutional neural networks, and recurrent neural networks, demonstrate high predictive performance across various medical datasets. However, their lack of interpretability often makes them challenging to adopt in clinical practice [<xref ref-type="bibr" rid="ref13">13</xref>]. By contrast, ML techniques, such as decision trees and logistic regression, are easier to interpret, making them more appropriate for use in pediatric health care contexts [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>Although interpretation tools, such as SHAP (Shapley Additive Explanations), have improved model explainability [<xref ref-type="bibr" rid="ref9">9</xref>], many existing studies still depend on basic importance scores or filter-based feature selection techniques and seldom contain domain-specific clinical insights [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. This persistent gap highlights the growing need for approaches that provide transparent explanations, as emphasized by Netayawijit et al [<xref ref-type="bibr" rid="ref16">16</xref>], while also integrating clinical expertise directly into the model&#x2019;s logic to ensure relevance [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>To bridge this gap, this study proposes an interpretable model specifically designed to predict T1D complications in children and adolescents. The model is developed using rule-based features extracted directly from the Saudi Diabetes Clinical Practice Guidelines (SDCPG), and it leverages SHAP and exhaustive feature selection (EFS) alongside a decision tree model to achieve interpretable and accurate predictions.</p></sec><sec id="s1-3"><title>Study Objective</title><p>The primary aim of this study is to develop an interpretable predictive model designed to classify complication risk levels&#x2014;low, chronic, and acute&#x2014;among children and adolescents with T1D. By integrating expert clinical rules from the SDCPG with advanced feature selection methods (SHAP and EFS), this study bridges the gap between high predictive accuracy and clinical transparency. Ultimately, the study provides a locally aligned tool for the Saudi health care system that supports the P4 medicine (predictive, preventive, personalized, and participatory medicine) framework by offering proactive, evidence-based decision support for diabetes management.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>The methodology aims to develop a multilevel predictive model that categorizes individuals with T1D based on their risk of complications. This classification is built on rules from the SDCPG. The model classifies patients into 3 main categories: acute risk, denoted by critical conditions, such as hypoglycemia and DKA; chronic risk, involving long-term complications, such as foot deformities, CKD, and neuropathy; and low risk, for patients showing no significant warning signs. By integrating expert clinical knowledge with data-driven modeling, the methodology seeks to identify the most consequential features in this classification. The process is structured into 5 main steps: data collection, data preprocessing, feature selection, training and validating the model, and finally evaluating different models to choose the most accurate one, as shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overall workflow of the multilevel predictive model development for type 1 diabetes (T1D) complication risk classification based on Saudi Diabetes Clinical Practice Guidelines (SDCPG).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e81039_fig01.png"/></fig></sec><sec id="s2-2"><title>Dataset</title><p>This study utilizes an open-source dataset titled &#x201C;Dataset on Significant Risk Factors for Type 1 Diabetes,&#x201D; published in 2018 [<xref ref-type="bibr" rid="ref17">17</xref>]. The dataset targets children and adolescents in Bangladesh and is designed to explore the major risk factors associated with T1D. It includes 306 participants, with an equal distribution between those diagnosed with T1D and those without, collected through structured surveys from hospitals and diagnostic centers in Dhaka.</p><p>The dataset consists of 22 features covering a broad range of information&#x2014;from demographic characteristics to clinical indicators, such as HbA<sub>1c</sub> and hypoglycemia, in addition to lifestyle factors, comorbidities, and family history of diabetes. The types of data include categorical, temporal, continuous, and multilabel text variables. <xref ref-type="table" rid="table1">Table 1</xref> presents a summary of the dataset features and their corresponding data types. Detailed feature descriptions and categorized values are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Summary of dataset features and data types used for type 1 diabetes (T1D) risk.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Feature</td><td align="left" valign="bottom">Type</td></tr></thead><tbody><tr><td align="left" valign="top">Age</td><td align="left" valign="top">Categorical</td></tr><tr><td align="left" valign="top">Sex</td><td align="left" valign="top">Binary</td></tr><tr><td align="left" valign="top">Residence</td><td align="left" valign="top">Categorical</td></tr><tr><td align="left" valign="top">HbA<sub>1c</sub><sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">Binary</td></tr><tr><td align="left" valign="top">Height</td><td align="left" valign="top">Continuous</td></tr><tr><td align="left" valign="top">Weight</td><td align="left" valign="top">Continuous</td></tr><tr><td align="left" valign="top">BMI</td><td align="left" valign="top">Continuous</td></tr><tr><td align="left" valign="top">Disease duration</td><td align="left" valign="top">Temporal</td></tr><tr><td align="left" valign="top">Comorbidities</td><td align="left" valign="top">Multilabel text</td></tr><tr><td align="left" valign="top">Nutrition status</td><td align="left" valign="top">Binary</td></tr><tr><td align="left" valign="top">Mother education</td><td align="left" valign="top">Binary</td></tr><tr><td align="left" valign="top">Growth in infancy</td><td align="left" valign="top">Categorical</td></tr><tr><td align="left" valign="top">Birth weight</td><td align="left" valign="top">Categorical</td></tr><tr><td align="left" valign="top">Autoantibodies</td><td align="left" valign="top">Binary</td></tr><tr><td align="left" valign="top">Impaired glucose metabolism</td><td align="left" valign="top">Binary</td></tr><tr><td align="left" valign="top">Takes insulin</td><td align="left" valign="top">Binary</td></tr><tr><td align="left" valign="top">Insulin delivery</td><td align="left" valign="top">Binary</td></tr><tr><td align="left" valign="top">Family history of T1D</td><td align="left" valign="top">Binary</td></tr><tr><td align="left" valign="top">Family history of T2D<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">Binary</td></tr><tr><td align="left" valign="top">Hypoglycemia</td><td align="left" valign="top">Binary</td></tr><tr><td align="left" valign="top">Pancreatic affected</td><td align="left" valign="top">Binary</td></tr><tr><td align="left" valign="top">T1D diagnosed</td><td align="left" valign="top">Binary</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>HbA<sub>1c</sub>: hemoglobin A<sub>1c</sub>.</p></fn><fn id="table1fn2"><p><sup>b</sup>T2D: type 2 diabetes.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>Ethical Considerations</title><p>This study involved a secondary analysis of a publicly available and anonymized dataset published by Asaduzzaman et al [<xref ref-type="bibr" rid="ref17">17</xref>] (2018). The dataset contains deidentified records and does not include any personally identifiable information. As no new data were collected and no direct interaction with human participants occurred, institutional review board approval was not required for this analysis. Informed consent was not required, as the dataset is publicly available and contains no identifiable information. All data were handled in accordance with relevant ethical standards, ensuring the privacy and confidentiality of individuals. No compensation was provided, as this study did not involve direct participant recruitment.</p></sec><sec id="s2-4"><title>Data Preprocessing</title><sec id="s2-4-1"><title>Data Preprocessing Overview</title><p>Data preprocessing is the process of preparing raw data by resolving inconsistencies and improving its quality [<xref ref-type="bibr" rid="ref18">18</xref>]. In this study, it involved 3 main steps. The first step focused on cleaning and normalizing the dataset values. The second step concentrated on engineering relevant features and defining the target variable. The final step included splitting the data into training and testing sets. These steps played an essential role in making the data suitable for ML models.</p></sec><sec id="s2-4-2"><title>Data Cleaning</title><p>Data cleaning is the process used to identify and correct errors and inaccuracies in raw data [<xref ref-type="bibr" rid="ref18">18</xref>]. In this study, a comprehensive review of missing values within the dataset was conducted. Text formats were cleaned up, and inconsistent labels were fixed to avoid accidental duplicates or misunderstandings when reading the variables. Moreover, column names were standardized to simplify programmatic handling and ensure consistency.</p></sec><sec id="s2-4-3"><title>Feature Engineering</title></sec><sec id="s2-4-4"><title>Feature Engineering Overview</title><p>Feature engineering is an important stage in data preprocessing, where diverse types of features can affect how accurate the predictive model is [<xref ref-type="bibr" rid="ref18">18</xref>]. This stage aims to convert cleaned data into numerical formats aligned with the clinical context, improving the model&#x2019;s ability to detect meaningful patterns and enhance interoperability. In this study, features were categorized into 4 main types: categorical, temporal, continuous, and multilabel text features. Additionally, the dataset did not directly include the T1D risk level as a target variable, so it had to be generated from a set of important clinical variables using rules from the SDCPG. The handling of each feature type is detailed in the following subsections.</p><sec id="s2-4-4-1"><title>Categorical Features</title><p>Categorical features were converted into numerical values using encoding methods relevant to each variable type. Binary variables, such as &#x201C;yes&#x201D; and &#x201C;no,&#x201D; were encoded using binary representation as 1 and 0. For multicategory features, such as age, sex, residence, infant growth, birth weight, HbA<sub>1c</sub> level, and insulin delivery, ordinal encoding was applied in a way that preserved their clinical ordering.</p></sec><sec id="s2-4-4-2"><title>Temporal Features</title><p>The duration of T1D is considered a key factor in assessing the risk of complications. To ensure consistency, all duration values&#x2014;originally recorded in days, weeks, months, or years&#x2014;were converted into a single unit: years, following survival analysis practices outlined by Hosmer et al [<xref ref-type="bibr" rid="ref19">19</xref>]. To capture the clinical differences in disease duration, values were categorized as short, medium, long, and exceptionally long and were encoded from 0 to 3, as suggested by Dovc et al [<xref ref-type="bibr" rid="ref20">20</xref>].</p></sec><sec id="s2-4-4-3"><title>Continuous Numerical Features</title><p>To represent nutritional status, BMI was selected as the primary measure, while height and weight were excluded to minimize redundancy and potential multicollinearity. As the dataset did not include exact age values, BMI was categorized according to the dataset&#x2019;s age groups based on World Health Organization (WHO) growth standards [<xref ref-type="bibr" rid="ref21">21</xref>]. These categories&#x2014;underweight, normal weight, overweight, and obese&#x2014;were encoded from 0 to 3, as shown in Table S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p></sec><sec id="s2-4-4-4"><title>Multilabel Text Features</title><p>The comorbidities feature consisted of unstructured free-text entries describing various health conditions. A tailored function was used to clean the data, standardize terminology, and correct duplicate or inaccurate entries. As a result, 23 common comorbidities were extracted, including heart disease, kidney disorders, hypertension, allergies, and others. Each condition was converted into a unique binary feature using a one-hot encoding technique [<xref ref-type="bibr" rid="ref22">22</xref>], allowing the model to treat them as structured numerical features. This improved the model&#x2019;s ability to capture associations.</p></sec><sec id="s2-4-4-5"><title>Risk Label Construction Based on Clinical Rules</title><p>Since the dataset did not include a direct feature indicating the patient&#x2019;s complication risk level, the target variable was generated by applying a set of clinical if-then rules derived from the SDCPG [<xref ref-type="bibr" rid="ref23">23</xref>]. These rules were organized into a knowledge base structured around 3 categories: identification, prevention, and management. Only the identification rules were used at this stage to classify patients into 3 risk levels: acute, chronic, and low. The original guideline used for deriving the clinical rules from the SDCPG is publicly available via the Saudi Health Council [<xref ref-type="bibr" rid="ref24">24</xref>]. Representative clinical identification rules and their corresponding thresholds are summarized in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Representative clinical identification rules derived from the Saudi Diabetes Clinical Practice Guidelines (SDCPG).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Rule ID</td><td align="left" valign="bottom">Clinical concept</td><td align="left" valign="bottom">Clinical threshold</td><td align="left" valign="bottom">Dataset variable(s)</td><td align="left" valign="bottom">Risk category</td><td align="left" valign="bottom">SDCPG source</td></tr></thead><tbody><tr><td align="left" valign="top">HYPO-01</td><td align="left" valign="top">Hypoglycemia risk associated with insulin therapy</td><td align="left" valign="top">Patient receiving insulin therapy</td><td align="left" valign="top">Takes insulin, and hypoglycemia</td><td align="left" valign="top">Acute</td><td align="left" valign="top">p. 68</td></tr><tr><td align="left" valign="top">CKD-01</td><td align="left" valign="top">Chronic kidney disease risk</td><td align="left" valign="top">T1D<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> duration &#x2265;5 years</td><td align="left" valign="top">Disease duration</td><td align="left" valign="top">Chronic</td><td align="left" valign="top">p. 82</td></tr><tr><td align="left" valign="top">NEUR-01</td><td align="left" valign="top">Neuropathy risk factors</td><td align="left" valign="top">High BMI or hypertension</td><td align="left" valign="top">BMI and other disease hypertension</td><td align="left" valign="top">Chronic</td><td align="left" valign="top">p. 81</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>T1D: type 1 diabetes.</p></fn></table-wrap-foot></table-wrap><p>Classification was based on clinical indicators, such as insulin use, hypoglycemia, comorbidities, disease duration, and BMI levels. Because some SDCPG thresholds were not available in the dataset, the rules were implemented using the closest clinical indicators available in the data. To prepare the data for modeling, the labels were encoded as 0 (low), 1 (chronic), and 2 (acute). An initial review of the resulting class distribution revealed a slight imbalance between categories. Although the dataset employed in this study originates from a pediatric population in Bangladesh, the use of rules derived from the SDCPG&#x2014;grounded in internationally recognized clinical indicators&#x2014;ensures both consistency and clinical relevance across diverse pediatric T1D populations. Their integration facilitates alignment with evidence-based medical frameworks and provides a standardized approach for risk stratification.</p></sec></sec></sec><sec id="s2-5"><title>Data Splitting</title><p>The dataset was split into 2 subsets: 80% used for training and 20% for testing. This allowed the model to learn from most of the data and check how well it performs on new, unseen examples. To ensure the class balance remained consistent in both sets, a stratified split was applied. This helped maintain the distribution of the target variable fairly and reduced bias in the results.</p></sec><sec id="s2-6"><title>Feature Selection</title><sec id="s2-6-1"><title>Feature Selection Overview</title><p>Following preprocessing and feature engineering, feature selection was used as a key part of the process to reduce the number of input features while optimizing model efficiency and accuracy. The process aimed to identify features that meaningfully contribute to predicting the target variable [<xref ref-type="bibr" rid="ref24">24</xref>]. This step also helped reduce noise, save computing resources, and enhance clinical interpretability.</p><p>Feature selection methods usually fall into 3 groups: filter, wrapper, and embedded. In this study, a hybrid approach was used, starting with an embedded method to reduce the initial list of features, followed by a wrapper method to select the best-performing subset.</p></sec><sec id="s2-6-2"><title>Embedded Method</title><p>Embedded methods work by incorporating feature selection directly into the model training process, allowing the model to evaluate feature importance during training. One popular technique in this category is SHAP. In this study, SHAP was used alongside a random forest model to examine how each input feature contributed to the model&#x2019;s predictions. Features were then ranked based on their average SHAP values to identify those with the most influence [<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>SHAP offers clinically meaningful insights by showing how each feature contributes to the model&#x2019;s predictions. Its ability to connect technical outputs with clinical interpretation makes it particularly useful in health care applications. The most influential features selected from SHAP were used in the next step, which helped reduce the number of features, improve computational performance, and simplify the model structure.</p></sec><sec id="s2-6-3"><title>Wrapper Method</title><p>Wrapper methods work by assessing how different feature combinations affect model outcomes and choosing the feature set that achieves the highest evaluation score based on a defined performance metric [<xref ref-type="bibr" rid="ref24">24</xref>]. In this study, the exhaustive feature selector algorithm was used to explore all combinations of the top 10 most important features identified in the previous step, resulting in a total of 1013 subsets. Each subset was evaluated using a decision tree model and the <italic>F</italic><sub>1</sub>-weighted score through 5-fold cross-validation (CV). The average performance and standard deviation for each subset were recorded in detail.</p><p>Finally, from the 1013 evaluated subsets, the results were filtered to identify the top 5 feature sets based on the highest cross-validated <italic>F</italic><sub>1</sub> mean (CV <italic>F</italic><sub>1</sub> mean) and the lowest SD (CV <italic>F</italic><sub>1</sub> SD). This enabled the selection of combinations that were not only effective and stable but also interpretable from a clinical perspective. These selected subsets were then carried forward for model training and evaluation.</p></sec></sec><sec id="s2-7"><title>Model Training and Optimization</title><sec id="s2-7-1"><title>Model Training and Optimization Overview</title><p>After identifying the top 5 feature sets, predictive models were built using a decision tree classifier. The decision tree algorithm was chosen for its simplicity, clarity, and interpretability in clinical environments. The model training process was followed by hyperparameter optimization to enhance generalization and improve predictive performance.</p></sec><sec id="s2-7-2"><title>Decision Tree Classifier</title><p>The classification models in this study were developed using the decision tree classifier from the Scikit-learn library (developed by Pedregosa et al [<xref ref-type="bibr" rid="ref25">25</xref>]), a widely used ML algorithm known for its ease of interpretation. This method builds a tree-like model by sequentially splitting the dataset based on feature values, creating branches that guide the classification process.</p><p>In this work, the Gini impurity criterion was used to determine the quality of each split. This measure captures the degree of impurity in a node and promotes divisions that enhance the separation between different classes [<xref ref-type="bibr" rid="ref26">26</xref>]. Decision trees are particularly suitable for clinical contexts due to their ability to handle both continuous and categorical features, along with their transparent decision-making logic and interpretable decision paths, which allow health care professionals to trace prediction pathways and support informed decisions [<xref ref-type="bibr" rid="ref27">27</xref>].</p></sec><sec id="s2-7-3"><title>Hyperparameter Optimization</title><p>To improve model performance and reduce overfitting, hyperparameter tuning was conducted using a randomized search CV. This technique explores a specified number of random combinations within a predefined parameter grid. This tuning technique is designed to efficiently optimize model settings without an exhaustive search [<xref ref-type="bibr" rid="ref28">28</xref>].</p><p>The process focused on adjusting the tree depth, the minimum number of samples required to split a node, and the minimum number of samples at a leaf. Tuning was performed using 10-fold stratified CV and the weighted <italic>F</italic><sub>1</sub>-score, which balances precision and recall&#x2014;an important factor for medical data [<xref ref-type="bibr" rid="ref29">29</xref>]. Tuning was applied to each feature set independently, retaining the best model from each.</p></sec></sec><sec id="s2-8"><title>Best Model Selection and Evaluation</title><p>To comprehensively evaluate model performance, assessments were conducted in 3 stages: the training set, the test set, and CV to measure consistency and robustness. The evaluation metrics included accuracy, weighted precision, weighted recall, weighted <italic>F</italic><sub>1</sub>-score, and multiclass area under the curve (AUC). The evaluation focused on average performance and stability, using the mean and SD across folds.</p><p>Models were ranked based on their ability to balance performance metrics on the test set, with priority given to strong <italic>F</italic><sub>1</sub> and AUC scores combined with low variability. Finally, the decision path of the highest-performing decision tree model was visualized to demonstrate the interpretability of its decision-making process.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Target Risk Distribution</title><p>The classification of patients into low, chronic, and acute risk categories was guided by clinical rules derived from the SDCPG. The class distribution shows that acute risk constituted the largest group (110/306 patients, 35.9%), followed closely by low risk (101/306 patients, 33%) and chronic risk (95/306 patients, 31%). This fairly balanced distribution enabled model training without requiring specialized class-balancing strategies. The slight increase in acute risk patients is expected, given the age group studied, as children and adolescents are more likely to experience sudden complications, such as hypoglycemia or DKA. In contrast, chronic conditions usually take longer to develop and are less common at younger ages [<xref ref-type="bibr" rid="ref3">3</xref>].</p></sec><sec id="s3-2"><title>Feature Selection Outcomes</title><sec id="s3-2-1"><title>SHAP Analysis Results</title><p>SHAP was applied to explore how individual features contributed to the model&#x2019;s predictions. As shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>, the SHAP summary plot highlights the top 10 features with the greatest average impact. The results showed that BMI, hypoglycemia, insulin delivery method, and disease duration were the most influential factors in predicting risk levels. BMI emerged as the top contributor, consistent with findings from previous studies that associate obesity with an increased risk of chronic complications, such as cardiovascular disease and hypertension [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref32">32</xref>]. Moreover, hypoglycemic episodes were strongly associated with acute risk, aligning with prior research that identifies such events as among the most dangerous acute complications [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. The insulin delivery method and disease duration showed moderate importance, reflecting their known influence on glycemic control and complication risk reduction [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. Although HbA<sub>1c</sub>, age, and impaired glucose metabolism had relatively lower SHAP values, they remain clinically relevant, particularly HbA<sub>1c</sub>, which is widely recognized as a key indicator of poor glycemic control [<xref ref-type="bibr" rid="ref37">37</xref>]. The exact SHAP values corresponding to <xref ref-type="fig" rid="figure2">Figure 2</xref> are provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>SHAP (Shapley Additive Explanations) summary plot showing the top 10 features ranked by their contribution across all 3 complication risk levels. Feature contributions are displayed using grayscale bars for each class: Class 0=low risk (dark), Class 1=chronic risk (medium), and Class 2=acute risk (light), based on the mean absolute SHAP values. HbA<sub>1c</sub>: hemoglobin A<sub>1c</sub>; T1D: type 1 diabetes.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e81039_fig02.png"/></fig></sec><sec id="s3-2-2"><title>EFS Subset Results</title><p>To build on the SHAP results, an EFS process was applied to identify the best-performing subsets among all possible combinations of the top 10 features. As shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, the relationship between the number of features and model performance shows that classification accuracy (CV <italic>F</italic><sub>1</sub> mean [SD]) improved with more features, particularly between 5 and 7 features. Based on these results, the top 5 feature subsets&#x2014;those with the highest <italic>F</italic><sub>1</sub>-scores&#x2014;were selected for full model training and evaluation, as detailed in <xref ref-type="table" rid="table3">Table 3</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Cross-validation (CV) <italic>F</italic><sub>1</sub> performance (mean [SD]) across feature subsets of varying sizes, selected through exhaustive feature selection (EFS) from the top 10 SHAP-ranked features. The plot highlights that feature sets with 5 to 7 features achieve high and stable performance. SHAP: Shapley Additive Explanations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e81039_fig03.png"/></fig><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Details of the 5 selected feature subsets, which serve as the core input for the training and evaluation phase.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ID</td><td align="left" valign="bottom">Feature set</td><td align="left" valign="bottom">Number of features</td><td align="left" valign="bottom">Train <italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">CV<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> <italic>F</italic><sub>1</sub> mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">C409</td><td align="left" valign="top">BMI + hypoglycemia + disease duration + HbA<sub>1c</sub><sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> + impaired glucose metabolism</td><td align="left" valign="top">5</td><td align="left" valign="top">0.991832</td><td align="left" valign="top">0.975111 (0.024141)</td></tr><tr><td align="left" valign="top">C640</td><td align="left" valign="top">BMI + hypoglycemia + disease duration + HbA<sub>1c</sub> + impaired glucose metabolism + insulin delivery</td><td align="left" valign="top">6</td><td align="left" valign="top">0.991832</td><td align="left" valign="top">0.975111 (0.024141)</td></tr><tr><td align="left" valign="top">C670</td><td align="left" valign="top">BMI + hypoglycemia + disease duration +HbA<sub>1c</sub> + impaired glucose metabolism + T1D diagnosed</td><td align="left" valign="top">6</td><td align="left" valign="top">0.991832</td><td align="left" valign="top">0.975111 (0.024141)</td></tr><tr><td align="left" valign="top">C676</td><td align="left" valign="top">BMI + hypoglycemia + disease duration + HbA<sub>1c</sub> + impaired glucose metabolism + takes insulin</td><td align="left" valign="top">6</td><td align="left" valign="top">0.991832</td><td align="left" valign="top">0.975111 (0.024141)</td></tr><tr><td align="left" valign="top">C679</td><td align="left" valign="top">BMI + hypoglycemia + disease duration + HbA<sub>1c</sub> + impaired glucose metabolism + age</td><td align="left" valign="top">6</td><td align="left" valign="top">0.991832</td><td align="left" valign="top">0.975111 (0.024141)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>CV: cross-validation.</p></fn><fn id="table3fn2"><p><sup>b</sup>HbA<sub>1c</sub>: hemoglobin A<sub>1c</sub>.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s3-3"><title>Model Evaluation Results</title><p><xref ref-type="table" rid="table4">Table 4</xref> presents a detailed comparison of 5 predictive models developed using the top-performing feature subsets. Evaluations were conducted across training, testing, and CV phases to assess their accuracy and consistency. All models achieved a test <italic>F</italic><sub>1</sub>-score of 0.983, with a CV <italic>F</italic><sub>1</sub> SD of 0.0189, reflecting strong and consistent performance. Despite performing similarly, all the models shared a common group of 5 core features: BMI, hypoglycemia, disease duration, HbA<sub>1c</sub>, and impaired glucose metabolism. Variations between the models were limited to 1 additional feature per model, but these differences did not result in noticeable performance improvements.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Comparison of the performance of 5 predictive models based on the best feature sets (SHAP<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> and EFS<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup>).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ID</td><td align="left" valign="bottom">Feature set</td><td align="left" valign="bottom">Number of features</td><td align="left" valign="bottom">Train <italic>F</italic><sub>1</sub></td><td align="left" valign="bottom">Test <italic>F</italic><sub>1</sub></td><td align="left" valign="bottom">CV<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup> <italic>F</italic><sub>1</sub> mean (SD)</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">BMI + hypoglycemia + disease duration + HbA<sub>1c</sub><sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup> + impaired glucose metabolism</td><td align="left" valign="top">5</td><td align="left" valign="top">0.99180</td><td align="left" valign="top">0.98387</td><td align="left" valign="top">0.98761 (0.01892)</td><td align="left" valign="top">0.98661</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">BMI + hypoglycemia + disease duration + HbA<sub>1c</sub> + impaired glucose metabolism + insulin delivery</td><td align="left" valign="top">6</td><td align="left" valign="top">0.99180</td><td align="left" valign="top">0.98387</td><td align="left" valign="top">0.98761 (0.01892)</td><td align="left" valign="top">0.98661</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">BMI + hypoglycemia + disease duration + HbA<sub>1c</sub> + impaired glucose metabolism + T1D diagnosed</td><td align="left" valign="top">6</td><td align="left" valign="top">0.99180</td><td align="left" valign="top">0.98387</td><td align="left" valign="top">0.98761 (0.01892)</td><td align="left" valign="top">0.98661</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">BMI + hypoglycemia + disease duration + HbA<sub>1c</sub> + impaired glucose metabolism + takes insulin</td><td align="left" valign="top">6</td><td align="left" valign="top">0.99180</td><td align="left" valign="top">0.98387</td><td align="left" valign="top">0.98761 (0.01892)</td><td align="left" valign="top">0.98661</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">BMI + hypoglycemia + disease duration + HbA<sub>1c</sub> + impaired glucose metabolism + age</td><td align="left" valign="top">6</td><td align="left" valign="top">0.99180</td><td align="left" valign="top">0.98387</td><td align="left" valign="top">0.98761 (0.01892)</td><td align="left" valign="top">0.98661</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>SHAP: Shapley Additive Explanations.</p></fn><fn id="table4fn2"><p><sup>b</sup>EFS: exhaustive feature selection.</p></fn><fn id="table4fn3"><p><sup>c</sup>CV: cross-validation.</p></fn><fn id="table4fn4"><p><sup>d</sup>AUC: area under the curve.</p></fn><fn id="table4fn5"><p><sup>e</sup>HbA<sub>1c</sub>: hemoglobin A<sub>1c</sub>.</p></fn></table-wrap-foot></table-wrap><p>The complete evaluation results, including all 1013 feature subsets tested via EFS, and detailed metrics for the top 5 models, are provided in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendices 4</xref> and <xref ref-type="supplementary-material" rid="app5">5</xref>.</p></sec><sec id="s3-4"><title>Final Model Interpretation</title><p>Although all 5 models demonstrated similar quantitative performance, the first model was selected as the final model due to its structural simplicity. It relies on 5 core features instead of 6 while still maintaining high predictive accuracy. Notably, these 5 features&#x2014;hypoglycemic episodes, HbA<sub>1c</sub>, BMI, disease duration, and impaired glucose metabolism&#x2014;were consistently present across all top-performing feature sets. <xref ref-type="fig" rid="figure4">Figure 4</xref> illustrates how the final model makes decisions through a simplified tree that reflects a clinically logical sequence. It begins by checking for hypoglycemic episodes, then evaluates HbA<sub>1c</sub> if no episodes are recorded. For patients with hypoglycemia, it moves through BMI, disease duration, and impaired glucose metabolism sequentially to reach a final risk classification.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Final decision tree for classifying complication risk levels in children and adolescents with type 1 diabetes (T1D), using 5 clinical features. Risk levels are color-coded as follows: low (yellow), chronic (green), and acute (purple). Each node represents a binary decision, where the left path indicates &#x201C;true&#x201D; and the right path indicates &#x201C;false.&#x201D;</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e81039_fig04.png"/></fig></sec><sec id="s3-5"><title>Illustrative Examples of Risk Classification</title><p>To demonstrate how the model processes patient information, <xref ref-type="table" rid="table5">Table 5</xref> presents 2 hypothetical patients based on the decision paths shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>.</p><p>For patient A, the model first checks for hypoglycemia. Because hypoglycemia is present, the model follows the right branch of the tree and evaluates HbA<sub>1c</sub>. Since the HbA<sub>1c</sub> level is below 7.5%, the patient is classified as an acute risk. For patient B, no history of hypoglycemia is present. The model follows the left branch of the tree and evaluates BMI and metabolic indicators. Because the patient is obese and shows impaired glucose metabolism, the model classifies the patient as a chronic risk.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Hypothetical patient examples illustrating the decision path of the final decision tree model<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup>.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Feature</td><td align="left" valign="bottom">Patient A</td><td align="left" valign="bottom">Patient B</td></tr></thead><tbody><tr><td align="left" valign="top">Hypoglycemia</td><td align="left" valign="top">Yes (1)</td><td align="left" valign="top">No (0)</td></tr><tr><td align="left" valign="top">BMI</td><td align="left" valign="top">Normal (1)</td><td align="left" valign="top">Obese (3)</td></tr><tr><td align="left" valign="top">Disease duration</td><td align="left" valign="top">Medium (1)</td><td align="left" valign="top">Long (2)</td></tr><tr><td align="left" valign="top">HbA<sub>1c</sub><sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="left" valign="top">Less than 7.5% (0)</td><td align="left" valign="top">Less than 7.5% (0)</td></tr><tr><td align="left" valign="top">Impaired glucose metabolism</td><td align="left" valign="top">Yes (1)</td><td align="left" valign="top">Yes (1)</td></tr><tr><td align="left" valign="top">Predicted risk level</td><td align="left" valign="top">Acute</td><td align="left" valign="top">Chronic</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Values in parentheses represent the encoded numerical values used by the decision tree during preprocessing.</p></fn><fn id="table5fn2"><p><sup>b</sup>HbA<sub>1c</sub>: hemoglobin A<sub>1c</sub>.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><sec id="s4-1-1"><title>Overall Model Performance</title><p>This study designed a decision tree model capable of classifying complication risk levels in children and adolescents with T1D using only 5 clinical features. The model achieved a high <italic>F</italic><sub>1</sub> mean score of 0.9876 with a low variance of 0.0189, demonstrating both strong predictive accuracy and consistency. These results establish that a small, targeted feature set is adequate to distinguish risk categories effectively, satisfying the study&#x2019;s goal of creating a simple and interpretable predictive model suitable for clinical integration. Furthermore, the decision tree&#x2019;s high performance indicates that interpretable models can effectively operationalize the clinical reasoning embedded in the SDCPG.</p></sec><sec id="s4-1-2"><title>Clinical Interpretation of Selected Features</title><p>Importantly, the selected 5 features (BMI, hypoglycemia, disease duration, HbA<sub>1c</sub>, and impaired glucose metabolism) not only achieved high performance but also aligned closely with clinical understanding. The SHAP analysis confirmed that BMI and hypoglycemia were the most impactful predictors: BMI was primarily associated with chronic and low-risk classification, while hypoglycemia strongly influenced acute risk predictions. Disease duration served as a moderate-level contributor that intersected with both chronic and acute risk categories. HbA<sub>1c</sub> and impaired glucose metabolism had relatively lower SHAP values but were consistently present across all high-performing models, indicating that even features with modest SHAP scores can still contribute meaningfully when viewed in combination.</p><p>These selected features are consistent with the SDCPG-derived knowledge base rules summarized in <xref ref-type="table" rid="table2">Table 2</xref>, which link hypoglycemic events with acute complications, prolonged disease duration, and metabolic risk indicators, such as high BMI with chronic complication risk, including conditions such as CKD and neuropathy. The identification rules explicitly associate insulin therapy with hypoglycemia risk, further supporting the model&#x2019;s reliance on hypoglycemia as a primary indicator of acute risk. BMI similarly reflects metabolic status indicators outlined in the guideline rules.</p><p>In contrast, HbA<sub>1c</sub> and impaired glucose metabolism emerged as additional metabolic predictors identified through the data-driven feature selection process. While several indicators used during label construction, including BMI, hypoglycemia, and disease duration, remained among the most informative predictors, other variables originally included in the rule definitions, such as hypertension, did not appear among the top-ranked features in the SHAP analysis. Overall, the feature selection process did not produce relationships that contradict SDCPG recommendations; rather, it highlighted the most clinically relevant indicators present in the dataset.</p></sec><sec id="s4-1-3"><title>Justification for Feature Reduction and Model Transparency</title><p>Furthermore, although some of the 6-feature models included additional variables, such as insulin delivery, T1D diagnosis confirmation, or patient age&#x2014;features that had notable SHAP values&#x2014;their inclusion did not enhance performance beyond what was achieved with the 5-feature model. Several insulin-related variables, including insulin use (takes insulin) and insulin delivery method, were evaluated during the feature selection process; however, their inclusion did not improve model performance compared with the 5-feature model, as shown in <xref ref-type="table" rid="table4">Table 4</xref>. This outcome demonstrates that model simplification does not compromise accuracy and highlights that a smaller set of high-quality features can be just as effective. It reinforces the broader principle that feature quality is more critical than quantity and that adding more variables does not necessarily lead to better results. Such simplification enhances the model&#x2019;s usability in clinical settings, where clarity and transparency are essential. These findings further emphasize SHAP&#x2019;s strength as an interpretable analytical tool that aligns with clinical reasoning and SDCPG recommendations. Additionally, the EFS process validated the consistency of top-performing feature combinations, reinforcing model robustness and minimizing the risk of overfitting.</p></sec></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>To evaluate the contribution of this study, a comparative analysis was conducted with 5 representative ML studies targeting the prediction of diabetes complications. The goal of this comparison is to evaluate how this study differs in methodology, clinical integration, model interpretability, and performance outcomes. <xref ref-type="table" rid="table6">Table 6</xref> presents general information on the selected studies, including their primary objectives, targeted complication types, dataset origin and size, and whether they incorporated formal clinical guidelines. <xref ref-type="table" rid="table7">Table 7</xref> summarizes the modeling approaches used in each study, reported performance metrics, and levels of interpretability. Including this study, a total of 6 models are summarized.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>General information on compared studies.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ID</td><td align="left" valign="bottom">Study and year</td><td align="left" valign="bottom">Purpose</td><td align="left" valign="bottom">Complication type</td><td align="left" valign="bottom">Location</td><td align="left" valign="bottom">Dataset</td><td align="left" valign="bottom">Based on clinical guidelines</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Jian et al [<xref ref-type="bibr" rid="ref5">5</xref>] (2021)</td><td align="left" valign="top">Predict 8 diabetes complications</td><td align="left" valign="top">Chronic</td><td align="left" valign="top">UAE (Ajman)</td><td align="left" valign="top">Structured EHR<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup> (N=884)</td><td align="left" valign="top">No</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Ravaut et al [<xref ref-type="bibr" rid="ref9">9</xref>] (2021)</td><td align="left" valign="top">Predict adverse outcomes</td><td align="left" valign="top">Acute and chronic</td><td align="left" valign="top">Canada (Ontario)</td><td align="left" valign="top">Admin health data (&#x003E;1.5 million)</td><td align="left" valign="top">No</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Eid et al [<xref ref-type="bibr" rid="ref10">10</xref>] (2023)</td><td align="left" valign="top">Predict DKA<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> in pediatric patients</td><td align="left" valign="top">Acute</td><td align="left" valign="top">Saudi Arabia</td><td align="left" valign="top">Structured EHR (N=3737)</td><td align="left" valign="top">No</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Subramanian et al [<xref ref-type="bibr" rid="ref11">11</xref>] (2024)</td><td align="left" valign="top">Predict postdiagnosis DKA</td><td align="left" valign="top">Acute</td><td align="left" valign="top">United States (Texas)</td><td align="left" valign="top">Structured EHR (N=1787)</td><td align="left" valign="top">No</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">Voskergian et al [<xref ref-type="bibr" rid="ref12">12</xref>] (2025)</td><td align="left" valign="top">Predict 4 complications</td><td align="left" valign="top">Chronic</td><td align="left" valign="top">Palestine/T&#x00FC;rkiye</td><td align="left" valign="top">Synthetic EHR (~1 million)</td><td align="left" valign="top">No</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">This study (2026)</td><td align="left" valign="top">Classify risk levels in pediatric T1D<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">Acute and chronic</td><td align="left" valign="top">Bangladesh</td><td align="left" valign="top">Open-source pediatric dataset (N=306)</td><td align="left" valign="top">Yes (SDCPG<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup>)</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>EHR: electronic health record.</p></fn><fn id="table6fn2"><p><sup>b</sup>DKA: diabetic ketoacidosis.</p></fn><fn id="table6fn3"><p><sup>c</sup>T1D: type 1 diabetes.</p></fn><fn id="table6fn4"><p><sup>d</sup>SDCPG: Saudi Diabetes Clinical Practice Guidelines.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Summary of modeling approaches and performance in previous work.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Study ID</td><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Performance</td><td align="left" valign="bottom">Interpretability</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">RF<sup><xref ref-type="table-fn" rid="table7fn1">a</xref></sup> (<italic>F</italic><sub>1</sub>=97.7%), SVM<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup> (<italic>F</italic><sub>1</sub>=96.6%), DT<sup><xref ref-type="table-fn" rid="table7fn3">c</xref></sup> (<italic>F</italic><sub>1</sub>=95.2%)</td><td align="left" valign="top">Accuracy: 97.8%</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">GBDT<sup><xref ref-type="table-fn" rid="table7fn4">d</xref></sup></td><td align="left" valign="top">AUC<sup><xref ref-type="table-fn" rid="table7fn5">e</xref></sup>&#x2248;77.7</td><td align="left" valign="top">Low-moderate</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">RF (performed best), DT, kNN<sup><xref ref-type="table-fn" rid="table7fn6">f</xref></sup>, GB<sup><xref ref-type="table-fn" rid="table7fn7">g</xref></sup>, AdaBoost, CN2</td><td align="left" valign="top">AUC=0.98, <italic>F</italic><sub>1</sub>=0.92</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">XGBoost<sup><xref ref-type="table-fn" rid="table7fn8">h</xref></sup></td><td align="left" valign="top">AUC=0.80, <italic>F</italic><sub>1</sub>=0.78</td><td align="left" valign="top">High (SHAP<sup><xref ref-type="table-fn" rid="table7fn9">i</xref></sup>)</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">XGBoost (AUC=85%), RF (AUC=83%), AdaBoost (AUC=77%), DT (AUC=80%)</td><td align="left" valign="top">Accuracy: 69%&#x2010;78%</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">DT (5 features), SHAP</td><td align="left" valign="top">AUC&#x2248;0.98, <italic>F</italic><sub>1</sub>=0.98</td><td align="left" valign="top">High (rule-based + SHAP)</td></tr></tbody></table><table-wrap-foot><fn id="table7fn1"><p><sup>a</sup>RF: random forest.</p></fn><fn id="table7fn2"><p><sup>b</sup>SVM: support vector machine.</p></fn><fn id="table7fn3"><p><sup>c</sup>DT: decision tree.</p></fn><fn id="table7fn4"><p><sup>d</sup>GBDT: gradient boosted decision tree.</p></fn><fn id="table7fn5"><p><sup>e</sup>AUC: area under the curve.</p></fn><fn id="table7fn6"><p><sup>f</sup>kNN; k-nearest neighbors.</p></fn><fn id="table7fn7"><p><sup>g</sup>GB: gradient boosting.</p></fn><fn id="table7fn8"><p><sup>h</sup>XGBoost: extreme gradient boosting.</p></fn><fn id="table7fn9"><p><sup>i</sup>SHAP: Shapley Additive Explanations.</p></fn></table-wrap-foot></table-wrap><p>A key differentiator of this work is its ability to classify both acute and chronic complication risks through a structured 3-level classification (low, chronic, and acute), whereas most prior studies focused on predicting only a single complication type, typically either acute (eg, DKA) or chronic (eg, CKD). While Ravaut et al [<xref ref-type="bibr" rid="ref9">9</xref>] addressed both complication types, their model still adopted a binary outcome structure, lacking the nuanced stratification offered in this study. Additionally, this study is the only one among the reviewed works to utilize national clinical guidelines (SDCPG) to inform risk labeling, thereby strengthening its clinical alignment.</p><p>Despite using a relatively small open-source dataset (N=306), the proposed model achieved a test <italic>F</italic><sub>1</sub>-score of 0.98 and AUC &#x2248; 0.98, on par with or exceeding the performance of more complex models built on larger datasets. Furthermore, it employs only 5 clinically meaningful features and leverages a decision tree classifier enhanced by the SHAP analysis, providing both transparency and clinical explainability. This comparison emphasizes that high predictive accuracy can be achieved without sacrificing interpretability, especially when models are designed with clinical context and usability in mind.</p></sec><sec id="s4-3"><title>Clinical Relevance and Usability</title><p>The model demonstrated powerful performance while maintaining a clear and interpretable structure, which makes it a practical choice for clinical use. In addition to its technical strengths, this model, based on SDCPG, ensures consistency with local clinical practice. This alignment enhances its credibility and supports smooth integration into existing clinical systems, increasing its potential for real-world adoption. Its ability to identify complication risks in children and adolescents with T1D ensures early intervention and supports a shift toward preventive care rather than reactive treatment.</p><p>In this context, the model contributes specifically to the predictive principle of P4 medicine by enabling the early identification of complication risk in this population. While other principles of P4 medicine&#x2014;such as preventive, personalized, and participatory strategies&#x2014;require further integration, this model offers a foundational predictive tool to support future enhancements. It not only anticipates complications but also adapts to individual clinical profiles and offers transparent decision paths that can be shared with both patients and health care providers.</p></sec><sec id="s4-4"><title>Limitations</title><p>While the model yielded encouraging outcomes, several important limitations should be considered. The dataset used in this study was published in 2018 and was derived from a single center, with a relatively small sample of 306 pediatric patients, which may affect the model&#x2019;s ability to generalize, especially in rare or borderline presentations. Since model validation was conducted without external validation, the findings may not fully translate to real-world settings, which could limit how well the model performs across broader populations.</p><p>A further limitation is that the risk labels were generated using SDCPG-derived rules rather than independently observed clinical outcomes, such as confirmed DKA or nephropathy. Therefore, the presented model performance reflects adherence to guideline-based classification logic, rather than prospective prediction of clinical complications. Future longitudinal investigations are required to validate the proposed risk classifications using real-world clinical outcomes.</p><p>Additionally, the clinical data were originated from a Bangladeshi cohort, whereas the risk classification rules were derived from the SDCPG. Although this supports guideline portability, differences in health care infrastructure, clinical practices, or population characteristics may influence generalizability. Some clinical features were simplified, such as using age groups instead of exact values, and certain variables were inconsistently recorded or structured, which may reduce predictive precision in pediatric patients. Moreover, the study did not compare its model against an established clinical risk scoring system or clinical decision support system baseline tool, as no standardized benchmark tool was available for this specific population and use case.</p><p>Finally, although the model uses common clinical features, such as HbA<sub>1c</sub>, disease duration, and BMI, inconsistencies in data documentation and system integration across health care institutions could pose challenges for its direct implementation into clinical decision support system platforms.</p></sec><sec id="s4-5"><title>Future Directions</title><p>Future work should aim to validate the model using external datasets from diverse populations to assess its generalizability. Additionally, exploring the integration of other P4 medicine elements, such as personalized treatment pathways and participatory tools, could enhance the model&#x2019;s utility. Collaboration with health care institutions to embed the model into existing EHR systems and collect feedback from clinicians on usability will be critical for real-world applications and iterative refinement.</p></sec><sec id="s4-6"><title>Conclusions</title><p>This study developed a clinically meaningful approach to classifying complication risks in children and adolescents with T1D, based on a set of clinical rules extracted from the SDCPG, to build a model that balances accuracy with interpretability. Using a hybrid feature selection technique that combines SHAP and EFS with a decision tree model, the model achieved consistently high performance using only 5 clinical indicators. These results suggest that effective risk classification can be achieved without complex systems, making the model a practical candidate for clinical use. Its transparency makes it easier for health care teams to trust and apply.</p><p>Future work should focus on external validation to test its generalizability and on expanding the dataset to improve robustness. Further exploration of the model&#x2019;s integration within EHR systems and alignment with the broader principles of P4 medicine is also warranted. This study specifically addresses the predictive component, laying the foundation for future work that could incorporate preventive, personalized, and participatory dimensions to enhance diabetes care for children and adolescents.</p></sec></sec></body><back><ack><p>The authors would like to acknowledge the support of the KAU Endowment (WAQF) and the Deanship of Scientific Research (DSR) at King Abdulaziz University.</p><p>The authors also acknowledge the publicly available dataset provided by Asaduzzaman et al [<xref ref-type="bibr" rid="ref17">17</xref>], which served as the foundation for this study. The dataset was used in accordance with its terms of use.</p><p>Generative artificial intelligence tools (ChatGPT and OpenAI web version) were used to assist with language refinement, translation improvement, and stylistic editing of the manuscript. All scientific content, data analyses, interpretations of results, and conclusions were developed and critically reviewed by the author.</p></ack><notes><sec><title>Funding</title><p>The project was funded by KAU Endowment (WAQF) at King Abdulaziz University, Jeddah, Saudi Arabia. The authors, therefore, acknowledge with thanks WAQF and the DSR for financial support.</p></sec><sec><title>Data Availability</title><p>The dataset used in this study is publicly available from Asaduzzaman et al [<xref ref-type="bibr" rid="ref17">17</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: JF</p><p>Data curation: JF</p><p>Formal analysis: JF</p><p>Methodology: JF</p><p>Writing &#x2013; original draft: JF</p><p>Supervision: HB</p><p>Writing &#x2013; review and editing: HB</p><p>Both the authors approved the final version of the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb2">CKD</term><def><p>chronic kidney disease</p></def></def-item><def-item><term id="abb3">CV</term><def><p>cross-validation</p></def></def-item><def-item><term id="abb4">DKA</term><def><p>diabetic ketoacidosis</p></def></def-item><def-item><term id="abb5">EFS</term><def><p>exhaustive feature selection</p></def></def-item><def-item><term id="abb6">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb7"><italic>F</italic><sub>1</sub></term><def><p><italic>F</italic><sub>1</sub>-score (harmonic mean of precision and recall)</p></def></def-item><def-item><term id="abb8">HbA<sub>1c</sub></term><def><p>hemoglobin A<sub>1c</sub></p></def></def-item><def-item><term id="abb9">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb10">P4 medicine</term><def><p>predictive, preventive, personalized, and participatory medicine</p></def></def-item><def-item><term id="abb11">SDCPG</term><def><p>Saudi Diabetes Clinical Practice Guidelines</p></def></def-item><def-item><term id="abb12">SHAP</term><def><p>Shapley Additive Explanations</p></def></def-item><def-item><term id="abb13">T1D</term><def><p>type 1 diabetes</p></def></def-item><def-item><term id="abb14">WHO</term><def><p>World Health Organization</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ling</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Lemos</surname><given-names>JRN</given-names> </name><name name-style="western"><surname>Hirani</surname><given-names>K</given-names> </name><name name-style="western"><surname>von Herrath</surname><given-names>M</given-names> </name></person-group><article-title>Type 1 diabetes: immune pathology and novel therapeutic approaches</article-title><source>Diabetol Int</source><year>2024</year><month>10</month><volume>15</volume><issue>4</issue><fpage>761</fpage><lpage>776</lpage><pub-id pub-id-type="doi">10.1007/s13340-024-00748-z</pub-id><pub-id pub-id-type="medline">39469552</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="report"><article-title>Type 1 diabetes estimates in children and adults</article-title><year>2022</year><access-date>2026-03-03</access-date><publisher-name>International Diabetes Federation</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://diabetesatlas.org/resources/idf-diabetes-atlas-reports/type-1-diabetes-estimates-in-children-and-adults/">https://diabetesatlas.org/resources/idf-diabetes-atlas-reports/type-1-diabetes-estimates-in-children-and-adults/</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Melmed</surname><given-names>S</given-names> </name><name name-style="western"><surname>Auchus</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Goldfine</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Rosen</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Kopp</surname><given-names>PA</given-names> </name></person-group><source>Williams Textbook of Endocrinology</source><year>2024</year><edition>15</edition><publisher-name>Elsevier</publisher-name><pub-id pub-id-type="other">9780323932301</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abraham</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Karges</surname><given-names>B</given-names> </name><name name-style="western"><surname>Dovc</surname><given-names>K</given-names> </name><etal/></person-group><article-title>ISPAD clinical practice consensus guidelines 2022: assessment and management of hypoglycemia in children and adolescents with diabetes</article-title><source>Pediatr Diabetes</source><year>2022</year><month>12</month><volume>23</volume><issue>8</issue><fpage>1322</fpage><lpage>1340</lpage><pub-id pub-id-type="doi">10.1111/pedi.13443</pub-id><pub-id pub-id-type="medline">36537534</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jian</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Pasquier</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sagahyroon</surname><given-names>A</given-names> </name><name name-style="western"><surname>Aloul</surname><given-names>F</given-names> </name></person-group><article-title>A machine learning approach to predicting diabetes complications</article-title><source>Healthcare (Basel)</source><year>2021</year><month>12</month><day>9</day><volume>9</volume><issue>12</issue><fpage>1712</fpage><pub-id pub-id-type="doi">10.3390/healthcare9121712</pub-id><pub-id pub-id-type="medline">34946438</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>American Diabetes Association Professional Practice Committee</collab></person-group><article-title>6. Glycemic goals and hypoglycemia: standards of care in diabetes&#x2014;2024</article-title><source>Diabetes Care</source><year>2024</year><month>01</month><day>1</day><volume>47</volume><issue>Supplement_1</issue><fpage>S111</fpage><lpage>S125</lpage><pub-id pub-id-type="doi">10.2337/dc24-S006</pub-id><pub-id pub-id-type="medline">38078586</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scheideman</surname><given-names>AF</given-names> </name><name name-style="western"><surname>Shao</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Zelada</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Machine learning to diagnose complications of diabetes</article-title><source>J Diabetes Sci Technol</source><year>2025</year><month>11</month><volume>19</volume><issue>6</issue><fpage>1650</fpage><lpage>1670</lpage><pub-id pub-id-type="doi">10.1177/19322968251365245</pub-id><pub-id pub-id-type="medline">40932163</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cveticanin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Arsenovic</surname><given-names>M</given-names> </name></person-group><article-title>Prediction models for diabetes in children and adolescents: a review</article-title><source>Appl Sci</source><year>2025</year><volume>15</volume><issue>6</issue><fpage>2906</fpage><pub-id pub-id-type="doi">10.3390/app15062906</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ravaut</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sadeghi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Leung</surname><given-names>KK</given-names> </name><etal/></person-group><article-title>Predicting adverse outcomes due to diabetes complications with machine learning using administrative health data</article-title><source>NPJ Digit Med</source><year>2021</year><month>02</month><day>12</day><volume>4</volume><issue>1</issue><fpage>24</fpage><pub-id pub-id-type="doi">10.1038/s41746-021-00394-8</pub-id><pub-id pub-id-type="medline">33580109</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eid</surname><given-names>WM</given-names> </name><name name-style="western"><surname>Alharthi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Aslam</surname><given-names>N</given-names> </name><name name-style="western"><surname>Abdur rab</surname><given-names>IU</given-names> </name><name name-style="western"><surname>Madani</surname><given-names>A</given-names> </name></person-group><article-title>Predicting diabetic ketoacidosis in pediatric patients using machine learning</article-title><source>F1000Res</source><year>2023</year><volume>12</volume><fpage>611</fpage><pub-id pub-id-type="doi">10.12688/f1000research.130042.1</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Subramanian</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sonabend</surname><given-names>R</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>I</given-names> </name></person-group><article-title>A machine learning model for risk stratification of postdiagnosis diabetic ketoacidosis hospitalization in pediatric type 1 diabetes: retrospective study</article-title><source>JMIR Diabetes</source><year>2024</year><volume>9</volume><fpage>e53338</fpage><pub-id pub-id-type="doi">10.2196/53338</pub-id><pub-id pub-id-type="medline">39110490</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Voskergian</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bakir-Gungor</surname><given-names>B</given-names> </name><name name-style="western"><surname>Yousef</surname><given-names>M</given-names> </name></person-group><article-title>Engineering novel features for diabetes complication prediction using synthetic electronic health records</article-title><source>Front Genet</source><year>2025</year><volume>16</volume><fpage>1451290</fpage><pub-id pub-id-type="doi">10.3389/fgene.2025.1451290</pub-id><pub-id pub-id-type="medline">40309033</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mora</surname><given-names>T</given-names> </name><name name-style="western"><surname>Roche</surname><given-names>D</given-names> </name><name name-style="western"><surname>Rodr&#x00ED;guez-S&#x00E1;nchez</surname><given-names>B</given-names> </name></person-group><article-title>Predicting the onset of diabetes-related complications after a diabetes diagnosis with machine learning algorithms</article-title><source>Diabetes Res Clin Pract</source><year>2023</year><month>10</month><volume>204</volume><fpage>110910</fpage><pub-id pub-id-type="doi">10.1016/j.diabres.2023.110910</pub-id><pub-id pub-id-type="medline">37722566</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rivetti</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hursh</surname><given-names>BE</given-names> </name><name name-style="western"><surname>Miraglia Del Giudice</surname><given-names>E</given-names> </name><name name-style="western"><surname>Marzuillo</surname><given-names>P</given-names> </name></person-group><article-title>Acute and chronic kidney complications in children with type 1 diabetes mellitus</article-title><source>Pediatr Nephrol</source><year>2023</year><month>05</month><volume>38</volume><issue>5</issue><fpage>1449</fpage><lpage>1458</lpage><pub-id pub-id-type="doi">10.1007/s00467-022-05689-w</pub-id><pub-id pub-id-type="medline">35896816</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fregoso-Aparicio</surname><given-names>L</given-names> </name><name name-style="western"><surname>Noguez</surname><given-names>J</given-names> </name><name name-style="western"><surname>Montesinos</surname><given-names>L</given-names> </name><name name-style="western"><surname>Garc&#x00ED;a-Garc&#x00ED;a</surname><given-names>JA</given-names> </name></person-group><article-title>Machine learning and deep learning predictive models for type 2 diabetes: a systematic review</article-title><source>Diabetol Metab Syndr</source><year>2021</year><month>12</month><day>20</day><volume>13</volume><issue>1</issue><fpage>148</fpage><pub-id pub-id-type="doi">10.1186/s13098-021-00767-9</pub-id><pub-id pub-id-type="medline">34930452</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Netayawijit</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chansanam</surname><given-names>W</given-names> </name><name name-style="western"><surname>Sorn-In</surname><given-names>K</given-names> </name></person-group><article-title>Interpretable machine learning framework for diabetes prediction: integrating SMOTE balancing with SHAP explainability for clinical decision support</article-title><source>Healthcare (Basel)</source><year>2025</year><month>10</month><day>14</day><volume>13</volume><issue>20</issue><fpage>2588</fpage><pub-id pub-id-type="doi">10.3390/healthcare13202588</pub-id><pub-id pub-id-type="medline">41154264</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Asaduzzaman</surname><given-names>S</given-names> </name><name name-style="western"><surname>Masud</surname><given-names>FA</given-names> </name><name name-style="western"><surname>Bhuiyan</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ahmed</surname><given-names>K</given-names> </name><name name-style="western"><surname>Paul</surname><given-names>BK</given-names> </name><name name-style="western"><surname>Rahman</surname><given-names>S</given-names> </name></person-group><article-title>Dataset on significant risk factors for type 1 diabetes: a Bangladeshi perspective</article-title><source>Data Brief</source><year>2018</year><month>12</month><volume>21</volume><fpage>700</fpage><lpage>708</lpage><pub-id pub-id-type="doi">10.1016/j.dib.2018.10.018</pub-id><pub-id pub-id-type="medline">30666315</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alasadi</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Bhaya</surname><given-names>WS</given-names> </name></person-group><article-title>Review of data preprocessing techniques in data mining</article-title><source>J Eng Appl Sci</source><year>2017</year><volume>12</volume><issue>16</issue><fpage>4102</fpage><lpage>4107</lpage><pub-id pub-id-type="doi">10.3923/jeasci.2017.4102.4107</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Hosmer</surname><given-names>DW</given-names> </name><name name-style="western"><surname>Lemeshow</surname><given-names>S</given-names> </name><name name-style="western"><surname>May</surname><given-names>S</given-names> </name></person-group><source>Applied Survival Analysis: Regression Modeling of Time-to-Event Data</source><year>2008</year><access-date>2026-04-21</access-date><edition>2</edition><publisher-name>John Wiley &#x0026; Sons, Inc</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://download.e-bookshelf.de/download/0000/5709/18/L-G-0000570918-0002357449.pdf">https://download.e-bookshelf.de/download/0000/5709/18/L-G-0000570918-0002357449.pdf</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dovc</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lanzinger</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cardona-Hernandez</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Association of achieving time in range clinical targets with treatment modality among youths with type 1 diabetes</article-title><source>JAMA Netw Open</source><year>2023</year><month>02</month><day>1</day><volume>6</volume><issue>2</issue><fpage>e230077</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.0077</pub-id><pub-id pub-id-type="medline">36808243</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>Growth reference data for 5-19 years: BMI-for-age (5-19 years)</article-title><source>World Health Organization</source><access-date>2025-06-01</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/tools/growth-reference-data-for-5to19-years/indicators/bmi-for-age">https://www.who.int/tools/growth-reference-data-for-5to19-years/indicators/bmi-for-age</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>VK</given-names> </name><name name-style="western"><surname>Maurya</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Mani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yadav</surname><given-names>RS</given-names> </name></person-group><article-title>Machine learning method using position-specific mutation based classification outperforms one hot coding for disease severity prediction in haemophilia &#x201C;A&#x201D;</article-title><source>Genomics</source><year>2020</year><month>11</month><volume>112</volume><issue>6</issue><fpage>5122</fpage><lpage>5128</lpage><pub-id pub-id-type="doi">10.1016/j.ygeno.2020.09.020</pub-id><pub-id pub-id-type="medline">32927010</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="report"><article-title>Saudi diabetes clinical practice guidelines (SDCPG)</article-title><year>2021</year><access-date>2025-05-26</access-date><publisher-name>Saudi Health Council</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://shc.gov.sa/ar/Centers/NDC/Activities/Documents/Specialists/Saudi%20Diabetes%20Clinical%20Practice%20Guidelines.pdf">https://shc.gov.sa/ar/Centers/NDC/Activities/Documents/Specialists/Saudi%20Diabetes%20Clinical%20Practice%20Guidelines.pdf</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chandrashekar</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sahin</surname><given-names>F</given-names> </name></person-group><article-title>A survey on feature selection methods</article-title><source>Comput Electr Eng</source><year>2014</year><month>01</month><volume>40</volume><issue>1</issue><fpage>16</fpage><lpage>28</lpage><pub-id pub-id-type="doi">10.1016/j.compeleceng.2013.11.024</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pedregosa</surname><given-names>F</given-names> </name><name name-style="western"><surname>Varoquaux</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gramfort</surname><given-names>A</given-names> </name><name name-style="western"><surname>Michel</surname><given-names>V</given-names> </name><name name-style="western"><surname>Thirion</surname><given-names>B</given-names> </name><name name-style="western"><surname>Grisel</surname><given-names>O</given-names> </name><etal/></person-group><article-title>Scikit-learn: machine learning in Python</article-title><source>J Mach Learn Res</source><year>2011</year><access-date>2026-04-21</access-date><volume>12</volume><fpage>2825</fpage><lpage>2830</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.jmlr.org/papers/volume12/pedregosa11a/pedregosa11a.pdf?source=post_page">https://www.jmlr.org/papers/volume12/pedregosa11a/pedregosa11a.pdf?source=post_page</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Quinlan</surname><given-names>JR</given-names> </name></person-group><article-title>Induction of decision trees</article-title><source>Mach Learn</source><year>1986</year><month>03</month><volume>1</volume><issue>1</issue><fpage>81</fpage><lpage>106</lpage><pub-id pub-id-type="doi">10.1007/BF00116251</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bennett</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Hauser</surname><given-names>K</given-names> </name></person-group><article-title>Artificial intelligence framework for simulating clinical decision-making: a Markov decision process approach</article-title><source>Artif Intell Med</source><year>2013</year><month>01</month><volume>57</volume><issue>1</issue><fpage>9</fpage><lpage>19</lpage><pub-id pub-id-type="doi">10.1016/j.artmed.2012.12.003</pub-id><pub-id pub-id-type="medline">23287490</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bergstra</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bengio</surname><given-names>Y</given-names> </name></person-group><article-title>Random search for hyper-parameter optimization</article-title><source>J Mach Learn Res</source><year>2012</year><access-date>2026-04-21</access-date><volume>13</volume><issue>1</issue><fpage>281</fpage><lpage>305</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf">https://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Saito</surname><given-names>T</given-names> </name><name name-style="western"><surname>Rehmsmeier</surname><given-names>M</given-names> </name></person-group><article-title>The precision-recall plot is more informative than the ROC plot when evaluating binary classifiers on imbalanced datasets</article-title><source>PLoS One</source><year>2015</year><volume>10</volume><issue>3</issue><fpage>e0118432</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0118432</pub-id><pub-id pub-id-type="medline">25738806</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salle</surname><given-names>L</given-names> </name><name name-style="western"><surname>Julla</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Aguayo</surname><given-names>GA</given-names> </name><etal/></person-group><article-title>1458-P: cardiovascular risk is higher in people with type 1 diabetes living with overweight or obesity&#x2014;insights from the SFDT1 cohort</article-title><source>Diabetes</source><year>2024</year><month>06</month><day>14</day><volume>73</volume><issue>Supplement_1</issue><pub-id pub-id-type="doi">10.2337/db24-1458-P</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Petty</surname><given-names>LD</given-names> </name><name name-style="western"><surname>Soto-Pedre</surname><given-names>E</given-names> </name><name name-style="western"><surname>McCrimmon</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Pearson</surname><given-names>ER</given-names> </name></person-group><article-title>Body mass index&#x2019;s influence on arterial hypertension in type 1 diabetes - a brief report from IMI-SOPHIA study</article-title><source>J Diabetes Complications</source><year>2024</year><month>06</month><volume>38</volume><issue>6</issue><fpage>108747</fpage><pub-id pub-id-type="doi">10.1016/j.jdiacomp.2024.108747</pub-id><pub-id pub-id-type="medline">38643555</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>DeBoer</surname><given-names>MD</given-names> </name></person-group><article-title>Obesity, systemic inflammation, and increased risk for cardiovascular disease and diabetes among adolescents: a need for screening tools to target interventions</article-title><source>Nutrition</source><year>2013</year><month>02</month><volume>29</volume><issue>2</issue><fpage>379</fpage><lpage>386</lpage><pub-id pub-id-type="doi">10.1016/j.nut.2012.07.003</pub-id><pub-id pub-id-type="medline">23022122</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cryer</surname><given-names>PE</given-names> </name></person-group><article-title>Hypoglycemia in type 1 diabetes mellitus</article-title><source>Endocrinol Metab Clin North Am</source><year>2010</year><month>09</month><volume>39</volume><issue>3</issue><fpage>641</fpage><lpage>654</lpage><pub-id pub-id-type="doi">10.1016/j.ecl.2010.05.003</pub-id><pub-id pub-id-type="medline">20723825</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ly</surname><given-names>TT</given-names> </name><name name-style="western"><surname>Maahs</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Rewers</surname><given-names>A</given-names> </name><etal/></person-group><article-title>ISPAD Clinical Practice Consensus Guidelines 2014. Assessment and management of hypoglycemia in children and adolescents with diabetes</article-title><source>Pediatr Diabetes</source><year>2014</year><month>09</month><volume>15 Suppl 20</volume><issue>Suppl 20</issue><fpage>180</fpage><lpage>192</lpage><pub-id pub-id-type="doi">10.1111/pedi.12174</pub-id><pub-id pub-id-type="medline">25040141</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Doyle</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Weinzimer</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Steffen</surname><given-names>AT</given-names> </name><name name-style="western"><surname>Ahern</surname><given-names>JAH</given-names> </name><name name-style="western"><surname>Vincent</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tamborlane</surname><given-names>WV</given-names> </name></person-group><article-title>A randomized, prospective trial comparing the efficacy of continuous subcutaneous insulin infusion with multiple daily injections using insulin glargine</article-title><source>Diabetes Care</source><year>2004</year><month>07</month><volume>27</volume><issue>7</issue><fpage>1554</fpage><lpage>1558</lpage><pub-id pub-id-type="doi">10.2337/diacare.27.7.1554</pub-id><pub-id pub-id-type="medline">15220227</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Donaghue</surname><given-names>KC</given-names> </name><name name-style="western"><surname>Marcovecchio</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Wadwa</surname><given-names>RP</given-names> </name><etal/></person-group><article-title>ISPAD Clinical Practice Consensus Guidelines 2018: microvascular and macrovascular complications in children and adolescents</article-title><source>Pediatr Diabetes</source><year>2018</year><month>10</month><volume>19 Suppl 27</volume><issue>Suppl 27</issue><fpage>262</fpage><lpage>274</lpage><pub-id pub-id-type="doi">10.1111/pedi.12742</pub-id><pub-id pub-id-type="medline">30079595</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>The Diabetes Control and Complications Trial Research Group</collab></person-group><article-title>The effect of intensive treatment of diabetes on the development and progression of long-term complications in insulin-dependent diabetes mellitus</article-title><source>N Engl J Med</source><year>1993</year><month>09</month><day>30</day><volume>329</volume><issue>14</issue><fpage>977</fpage><lpage>986</lpage><pub-id pub-id-type="doi">10.1056/NEJM199309303291401</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Detailed overview of dataset features and categorized values used for type 1 diabetes.</p><media xlink:href="formative_v10i1e81039_app1.pdf" xlink:title="PDF File, 125 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>BMI classification thresholds used to encode nutritional status across four children and adolescent age groups based on the World Health Organization Growth Standards.</p><media xlink:href="formative_v10i1e81039_app2.pdf" xlink:title="PDF File, 104 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Shapley additive explanations values for the top 10 features corresponding to Figure 2.</p><media xlink:href="formative_v10i1e81039_app3.xlsx" xlink:title="XLSX File, 9 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4 </label><p>Exhaustive feature subsets (n=1013) with cross-validated <italic>F</italic><sub>1</sub> performance metrics.</p><media xlink:href="formative_v10i1e81039_app4.xlsx" xlink:title="XLSX File, 74 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5 </label><p>Detailed evaluation metrics for the top 5 predictive models (train/test/CV <italic>F</italic><sub>1</sub>, area under the curve).</p><media xlink:href="formative_v10i1e81039_app5.xlsx" xlink:title="XLSX File, 9 KB"/></supplementary-material></app-group></back></article>