<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i1e75121</article-id>
      <article-id pub-id-type="pmid">41032359</article-id>
      <article-id pub-id-type="doi">10.2196/75121</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Machine Learning–Enhanced Surveillance for Surgical Site Infections in Patients Undergoing Colon Surgery: Model Development and Evaluation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Potla</surname>
            <given-names>Ravi Teja</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Pant</surname>
            <given-names>Dewank</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Celik</surname>
            <given-names>Ugur</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Center for Clinical and Translational Sciences</institution>
            <institution>University of Massachusetts Chan Medical School</institution>
            <addr-line>55 Lake Avenue North</addr-line>
            <addr-line>Worcester, MA, 01655</addr-line>
            <country>United States</country>
            <phone>1 7744554679</phone>
            <email>ugur_celik@student.uml.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3555-695X</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Feifan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0881-6365</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Kobayashi</surname>
            <given-names>Kimiyoshi</given-names>
          </name>
          <degrees>MD, MBA</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-1796-7023</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Ellison III</surname>
            <given-names>Richard T</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1335-9832</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Guilarte-Walker</surname>
            <given-names>Yurima</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3610-0452</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Mack</surname>
            <given-names>Deborah Ann</given-names>
          </name>
          <degrees>RN</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-7142-2204</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Shi</surname>
            <given-names>Qiming</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5829-4345</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Zai</surname>
            <given-names>Adrian</given-names>
          </name>
          <degrees>MD, MPH, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2972-6839</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Center for Clinical and Translational Sciences</institution>
        <institution>University of Massachusetts Chan Medical School</institution>
        <addr-line>Worcester, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Population and Quantitative Health Sciences</institution>
        <institution>Division of Health Informatics and Implementation Science</institution>
        <institution>University of Massachusetts Chan Medical School</institution>
        <addr-line>Worcester, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Medicine</institution>
        <institution>University of Massachusetts Chan Medical School</institution>
        <addr-line>Worcester, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>UMass Memorial Medical Center</institution>
        <addr-line>Worcester</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Infection Control Department</institution>
        <institution>UMass Memorial Medical Center</institution>
        <addr-line>Worcester, MA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Ugur Celik <email>ugur_celik@student.uml.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>1</day>
        <month>10</month>
        <year>2025</year>
      </pub-date>
      <volume>9</volume>
      <elocation-id>e75121</elocation-id>
      <history>
        <date date-type="received">
          <day>1</day>
          <month>4</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>28</day>
          <month>5</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>18</day>
          <month>6</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>19</day>
          <month>8</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Ugur Celik, Feifan Liu, Kimiyoshi Kobayashi, Richard T Ellison III, Yurima Guilarte-Walker, Deborah Ann Mack, Qiming Shi, Adrian Zai. Originally published in JMIR Formative Research (https://formative.jmir.org), 01.10.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2025/1/e75121" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Surgical site infections (SSIs) are one of the most common health care–associated infections, accounting for nearly 20% of all health care–associated infections in hospitalized patients. SSIs are associated with longer hospital stays, increased readmission rates, higher health care costs, and a mortality rate twice that of patients without infections.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to develop and evaluate machine learning (ML) models for augmenting SSI surveillance after colon surgery with the goal of improving the efficiency of infection control practices by prioritizing patients at high risk.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We conducted a retrospective study using data from 1508 patients undergoing colon surgery treated between 2018 and 2023 at a single academic medical center. Of these 1508 patients, 66 (4.4%) developed SSIs as adjudicated by infection control practitioners following Centers for Disease Control and Prevention National Healthcare Safety Network criteria. Data included 78 structured variables (eg, demographics, comorbidities, vital signs, laboratory tests, medications, and operative details) and 2 features derived from unstructured clinical notes using natural language processing. ML models&lt;strong&gt;―&lt;/strong&gt;logistic regression, random forest, and Extreme Gradient Boosting (XGBoost)&lt;strong&gt;―&lt;/strong&gt;were trained using stratified 80/20 train-test splits. Class imbalance was addressed using cost-sensitive learning and the synthetic minority oversampling technique. Model performance was evaluated using precision, recall, <italic>F</italic><sub>1</sub>-score, area under the receiver operating characteristic curve, and Brier scores for calibration.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Of the 1508 patients, those who developed SSIs had longer hospital stays (mean 8.1, SD 6.8 days vs mean 6.3, SD 10.5 days; <italic>P</italic>&lt;.001), higher rates of an American Society of Anesthesiologists score of 3 (52/66, 79% vs 653/1442, 45.3%; <italic>P</italic>&lt;.001), and elevated white blood cell counts (51/66, 77% vs 734/1442, 50.9%; <italic>P</italic>&lt;.001). XGBoost achieved the best overall performance with an area under the receiver operating characteristic curve of 0.788, precision of 50%, recall of 38%, and Brier score of 0.035. Random forest yielded perfect precision (100%) but lower recall (23%), with a Brier score of 0.034. Logistic regression showed the highest recall (46%) but the lowest precision (10%), with a Brier score of 0.139. Feature importance analysis using Shapley additive explanations (SHAP) values revealed that the top predictors included recovery duration (SHAP=1.18), SSI keyword frequency (SHAP=1.12), patient age (SHAP=1.12), and American Society of Anesthesiologists score (SHAP=0.94), with natural language processing–derived features ranking among the top 10.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>ML models can augment traditional SSI surveillance by improving early identification of patients at high risk. The XGBoost model offered the best trade-off between discrimination and calibration, suggesting its utility in clinical workflows. Incorporating structured and unstructured electronic health record data enhances model accuracy and clinical relevance, supporting scalable and efficient infection control practices.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>surgical site infection</kwd>
        <kwd>machine learning</kwd>
        <kwd>surveillance</kwd>
        <kwd>electronic health records</kwd>
        <kwd>natural language processing</kwd>
        <kwd>colon surgery</kwd>
        <kwd>risk prediction</kwd>
        <kwd>Extreme Gradient Boosting</kwd>
        <kwd>XGBoost</kwd>
        <kwd>random forest</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Surgical site infections (SSIs) are a significant category of health care–associated infections, representing a serious challenge to health care systems worldwide. SSIs are estimated to account for nearly 20% of all health care–associated infections among hospitalized patients [<xref ref-type="bibr" rid="ref1">1</xref>]. In the United States, SSIs occur in approximately 2% to 4% of patients undergoing inpatient surgical procedures [<xref ref-type="bibr" rid="ref2">2</xref>]. SSIs can lead to severe complications, including increased morbidity, extended hospital stays, higher readmission rates, and increased health care costs, ultimately impacting patient outcomes and putting a strain on health care resources [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. The financial burden of SSIs is substantial, with associated costs reaching billions of dollars annually. SSIs are the third most costly type of health care–acquired infection, with an estimated cost of US $20,785 per patient case [<xref ref-type="bibr" rid="ref6">6</xref>]. Patients with SSIs have a mortality rate twice that of patients without infections [<xref ref-type="bibr" rid="ref7">7</xref>].</p>
        <p>The current surveillance process for SSIs is resource intensive, requiring manual chart reviews by infection control practitioners to monitor surgical procedures and screen for potential infections. This manual surveillance is time-consuming and labor intensive and detracts from direct patient care [<xref ref-type="bibr" rid="ref8">8</xref>]. The Centers for Disease Control and Prevention (CDC) National Healthcare Safety Network (NHSN) has established comprehensive guidelines to support the systematic tracking and identification of SSIs; yet, the manual workload remains significant [<xref ref-type="bibr" rid="ref9">9</xref>].</p>
        <p>Several studies have applied machine learning (ML) and natural language processing (NLP) to improve the detection and prediction of SSIs. These approaches have shown promise in automating surveillance and improving accuracy by using structured and unstructured clinical data. However, previous research has largely focused on retrospective detection rather than prospective prediction, relied on limited data sources, or failed to integrate both structured and unstructured data [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref13">13</xref>].</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>To address these limitations, our study introduces an ML-based surveillance tool designed to enhance SSI monitoring. By integrating structured data and clinical notes, our approach improves detection efficiency and accuracy. Automating surveillance through ML reduces the burden on infection control practitioners, allowing them to focus on patients at high risk and critical tasks, ultimately improving patient outcomes and optimizing resource allocation.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study was reviewed by the University of Massachusetts Chan Medical School (UMass Chan) Institutional Review Board and deemed exempt under institutional guidelines for quality improvement. As a retrospective analysis of existing electronic health record (EHR) data with no patient contact, informed consent was not required. The institutional review board confirmed that secondary use of clinical data did not necessitate additional consent. All data were deidentified in accordance with HIPAA (Health Insurance Portability and Accountability Act) and analyzed within a secure, access-controlled environment at UMass Chan. Only authorized personnel had access to the data. No identifiable patient information appears in images or supplementary materials; all figures are fully anonymized.</p>
      </sec>
      <sec>
        <title>Data Extraction and Preparation</title>
        <p>Data for this study were extracted from the UMass Chan Data Lake, which is a copy of the Epic Clarity EHR system at the hospital and refreshes weekly. This data repository comprises 2.5 million unique patients since November 2017. We focused on patients who underwent colon surgeries between 2018 and 2023, identified using Current Procedural Terminology (CPT) codes according to the NHSN guidelines. Our cohort included 1508 procedures, with 66 (4.4%) confirmed SSI cases.</p>
      </sec>
      <sec>
        <title>Cohort Inclusion and Exclusion Criteria</title>
        <p>We identified all adult patients (aged ≥18 years) who underwent colon surgery at the University of Massachusetts Memorial Medical Center between January 1, 2018, and December 31, 2023, using CPT procedure codes per NHSN guidelines. Patients were included if they were aged ≥18 years at the time of surgery, underwent elective or urgent colon resection procedures (CPT codes 44140-44160 and 44204-44208), and had available postoperative follow-up data for at least 30 days. We excluded patients with noncolon procedures or combined multiorgan resections, those with missing critical EHR data (eg, American Society of Anesthesiologists [ASA] score, surgery date, or outcome label), and patients who were deceased before completion of the 30-day postoperative surveillance window. After applying these criteria, 1508 unique procedures remained for analysis.</p>
      </sec>
      <sec>
        <title>Outcome Labeling and Gold Standard</title>
        <p>All SSI labels were assigned by the hospital’s infection prevention and control (IPC) team according to CDC NHSN criteria. Each postoperative patient who underwent colon surgery is manually reviewed daily by IPC specialists, who screen charts, microbiology reports, wound assessments, and nursing notes against the NHSN definitions. Ambiguous cases are flagged for discussion at a weekly consensus meeting with at least 2 IPC specialists and a supervising epidemiologist, and final SSI determinations are made through consensus. The IPC department also conducts monthly peer audit exercises to ensure consistency with NHSN standards.</p>
        <p>To ensure clinical relevance, infection control nurses provided input during the term selection process for clinical note analysis and validated the model’s outputs against real-world clinical scenarios. The dataset included 78 structured variables, such as demographics, medications, laboratory test results, and medical histories (<xref ref-type="table" rid="table1">Table 1</xref>). In addition, 2 variables—SSI keyword count and SSI negation term count—were derived from unstructured clinical notes using NLP techniques.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Candidate predictors used to train machine learning models in a retrospective cohort study of postoperative surgical site infection (SSI) after colon surgery at the University of Massachusetts Memorial Medical Center (Worcester, Massachusetts, United States) from January 1, 2018, to December 31, 2023.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="800"/>
            <thead>
              <tr valign="top">
                <td>Data domain</td>
                <td>Data points</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Demographics</td>
                <td>Gender, race, ethnicity, age, alcohol usage and smoking status</td>
              </tr>
              <tr valign="top">
                <td>Comorbidities</td>
                <td>Obesity, cancer, diabetes, immunological disease, depression, dementia, anemia, heart failure, AIDS, and alcohol</td>
              </tr>
              <tr valign="top">
                <td>Encounters</td>
                <td>Total stay days and inpatient stay</td>
              </tr>
              <tr valign="top">
                <td>Laboratory tests</td>
                <td>Hemoglobin, culture, white blood cell count, and C-reactive protein</td>
              </tr>
              <tr valign="top">
                <td>Medications</td>
                <td>Antibiotics, immunosuppressants, and steroids</td>
              </tr>
              <tr valign="top">
                <td>Patient</td>
                <td>Patient ID and patient encounter ID</td>
              </tr>
              <tr valign="top">
                <td>Surgery details</td>
                <td>Surgery class, surgery procedure code, physician ID, number of procedures, surgery duration, surgery recovery duration, department, room number, wound status, anesthesia type, ASA<sup>a</sup> score, incision closure, and SSI</td>
              </tr>
              <tr valign="top">
                <td>Vital signs</td>
                <td>BMI</td>
              </tr>
              <tr valign="top">
                <td>NLP<sup>b</sup>-derived features</td>
                <td>SSI_Keyword and SSI_Negation</td>
              </tr>
              <tr valign="top">
                <td>SSI keywords</td>
                <td>“Fever,” “nausea,” “vomiting,” “pain,” “tenderness,” “odynophagia,” “dysphagia,” “hypotension,” “jaundice,” “dysuria,” “abscess,” and “infection”</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>ASA: American Society of Anesthesiologists.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>NLP: natural language processing.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Unstructured Data Processing</title>
        <p>Specifically, we used the spaCy library (Explosion AI) [<xref ref-type="bibr" rid="ref14">14</xref>] for tokenization and named entity recognition to identify relevant keywords associated with SSIs, such as “redness,” “swelling,” “drainage,” and “purulent discharge.” Negation detection was conducted using the NegEx algorithm [<xref ref-type="bibr" rid="ref15">15</xref>], which allowed us to determine whether an identified keyword was negated in the context of the clinical note. For example, “no signs of infection” or “denies fever” would be identified as negated terms. These NLP tools enabled us to accurately quantify the occurrence of SSI-related terms and their context within the notes.</p>
        <p>Infection-related terms were first extracted from the CDC NHSN SSI surveillance criteria and then reviewed and finalized by IPC nurses—who conduct daily chart surveillance—to ensure relevance to our colon surgery population. No formal statistical testing was conducted on the term list. All SSI related keywords were curated and validated by the IPC nurses based on daily practice. While transformer-based embeddings (eg, BioBERT) may capture richer linguistic patterns, we deferred their use because our IT department could not approve deployment of large language models in the current infrastructure. These contextual methods will be explored in follow-up work.</p>
      </sec>
      <sec>
        <title>Data Processing and Feature Engineering</title>
        <p>Data preprocessing involved imputing missing values in numerical columns using column-wise means, one-hot encoding categorical variables, and validating data types for compatibility with ML algorithms. The dataset was processed using the Python pandas library [<xref ref-type="bibr" rid="ref16">16</xref>] and then deployed to a secure workspace, the Platform for Learning Health System environment at UMass Chan, for further analysis [<xref ref-type="bibr" rid="ref17">17</xref>].</p>
        <p>After median imputation of numerical variables and one-hot encoding of categorical variables, the dataset expanded to 150 features in total: the original 78 numerical inputs, 2 NLP-derived counts, and 70 dummy variables from one-hot encoding. For the logistic regression model, we then generated all second-degree polynomial interaction terms, yielding 11,325 features in the final matrix used for model fitting.</p>
        <p>To better understand the relative impact of different imbalance-handling techniques, we conducted an ablation study on all 3 models (logistic regression, random forest, and Extreme Gradient Boosting [XGBoost]). Three strategies were compared: cost-sensitive approach only by applying class weights (class_weight=<italic>balanced</italic> for tree and linear models and scale_pos_weight for XGBoost), oversampling only via 1:1 random upsampling of the minority (SSI) class in the training set, and oversampling combined with cost-sensitive approaches. Pipelines for each strategy were retrained and evaluated on the same held-out test set (n=302), and metrics including precision, recall, <italic>F</italic><sub>1</sub>-score (for the SSI class), area under the receiver operating characteristic curve (AUC-ROC), and Brier score were recorded.</p>
      </sec>
      <sec>
        <title>Model Development</title>
        <p>The dataset was split into training (80%) and validation (20%) sets using stratified sampling to maintain the class distribution. To address class imbalance, we applied cost-sensitive learning [<xref ref-type="bibr" rid="ref18">18</xref>] and the synthetic minority oversampling technique (SMOTE) [<xref ref-type="bibr" rid="ref19">19</xref>]. SMOTE works by creating synthetic examples of the minority class by interpolating between existing instances, thereby balancing the class distribution in the training set.</p>
        <p>We developed 3 ML models: logistic regression [<xref ref-type="bibr" rid="ref20">20</xref>], random forest [<xref ref-type="bibr" rid="ref21">21</xref>], and XGBoost [<xref ref-type="bibr" rid="ref22">22</xref>]. Each model was chosen based on specific strengths—logistic regression for interpretability, random forest for handling complex interactions, and XGBoost for efficient structured data analysis.</p>
        <p>Logistic regression was chosen due to its simplicity and interpretability, which allows health care professionals to understand the relationships between features and outcomes, making it useful for clinical decision-making. Random forest was selected for its ability to handle complex feature interactions and its robustness in managing missing data, making it effective for capturing nonlinear patterns in the dataset [<xref ref-type="bibr" rid="ref23">23</xref>]. XGBoost was included for its high predictive performance and efficiency, particularly with structured datasets, and its ability to handle imbalanced data effectively through boosting techniques.</p>
        <p>For logistic regression, we used L2 regularization [<xref ref-type="bibr" rid="ref24">24</xref>] to prevent overfitting and applied polynomial features (adding interaction terms between features) to capture potential nonlinear relationships in the data. For random forest, we optimized the number of trees, maximum depth, and minimum samples per leaf to ensure robustness. We used the same feature set for all models, including the polynomial features, to ensure a fair comparison. For XGBoost, we tuned hyperparameters such as learning rate, maximum depth, and regularization parameters through grid search using cross-validation [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
      </sec>
      <sec>
        <title>Model Calibration</title>
        <p>To assess the reliability of each model’s predicted probabilities in a clinical context, we conducted both quantitative and visual calibration analyses on the held-out test set. We computed the Brier score (using scikit-learn’s [Google Summer of Code] brier_score_loss function), which measures the mean squared difference between predicted probabilities and observed outcomes (lower=better calibration). We also generated calibration (reliability) plots via scikit-learn’s calibration_curve function, dividing predictions into 10 equal-width bins, and for each bin, we plotted the mean predicted probability against the observed SSI rate, overlaying the 45° line to represent perfect calibration. All calibration calculations were conducted in Python (version 3.8; Python Software Foundation) using scikit-learn (version 1.0).</p>
      </sec>
      <sec>
        <title>Model Evaluation</title>
        <p>Because SSIs represent only approximately 4% of cases, overall accuracy can be misleading. Therefore, we report precision, recall (sensitivity), <italic>F</italic><sub>1</sub>-score, and AUC-ROC as our primary performance metrics for the minority class, with accuracy included only for completeness.</p>
        <p>To assess performance stability, we performed 5-fold stratified cross-validation on the training set and report each metric’s mean and SD.</p>
        <p>Performance metrics, including precision, sensitivity (recall), specificity, <italic>F</italic><sub>1</sub>-score, and AUC-ROC, were calculated to comprehensively evaluate model performance, especially given the imbalanced nature of the dataset. Sensitivity and specificity help assess the model’s ability to correctly identify positive and negative cases, respectively, whereas the AUC-ROC offers an overall measure of discriminative performance. The final models were validated on the reserved 20% of the dataset to assess their generalizability.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Study Population and Characteristics</title>
        <p>A total of 1508 patients met the inclusion criteria (n=66, 4.4% with SSIs and n=1442, 95.6% without SSIs). <xref ref-type="table" rid="table2">Table 2</xref> presents the comprehensive characteristics of the study population comparing patients with and without SSIs. The table is organized into demographic, clinical, surgical, and medication-related variables to facilitate interpretation (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        <p>Demographically, patients with SSIs were significantly older (mean age 61.1, SD 8.9 years, <italic>P</italic>=.01) than those without SSIs (mean age 58.5, SD 15.4 years), with most SSI cases (38/66, 58%) occurring in the category of 61 to 80 years. A higher proportion of female individuals developed SSIs (43/66, 65% vs 765/1442, 53.1% in the non-SSI group), although this difference did not reach statistical significance (<italic>P</italic>=.07).</p>
        <p>The ASA score [<xref ref-type="bibr" rid="ref26">26</xref>] differed significantly between groups (<italic>P</italic>&lt;.001), with a substantially higher percentage of patients with SSIs having an ASA score of 3 (52/66, 79% vs 653/1442, 45.3%), indicating greater preoperative risk and comorbidity burden. Patients who developed SSIs showed a higher prevalence of several comorbidities, including diabetes (17/66, 26% vs 265/1442, 18.4%; <italic>P</italic>=.18), depression (17/66, 26% vs 296/1442, 20.5%; <italic>P</italic>=.18), anemia (24/66, 36% vs 423/1442, 29.3%; <italic>P</italic>=.28), hypertension (36/66, 55% vs 655/1442, 45.4%; <italic>P</italic>=.18), and chronic kidney disease (9/66, 14% vs 123/1442, 8.5%; <italic>P</italic>=.23), although these differences individually did not reach statistical significance.</p>
        <p>Surgical and procedural characteristics revealed important differences. Patients who developed SSIs underwent longer surgeries (mean duration 262.5, SD 137.3 min vs 229.9, SD 103.6 min), had a significantly higher number of procedures (mean 1.7, SD 1.1 vs 1.4, SD 0.7; <italic>P</italic>=.02), and experienced substantially longer hospital stays (mean 8.1, SD 6.8 days vs 6.3, SD 10.5 days; <italic>P</italic>&lt;.001). Wound classification showed significant variation, with contaminated wounds being more common in the SSI group (27/66, 41% vs 397/1442, 27.5%; <italic>P</italic>=.03) and clean-contaminated wounds being less common (24/66, 36% vs 799/1442, 55.4%; <italic>P</italic>=.004).</p>
        <p>Laboratory and medication factors also demonstrated significant associations with SSI development. Abnormal white blood cell (WBC) counts were significantly more common in patients with SSIs (51/66, 77% vs 734/1442, 50.9%; <italic>P</italic>&lt;.001), defined as WBC counts of &gt;11 before surgery. Steroid use, categorized as receiving any steroid within 12 months before surgery, was significantly higher in the SSI group (55/66, 83% vs 846/1442, 58.7%; <italic>P</italic>&lt;.001). All patients who developed SSIs had received antibiotics.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Bivariate comparisons of patient, operative, laboratory, and medication characteristics by 30-day National Healthcare Safety Network–defined surgical site infection (SSI) status in a retrospective single-center cohort of patients undergoing colon surgery at the University of Massachusetts Memorial Medical Center (Worcester, Massachusetts, United States) from 2018 to 2023. Only variables with <italic>P</italic>&lt;.05 are shown.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="460"/>
            <col width="240"/>
            <col width="160"/>
            <col width="140"/>
            <thead>
              <tr valign="top">
                <td>Variable</td>
                <td>No SSI (n=1442)</td>
                <td>SSI (n=66)</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Age (y), mean (SD)</td>
                <td>58.5 (15.4)</td>
                <td>61.1 (8.9)</td>
                <td>.01</td>
              </tr>
              <tr valign="top">
                <td>ASA<sup>a</sup> score of 3, n (%)</td>
                <td>653 (45.3)</td>
                <td>52 (78.8)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>Contaminated wound, n (%)</td>
                <td>397 (27.5)</td>
                <td>27 (40.9)</td>
                <td>.03</td>
              </tr>
              <tr valign="top">
                <td>Length of stay (d), mean (SD)</td>
                <td>6.3 (10.5)</td>
                <td>8.1 (6.8)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>WBC<sup>b</sup> flag—high, n (%)</td>
                <td>734 (50.9)</td>
                <td>51 (77.3)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>Steroid use, n (%)</td>
                <td>846 (58.7)</td>
                <td>55 (83.3)</td>
                <td>&lt;.001</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>ASA: American Society of Anesthesiologists.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>WBC: white blood cell.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Imbalance Handling and Ablation Study Results</title>
        <p>To evaluate the effectiveness of different approaches for handling class imbalance in our dataset, we performed an ablation study that compares three strategies: cost-sensitive learning, random oversampling, and a combination of both techniques. Each strategy was applied to all three ML models (logistic regression, random forest, and XGBoost) and evaluated on the same held-out set to ensure fair comparison. The results of this ablation study are presented in <xref ref-type="table" rid="table3">Table 3</xref>, which shows the performance metrics for each model and strategy combination.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Ablation study comparing class imbalance–handling strategies—cost-sensitive learning, random oversampling, and their combination—for predicting 30-day National Healthcare Safety Network–defined surgical site infections after colon surgery in a retrospective single-center cohort (University of Massachusetts Memorial Medical Center, Worcester, Massachusetts, United States; 2018-2023).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="350"/>
            <col width="150"/>
            <col width="120"/>
            <col width="100"/>
            <col width="130"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Model and strategy</td>
                <td>Precision (%)</td>
                <td>Recall (%)</td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>AUC-ROC<sup>a</sup></td>
                <td>Brier score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="7">
                  <bold>Logistic regression</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cost-sensitive learning</td>
                <td>9.84</td>
                <td>46.15</td>
                <td>0.162</td>
                <td>0.709</td>
                <td>0.143</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Oversampling only</td>
                <td>10.34</td>
                <td>46.15</td>
                <td>0.169</td>
                <td>0.704</td>
                <td>0.140</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Oversampling+cost-sensitive learning</td>
                <td>10.34</td>
                <td>46.15</td>
                <td>0.169</td>
                <td>0.704</td>
                <td>0.140</td>
              </tr>
              <tr valign="top">
                <td colspan="7">
                  <bold>Random forest</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cost-sensitive learning</td>
                <td>100</td>
                <td>15.38</td>
                <td>0.267</td>
                <td>0.776</td>
                <td>0.032</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Oversampling only</td>
                <td>100</td>
                <td>15.38</td>
                <td>0.267</td>
                <td>0.777</td>
                <td>0.031</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Oversampling+cost-sensitive learning</td>
                <td>100</td>
                <td>15.38</td>
                <td>0.267</td>
                <td>0.777</td>
                <td>0.031</td>
              </tr>
              <tr valign="top">
                <td colspan="7">
                  <bold>XGBoost<sup>b</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cost-sensitive learning</td>
                <td>57.14</td>
                <td>30.77</td>
                <td>0.400</td>
                <td>0.719</td>
                <td>0.036</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Oversampling only</td>
                <td>41.67</td>
                <td>38.46</td>
                <td>0.400</td>
                <td>0.758</td>
                <td>0.043</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Oversampling+cost-sensitive learning</td>
                <td>41.67</td>
                <td>38.46</td>
                <td>0.400</td>
                <td>0.758</td>
                <td>0.043</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>AUC-ROC: area under the receiver operating characteristic curve.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>XGBoost: Extreme Gradient Boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Model Performance and Calibration</title>
        <p><xref ref-type="table" rid="table4">Table 4</xref> summarizes each model’s precision, recall, <italic>F</italic><sub>1</sub>-score, and AUC-ROC for SSI detection as means and SDs from 5-fold cross-validation alongside Brier scores on the held-out test set. Accuracy was high for all models but less informative given the 4.4% (66/1508) SSI rate. Confusion matirces for all three models on the held-out test are shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <p>In terms of discrimination, XGBoost achieved the highest AUC-ROC (0.788), followed by random forest (0.778) and logistic regression (0.706; <xref rid="figure2" ref-type="fig">Figure 2</xref>). Regarding calibration, random forest and XGBoost both exhibited low Brier scores (0.034 and 0.035, respectively), indicating well-calibrated probability estimates, whereas logistic regression’s higher Brier score (0.139) reflects moderate miscalibration (<xref rid="figure3" ref-type="fig">Figure 3</xref>).</p>
        <p>Among the 3 models, XGBoost demonstrated the highest AUC-ROC score (0.788) and <italic>F</italic><sub>1</sub>-score (0.43), suggesting the best overall discriminative ability and balance between precision and recall. The random forest model showed perfect precision but lower recall, indicating that it was highly conservative in predicting SSIs. Logistic regression had the highest recall but the lowest precision, suggesting that it was more liberal in flagging potential SSI cases but at the cost of many false positives.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Model performance for predicting 30-day National Healthcare Safety Network–defined surgical site infections after colon surgery in a retrospective single-center cohort (University of Massachusetts Memorial Medical Center, Worcester, Massachusetts, United States; 2018-2023)<sup>a</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="180"/>
            <col width="190"/>
            <col width="170"/>
            <col width="170"/>
            <col width="190"/>
            <col width="100"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td>Precision (%), mean (SD)</td>
                <td>Recall (%), mean (SD)</td>
                <td><italic>F</italic><sub>1</sub>-score, mean (SD)</td>
                <td>AUC-ROC<sup>b</sup>, mean (SD)</td>
                <td>Brier score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>10.7 (3.9)</td>
                <td>56.0 (18.5)</td>
                <td>0.18 (0.06)</td>
                <td>0.775 (0.059)</td>
                <td>0.139</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>80.0 (40.0)</td>
                <td>11.1 (6.8)</td>
                <td>0.19 (0.11)</td>
                <td>0.756 (0.046)</td>
                <td>0.034</td>
              </tr>
              <tr valign="top">
                <td>XGBoost<sup>c</sup></td>
                <td>40.0 (17.0)</td>
                <td>20.5 (8.6)</td>
                <td>0.27 (0.11)</td>
                <td>0.735 (0.059)</td>
                <td>0.035</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Five-fold stratified cross-validation results are shown; Brier scores were computed on the 20% held-out test set.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>AUC-ROC: area under the receiver operating characteristic curve.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>XGBoost: Extreme Gradient Boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Confusion matrices on the 20% held-out test set for the logistic regression, random forest, and Extreme Gradient Boosting (XGBoost) models predicting 30 day National Healthcare Safety Network–defined surgical site infections (SSIs) after colon surgery in a retrospective single-center cohort (University of Massachusetts Memorial Medical Center, Worcester, Massachusetts, United States; 2018-2023; N=1508 with 66/1508, 4.4% SSIs).</p>
          </caption>
          <graphic xlink:href="formative_v9i1e75121_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Receiver operating characteristic (ROC) curves on the 20% held-out test set for the logistic regression, random forest, and Extreme Gradient Boosting (XGBoost) models trained to predict 30-day National Healthcare Safety Network–defined surgical site infections following colon surgery in a retrospective single center cohort (University of Massachusetts Memorial Medical Center, Worcester, Massachusetts, United States; 2018-2023). Area under the ROC curve values summarize discrimination. AUC: area under the curve.</p>
          </caption>
          <graphic xlink:href="formative_v9i1e75121_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Calibration (reliability) curves on the 20% held-out test set for logistic regression, random forest, and Extreme Gradient Boosting (XGBoost) predicting 30-day National Healthcare Safety Network–defined surgical site infections after colon surgery in a retrospective single-center cohort (University of Massachusetts Memorial Medical Center, Worcester, Massachusetts, United States; 2018-2023). Predictions were binned into 10 equal-width bins; the dashed 45° line indicates perfect calibration.</p>
          </caption>
          <graphic xlink:href="formative_v9i1e75121_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Statistical Comparison of Area Under the Curve</title>
        <p>To formally assess whether observed area under the curve (AUC) differences were significant, we conducted a paired bootstrap analysis with 1000 resamples of the test set. The 95% CIs and <italic>P</italic> values are shown in <xref ref-type="table" rid="table5">Table 5</xref>. All intervals overlapped, and both pairwise <italic>P</italic> values exceeded .05, indicating no statistically significant differences between XGBoost and the other models.</p>
        <p>Because all intervals overlapped substantially and both <italic>P</italic> values exceeded .05, we conclude that the small observed AUC differences were not statistically significant. These results support our interpretation that XGBoost and random forest performed equivalently in discriminating SSI risk and that XGBoost’s edge over logistic regression did not reach significance under bootstrap testing.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Statistical comparison of model areas under the curve (AUCs) via paired bootstrap (1000 resamples) on the 20% held-out test set in a retrospective single-center study predicting 30-day National Healthcare Safety Network–defined surgical site infections after colon surgery (University of Massachusetts Memorial Medical Center, Worcester, Massachusetts, United States; 2018-2023).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="400"/>
            <col width="400"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AUC (95% CI)</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>LR<sup>a</sup></td>
                <td>0.706 (0.538-0.850)</td>
                <td>—<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td>RF<sup>c</sup></td>
                <td>0.778 (0.588-0.935)</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>XGB<sup>d</sup></td>
                <td>0.788 (0.607-0.933)</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td>XGB vs LR</td>
                <td>—</td>
                <td>.50</td>
              </tr>
              <tr valign="top">
                <td>XGB vs RF</td>
                <td>—</td>
                <td>.79</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>LR: logistic regression.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>Not applicable.</p>
            </fn>
            <fn id="table5fn3">
              <p><sup>c</sup>RF: random forest.</p>
            </fn>
            <fn id="table5fn4">
              <p><sup>d</sup>XGB: Extreme Gradient Boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Feature Importance (Shapley Additive Explanations Analysis)</title>
        <p><xref ref-type="table" rid="table6">Table 6</xref> lists the top 10 features by mean absolute Shapley additive explanations (SHAP) value for the XGBoost model, demonstrating the influence of both structured EHR variables and our NLP-derived metrics on SSI risk prediction.</p>
        <p>Feature importance analysis revealed that the most predictive factors for SSI risk included ASA score, patient age, wound classification (particularly contaminated wounds), steroid use, and laboratory indicators such as WBC count. The NLP-derived features (SSI keyword count and negation term count) also contributed significantly to the models’ predictive performance, highlighting the value of incorporating unstructured clinical notes in SSI risk assessment. Notably, ssi_keyword (rank 2) and ssi_negated (rank 7) were among the top predictors—on par with established clinical features such as age and ASA score.</p>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Top 10 predictors of 30-day National Healthcare Safety Network–defined surgical site infections (SSIs) after colon surgery based on mean absolute Shapley additive explanations (SHAP) values from the final Extreme Gradient Boosting model trained on the retrospective single-center cohort (University of Massachusetts Memorial Medical Center, Worcester, Massachusetts, United States; 2018-2023)a.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="100"/>
            <col width="400"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Rank</td>
                <td>Feature</td>
                <td>SHAP value, mean (SD)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>surgery_minutes_in_recovery</td>
                <td>1.1761</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>ssi_keyword</td>
                <td>1.1217</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>surgery_patient_age</td>
                <td>1.1188</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>asa_score</td>
                <td>0.9354</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>surgery_minutes_in_or</td>
                <td>0.7716</td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td>average_bmi</td>
                <td>0.7397</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>ssi_negated</td>
                <td>0.7311</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>surgery_physician_id_E8779</td>
                <td>0.4223</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td>length_of_stay</td>
                <td>0.3408</td>
              </tr>
              <tr valign="top">
                <td>10</td>
                <td>patient_gender_female</td>
                <td>0.3100</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup>Predictors include structured electronic health record features (eg, American Society of Anesthesiologists score, operative times, BMI, and length of stay) and natural language processing–derived features from clinical notes (SSI keyword and negation counts).</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This study aimed to develop and evaluate ML models for the early prediction of SSIs following colon surgery using both structured EHR data and unstructured clinical notes. Our main findings show that ML models, particularly XGBoost, can effectively augment traditional surveillance practices by providing well-calibrated, discriminative risk predictions that prioritize patients at high risk. Among the models tested, XGBoost demonstrated the best balance between precision and recall (AUC-ROC=0.788; precision=50%; recall=38%), whereas random forest achieved perfect precision at the cost of low sensitivity (recall=23%).</p>
        <p>A more detailed analysis reveals that each model has specific strengths that may suit different clinical priorities. The XGBoost model provides a practical compromise between sensitivity and specificity, which is ideal for resource-limited infection control teams aiming to balance workload and risk. Random forest’s high precision makes it suitable for contexts in which false positives must be minimized, whereas logistic regression offers higher recall but risks overburdening staff due to its lower precision. Our findings are consistent with those of previous literature in emphasizing the importance of model calibration for real-world implementation; the low Brier scores for both XGBoost and random forest suggest reliable probability estimates that can guide clinical triage. Furthermore, the integration of NLP-derived features—such as SSI keyword frequency and negation detection—significantly improved predictive performance, with these variables ranking among the top 10 predictors by SHAP value. This supports previous work [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>] on the added value of unstructured clinical data in infection surveillance, but our study differs by using these data prospectively—leveraging information available before infection onset—rather than relying on retrospective documentation after SSIs have already occurred.</p>
        <p>Previous studies have also demonstrated strong performance in SSI identification using ML but with important methodological differences. One study using NLP achieved a sensitivity and positive predictive value of 97% for SSI detection [<xref ref-type="bibr" rid="ref29">29</xref>] but relied on postinfection clinical notes, limiting its utility for early intervention. Another study applied logistic regression and tree-based models to preoperative blood test results, achieving an AUC of 86% [<xref ref-type="bibr" rid="ref30">30</xref>], although it used a more balanced dataset and focused narrowly on laboratory values. In contrast, our approach emphasizes prospective prediction using only preinfection data and incorporates a broader set of features—including operative and demographic characteristics and unstructured clinical text—to support earlier, more comprehensive risk assessment and real-time clinical deployment. This multimodal integration increases the generalizability and interpretability of our model, making it better suited for prospective deployment within existing clinical workflows.</p>
        <p>To support clinical adoption, we propose embedding the model’s SSI risk scores directly into the EHR’s surveillance dashboard. Each morning, the model generates risk scores for all patients undergoing colon surgery, automatically flagging those above a configurable threshold (eg, the top 10%). Flagged patients appear on an IPC nurse worklist for prioritized chart review, focusing on vital signs, wound assessments, and microbiology. The threshold can be adjusted based on operational capacity—for example, flagging the top 5% during busy periods or expanding to 15% during lower volume. Nurse adjudications (SSI vs no SSI) are fed back into the system to support model retraining and recalibration over time. This workflow focuses human effort on high-risk cases, enhances surveillance efficiency, and remains adaptable to fluctuating staffing or patient volumes.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Our evaluation showed that SMOTE was more effective than cost-sensitive learning in improving model performance for the minority class, increasing recall by approximately 15% while maintaining similar precision. However, both approaches still exhibited some bias toward the majority class, particularly the random forest model. The severe class imbalance in our dataset (only 66/1508, 4.4% were SSI cases) presents a significant challenge for model development and evaluation. Although we used techniques to address this imbalance, the models’ performance in identifying positive cases remained suboptimal, as evidenced by the modest recall values. Future work could explore advanced synthetic generation methods [<xref ref-type="bibr" rid="ref31">31</xref>] to address severe class imbalance.</p>
        <p>Due to the severe class imbalance, feature importance scores—particularly from tree-based models—may be driven primarily by patterns in the majority (non-SSI) class. Although we applied both SMOTE and cost-sensitive learning to mitigate this imbalance before model training, readers should interpret importance rankings with caution. In future work, we plan to explore class-specific importance measures (eg, SHAP values stratified by outcome) to obtain a more balanced view of predictors for the minority class.</p>
        <p>In addition, the use of retrospective data from a single health care center may introduce biases related to specific patient populations and clinical practices, potentially limiting the generalizability of the results. The predictive factors identified in our specific setting might not hold the same importance in other health care environments with different patient demographics, surgical practices, or infection control protocols.</p>
        <p>Furthermore, our approach to processing unstructured clinical notes was limited to keyword counting and negation detection. More sophisticated NLP techniques such as embedding-based methods or transformer models might yield better feature extraction and, ultimately, enhance predictive performance [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref34">34</xref>].</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study demonstrates that ML models can enhance SSI surveillance by helping clinicians prioritize patients at high risk. Our ML-based tool, which integrates structured EHR data and unstructured clinical notes, offers a scalable approach to improve monitoring efficiency.</p>
        <p>Among the models evaluated, XGBoost provided the best balance of precision, recall, and calibration, although each model presents unique strengths suited to different clinical needs. By triaging patients based on predicted risk, the tool can reduce manual workload and support more timely, targeted interventions to improve patient outcomes.</p>
        <p>While this pilot study shows proof of concept, broader validation is needed to ensure generalizability and clinical utility. We plan to partner with other institutions using the Observational Medical Outcomes Partnership Common Data Model and federated learning to enable privacy-preserving, cross-site model training. Performance will be assessed through calibration, discrimination, and operational impact metrics.</p>
        <p>Beyond technical refinement, this work underscores the potential of artificial intelligence–driven tools to transform infection surveillance from reactive monitoring to proactive, risk-based care. Future efforts should focus on handling class imbalance, improving NLP feature extraction, and ensuring model reliability through post hoc calibration and multisite validation.</p>
        <p>Ultimately, this research advances the responsible and scalable integration of artificial intelligence into clinical workflows to support more targeted, efficient infection prevention.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Detailed cohort characteristics and model performance metrics, including confusion matrices and receiver operating characteristic curve plots for each machine learning model.</p>
        <media xlink:href="formative_v9i1e75121_app1.docx" xlink:title="DOCX File , 21 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ASA</term>
          <def>
            <p>American Society of Anesthesiologists</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUC</term>
          <def>
            <p>area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">AUC-ROC</term>
          <def>
            <p>area under the receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CDC</term>
          <def>
            <p>Centers for Disease Control and Prevention</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CPT</term>
          <def>
            <p>Current Procedural Terminology</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">HIPAA</term>
          <def>
            <p>Health Insurance Portability and Accountability Act</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">IPC</term>
          <def>
            <p>infection prevention and control</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">NHSN</term>
          <def>
            <p>National Healthcare Safety Network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">SHAP</term>
          <def>
            <p>Shapley additive explanations</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">SMOTE</term>
          <def>
            <p>synthetic minority oversampling technique</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">SSI</term>
          <def>
            <p>surgical site infection</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">UMass Chan</term>
          <def>
            <p>University of Massachusetts Chan Medical School</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">WBC</term>
          <def>
            <p>white blood cell</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">XGBoost</term>
          <def>
            <p>Extreme Gradient Boosting</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors thank the infection prevention and control staff at the University of Massachusetts Memorial Medical Center (Melissa Dabkowski, and Maria Del Pilar Diaz Padro) for their invaluable support and collaboration, as well as the University of Massachusetts Chan Medical School IT department for their essential technical assistance. Their contributions were vital to the success of this project. The grant is funded by The Doctors Company Foundation and the University of Massachusetts Center for Clinical and Translational Science Pilot Project Program (UL1-TR001453).</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The datasets generated or analyzed during this study are not publicly available due to usage of individual-level electronic health record data that includes protected health information such as structured tables and free-text clinical notes and, therefore, cannot be publicly shared under US HIPAA (Health Insurance Portability and Accountability Act) regulations and the terms of our institutional review board approval. Deidentified, aggregate summary outputs underlying the manuscript (eg, model performance tables) are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Battles</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Farr</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Weinberg</surname>
              <given-names>DA</given-names>
            </name>
          </person-group>
          <article-title>From research to nationwide implementation: the impact of AHRQ's HAI prevention program</article-title>
          <source>Med Care</source>
          <year>2014</year>
          <month>02</month>
          <volume>52</volume>
          <issue>2 Suppl 1</issue>
          <fpage>S91</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1097/MLR.0000000000000037</pub-id>
          <pub-id pub-id-type="medline">24430273</pub-id>
          <pub-id pub-id-type="pii">00005650-201402001-00015</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Awad</surname>
              <given-names>SS</given-names>
            </name>
          </person-group>
          <article-title>Adherence to surgical care improvement project measures and post-operative surgical site infections</article-title>
          <source>Surg Infect (Larchmt)</source>
          <year>2012</year>
          <month>08</month>
          <volume>13</volume>
          <issue>4</issue>
          <fpage>234</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.1089/sur.2012.131</pub-id>
          <pub-id pub-id-type="medline">22913334</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Merkow</surname>
              <given-names>RP</given-names>
            </name>
            <name name-style="western">
              <surname>Ju</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>BL</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>MV</given-names>
            </name>
            <name name-style="western">
              <surname>Tsai</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Ko</surname>
              <given-names>CY</given-names>
            </name>
            <name name-style="western">
              <surname>Bilimoria</surname>
              <given-names>KY</given-names>
            </name>
          </person-group>
          <article-title>Underlying reasons associated with hospital readmission following surgery in the United States</article-title>
          <source>JAMA</source>
          <year>2015</year>
          <month>02</month>
          <day>03</day>
          <volume>313</volume>
          <issue>5</issue>
          <fpage>483</fpage>
          <lpage>95</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2014.18614</pub-id>
          <pub-id pub-id-type="medline">25647204</pub-id>
          <pub-id pub-id-type="pii">2107788</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>de Lissovoy</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Fraeman</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hutchins</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Vaughn</surname>
              <given-names>BB</given-names>
            </name>
          </person-group>
          <article-title>Surgical site infection: incidence and impact on hospital utilization and treatment costs</article-title>
          <source>Am J Infect Control</source>
          <year>2009</year>
          <month>06</month>
          <volume>37</volume>
          <issue>5</issue>
          <fpage>387</fpage>
          <lpage>97</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ajic.2008.12.010</pub-id>
          <pub-id pub-id-type="medline">19398246</pub-id>
          <pub-id pub-id-type="pii">S0196-6553(09)00073-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sparling</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Ryckman</surname>
              <given-names>FC</given-names>
            </name>
            <name name-style="western">
              <surname>Schoettker</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Byczkowski</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Helpling</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mandel</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Panchanathan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kotagal</surname>
              <given-names>UR</given-names>
            </name>
          </person-group>
          <article-title>Financial impact of failing to prevent surgical site infections</article-title>
          <source>Qual Manag Health Care</source>
          <year>2007</year>
          <volume>16</volume>
          <issue>3</issue>
          <fpage>219</fpage>
          <lpage>25</lpage>
          <pub-id pub-id-type="doi">10.1097/01.QMH.0000281058.99929.ea</pub-id>
          <pub-id pub-id-type="medline">17627217</pub-id>
          <pub-id pub-id-type="pii">00019514-200707000-00005</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Magill</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Edwards</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Bamberg</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Beldavs</surname>
              <given-names>ZG</given-names>
            </name>
            <name name-style="western">
              <surname>Dumyati</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Kainer</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Lynfield</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Maloney</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>McAllister-Hollod</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Nadle</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ray</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>LE</given-names>
            </name>
            <name name-style="western">
              <surname>Fridkin</surname>
              <given-names>SK</given-names>
            </name>
          </person-group>
          <article-title>Multistate point-prevalence survey of health care-associated infections</article-title>
          <source>N Engl J Med</source>
          <year>2014</year>
          <month>03</month>
          <day>27</day>
          <volume>370</volume>
          <issue>13</issue>
          <fpage>1198</fpage>
          <lpage>208</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/24670166"/>
          </comment>
          <pub-id pub-id-type="doi">10.1056/NEJMoa1306801</pub-id>
          <pub-id pub-id-type="medline">24670166</pub-id>
          <pub-id pub-id-type="pmcid">PMC4648343</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Podgorny</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Berríos-Torres</surname>
              <given-names>SI</given-names>
            </name>
            <name name-style="western">
              <surname>Bratzler</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Dellinger</surname>
              <given-names>EP</given-names>
            </name>
            <name name-style="western">
              <surname>Greene</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Nyquist</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Saiman</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yokoe</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Maragakis</surname>
              <given-names>LL</given-names>
            </name>
            <name name-style="western">
              <surname>Kaye</surname>
              <given-names>KS</given-names>
            </name>
          </person-group>
          <article-title>Strategies to prevent surgical site infections in acute care hospitals: 2014 update</article-title>
          <source>Infect Control Hosp Epidemiol</source>
          <year>2014</year>
          <month>06</month>
          <volume>35</volume>
          <issue>6</issue>
          <fpage>605</fpage>
          <lpage>27</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/24799638"/>
          </comment>
          <pub-id pub-id-type="doi">10.1086/676022</pub-id>
          <pub-id pub-id-type="medline">24799638</pub-id>
          <pub-id pub-id-type="pmcid">PMC4267723</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mangram</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Horan</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Pearson</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Silver</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Jarvis</surname>
              <given-names>WR</given-names>
            </name>
          </person-group>
          <article-title>Guideline for prevention of surgical site infection, 1999. Hospital Infection Control Practices Advisory Committee</article-title>
          <source>Infect Control Hosp Epidemiol</source>
          <year>1999</year>
          <month>04</month>
          <volume>20</volume>
          <issue>4</issue>
          <fpage>250</fpage>
          <lpage>78; quiz 279-80</lpage>
          <pub-id pub-id-type="doi">10.1086/501620</pub-id>
          <pub-id pub-id-type="medline">10219875</pub-id>
          <pub-id pub-id-type="pii">ICHE7230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="web">
          <article-title>Surgical site infection event (SSI)</article-title>
          <source>Centers for Disease Control and Prevention</source>
          <access-date>2024-01-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdc.gov/nhsn/pdfs/pscmanual/9pscssicurrent.pdf">https://www.cdc.gov/nhsn/pdfs/pscmanual/9pscssicurrent.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sanger</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>van Ramshorst</surname>
              <given-names>GH</given-names>
            </name>
            <name name-style="western">
              <surname>Mercan</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hartzler</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Armstrong</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Lordon</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lober</surname>
              <given-names>WB</given-names>
            </name>
            <name name-style="western">
              <surname>Evans</surname>
              <given-names>HL</given-names>
            </name>
          </person-group>
          <article-title>A prognostic model of surgical site infection using daily clinical wound assessment</article-title>
          <source>J Am Coll Surg</source>
          <year>2016</year>
          <month>08</month>
          <volume>223</volume>
          <issue>2</issue>
          <fpage>259</fpage>
          <lpage>70.e2</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/27188832"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jamcollsurg.2016.04.046</pub-id>
          <pub-id pub-id-type="medline">27188832</pub-id>
          <pub-id pub-id-type="pii">S1072-7515(16)30145-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC4961603</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meeks</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Lally</surname>
              <given-names>KP</given-names>
            </name>
            <name name-style="western">
              <surname>Carrick</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Lew</surname>
              <given-names>DF</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>EJ</given-names>
            </name>
            <name name-style="western">
              <surname>Doyle</surname>
              <given-names>PD</given-names>
            </name>
            <name name-style="western">
              <surname>Kao</surname>
              <given-names>LS</given-names>
            </name>
          </person-group>
          <article-title>Compliance with guidelines to prevent surgical site infections: as simple as 1-2-3?</article-title>
          <source>Am J Surg</source>
          <year>2011</year>
          <month>01</month>
          <volume>201</volume>
          <issue>1</issue>
          <fpage>76</fpage>
          <lpage>83</lpage>
          <pub-id pub-id-type="doi">10.1016/j.amjsurg.2009.07.050</pub-id>
          <pub-id pub-id-type="medline">20573335</pub-id>
          <pub-id pub-id-type="pii">S0002-9610(10)00059-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ridgeway</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Charlet</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kafatos</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Pearson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Coello</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Infection of the surgical site after arthroplasty of the hip</article-title>
          <source>J Bone Joint Surg Br</source>
          <year>2005</year>
          <month>06</month>
          <volume>87</volume>
          <issue>6</issue>
          <fpage>844</fpage>
          <lpage>50</lpage>
          <pub-id pub-id-type="doi">10.1302/0301-620X.87B6.15121</pub-id>
          <pub-id pub-id-type="medline">15911671</pub-id>
          <pub-id pub-id-type="pii">87-B/6/844</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Koek</surname>
              <given-names>MB</given-names>
            </name>
            <name name-style="western">
              <surname>Hopmans</surname>
              <given-names>TE</given-names>
            </name>
            <name name-style="western">
              <surname>Soetens</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Wille</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Geerlings</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Vos</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>van Benthem</surname>
              <given-names>BH</given-names>
            </name>
            <name name-style="western">
              <surname>de Greeff</surname>
              <given-names>SC</given-names>
            </name>
          </person-group>
          <article-title>Adhering to a national surgical care bundle reduces the risk of surgical site infections</article-title>
          <source>PLoS One</source>
          <year>2017</year>
          <month>09</month>
          <day>06</day>
          <volume>12</volume>
          <issue>9</issue>
          <fpage>e0184200</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0184200"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0184200</pub-id>
          <pub-id pub-id-type="medline">28877223</pub-id>
          <pub-id pub-id-type="pii">PONE-D-17-26109</pub-id>
          <pub-id pub-id-type="pmcid">PMC5587118</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Honnibal</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>spaCy 2: natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing</article-title>
          <source>Sentometrics Research</source>
          <year>2017</year>
          <month>1</month>
          <access-date>2025-08-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://sentometrics-research.com/publication/72/">https://sentometrics-research.com/publication/72/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>WW</given-names>
            </name>
            <name name-style="western">
              <surname>Bridewell</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Hanbury</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>GF</given-names>
            </name>
            <name name-style="western">
              <surname>Buchanan</surname>
              <given-names>BG</given-names>
            </name>
          </person-group>
          <article-title>A simple algorithm for identifying negated findings and diseases in discharge summaries</article-title>
          <source>J Biomed Inform</source>
          <year>2001</year>
          <month>10</month>
          <volume>34</volume>
          <issue>5</issue>
          <fpage>301</fpage>
          <lpage>10</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(01)91029-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1006/jbin.2001.1029</pub-id>
          <pub-id pub-id-type="medline">12123149</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(01)91029-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McKinney</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Data structures for statistical computing in python</article-title>
          <source>Proceedings of the 9th Python in Science Conference</source>
          <year>2010</year>
          <conf-name>SciPy 2010</conf-name>
          <conf-date>June 28-July 3, 2010</conf-date>
          <conf-loc>Austin, TX</conf-loc>
          <pub-id pub-id-type="doi">10.25080/majora-92bf1922-00a</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zai</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Guilarte-Walker</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Langlois</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Coleman</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Soni</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>McManus</surname>
              <given-names>DD</given-names>
            </name>
            <name name-style="western">
              <surname>Luzuriaga</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Implementing virtual desktops for clinical research at an academic health center: a case report</article-title>
          <source>JAMIA Open</source>
          <year>2024</year>
          <month>08</month>
          <day>28</day>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>ooae083</fpage>
          <pub-id pub-id-type="doi">10.1093/jamiaopen/ooae083</pub-id>
          <pub-id pub-id-type="medline">39206281</pub-id>
          <pub-id pub-id-type="pii">ooae083</pub-id>
          <pub-id pub-id-type="pmcid">PMC11357573</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elkan</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>The foundations of cost-sensitive learning</article-title>
          <source>Proceedings of the 17th International Joint Conference on Artificial Intelligence</source>
          <year>2001</year>
          <conf-name>IJCAI'01</conf-name>
          <conf-date>August 4-10, 2001</conf-date>
          <conf-loc>Seattle, WA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chawla</surname>
              <given-names>NV</given-names>
            </name>
            <name name-style="western">
              <surname>Bowyer</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>LO</given-names>
            </name>
            <name name-style="western">
              <surname>Kegelmeyer</surname>
              <given-names>WP</given-names>
            </name>
          </person-group>
          <article-title>SMOTE: synthetic minority over-sampling technique</article-title>
          <source>J Artif Intell Res</source>
          <year>2002</year>
          <month>06</month>
          <day>01</day>
          <volume>16</volume>
          <fpage>321</fpage>
          <lpage>57</lpage>
          <pub-id pub-id-type="doi">10.1613/JAIR.953</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hosmer</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Lemeshow</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sturdivant</surname>
              <given-names>RX</given-names>
            </name>
          </person-group>
          <source>Applied Logistic Regression</source>
          <year>2013</year>
          <publisher-loc>Hoboken, NJ</publisher-loc>
          <publisher-name>Wiley</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Breiman</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Random forests</article-title>
          <source>Mach Learn</source>
          <year>2001</year>
          <month>10</month>
          <volume>45</volume>
          <fpage>5</fpage>
          <lpage>32</lpage>
          <pub-id pub-id-type="doi">10.1023/a:1010933404324</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Guestrin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>XGBoost: a scalable tree boosting system</article-title>
          <source>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</source>
          <year>2016</year>
          <conf-name>KDD '16</conf-name>
          <conf-date>August 13-17, 2016</conf-date>
          <conf-loc>San Francisco, CA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dietterich</surname>
              <given-names>TG</given-names>
            </name>
          </person-group>
          <article-title>Ensemble methods in machine learning</article-title>
          <source>Proceedings of the First International Workshop on Multiple Classifier Systems</source>
          <year>2000</year>
          <conf-name>MCS 2000</conf-name>
          <conf-date>June 21-23, 2000</conf-date>
          <conf-loc>Cagliari, Italy</conf-loc>
          <pub-id pub-id-type="doi">10.1007/3-540-45014-9_1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>AY</given-names>
            </name>
          </person-group>
          <article-title>Feature selection, L1 vs. L2 regularization, and rotational invariance</article-title>
          <source>Proceedings of the Twenty-First International Conference on Machine Learning</source>
          <year>2004</year>
          <conf-name>ICML '04</conf-name>
          <conf-date>July 4-8, 2004</conf-date>
          <conf-loc>Banff, AB</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bergstra</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Random search for hyper-parameter optimization</article-title>
          <source>J Mach Learn Res</source>
          <year>2012</year>
          <volume>13</volume>
          <fpage>281</fpage>
          <lpage>305</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Doyle</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Garmon</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>American Society of Anesthesiologists Classification (ASA Class)</article-title>
          <source>StatPearls</source>
          <year>2025</year>
          <access-date>2025-05-29</access-date>
          <publisher-loc>Treasure Island (FL)</publisher-loc>
          <publisher-name>StatPearls Publishing</publisher-name>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ncbi.nlm.nih.gov/books/NBK441940/">https://www.ncbi.nlm.nih.gov/books/NBK441940/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Larson</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Naessens</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Habermann</surname>
              <given-names>EB</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Detection of surgical site infection utilizing automated feature generation in clinical notes</article-title>
          <source>J Healthc Inform Res</source>
          <year>2019</year>
          <month>09</month>
          <volume>3</volume>
          <issue>3</issue>
          <fpage>267</fpage>
          <lpage>82</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31728432"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s41666-018-0042-9</pub-id>
          <pub-id pub-id-type="medline">31728432</pub-id>
          <pub-id pub-id-type="pmcid">PMC6855398</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>da Silva</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Ten Caten</surname>
              <given-names>CS</given-names>
            </name>
            <name name-style="western">
              <surname>Dos Santos</surname>
              <given-names>RP</given-names>
            </name>
            <name name-style="western">
              <surname>Fogliatto</surname>
              <given-names>FS</given-names>
            </name>
            <name name-style="western">
              <surname>Hsuan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Predicting the occurrence of surgical site infections using text mining and machine learning</article-title>
          <source>PLoS One</source>
          <year>2019</year>
          <month>12</month>
          <day>13</day>
          <volume>14</volume>
          <issue>12</issue>
          <fpage>e0226272</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0226272"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0226272</pub-id>
          <pub-id pub-id-type="medline">31834905</pub-id>
          <pub-id pub-id-type="pii">PONE-D-19-12149</pub-id>
          <pub-id pub-id-type="pmcid">PMC6910696</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thirukumaran</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Zaman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rubery</surname>
              <given-names>PT</given-names>
            </name>
            <name name-style="western">
              <surname>Calabria</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ricciardi</surname>
              <given-names>BF</given-names>
            </name>
            <name name-style="western">
              <surname>Bakhsh</surname>
              <given-names>WR</given-names>
            </name>
            <name name-style="western">
              <surname>Kautz</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing for the identification of surgical site infections in orthopaedics</article-title>
          <source>J Bone Joint Surg Am</source>
          <year>2019</year>
          <month>12</month>
          <day>18</day>
          <volume>101</volume>
          <issue>24</issue>
          <fpage>2167</fpage>
          <lpage>74</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31596819"/>
          </comment>
          <pub-id pub-id-type="doi">10.2106/JBJS.19.00661</pub-id>
          <pub-id pub-id-type="medline">31596819</pub-id>
          <pub-id pub-id-type="pmcid">PMC7002080</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mandagani</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Coleman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zahid</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pugel Ehlers</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Basu Roy</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>De Cock</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Machine learning models for surgical site infection prediction</article-title>
          <source>Proceedings of the American Medical Informatics Association Knowledge Discovery and Data Mining Working Group Symposium</source>
          <year>2016</year>
          <conf-name>AMIA KDDMWG 2016</conf-name>
          <conf-date>November 12-16, 2016</conf-date>
          <conf-loc>Chicago, IL</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Park</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mohammadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gorde</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jajodia</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Data synthesis based on generative adversarial networks</article-title>
          <source>Proc VLDB Endow</source>
          <year>2018</year>
          <month>06</month>
          <volume>11</volume>
          <issue>10</issue>
          <fpage>1071</fpage>
          <lpage>83</lpage>
          <pub-id pub-id-type="doi">10.14778/3231751.3231757</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on October 11, 2018</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinformatics</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>40</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31501885"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
          <pub-id pub-id-type="pmcid">PMC7703786</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alsentzer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boag</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>WH</given-names>
            </name>
            <name name-style="western">
              <surname>Jindi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McDermott</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Publicly available clinical BERT embeddings</article-title>
          <source>Proceedings of the 2nd Clinical Natural Language Processing Workshop</source>
          <year>2019</year>
          <conf-name>ClinicalNLP 2019</conf-name>
          <conf-date>June 7, 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/w19-1909</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
