<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i1e54044</article-id>
      <article-id pub-id-type="pmid">38986131</article-id>
      <article-id pub-id-type="doi">10.2196/54044</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Predictive Model for Extended-Spectrum β-Lactamase–Producing Bacterial Infections Using Natural Language Processing Technique and Open Data in Intensive Care Unit Environment: Retrospective Observational Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Tang</surname>
            <given-names>Rui</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Ito</surname>
            <given-names>Genta</given-names>
          </name>
          <degrees>BPharm</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5821-4242</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Yada</surname>
            <given-names>Shuntaro</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6209-1054</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Wakamiya</surname>
            <given-names>Shoko</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9371-1340</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Aramaki</surname>
            <given-names>Eiji</given-names>
          </name>
          <degrees>Prof Dr</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Information Science</institution>
            <institution>Nara Institute of Science and Technology</institution>
            <addr-line>8916-5 Takayama-cho</addr-line>
            <addr-line>Ikoma City, 8916-5</addr-line>
            <country>Japan</country>
            <phone>81 0743725204</phone>
            <email>aramaki@is.naist.jp</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0201-3609</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Information Science</institution>
        <institution>Nara Institute of Science and Technology</institution>
        <addr-line>Ikoma City</addr-line>
        <country>Japan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Eiji Aramaki <email>aramaki@is.naist.jp</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>10</day>
        <month>7</month>
        <year>2024</year>
      </pub-date>
      <volume>8</volume>
      <elocation-id>e54044</elocation-id>
      <history>
        <date date-type="received">
          <day>8</day>
          <month>11</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>25</day>
          <month>1</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>3</day>
          <month>3</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>29</day>
          <month>5</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Genta Ito, Shuntaro Yada, Shoko Wakamiya, Eiji Aramaki. Originally published in JMIR Formative Research (https://formative.jmir.org), 10.07.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2024/1/e54044" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Machine learning has advanced medical event prediction, mostly using private data. The public MIMIC-3 (Medical Information Mart for Intensive Care III) data set, which contains detailed data on over 40,000 intensive care unit patients, stands out as it can help develop better models including structured and textual data.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to build and test a machine learning model using the MIMIC-3 data set to determine the effectiveness of information extracted from electronic medical record text using a named entity recognition, specifically QuickUMLS, for predicting important medical events. Using the prediction of extended-spectrum β-lactamase (ESBL)–producing bacterial infections as an example, this study shows how open data sources and simple technology can be useful for making clinically meaningful predictions.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>The MIMIC-3 data set, including demographics, vital signs, laboratory results, and textual data, such as discharge summaries, was used. This study specifically targeted patients diagnosed with <italic>Klebsiella pneumoniae</italic> or <italic>Escherichia coli</italic> infection. Predictions were based on ESBL-producing bacterial standards and the minimum inhibitory concentration criteria. Both the structured data and extracted patient histories were used as predictors. In total, 2 models, an L1-regularized logistic regression model and a LightGBM model, were evaluated using the receiver operating characteristic area under the curve (ROC-AUC) and the precision-recall curve area under the curve (PR-AUC).</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Of 46,520 MIMIC-3 patients, 4046 were identified with bacterial cultures, indicating the presence of <italic>K pneumoniae</italic> or <italic>E coli</italic>. After excluding patients who lacked discharge summary text, 3614 patients remained. The L1-penalized model, with variables from only the structured data, displayed a ROC-AUC of 0.646 and a PR-AUC of 0.307. The LightGBM model, combining structured and textual data, achieved a ROC-AUC of 0.707 and a PR-AUC of 0.369. Key contributors to the LightGBM model included patient age, duration since hospital admission, and specific medical history such as diabetes. The structured data-based model showed improved performance compared to the reference models. Performance was further improved when textual medical history was included. Compared to other models predicting drug-resistant bacteria, the results of this study ranked in the middle. Some misidentifications, potentially due to the limitations of QuickUMLS, may have affected the accuracy of the model.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>This study successfully developed a predictive model for ESBL-producing bacterial infections using the MIMIC-3 data set, yielding results consistent with existing literature. This model stands out for its transparency and reliance on open data and open-named entity recognition technology. The performance of the model was enhanced using textual information. With advancements in natural language processing tools such as BERT and GPT, the extraction of medical data from text holds substantial potential for future model optimization.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>predictive modeling</kwd>
        <kwd>MIMIC-3 dataset</kwd>
        <kwd>natural language processing</kwd>
        <kwd>NLP</kwd>
        <kwd>QuickUMLS</kwd>
        <kwd>named entity recognition</kwd>
        <kwd>ESBL-producing bacterial infections</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>In recent years, machine learning techniques have been used to build models to predict various medical events such as drug-resistant bacterial infections [<xref ref-type="bibr" rid="ref1">1</xref>] and unscheduled hospital readmissions [<xref ref-type="bibr" rid="ref2">2</xref>]. Most of these studies have used private data sets to build their prediction models, limiting the replicability and generalizability of the findings owing to accessibility constraints.</p>
      <p>In contrast, MIMIC-3 (Medical Information Mart for Intensive Care III) is a large publicly available electronic medical record (EMR) data set that contains comprehensive clinical data of more than 40,000 patients admitted to intensive care units (ICUs), thereby serving as a valuable resource for the development and evaluation of machine learning models for predicting various medical events [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. While previous studies have mainly used structured data such as patient background (eg, age and sex), laboratory results, and vital signs to predict medical events, MIMIC-3 is unique in that it also includes textual data. In a database consisting of a single medical institution, the records of visits to other medical institutions are not structured and may be difficult to trace. However, the text of an EMR may contain records of visits to other hospitals, and if information from such texts can be extracted, even a database obtained from a single medical facility might facilitate the tracking of past medical history. This may also improve the accuracy of predicting medical events because past medical history is often important in predicting such events.</p>
      <p>Information extraction from medical texts can be a complex task because of the specialized terminology and abundant abbreviations used. Further, 1 common method to extract information from medical texts is to use named entity recognition (NER), which is a subtask of information extraction that seeks to locate and classify named entities in text into predefined categories, such as the names of diseases, drugs, and medical conditions. For example, QuickUMLS matches strings of text to Unified Medical Language System (UMLS) concepts and extracts concept unique identifiers (CUI) from the text [<xref ref-type="bibr" rid="ref6">6</xref>]. The UMLS is a comprehensive resource of biomedical terms and concepts that allows QuickUMLS to extract medical information effectively and quickly. QuickUMLS uses a method called “approximate string matching,” which finds UMLS concepts in texts that are either the same or very close to the string in the text.</p>
      <p>Among the various medical events that are meaningful to predict, this study focused on predicting infections caused by a type of antibiotic-resistant bacteria known as extended-spectrum β-lactamase (ESBL)–producing bacteria. They are a significant global health concern because of their resistance to commonly used antibiotics [<xref ref-type="bibr" rid="ref7">7</xref>]. The incidence of ESBL-producing bacteria has been reported to have increased from 1997 to 2011 in the United States [<xref ref-type="bibr" rid="ref8">8</xref>]. The timely and accurate prediction of ESBL-producing bacterial infections can help initiate appropriate antimicrobial therapy, improve patient outcomes, and minimize the spread of antibiotic resistance.</p>
      <p>In this study, using MIMIC-3 as a data source, we constructed and evaluated a machine learning model to predict whether <italic>Escherichia coli</italic> and <italic>Klebsiella pneumoniae</italic> in specimens collected from a patient were suspected of producing ESBLs, based on structured data and patient history information extracted by applying QuickUMLS to EMR text. This study aimed to build a model that makes clinically meaningful predictions using open data sources and open NER technology.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Ethical Considerations</title>
        <p>The establishment of the MIMIC-3 database was approved by the institutional review boards (IRBs) of Beth Israel Deaconess Medical Center and the Massachusetts Institute of Technology [<xref ref-type="bibr" rid="ref4">4</xref>]. Under the Common Rule (45 CFR 46), records-based research using identifiable private information that is publicly available is exempt from IRB review. Since MIMIC-3 is publicly available and deidentified, the secondary analysis in this study is exempt from IRB review. As a result, this study did not undergo an IRB review.</p>
        <p>One of the authors (GI) received the necessary training in the use of the MIMIC-3 data set, obtained permission to use the data, and conducted this study in compliance with the PhysioNet Credentialed Health Data Use Agreement 1.5.0.</p>
      </sec>
      <sec>
        <title>Data Source</title>
        <p>In this study, we used the MIMIC-3 data set, a publicly available large-scale EMR data set. MIMIC-3 contains comprehensive clinical data from over 40,000 patients admitted to ICUs at the Beth Israel Deaconess Medical Center in Boston, Massachusetts, between 2001 and 2012. The data set includes various types of clinical information such as demographics, vital signs, laboratory results, medications, diagnoses, and free-text data in the form of nursing notes, radiology reports, and discharge summaries [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>].</p>
      </sec>
      <sec>
        <title>Study Population</title>
        <p>This study included patients with <italic>K pneumoniae</italic> or <italic>E coli</italic> detected in bacterial culture tests conducted during hospitalization and detailed in the MIMIC-3. Patients without a summary text at discharge were excluded from this study.</p>
      </sec>
      <sec>
        <title>Outcome Variable</title>
        <p>The outcome variable is a binary variable of whether <italic>K pneumoniae</italic> or <italic>E coli</italic> in the specimen showed a minimum inhibitory concentration ≥8 µg/mL for cefpodoxime or minimum inhibitory concentration ≥2 µg/mL for ceftazidime, or ceftriaxone as a result of the bacterial culture test (liquid microdilution method). This criterion corresponds to the screening criteria for ESBL-producing bacteria according to the Clinical and Laboratory Standards Institute M100-S25 [<xref ref-type="bibr" rid="ref9">9</xref>].</p>
      </sec>
      <sec>
        <title>Predictor Variables</title>
        <p>The predictor variables were broadly classified into variables extracted from the structured tables and those related to the patient’s history extracted from the text data.</p>
        <p>The variables extracted from the structured table were patient age at the time of specimen collection, sex, number of days between admission and specimen collection, admission type, previous location of the patient before arrival at the hospital, and specimen type collected for the bacterial culture test.</p>
        <p>The medical history variables extracted from the discharge summary text were preprocessed using the procedure: (1) from the discharge summary, we extracted paragraphs beginning with the string “Past medical history,” “Past Medical History,” or “PAST MEDICAL HISTORY”; (2) QuickUMLS was applied to the extracted paragraphs to extract the CUI with a Jaccard similarity coefficient of 0.7 or higher; and (3) dummy variables with and without each CUI were used for medical history.</p>
      </sec>
      <sec>
        <title>Model Development and Evaluation</title>
        <p>First, we constructed an L1-regularized logistic regression model using only the variables extracted from the structured table. Subsequently, we used the variables derived from the structured table and those related to medical history extracted from the text to build either an L1-regularized logistic regression model or a LightGBM model. We chose L1-regularized logistic regression and LightGBM to build our predictive model primarily for 2 reasons. First, logistic regression is a straightforward linear model used for binary classification, while LightGBM is a more complex model that can handle both linear and nonlinear patterns. This combination allows us to cover a broad range of data behaviors. Second, we used L1 regularization with logistic regression to help manage the model’s complexity by selecting important features. For reference, we constructed a model that judges all positive cases, one that judges all negative cases, and one that judges randomly according to the ratio of positive to negative cases.</p>
        <p>The models were constructed and evaluated using stratified group 5-fold cross-validation with patient ID as a group variable. This method ensures an equal class distribution in each fold, while samples from the same patient are not split across different folds. The receiver operating characteristic area under the curve (ROC-AUC) and precision-recall curve area under the curve (PR-AUC) were used to evaluate the performance of each model. The ROC-AUC evaluates the model’s ability to distinguish between classes, whereas the PR-AUC focuses on the model’s performance in terms of precision and recall, which are particularly valuable when dealing with imbalanced data sets [<xref ref-type="bibr" rid="ref10">10</xref>]. These were calculated for each fold, and the average of the values from each fold was used as the ROC-AUC and PR-AUC of that model.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Study Population</title>
        <p>The total number of patients registered in the MIMIC-3 database was 46,520, of which 4046 underwent bacterial culture tests, and <italic>K pneumoniae</italic> or <italic>E coli</italic> were detected. After excluding 432 patients without a summary text on discharge, the final study population consisted of 3614 patients. Of the specimens collected from this study’s population, 5272 specimens were positive for <italic>K pneumoniae</italic> or <italic>E coli</italic>, which were the targets of this model.</p>
      </sec>
      <sec>
        <title>Patient Characteristics</title>
        <p>Patient characteristics are summarized in <xref ref-type="table" rid="table1">Table 1</xref>, which shows no notable differences in mean age between negative and positive ESBL screening patients, although there was a slightly higher proportion of older patients (n=1160, 27.3%) aged 80 years or older among negative patients. The sex distribution was slightly higher for females (n=2477, 58.4%) in the negative group and almost the same in the positive group. The average time from admission to specimen collection was 6.1 (SD 11.9) days for negative patients and 12.1 (SD 18.4) days for positive patients. Before arrival at the hospital, the previous location of the patient did not differ markedly between the negative and positive patients. Specimens tested for bacterial growth showed that negative patients had slightly more urine specimens and slightly fewer sputum specimens than positive patients. However, these differences were not statistically significant.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Patient characteristics.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="410"/>
            <col width="0"/>
            <col width="280"/>
            <col width="0"/>
            <col width="280"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Characteristics</td>
                <td colspan="2">Patients with ESBL<sup>a</sup> screening–negative specimen (N=4242)</td>
                <td>Patients with ESBL screening–positive specimen (N=1030)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>Age (years)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>&#60;20 years, n (%)</td>
                <td colspan="2">10 (0.24)</td>
                <td colspan="2">0 (0)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>≥20 to &#60;40 years, n (%)</td>
                <td colspan="2">196 (4.62)</td>
                <td colspan="2">64 (6.21)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>≥40 to &#60;60 years, n (%)</td>
                <td colspan="2">941 (22.18)</td>
                <td colspan="2">275 (26.70)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>≥60 to &#60;80 years, n (%)</td>
                <td colspan="2">1935 (45.62)</td>
                <td colspan="2">530 (51.46)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>≥80 years, n (%)</td>
                <td colspan="2">1160 (27.35)</td>
                <td colspan="2">161 (15.63)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Mean (SD)</td>
                <td colspan="2">68.8 (15.2)</td>
                <td colspan="2">65.2 (14.5)</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Sex, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Male</td>
                <td colspan="2">1765 (41.61)</td>
                <td colspan="2">530 (51.46)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Female</td>
                <td colspan="2">2477 (58.39)</td>
                <td colspan="2">500 (48.54)</td>
              </tr>
              <tr valign="top">
                <td colspan="3">Number of days from the admission to specimen collection, mean (SD)</td>
                <td colspan="2">6.1 (11.9)</td>
                <td>12.1 (18.4)</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Admission type, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Emergency</td>
                <td colspan="2">3741 (88.19)</td>
                <td colspan="2">927 (90)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Elective</td>
                <td colspan="2">384 (9.05)</td>
                <td colspan="2">76 (7.38)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Urgent</td>
                <td colspan="2">114 (2.69)</td>
                <td colspan="2">27 (2.62)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Newborn</td>
                <td colspan="2">3 (0.07)</td>
                <td colspan="2">0 (0)</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Previous location of the patient before arriving at the hospital, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Emergency department admission</td>
                <td colspan="2">2092 (49.31)</td>
                <td colspan="2">453 (43.98)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Clinic referral or premature</td>
                <td colspan="2">920 (21.69)</td>
                <td colspan="2">213 (20.68)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Transfer from hospital or extramural</td>
                <td colspan="2">648 (15.28)</td>
                <td colspan="2">208 (20.19)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Physician referral or normal delivery</td>
                <td colspan="2">526 (12.40)</td>
                <td colspan="2">116 (11.26)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Transfer from a skilled nursing facility</td>
                <td colspan="2">40 (0.94)</td>
                <td colspan="2">29 (2.82)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Transfer from other health</td>
                <td colspan="2">15 (0.35)</td>
                <td colspan="2">11 (1.07)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Information not available</td>
                <td colspan="2">1 (0.02)</td>
                <td colspan="2">0 (0)</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Specimen tested for bacterial growth, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Urine</td>
                <td colspan="2">2395 (56.46)</td>
                <td colspan="2">431 (41.84)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sputum</td>
                <td colspan="2">637 (15.02)</td>
                <td colspan="2">245 (23.79)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Blood culture</td>
                <td colspan="2">568 (13.39)</td>
                <td colspan="2">134 (13.01)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Swab</td>
                <td colspan="2">197 (4.64)</td>
                <td colspan="2">72 (6.99)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Bronchoalveolar lavage</td>
                <td colspan="2">78 (1.84)</td>
                <td colspan="2">18 (1.75)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Other</td>
                <td colspan="2">367 (8.65)</td>
                <td colspan="2">130 (12.62)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>ESBL: extended-spectrum β-lactamase.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Model Performance</title>
        <p>The ROC-AUC of the L1-penalized model constructed using variables extracted from a structured table was 0.646 and the PR-AUC was 0.307 (<xref ref-type="table" rid="table2">Table 2</xref>). When variables extracted from the structured table and past medical history variables extracted from the discharge summary were used to construct the L1-penalized model, the ROC-AUC and PR-AUC were 0.653 and 0.335, respectively (<xref ref-type="table" rid="table2">Table 2</xref> and <xref rid="figure1" ref-type="fig">Figure 1</xref>). The ROC-AUC of the LightGBM model was 0.707 and the PR-AUC was 0.369 (<xref ref-type="table" rid="table2">Table 2</xref> and <xref rid="figure2" ref-type="fig">Figure 2</xref>). In the reference random classification model, the ROC-AUC and PR-AUC were 0.501 and 0.275, respectively (<xref ref-type="table" rid="table2">Table 2</xref>). The model that predicted all cases as positive or negative had a ROC-AUC of 0.500 and an undefined PR-AUC (<xref ref-type="table" rid="table2">Table 2</xref>).</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Model performance.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="520"/>
            <col width="180"/>
            <col width="270"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Predictor variables and model</td>
                <td>ROC<sup>a</sup>-AUC<sup>b</sup></td>
                <td>PR<sup>c</sup>-AUC</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="4">
                  <bold>Only from the structured table</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>L1-regularized logistic regression</td>
                <td>0.646</td>
                <td>0.307</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>From the structured table and the discharge summary text</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>L1-regularized logistic regression</td>
                <td>0.653</td>
                <td>0.335</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LightGBM</td>
                <td>0.707</td>
                <td>0.369</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>None</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Random<sup>d</sup></td>
                <td>0.501</td>
                <td>0.275</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All positive</td>
                <td>0.500</td>
                <td>—<sup>e</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>All negative</td>
                <td>0.500</td>
                <td>—</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>ROC: receiver operating characteristic.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>AUC: area under the curve.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>PR: precision-recall curve.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>Random model judges randomly according to the ratio of positive to negative cases.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Mean ROC curve and mean PR curve of L1-regularized logistic regression with variables from the structured table and the discharge summary text. The AUC value is presented as the mean (SD) across 5 folds. AUC: area under the curve; PR: precision-recall curve; ROC: receiver operating characteristic.</p>
          </caption>
          <graphic xlink:href="formative_v8i1e54044_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Mean ROC curve and mean PR curve of LightGBM with variables from the structured table and the discharge summary text. The AUC value is presented as the mean (SD) across 5 folds. AUC: area under the curve; PR: precision-recall curve; ROC: receiver operating characteristic.</p>
          </caption>
          <graphic xlink:href="formative_v8i1e54044_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Model Specification</title>
        <p>Several key variables related to patient attributes and medical history are highlighted in the feature importance data for the LightGBM model. Specifically, “age at admission” has an importance score of 262, emphasizing its significant predictive value in the model. The “days from hospitalization to specimen collection” feature also plays a critical role with a score of 245.2. Medical conditions such as end-stage renal disease (ESRD) and Pseudomonas infection are also pertinent, with ESRD assigned an importance of 64.4, although the specific score for Pseudomonas infection is not listed in the top features shown. The data further include morbid obesity and diabetes mellitus (DM), which are integral for understanding patient outcomes, though their importance scores need to be specified from the full list. Additional features such as “fright,” “secondary,” “trachy,” and “severed” are less prominent, with “fright” having an importance of 28.7, indicating its relatively lower but still notable influence on the model’s predictions.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Compared with the reference random classification model, we observed an improvement in both the ROC-AUC and PR-AUC of our model, built solely on predictor variables from the structured table data. The performance of the model was further enhanced by adding features extracted from text-based medical histories.</p>
        <p>A systematic review reported that the ROC-AUCs of existing predictive models without text for drug-resistant bacteria ranged from 0.48 to 0.93 [<xref ref-type="bibr" rid="ref1">1</xref>]. The performance of our model was neither outstanding nor disappointing; it was somewhere in between. Key features contributing to the model’s performance included age; number of days since hospital admission; and medical history of end-stage renal disease, <italic>Pseudomonas</italic> infection, obesity, and DM. These have been noted in previous studies as risk factors for ESBL-producing bacteria and other drug-resistant bacteria, and it seems reasonable to assume that they are important factors for predicting ESBL-producing bacterial infections [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref15">15</xref>].</p>
        <p>While our study’s predictors are consistent with known risk factors for drug-resistant infections, the retrospective nature of our analysis means we might not have fully accounted for all biases inherent in such data. Therefore, our findings do not imply causal relationships and should be interpreted with caution, particularly in clinical applications.</p>
        <p>Features from the discharge summary text improved prediction accuracy; however, the feature “fright,” which appears to be unimportant for predicting drug-resistant bacteria, also appeared among the important features. This may be because of the limitations of NER using QuickUMLS. That is, “fright” may have been incorrectly identified because it contains the string “right.” Patients with a long medical history are more likely to include common words such as “right” in their medical history, which may have increased the risk of ESBL-producing bacterial infection due to the long medical history, resulting in a higher feature importance. The accuracy of the history extraction by NER may have led to the extraction of invalid features and prevented the model from reaching a high level of performance.</p>
        <p>Another concern is the frequent use of abbreviations in discharge summaries. For example, “DM” is an abbreviation for diabetes mellitus. QuickUMLS performs NER based on string similarity, so the abbreviation “DM” and the full term “diabetes mellitus” are the same, but the strings are not highly similar, making extraction difficult.</p>
        <p>Furthermore, the extraction by QuickUMLS of words that might seem less meaningful at first glance, such as “secondary,” “trachy,” and “severed” is another challenge. Specifically, “secondary” is associated with “neoplasm metastasis” (CUI: C0027627), “trachy” with “tracheotomy procedure” (CUI: C0040591), and “severed” with “severing” (CUI: C1306232). However, these words were not always used in the discharge summaries to indicate their corresponding CUIs.</p>
        <p>The data used in our study were obtained from MIMIC-3, a single-institution database, and the extraction of information outside the patient’s institution, such as medical history, was possible only through textual information. Medical histories were extracted from the discharge summary texts; however, information such as antibiotic usage history, which is a risk factor for ESBL, may not have been detailed in the discharge summary, leading to potential underextraction.</p>
        <p>Additionally, while the MIMIC-III database is open and makes our study results more reproducible, the reliability of its data cannot be fully guaranteed. This means that although our model shows trends similar to previous studies, suggesting it has some validity, the possibility that the model could be invalid cannot be dismissed.</p>
        <p>In summary, although feature extraction from the discharge summary texts using QuickUMLS improved the accuracy of predicting ESBL-producing bacterial infections, incomplete data and difficulties in extracting information from the text may have prevented us from extracting all the data required for ESBL prediction. These obstacles may have contributed to the suboptimal performance of the proposed model.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In conclusion, we constructed a model that predicts ESBL-producing bacterial infections with accuracy comparable to that of previous studies using the publicly available MIMIC-3 data set.</p>
        <p>Because our model was constructed using open data and open NER technology, it exhibited a high level of transparency. We believe that this model serves as a valuable reference for future studies in this field.</p>
        <p>By extracting information from the text, we enhanced the performance of our model. We posit that if we can extract data from the text with even higher precision, we may be able to further improve the performance of our model.</p>
        <p>The advent of transformer-based models, such as BERT and GPT, has led to notable improvements in medical natural language processing tasks [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Given the rise in natural language processing techniques, we believe that further applications for the extraction of information from medical texts, such as those used in our study, are promising.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">CUI</term>
          <def>
            <p>concept unique identifier</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">DM</term>
          <def>
            <p>diabetes mellitus</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">EMR</term>
          <def>
            <p>electronic medical record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">ESBL</term>
          <def>
            <p>extended-spectrum β-lactamase</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">ESRD</term>
          <def>
            <p>end-stage renal disease</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ICU</term>
          <def>
            <p>intensive care unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">IRB</term>
          <def>
            <p>institutional review board</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">MIMIC-3</term>
          <def>
            <p>Medical Information Mart for Intensive Care III</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">NER</term>
          <def>
            <p>named entity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">PR-AUC</term>
          <def>
            <p>precision-recall curve area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">ROC-AUC</term>
          <def>
            <p>receiver operating characteristic area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">UMLS</term>
          <def>
            <p>Unified Medical Language System</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the Japan Science and Technology Agency's Core Research for Evolutional Science and Technology (JST CREST, JPMJCR22N1, Japan).</p>
      <p>We would like to thank Editage [<xref ref-type="bibr" rid="ref18">18</xref>] and ChatGPT for English language proofreading.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>This study used the MIMIC-III database, which is publicly available and contains deidentified data from ICU patients. Access to the data requires the completion of a data use agreement and a training course, available at PhysioNet [<xref ref-type="bibr" rid="ref19">19</xref>]. The data used in our research adheres to the terms of this agreement, ensuring patient privacy and confidentiality. Researchers can obtain the MIMIC-III data set by following the specified access procedure on PhysioNet.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>GI designed this study, performed the data analysis, discussed the results, and drafted and completed this paper. SY, SW, and EA discussed the results and commented on this paper. EA supervised this study. All the authors have reviewed and approved the final paper.</p>
      </fn>
      <fn fn-type="conflict">
        <p>GI is an employee at Shionogi &#38; Co. There was no involvement of Shionogi &#38; Co in the publication process. The other authors declare no conflicts of interest.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Machine learning in predicting antimicrobial resistance: a systematic review and meta-analysis</article-title>
          <source>Int J Antimicrob Agents</source>
          <year>2022</year>
          <volume>60</volume>
          <issue>5-6</issue>
          <fpage>106684</fpage>
          <pub-id pub-id-type="doi">10.1016/j.ijantimicag.2022.106684</pub-id>
          <pub-id pub-id-type="medline">36279973</pub-id>
          <pub-id pub-id-type="pii">S0924-8579(22)00211-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>LY</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>XC</given-names>
            </name>
            <name name-style="western">
              <surname>Nejatian</surname>
              <given-names>NP</given-names>
            </name>
            <name name-style="western">
              <surname>Nasir-Moin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Abidin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Eaton</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Riina</surname>
              <given-names>HA</given-names>
            </name>
            <name name-style="western">
              <surname>Laufer</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Punjabi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Miceli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>NC</given-names>
            </name>
            <name name-style="western">
              <surname>Orillac</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Schnurman</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Livia</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kurland</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Neifert</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dastagirzada</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kondziolka</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cheung</surname>
              <given-names>ATM</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Flores</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Costa</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Aphinyanaphongs</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Oermann</surname>
              <given-names>EK</given-names>
            </name>
          </person-group>
          <article-title>Health system-scale language models are all-purpose prediction engines</article-title>
          <source>Nature</source>
          <year>2023</year>
          <volume>619</volume>
          <issue>7969</issue>
          <fpage>357</fpage>
          <lpage>362</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37286606"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41586-023-06160-y</pub-id>
          <pub-id pub-id-type="medline">37286606</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41586-023-06160-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC10338337</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III Clinical Database (version 1.4)</article-title>
          <source>PhysioNet</source>
          <year>2016</year>
          <access-date>2024-06-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://physionet.org/content/mimiciii/1.4/">https://physionet.org/content/mimiciii/1.4/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AEW</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III, a freely accessible critical care database</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <volume>3</volume>
          <fpage>160035</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/sdata.2016.35"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id>
          <pub-id pub-id-type="medline">27219127</pub-id>
          <pub-id pub-id-type="pii">sdata201635</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goldberger</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Amaral</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Glass</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hausdorff</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Ivanov</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
            <name name-style="western">
              <surname>Mietus</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>GB</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>CK</given-names>
            </name>
            <name name-style="western">
              <surname>Stanley</surname>
              <given-names>HE</given-names>
            </name>
          </person-group>
          <article-title>PhysioBank, PhysioToolkit, and PhysioNet: components of a new research resource for complex physiologic signals</article-title>
          <source>Circulation</source>
          <year>2000</year>
          <volume>101</volume>
          <issue>23</issue>
          <fpage>E215</fpage>
          <lpage>E220</lpage>
          <pub-id pub-id-type="doi">10.1161/01.cir.101.23.e215</pub-id>
          <pub-id pub-id-type="medline">10851218</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Soldaini</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Goharian</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>QuickUMLS: a fast, unsupervised approach for medical concept extraction</article-title>
          <source>MedIR Workshop</source>
          <year>2016</year>
          <publisher-loc>USA</publisher-loc>
          <publisher-name>SIGIR</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Castanheira</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Simner</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Bradford</surname>
              <given-names>PA</given-names>
            </name>
          </person-group>
          <article-title>Extended-spectrum β-lactamases: an update on their characteristics, epidemiology and detection</article-title>
          <source>JAC Antimicrob Resist</source>
          <year>2021</year>
          <volume>3</volume>
          <issue>3</issue>
          <fpage>dlab092</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34286272"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jacamr/dlab092</pub-id>
          <pub-id pub-id-type="medline">34286272</pub-id>
          <pub-id pub-id-type="pii">dlab092</pub-id>
          <pub-id pub-id-type="pmcid">PMC8284625</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McDanel</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schweizer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Crabb</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Samore</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Khader</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Blevins</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Diekema</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chiang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Nair</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Perencevich</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Incidence of extended-spectrum ß-lactamase (ESBL)—producing Escherichia coli and Klebsiella infections in the United States: a systematic literature review</article-title>
          <source>Infect Control Hosp Epidemiol</source>
          <year>2017</year>
          <volume>38</volume>
          <issue>10</issue>
          <fpage>1209</fpage>
          <lpage>1215</lpage>
          <pub-id pub-id-type="doi">10.1017/ice.2017.156</pub-id>
          <pub-id pub-id-type="medline">28758612</pub-id>
          <pub-id pub-id-type="pii">S0899823X17001568</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="book">
          <source>Performance standards for antimicrobial susceptibility testing; 25th informational supplement</source>
          <year>2015</year>
          <publisher-loc>Wayne, PA</publisher-loc>
          <publisher-name>Clinical and Laboratory Standards Institute</publisher-name>
          <fpage>94</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Saito</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Rehmsmeier</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The precision-recall plot is more informative than the ROC plot when evaluating binary classifiers on imbalanced datasets</article-title>
          <source>PLoS One</source>
          <year>2015</year>
          <volume>10</volume>
          <issue>3</issue>
          <fpage>e0118432</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0118432"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0118432</pub-id>
          <pub-id pub-id-type="medline">25738806</pub-id>
          <pub-id pub-id-type="pii">PONE-D-14-26790</pub-id>
          <pub-id pub-id-type="pmcid">PMC4349800</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>McGregor</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Strauss</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Moore</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Standiford</surname>
              <given-names>HC</given-names>
            </name>
            <name name-style="western">
              <surname>Hebden</surname>
              <given-names>JN</given-names>
            </name>
            <name name-style="western">
              <surname>Morris</surname>
              <given-names>JG</given-names>
            </name>
          </person-group>
          <article-title>Risk factors for colonization with extended-spectrum beta-lactamase-producing bacteria and intensive care unit admission</article-title>
          <source>Emerg Infect Dis</source>
          <year>2007</year>
          <volume>13</volume>
          <issue>8</issue>
          <fpage>1144</fpage>
          <lpage>1149</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/17953083"/>
          </comment>
          <pub-id pub-id-type="doi">10.3201/eid1308.070071</pub-id>
          <pub-id pub-id-type="medline">17953083</pub-id>
          <pub-id pub-id-type="pmcid">PMC2828082</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lautenbach</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Bilker</surname>
              <given-names>WB</given-names>
            </name>
            <name name-style="western">
              <surname>Edelstein</surname>
              <given-names>PH</given-names>
            </name>
            <name name-style="western">
              <surname>Fishman</surname>
              <given-names>NO</given-names>
            </name>
          </person-group>
          <article-title>Extended-spectrum beta-lactamase-producing Escherichia coli and Klebsiella pneumoniae: risk factors for infection and impact of resistance on outcomes</article-title>
          <source>Clin Infect Dis</source>
          <year>2001</year>
          <volume>32</volume>
          <issue>8</issue>
          <fpage>1162</fpage>
          <lpage>1171</lpage>
          <pub-id pub-id-type="doi">10.1086/319757</pub-id>
          <pub-id pub-id-type="medline">11283805</pub-id>
          <pub-id pub-id-type="pii">CID000853</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Narayanan</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Vinarov</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bucek</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mathew</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chaudhry</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Brunetti</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Relationship between multidrug-resistant enterobacterales and obesity in older adults</article-title>
          <source>Infect Drug Resist</source>
          <year>2021</year>
          <volume>14</volume>
          <fpage>2527</fpage>
          <lpage>2532</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34234480"/>
          </comment>
          <pub-id pub-id-type="doi">10.2147/IDR.S317014</pub-id>
          <pub-id pub-id-type="medline">34234480</pub-id>
          <pub-id pub-id-type="pii">317014</pub-id>
          <pub-id pub-id-type="pmcid">PMC8255648</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Su</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Riggi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lindholm</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Marrone</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Carrero</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lundborg</surname>
              <given-names>CS</given-names>
            </name>
          </person-group>
          <article-title>Association of kidney function with infections by multidrug-resistant organisms: an electronic medical record analysis</article-title>
          <source>Sci Rep</source>
          <year>2018</year>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>13372</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-018-31612-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-018-31612-1</pub-id>
          <pub-id pub-id-type="medline">30190585</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-018-31612-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC6127257</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maia</surname>
              <given-names>MDO</given-names>
            </name>
            <name name-style="western">
              <surname>da Silveira</surname>
              <given-names>CDG</given-names>
            </name>
            <name name-style="western">
              <surname>Gomes</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fernandes</surname>
              <given-names>SES</given-names>
            </name>
            <name name-style="western">
              <surname>Bezerra de Santana</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>de Oliveira</surname>
              <given-names>DQ</given-names>
            </name>
            <name name-style="western">
              <surname>Amorim</surname>
              <given-names>FFP</given-names>
            </name>
            <name name-style="western">
              <surname>Neves</surname>
              <given-names>FDAR</given-names>
            </name>
            <name name-style="western">
              <surname>Amorim</surname>
              <given-names>FF</given-names>
            </name>
          </person-group>
          <article-title>Multidrug-resistant bacteria on critically ill patients with sepsis at hospital admission: risk factors and effects on hospital mortality</article-title>
          <source>Infect Drug Resist</source>
          <year>2023</year>
          <volume>16</volume>
          <fpage>1693</fpage>
          <lpage>1704</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36992963"/>
          </comment>
          <pub-id pub-id-type="doi">10.2147/IDR.S401754</pub-id>
          <pub-id pub-id-type="medline">36992963</pub-id>
          <pub-id pub-id-type="pii">401754</pub-id>
          <pub-id pub-id-type="pmcid">PMC10042244</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdavi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Scales</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tanwani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cole-Lewis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Pfohl</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Payne</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Seneviratne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gamble</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Babiker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schärli</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chowdhery</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mansfield</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Demner-Fushman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Agüera Y Arcas</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Webster</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Matias</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gottweis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tomasev</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rajkomar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Barral</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Semturs</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Large language models encode clinical knowledge</article-title>
          <source>Nature</source>
          <year>2023</year>
          <volume>620</volume>
          <issue>7972</issue>
          <fpage>172</fpage>
          <lpage>180</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37438534"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="medline">37438534</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC10396962</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Idnay</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Nestor</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Soroush</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Elias</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Durrett</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Rousseau</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Evaluating large language models on medical evidence summarization</article-title>
          <source>NPJ Digit Med</source>
          <year>2023</year>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>158</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-023-00896-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-023-00896-7</pub-id>
          <pub-id pub-id-type="medline">37620423</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-023-00896-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC10449915</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
          <source>editage</source>
          <access-date>2024-06-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.editage.jp/">https://www.editage.jp/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <source>PhysioNet</source>
          <access-date>2024-06-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://physionet.org/">https://physionet.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
