<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i1e59267</article-id>
      <article-id pub-id-type="pmid">38924784</article-id>
      <article-id pub-id-type="doi">10.2196/59267</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Evaluating ChatGPT-4’s Accuracy in Identifying Final Diagnoses Within Differential Diagnoses Compared With Those of Physicians: Experimental Study for Diagnostic Cases</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Rodman</surname>
            <given-names>Adam</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhang</surname>
            <given-names>Chaofan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Hirosawa</surname>
            <given-names>Takanobu</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Diagnostic and Generalist Medicine</institution>
            <institution>Dokkyo Medical University</institution>
            <addr-line>880 Kitakobayashi</addr-line>
            <addr-line>Mibu-cho, Shimotsuga</addr-line>
            <addr-line>Tochigi, 321-0293</addr-line>
            <country>Japan</country>
            <phone>81 282861111</phone>
            <email>hirosawa@dokkyomed.ac.jp</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3573-8203</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Harada</surname>
            <given-names>Yukinori</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6042-7397</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Mizuta</surname>
            <given-names>Kazuya</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-8822-7127</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Sakamoto</surname>
            <given-names>Tetsu</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9104-8891</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Tokumasu</surname>
            <given-names>Kazuki</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9513-6864</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Shimizu</surname>
            <given-names>Taro</given-names>
          </name>
          <degrees>MD, MPH, MBA, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3788-487X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Diagnostic and Generalist Medicine</institution>
        <institution>Dokkyo Medical University</institution>
        <addr-line>Tochigi</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of General Medicine</institution>
        <institution>Okayama University Graduate School of Medicine, Dentistry and Pharmaceutical Sciences</institution>
        <addr-line>Okayama</addr-line>
        <country>Japan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Takanobu Hirosawa <email>hirosawa@dokkyomed.ac.jp</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>26</day>
        <month>6</month>
        <year>2024</year>
      </pub-date>
      <volume>8</volume>
      <elocation-id>e59267</elocation-id>
      <history>
        <date date-type="received">
          <day>8</day>
          <month>4</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>24</day>
          <month>4</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>28</day>
          <month>4</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>4</day>
          <month>5</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Takanobu Hirosawa, Yukinori Harada, Kazuya Mizuta, Tetsu Sakamoto, Kazuki Tokumasu, Taro Shimizu. Originally published in JMIR Formative Research (https://formative.jmir.org), 26.06.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2024/1/e59267" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The potential of artificial intelligence (AI) chatbots, particularly ChatGPT with GPT-4 (OpenAI), in assisting with medical diagnosis is an emerging research area. However, it is not yet clear how well AI chatbots can evaluate whether the final diagnosis is included in differential diagnosis lists.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to assess the capability of GPT-4 in identifying the final diagnosis from differential-diagnosis lists and to compare its performance with that of physicians for case report series.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We used a database of differential-diagnosis lists from case reports in the <italic>American Journal of Case Reports</italic>, corresponding to final diagnoses. These lists were generated by 3 AI systems: GPT-4, Google Bard (currently Google Gemini), and Large Language Models by Meta AI 2 (LLaMA2). The primary outcome was focused on whether GPT-4’s evaluations identified the final diagnosis within these lists. None of these AIs received additional medical training or reinforcement. For comparison, 2 independent physicians also evaluated the lists, with any inconsistencies resolved by another physician.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The 3 AIs generated a total of 1176 differential diagnosis lists from 392 case descriptions. GPT-4’s evaluations concurred with those of the physicians in 966 out of 1176 lists (82.1%). The Cohen κ coefficient was 0.63 (95% CI 0.56-0.69), indicating a fair to good agreement between GPT-4 and the physicians’ evaluations.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>GPT-4 demonstrated a fair to good agreement in identifying the final diagnosis from differential-diagnosis lists, comparable to physicians for case report series. Its ability to compare differential diagnosis lists with final diagnoses suggests its potential to aid clinical decision-making support through diagnostic feedback. While GPT-4 showed a fair to good agreement for evaluation, its application in real-world scenarios and further validation in diverse clinical environments are essential to fully understand its utility in the diagnostic process.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>decision support system</kwd>
        <kwd>diagnostic errors</kwd>
        <kwd>diagnostic excellence</kwd>
        <kwd>diagnosis</kwd>
        <kwd>large language model</kwd>
        <kwd>LLM</kwd>
        <kwd>natural language processing</kwd>
        <kwd>GPT-4</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>diagnoses</kwd>
        <kwd>physicians</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
        <kwd>chatbots</kwd>
        <kwd>medical diagnosis</kwd>
        <kwd>assessment</kwd>
        <kwd>decision-making support</kwd>
        <kwd>application</kwd>
        <kwd>applications</kwd>
        <kwd>app</kwd>
        <kwd>apps</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Diagnostic Error and Feedback</title>
        <p>A well-developed diagnostic process is fundamental to medicine. Diagnostic errors [<xref ref-type="bibr" rid="ref1">1</xref>], which include missed, incorrect, or delayed diagnoses [<xref ref-type="bibr" rid="ref2">2</xref>], result in severe misdiagnosis-related harm, affecting up to 795,000 patients annually in the United States [<xref ref-type="bibr" rid="ref3">3</xref>]. These errors often stem from a failure to correctly identify an underlying condition [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Enhancing the diagnostic process is crucial, with diagnostic feedback playing a key role [<xref ref-type="bibr" rid="ref6">6</xref>]. The feedback enables physicians to assess their diagnostic accuracy and adjust their subsequent clinical decisions accordingly [<xref ref-type="bibr" rid="ref7">7</xref>]. Common diagnostic feedback methods include self-reflection [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], peer review [<xref ref-type="bibr" rid="ref1">1</xref>], and clinical decision support systems (CDSSs), which aim to enhance decision-making at the point of care [<xref ref-type="bibr" rid="ref10">10</xref>]. Unlike the retrospective nature of self and peer review processes, feedback from CDSSs is provided in real-time [<xref ref-type="bibr" rid="ref11">11</xref>], offering immediate support and guidance during the diagnostic process. This timely feedback is particularly advantageous in fast-paced clinical settings where timely decision-making is critical.</p>
      </sec>
      <sec>
        <title>CDSSs and Artificial Intelligence</title>
        <p>CDSSs are categorized into 2 main types: knowledge-based and nonknowledge-based systems [<xref ref-type="bibr" rid="ref10">10</xref>]. Knowledge-based CDSSs rely on established medical knowledge including clinical guidelines, expert protocols, and information on drug interactions. In contrast, nonknowledge-based systems, particularly those using artificial intelligence (AI), leverage advanced algorithms, machine learning, and statistical pattern recognition. Unlike their rule-based counterparts, these systems adapt over time, continuously refining their insights and recommendations. The rapid integration of AI into CDSSs highlights the growing importance of advanced technologies in health care [<xref ref-type="bibr" rid="ref12">12</xref>]. In recent years, generative AI through large language models (LLMs) has been reshaping health care, offering improvements in diagnostic accuracy, treatment planning, and patient care [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. AI systems, emulating human cognition, continuously learn from new data [<xref ref-type="bibr" rid="ref15">15</xref>]. They assist health care professionals by analyzing complex patient data, thereby enhancing clinical decision-making and patient outcomes [<xref ref-type="bibr" rid="ref10">10</xref>].</p>
      </sec>
      <sec>
        <title>Growing Importance of Generative AI</title>
        <p>In this context of rapidly integrating AI into CDSSs, generative AIs have marked a new era in digital health. LLMs are advanced AI algorithms trained on extensive textual data, enabling them to process and generate human-like text, thereby providing valuable insights to medical diagnostics. Several generative AI tools are now available to the public, including Bard (currently Gemini) by Google [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>], LLM Meta AI 2 (LLaMA2) by Meta AI [<xref ref-type="bibr" rid="ref18">18</xref>], and ChatGPT, developed by OpenAI [<xref ref-type="bibr" rid="ref19">19</xref>]. These AI tools, which use LLMs, have successfully passed national medical licensing exams without specific training or reinforcement [<xref ref-type="bibr" rid="ref20">20</xref>], demonstrating their potential in medical diagnostics. Among these, ChatGPT stands out as one of the most extensively researched generative AI applications in health care [<xref ref-type="bibr" rid="ref21">21</xref>]. Specifically, in diagnostics, a recent study has shown that these generative AI systems, particularly ChatGPT with GPT-4, demonstrate excellent diagnostic capability when answering clinical vignette questions [<xref ref-type="bibr" rid="ref22">22</xref>]. Additionally, other studies, including our own, have assessed AI systems’ performance in one aspect of the diagnostic process, generating differential diagnosis lists [<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. While broader studies compare a variety of state-of-the-art models, our analysis focuses on the distinct capabilities and impacts of these specific tools within medical diagnostics.</p>
      </sec>
      <sec>
        <title>Generative AI Systems in the Diagnostic Process</title>
        <p>The diagnostic process involves collecting clinical information, forming a differential diagnosis, and refining it through continuous feedback [<xref ref-type="bibr" rid="ref26">26</xref>]. This feedback consists of patient outcomes, test results, and final diagnoses [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. Similar to traditional CDSSs, generative AI systems can enhance this feedback loop [<xref ref-type="bibr" rid="ref29">29</xref>]. However, a gap previously existed in the systematic comparison of differential diagnoses with final diagnoses through a feedback loop [<xref ref-type="bibr" rid="ref27">27</xref>]. Given this background, it remains less explored how effectively these AI systems integrate their feedback into clinical workflow. To address this gap, exploring how generative AI systems provide feedback by comparing final diagnoses with differential-diagnosis lists represents a straightforward and viable first step. This study used differential diagnosis lists to assess diagnostic accuracy. This approach was chosen to mimic a key aspect of the clinical decision-making process, where physicians often narrow down a broad list of potential diagnoses to determine the most likely one. This method reflects a critical use case for AI in health care, potentially speeding up and refining diagnostic accuracy. In our previous short communication, we reported that the fourth generation ChatGPT (GPT-4) showed very good agreement with physicians in evaluating the lists for a limited number of case reports published from our General Internal Medicine (GIM) department [<xref ref-type="bibr" rid="ref30">30</xref>]. Building on this research, this study focused on assessing the capability of GPT-4 in identifying the final diagnosis from differential-diagnosis lists for comprehensive case report series, compared with those of physicians. Furthermore, this research aimed to demonstrate the role of generative AI, particularly GPT-4, in enhancing the diagnostic learning cycle through effective feedback mechanisms.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>We conducted an experimental study using GPT-4 and the differential-diagnosis lists generated by 3 AI systems inputting into case descriptions. The research was conducted at the Department of Generalist and Diagnostic Medicine (GIM), Dokkyo Medical University, Tochigi, Japan. Our research methodology encompassed preparing a data set for differential-diagnosis lists and the corresponding final diagnoses, assessing these lists using GPT-4, and having physicians evaluate the lists. <xref rid="figure1" ref-type="fig">Figure 1</xref> illustrates this study flow.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Study flowchart of inclusion of case reports, generation of differential-diagnosis lists, and evaluation of the lists. LLaMA2: LLM Meta AI 2.</p>
          </caption>
          <graphic xlink:href="formative_v8i1e59267_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Since we used a database extracted from published case reports, obtaining ethical approval was not applicable.</p>
      </sec>
      <sec>
        <title>Database of Differential-Diagnosis Lists and Final Diagnoses</title>
        <p>We used our data set from a previous study (TH, YH, KM, T Sakamoto, KT, T Shimizu. Diagnostic performance of generative artificial intelligences for a series of complex case reports. unpublished data, November 2023). From the PubMed search, we identified a total of 557 case reports. We excluded the nondiagnosed cases (130 cases) and the pediatric cases, aged younger than 10 years (35 cases). The exclusion criteria were based on the previous research for CDSS [<xref ref-type="bibr" rid="ref31">31</xref>]. After the exclusion, we included 392 case reports. The case reports were brushed up as case descriptions to focus on the diagnosis. The authors typically defined the final diagnoses. Through inputting into the case descriptions and systematic prompt, 3 generative AI systems—GPT-4, Google Bard (currently Google Gemini), and LLaMA2 chatbot—generated the top 10 differential-diagnosis lists. The AI systems used were not trained for any additional medical use or reinforced. The main investigator (TH) conducted the entire process, with validation provided by another investigator (YH). Through this process, this data set included differential diagnosis lists corresponding to case descriptions and final diagnoses from case reports in the <italic>American Journal of Case Reports</italic>. Detailed lists of differential diagnoses and their final diagnoses are shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>GPT-4 Assessment of the Differential-Diagnosis Lists</title>
        <p>In selecting the generative AI systems for evaluation, we focused on GPT-4 due to its distinct architectural frameworks and widespread use in the field of health care research. GPT-4, developed by OpenAI, is notable for its advanced natural language processing capabilities and extensive training data set, making it particularly relevant for health care [<xref ref-type="bibr" rid="ref32">32</xref>]. We used the August 3 version and September 25 version of GPT-4 to evaluate differential diagnosis lists. The access date was from September 11, 2023, to October 6, 2023. A structured prompt was crafted to ascertain whether GPT-4 could identify the final diagnosis within a list and its position if present. The prompt required direct copying and pasting of the final diagnoses and differential diagnosis lists from our data set. We assessed the inclusion of the final diagnosis in the list (Yes=1, No=0) and its position. The prompt selection was a preliminary investigation. To ensure unbiased output, each session was isolated by deactivating chat history and training controls and restarting GPT-4 before every new evaluation. We obtained a single output from GPT-4 for each differential diagnosis list. The details of this structured prompt in this study are expounded in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
      </sec>
      <sec>
        <title>Physician Assessment of the Differential-Diagnosis Lists</title>
        <p>For comparison, 2 independent physicians (KM and T Sakamoto) also evaluated the differential diagnosis lists. The presence of the final diagnosis within the differential diagnosis lists was marked with a 1 or 0. A “1” was marked when the lists precisely and acceptably identified the final diagnosis [<xref ref-type="bibr" rid="ref33">33</xref>], further ranking it from 1 to 10 based on its placement. A “0” indicated its absence. Discrepancies between the evaluations of the 2 physicians were resolved by another physician (KT). Notably, the physicians were blinded to which AI generated the lists they assessed. We selected 3 independent physicians, specializing in GIM. Selection was based on expertise in diagnostic processes and familiarity with AI technologies in health care. All physicians underwent a brief guidance session to familiarize themselves with the evaluation criteria and objectives of the study to ensure consistent assessment standards.</p>
      </sec>
      <sec>
        <title>Outcome</title>
        <p>The primary outcome was defined as the κ coefficient for interrater agreement between GPT-4 and the physicians’ evaluations for the differential-diagnosis lists generated by 3 AI systems including GPT-4, Google Bard (currently Google Gemini), and LLaMA2 chatbot. The secondary outcomes were defined as the κ coefficients for interrater agreement between GPT-4 and the physicians’ evaluations for the differential diagnosis lists generated by each AI system. Additionally, another secondary outcome was defined as the ranking patterns between GPT-4’s evaluation and that of physicians.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>Analytical procedures were conducted using R (version 4.2.2; The R Foundation for Statistical Computing). The agreement between different evaluations was quantified using the Cohen κ coefficient through the irr package in R. Agreement strength was categorized as per Cohen κ benchmarks: values under 0.40 indicated poor agreement; values between 0.41 and 0.75 showed fair to good agreement; and values ranging from 0.75 to 1.00 denoted very good agreement [<xref ref-type="bibr" rid="ref34">34</xref>]. The 95% CIs were used to quantify uncertainty. Additionally, we compared ranking patterns between GPT-4’s evaluation and that of physicians [<xref ref-type="bibr" rid="ref35">35</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overall Evaluation</title>
        <p>This study involved 3 generative AI systems—GPT-4, Google Bard (currently Google Gemini), and LLaMA2 chatbot—outputting differential-diagnosis lists for 392 case descriptions, resulting in a total of 1176 lists. In 825 lists where physicians included a final diagnosis, GPT-4 matched 636 lists and did not match 189 lists. Conversely, in 351 lists where physicians did not include a final diagnosis, GPT-4 matched 330 lists and did not match 21 lists. In total, GPT-4’s evaluations matched the physicians’ evaluations in 966 out of 1176 lists (82.1%). Cohen κ coefficient was 0.63 (95% CI 0.56-0.69), indicating a fair to good agreement between GPT-4 and the physicians’ evaluations. GPT-4 omitted the final diagnosis in 16.1% (n=189) of cases, contrasting with physicians’ evaluations that included these diagnoses. <xref ref-type="table" rid="table1">Table 1</xref> shows GPT-4’s evaluations concurred with the physicians’ evaluations. <xref ref-type="table" rid="table2">Table 2</xref> details the κ coefficient for interrater agreement between GPT-4 and the physicians’ evaluations. The representative input used in GPT-4’s evaluations is illustrated in <xref rid="figure2" ref-type="fig">Figure 2</xref>, and the corresponding output is shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>. A formed data set is shown in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>GPT-4’s evaluations concurred with the physicians’ evaluations.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="400"/>
            <col width="120"/>
            <col width="140"/>
            <col width="340"/>
            <thead>
              <tr valign="top">
                <td>Variables</td>
                <td colspan="2">GPT-4</td>
                <td>Total (N=1176)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Matched</td>
                <td>Did not match</td>
                <td>
                  <break/>
                </td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Inclusion of final diagnosis</td>
                <td>636</td>
                <td>189</td>
                <td>825</td>
              </tr>
              <tr valign="top">
                <td>Noninclusion of final diagnosis</td>
                <td>330</td>
                <td>21</td>
                <td>351</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>κ coefficient for interrater agreement between GPT-4 and the physicians’ evaluations for the differential diagnosis lists.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="230"/>
            <col width="220"/>
            <col width="300"/>
            <thead>
              <tr valign="top">
                <td>Differential-diagnosis lists generator</td>
                <td>Cohen κ coefficient (95% CI)</td>
                <td>Strength of agreement [<xref ref-type="bibr" rid="ref34">34</xref>]</td>
                <td>Number of differential-diagnosis lists</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>All</td>
                <td>0.63 (0.56-0.69)</td>
                <td>Fair to good</td>
                <td>1176</td>
              </tr>
              <tr valign="top">
                <td>GPT-4</td>
                <td>0.47 (0.39-0.56)</td>
                <td>Fair to good</td>
                <td>392</td>
              </tr>
              <tr valign="top">
                <td>Google Bard<sup>a</sup></td>
                <td>0.67 (0.52-0.73)</td>
                <td>Fair to good</td>
                <td>392</td>
              </tr>
              <tr valign="top">
                <td>LLaMA2 chatbot<sup>b</sup></td>
                <td>0.63 (0.52-0.73)</td>
                <td>Fair to good</td>
                <td>392</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Currently Google Gemini.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>LLaMA2: LLM Meta AI 2.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>The representative input for GPT-4 generated to evaluate whether the final diagnosis was included in the differential diagnosis.</p>
          </caption>
          <graphic xlink:href="formative_v8i1e59267_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>From the input (Figure 2), GPT-4 generated the representative output of its evaluation.</p>
          </caption>
          <graphic xlink:href="formative_v8i1e59267_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Evaluation of Each Generative AI</title>
        <p>The κ coefficients for differential-diagnosis lists generated by GPT-4, Google Bard (currently Google Gemini), and LLaMA2 chatbot were 0.47 (95% CI 0.39-0.56), 0.67 (95% CI 0.52-0.73), and 0.63 (95% CI 0.52-0.73), respectively. All κ coefficients indicated a fair to good agreement between GPT-4 and the physicians’ evaluations.</p>
      </sec>
      <sec>
        <title>Comparison of Ranking Patterns Between GPT-4 and Physicians</title>
        <p>Both GPT-4’s evaluation and that of physicians showed a general trend of decreasing frequency as the rank increases. <xref rid="figure4" ref-type="fig">Figure 4</xref> shows the comparisons of ranking patterns between GPT-4 and physicians.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Comparison of ranking patterns in evaluations by GPT-4 and physicians.</p>
          </caption>
          <graphic xlink:href="formative_v8i1e59267_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Evaluation Between Physicians</title>
        <p>Physicians’ evaluations (KM and T Sakamoto) for the differential diagnosis lists showed very good agreement, with concordance in 88.8% (n=1044) of cases. The κ coefficient was 0.75 (95% CI 0.46-0.99).</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Results</title>
        <p>This experimental study highlights several key findings. First, GPT-4’s evaluations matched those of physicians in more than 82% (n/N=966/1176) of the cases, demonstrating fair to good agreement according to κ coefficient values. These results imply that GPT-4’s accuracy in identifying the final diagnosis within differential-diagnosis lists is comparable to that of physicians. Unlike traditional CDSSs, generative AI systems, including GPT-4, are capable of performing multiple roles in the diagnostic process including formulating and assessing differential diagnoses. These capabilities highlight GPT-4’s potential to streamline diagnostics in clinical settings by expediting diagnostic feedback [<xref ref-type="bibr" rid="ref36">36</xref>]. Our study design focuses on GPT-4’s ability to refine and validate pre-existing diagnostic considerations as supplementary tools for medical diagnostics. This scenario is akin to real-world clinical settings where generative AI systems could verify and support physicians’ final diagnostic decisions. By assessing the AI’s accuracy in this context, we can better understand its potential role and limitations in practical medical applications. Furthermore, in medical education, generative AI tools, like GPT-4, can offer students valuable self-learning opportunities. They provide timely feedback in the form of final diagnoses [<xref ref-type="bibr" rid="ref37">37</xref>], enabling them to cross-reference with reliable sources for verification [<xref ref-type="bibr" rid="ref38">38</xref>].</p>
        <p>Second, GPT-4 failed to identify the final diagnosis in 16% (n/N=189/1176) of differential-diagnosis lists, even though these diagnoses were recognized by the evaluating physicians. Notably, despite achieving very good agreement among physicians, GPT-4 did not reach similar levels of concordance. This discrepancy highlights potential areas for improving the system’s ability to interpret and analyze complex medical data. This discrepancy arises primarily from GPT-4’s reliance on textual patterns and word associations within the provided differential diagnosis lists. Unlike physicians, who use a comprehensive medical knowledge base and clinical experience, an inherent limitation in generative AI systems like GPT-4 is their reliance on existing data patterns and textual association. To mitigate these discrepancies, continuous development in generative AI systems for health care is needed. Additionally, future research should focus on enhancing the medical training of these systems. This will enhance the generative AI systems’ diagnostic feedback, making it more adaptable to real clinical settings.</p>
        <p>Third, regarding evaluation at what rank in the differential-diagnosis list was the final diagnosis found, both GPT-4 and physicians exhibited a trend of decreasing frequency. This suggests GPT-4’s diagnosis ranking shows a similar trend to physicians’ diagnosis ranking. Moreover, all 3 generative AI systems, including GPT-4, Google Bard (currently Google Gemini), and LLaMA2 chatbot, prioritized the most likely diagnoses at the top of the list, leading to a natural decrease in frequency as less-probable diagnoses are ranked lower. Therefore, generative AI systems showed the potential not only to generate differential diagnosis lists for clinical cases but also to evaluate these lists as feedback.</p>
        <p>Fourth, an examination of the differential diagnosis lists generated by 3 different AI systems showed the overlap in the 95% CI for the κ coefficients across the 3 AI platforms. One might hypothesize that GPT-4 would exhibit improved performance when evaluating differential-diagnosis lists it generated itself. However, observed results may stem from the inherent variability in generative AI outputs including GPT-4. This inherent variability underscores the challenge of maintaining a consistent standard of accuracy and reliability in the outputs from generative AI systems. Even when evaluating differential-diagnosis lists generated by itself, GPT-4’s performance did not markedly surpass that of lists generated by other AI systems. Additionally, the observed performance differences may be partially due to version inconsistencies. The generation of differential diagnosis lists used an earlier version of GPT-4 (March 24). Subsequent evaluations used later versions (August 3 and September 25). Different versions of generative AI systems can exhibit varied capabilities and outputs, potentially impacting the accuracy and consistency of diagnostic evaluations. This highlights the need for ongoing updates and version alignment in clinical AI applications to maintain reliability.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study has several limitations. First, GPT-4’s role was limited to identifying the final diagnosis within the differential diagnosis list. The current binary evaluation method has not been a well-established approach to evaluating diagnostic performance by other CDSSs. Another study used a 5-grade level of accuracy for a variety number of differentials [<xref ref-type="bibr" rid="ref39">39</xref>]. Investigating more complex outcomes, such as quantitative evaluations and additional clinical suggestions, might yield different results. Second, our inputs to GPT-4 consisted only of the final diagnoses and the differential diagnosis list, without the case descriptions that generated these lists. Further research should examine what types of input enhance AI systems’ performance the most. Third, there was a nonnegligible risk associated with generative AI systems, including GPT-4, regarding their capacity to inadvertently learn from and replicate the information contained in publicly available case reports. Fourth, the data set was sourced from a single case reports journal and generated by 3 AI systems. Future research would benefit from using real-world scenarios [<xref ref-type="bibr" rid="ref40">40</xref>]. Expanding the data set to include a more diverse range of AI systems is also advisable.</p>
        <p>Regarding limitations for generative AI systems, like GPT-4, there is currently no approval for their use as CDSSs. Furthermore, GPT-4 operates as a fee-based application, which could potentially limit its accessibility to the wider public. Additionally, the reliability of generative AI systems can vary based on the input data it was trained on. If it is not exposed to diverse clinical scenarios during its training, it may not be as effective in real-world diagnostic situations [<xref ref-type="bibr" rid="ref41">41</xref>]. Moreover, while AI tools can assist, they do not replace the nuanced judgments and decision-making processes of human physicians [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. Additionally, the rapid evolution of AI means that our findings may become outdated as Google Bard and LLaMA2 were updated to the new LLM model, Google Gemini and LLaMA3, respectively [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. Finally, overreliance on AI without critical review could lead to diagnostic errors [<xref ref-type="bibr" rid="ref45">45</xref>].</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p>In our previous study involving GPT-4 [<xref ref-type="bibr" rid="ref30">30</xref>], we observed a very good agreement with physicians in identifying final diagnoses within the differential-diagnosis lists, achieving a 95.9% agreement rate (236 out of 246 lists; κ=0.86). In contrast, this study demonstrated a fair to good agreement rate of 82.1% (966/1176 lists; κ=0.63). Despite using the same evaluation methods in both studies, the observed decrease in the agreement can be attributed to several factors: the source of case reports (GIM-published vs a broader range of case reports), the generators of differential diagnoses (physicians, GPT-3/GPT-4 vs GPT-4/Google Bard [currently Gemini]/LLaMA2 chatbot), and the volume of lists assessed (246 lists vs 1176 lists).</p>
      </sec>
      <sec>
        <title>Future Directions</title>
        <p>Future studies explore the potential of integrating GPT-4 and similar AI systems into real-world clinical settings. This could involve developing interfaces that allow these AI systems to interact directly with electronic health records, providing real-time diagnostic feedback to physicians. Additionally, research could focus on tailoring these AI systems for specialized medical fields, where their ability to process vast amounts of data could significantly aid in complex case analysis. Another vital area for future research is the ethical implications of AI in medicine [<xref ref-type="bibr" rid="ref43">43</xref>], particularly in patient data privacy, AI decision transparency, and the impact of AI-assisted diagnostics on physician-patient relationships.</p>
        <p>Furthermore, further research should also investigate the optimal use of AI technologies, including the exploration of both chatbot interfaces and application programming interface functionalities. A more detailed examination of application programming interface settings, such as adjustable parameters including temperature and Top P, could be invaluable. This investigation would provide clearer guidelines on when and how to use different AI tools effectively, considering both scientific evidence and effectiveness.</p>
        <p>Moreover, our future research will focus on refining the evaluation of AI-generated differential diagnoses by incorporating more sophisticated and validated psychometric methods as the next diagnostic step. We propose to adopt methodologies for assessing the quality of differential diagnoses. This approach will allow us not only to compare AI-generated outputs with those from physicians but also to treat it as a form of Turing test—evaluating whether AI can match or surpass human performance in diagnostic tasks without being distinguishable from them [<xref ref-type="bibr" rid="ref46">46</xref>].</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>GPT-4 demonstrated a fair to good agreement in identifying the final diagnosis from differential-diagnosis lists, comparable to physicians for case report series. By reliably identifying diagnoses, GPT-4 can provide on-time feedback by comparing final diagnoses with differential-diagnosis lists. Therefore, this study suggests that generative AI systems have the potential to assist physicians in the diagnostic process by providing reliable and efficient feedback, thereby contributing to improved clinical decision-making and medical education. However, it is imperative to recognize that these findings are based on experimental studies. Real-world scenarios could present unique challenges, and further validations in diverse clinical environments are essential before broad implementation can be recommended.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>The differential-diagnosis generated by 3 artificial intelligences used in this study and the final diagnosis.</p>
        <media xlink:href="formative_v8i1e59267_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 938 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Structured prompt used in this study.</p>
        <media xlink:href="formative_v8i1e59267_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 44 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Formed data set used in this study.</p>
        <media xlink:href="formative_v8i1e59267_app3.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 149 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CDSS</term>
          <def>
            <p>clinical decision support system</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">GIM</term>
          <def>
            <p>general internal medicine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LLaMA2</term>
          <def>
            <p>LLM Meta AI 2</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This research was funded by the Japan Society for the Promotion of Science (JSPS) KAKENHI (grant 22K10421). This study was conducted using resources from the Department of Diagnostics and Generalist Medicine at Dokkyo Medical University.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>TH, YH, KM, T Sakamoto, KT, and T Shimizu contributed to the study concept and design. TH performed the statistical analyses. TH contributed to the drafting of the manuscript. YH, KM, T Sakamoto, KT, and T Shimizu contributed to the critical revision of the manuscript for relevant intellectual content. All the authors have read and approved the final version of the manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Balogh</surname>
              <given-names>EP</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>BT</given-names>
            </name>
            <name name-style="western">
              <surname>Ball</surname>
              <given-names>JR</given-names>
            </name>
          </person-group>
          <source>Improving Diagnosis in Health Care</source>
          <year>2015</year>
          <publisher-loc>Washington, DC</publisher-loc>
          <publisher-name>National Academies Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Graber</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic errors in medicine: a case of neglect</article-title>
          <source>Jt Comm J Qual Patient Saf</source>
          <year>2005</year>
          <volume>31</volume>
          <issue>2</issue>
          <fpage>106</fpage>
          <lpage>113</lpage>
          <pub-id pub-id-type="doi">10.1016/s1553-7250(05)31015-4</pub-id>
          <pub-id pub-id-type="medline">15791770</pub-id>
          <pub-id pub-id-type="pii">S1553-7250(05)31015-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Newman-Toker</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Nassery</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Schaffer</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Yu-Moe</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Clemens</surname>
              <given-names>GD</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Saber Tehrani</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Fanai</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hassoon</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Siegal</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Burden of serious harms from diagnostic error in the USA</article-title>
          <source>BMJ Qual Saf</source>
          <year>2024</year>
          <volume>33</volume>
          <issue>2</issue>
          <fpage>109</fpage>
          <lpage>120</lpage>
          <pub-id pub-id-type="doi">10.1136/bmjqs-2021-014130</pub-id>
          <pub-id pub-id-type="medline">37460118</pub-id>
          <pub-id pub-id-type="pii">bmjqs-2021-014130</pub-id>
          <pub-id pub-id-type="pmcid">PMC10792094</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Graber</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Franklin</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gordon</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic error in internal medicine</article-title>
          <source>Arch Intern Med</source>
          <year>2005</year>
          <volume>165</volume>
          <issue>13</issue>
          <fpage>1493</fpage>
          <lpage>1499</lpage>
          <pub-id pub-id-type="doi">10.1001/archinte.165.13.1493</pub-id>
          <pub-id pub-id-type="medline">16009864</pub-id>
          <pub-id pub-id-type="pii">165/13/1493</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schiff</surname>
              <given-names>GD</given-names>
            </name>
            <name name-style="western">
              <surname>Hasan</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Abrams</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cosby</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Lambert</surname>
              <given-names>BL</given-names>
            </name>
            <name name-style="western">
              <surname>Elstein</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Hasler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kabongo</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Krosnjar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Odwazny</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wisniewski</surname>
              <given-names>MF</given-names>
            </name>
            <name name-style="western">
              <surname>McNutt</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic error in medicine: analysis of 583 physician-reported errors</article-title>
          <source>Arch Intern Med</source>
          <year>2009</year>
          <volume>169</volume>
          <issue>20</issue>
          <fpage>1881</fpage>
          <lpage>1887</lpage>
          <pub-id pub-id-type="doi">10.1001/archinternmed.2009.333</pub-id>
          <pub-id pub-id-type="medline">19901140</pub-id>
          <pub-id pub-id-type="pii">169/20/1881</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Connor</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Dhaliwal</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Five strategies for clinicians to advance diagnostic excellence</article-title>
          <source>BMJ</source>
          <year>2022</year>
          <volume>376</volume>
          <fpage>e068044</fpage>
          <pub-id pub-id-type="doi">10.1136/bmj-2021-068044</pub-id>
          <pub-id pub-id-type="medline">35172968</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meyer</surname>
              <given-names>AND</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>The path to diagnostic excellence includes feedback to calibrate how clinicians think</article-title>
          <source>JAMA</source>
          <year>2019</year>
          <volume>321</volume>
          <issue>8</issue>
          <fpage>737</fpage>
          <lpage>738</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2019.0113</pub-id>
          <pub-id pub-id-type="medline">30735239</pub-id>
          <pub-id pub-id-type="pii">2724792</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mamede</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>HG</given-names>
            </name>
            <name name-style="western">
              <surname>Penaforte</surname>
              <given-names>JC</given-names>
            </name>
          </person-group>
          <article-title>Effects of reflective practice on the accuracy of medical diagnoses</article-title>
          <source>Med Educ</source>
          <year>2008</year>
          <volume>42</volume>
          <issue>5</issue>
          <fpage>468</fpage>
          <lpage>475</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1365-2923.2008.03030.x</pub-id>
          <pub-id pub-id-type="medline">18412886</pub-id>
          <pub-id pub-id-type="pii">MED3030</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mamede</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>HG</given-names>
            </name>
          </person-group>
          <article-title>Reflection in medical diagnosis: a literature review</article-title>
          <source>Health Prof Educ</source>
          <year>2017</year>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>15</fpage>
          <lpage>25</lpage>
          <pub-id pub-id-type="doi">10.1016/j.hpe.2017.01.003</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sutton</surname>
              <given-names>RT</given-names>
            </name>
            <name name-style="western">
              <surname>Pincock</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Baumgart</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Sadowski</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Fedorak</surname>
              <given-names>RN</given-names>
            </name>
            <name name-style="western">
              <surname>Kroeker</surname>
              <given-names>KI</given-names>
            </name>
          </person-group>
          <article-title>An overview of clinical decision support systems: benefits, risks, and strategies for success</article-title>
          <source>NPJ Digit Med</source>
          <year>2020</year>
          <volume>3</volume>
          <fpage>17</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-020-0221-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-020-0221-y</pub-id>
          <pub-id pub-id-type="medline">32047862</pub-id>
          <pub-id pub-id-type="pii">221</pub-id>
          <pub-id pub-id-type="pmcid">PMC7005290</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rubins</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>McCoy</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Dutta</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>McEvoy</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Patterson</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jackson</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Zuccotti</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Real-time user feedback to support clinical decision support system improvement</article-title>
          <source>Appl Clin Inform</source>
          <year>2022</year>
          <volume>13</volume>
          <issue>5</issue>
          <fpage>1024</fpage>
          <lpage>1032</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36288748"/>
          </comment>
          <pub-id pub-id-type="doi">10.1055/s-0042-1757923</pub-id>
          <pub-id pub-id-type="medline">36288748</pub-id>
          <pub-id pub-id-type="pmcid">PMC9605820</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Haug</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Drazen</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence and machine learning in clinical medicine, 2023</article-title>
          <source>N Engl J Med</source>
          <year>2023</year>
          <volume>388</volume>
          <issue>13</issue>
          <fpage>1201</fpage>
          <lpage>1208</lpage>
          <pub-id pub-id-type="doi">10.1056/NEJMra2302038</pub-id>
          <pub-id pub-id-type="medline">36988595</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Utility of ChatGPT in clinical practice</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <volume>25</volume>
          <fpage>e48568</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e48568/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48568</pub-id>
          <pub-id pub-id-type="medline">37379067</pub-id>
          <pub-id pub-id-type="pii">v25i1e48568</pub-id>
          <pub-id pub-id-type="pmcid">PMC10365580</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alowais</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Alghamdi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Alsuhebany</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Alqahtani</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Alshaya</surname>
              <given-names>AI</given-names>
            </name>
            <name name-style="western">
              <surname>Almohareb</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Aldairem</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Alrashed</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bin Saleh</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Badreldin</surname>
              <given-names>HA</given-names>
            </name>
            <name name-style="western">
              <surname>Al Yami</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Al Harbi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Albekairy</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>Revolutionizing healthcare: the role of artificial intelligence in clinical practice</article-title>
          <source>BMC Med Educ</source>
          <year>2023</year>
          <volume>23</volume>
          <issue>1</issue>
          <fpage>689</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-023-04698-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-023-04698-z</pub-id>
          <pub-id pub-id-type="medline">37740191</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-023-04698-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC10517477</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Collins</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dennehy</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Conboy</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mikalef</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in information systems research: a systematic literature review and research agenda</article-title>
          <source>Int J Inf Manage</source>
          <year>2021</year>
          <volume>60</volume>
          <fpage>102383</fpage>
          <pub-id pub-id-type="doi">10.1016/j.ijinfomgt.2021.102383</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Patrizio</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Google Gemini (formerly Bard)</article-title>
          <source>TechTarget</source>
          <year>2024</year>
          <month>03</month>
          <access-date>2024-06-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.techtarget.com/searchenterpriseai/definition/Google-Bard">https://www.techtarget.com/searchenterpriseai/definition/Google-Bard</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sundar</surname>
              <given-names>PD</given-names>
            </name>
          </person-group>
          <article-title>Introducing Gemini: our largest and most capable AI model</article-title>
          <source>Google</source>
          <year>2023</year>
          <month>12</month>
          <day>06</day>
          <access-date>2024-06-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://blog.google/technology/ai/google-gemini-ai/#sundar-note">https://blog.google/technology/ai/google-gemini-ai/#sundar-note</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Touvron</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Stone</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Albert</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Almahairi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Babaei</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Llama 2: open foundation and fine-tuned chat models</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online July 18, 2023</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2307.09288</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Achiam</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Adler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 Technical Report</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online March 15, 2023</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gaur</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sai</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chamola</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Guizani</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rodrigues</surname>
              <given-names>JJPC</given-names>
            </name>
          </person-group>
          <article-title>Generative AI for transformative healthcare: a comprehensive study of emerging models, applications, case studies, and limitations</article-title>
          <source>IEEE Access</source>
          <year>2024</year>
          <volume>12</volume>
          <fpage>31078</fpage>
          <lpage>31106</lpage>
          <pub-id pub-id-type="doi">10.1109/ACCESS.2024.3367715</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cascella</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Montomoli</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bellini</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Bignami</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the feasibility of ChatGPT in healthcare: an analysis of multiple clinical and research scenarios</article-title>
          <source>J Med Syst</source>
          <year>2023</year>
          <volume>47</volume>
          <issue>1</issue>
          <fpage>33</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36869927"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10916-023-01925-4</pub-id>
          <pub-id pub-id-type="medline">36869927</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10916-023-01925-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC9985086</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Han</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Adams</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Bressem</surname>
              <given-names>KK</given-names>
            </name>
            <name name-style="western">
              <surname>Busch</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Nebelung</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Truhn</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Comparative analysis of multimodal large language model performance on clinical vignette questions</article-title>
          <source>JAMA</source>
          <year>2024</year>
          <volume>331</volume>
          <issue>15</issue>
          <fpage>1320</fpage>
          <lpage>1321</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2023.27861</pub-id>
          <pub-id pub-id-type="medline">38497956</pub-id>
          <pub-id pub-id-type="pii">2816270</pub-id>
          <pub-id pub-id-type="pmcid">PMC10949144</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hirosawa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kawamura</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Harada</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mizuta</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tokumasu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kaji</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Suzuki</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT-generated differential diagnosis lists for complex case-derived clinical vignettes: diagnostic accuracy evaluation</article-title>
          <source>JMIR Med Inform</source>
          <year>2023</year>
          <volume>11</volume>
          <fpage>e48808</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2023//e48808/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48808</pub-id>
          <pub-id pub-id-type="medline">37812468</pub-id>
          <pub-id pub-id-type="pii">v11i1e48808</pub-id>
          <pub-id pub-id-type="pmcid">PMC10594139</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hirosawa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mizuta</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Harada</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Comparative evaluation of diagnostic accuracy between Google Bard and physicians</article-title>
          <source>Am J Med</source>
          <year>2023</year>
          <month>11</month>
          <volume>136</volume>
          <issue>11</issue>
          <fpage>1119</fpage>
          <lpage>1123.e18</lpage>
          <pub-id pub-id-type="doi">10.1016/j.amjmed.2023.08.003</pub-id>
          <pub-id pub-id-type="medline">37643659</pub-id>
          <pub-id pub-id-type="pii">S0002-9343(23)00536-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kanjee</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Crowe</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rodman</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title>
          <source>JAMA</source>
          <year>2023</year>
          <volume>330</volume>
          <issue>1</issue>
          <fpage>78</fpage>
          <lpage>80</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id>
          <pub-id pub-id-type="medline">37318797</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Price</surname>
              <given-names>RB</given-names>
            </name>
            <name name-style="western">
              <surname>Vlahcevic</surname>
              <given-names>ZR</given-names>
            </name>
          </person-group>
          <article-title>Logical principles in differential diagnosis</article-title>
          <source>Ann Intern Med</source>
          <year>1971</year>
          <volume>75</volume>
          <issue>1</issue>
          <fpage>89</fpage>
          <lpage>95</lpage>
          <pub-id pub-id-type="doi">10.7326/0003-4819-75-1-89</pub-id>
          <pub-id pub-id-type="medline">4933297</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fernandez Branson</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Graber</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Lane</surname>
              <given-names>KP</given-names>
            </name>
            <name name-style="western">
              <surname>Grieser</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Landis-Lewis</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cooke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Upadhyay</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Mondoux</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zwaan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Olson</surname>
              <given-names>APJ</given-names>
            </name>
          </person-group>
          <article-title>Improving diagnostic performance through feedback: the diagnosis learning cycle</article-title>
          <source>BMJ Qual Saf</source>
          <year>2021</year>
          <volume>30</volume>
          <issue>12</issue>
          <fpage>1002</fpage>
          <lpage>1009</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://qualitysafety.bmj.com/lookup/pmidlookup?view=long&#38;pmid=34417335"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjqs-2020-012456</pub-id>
          <pub-id pub-id-type="medline">34417335</pub-id>
          <pub-id pub-id-type="pii">bmjqs-2020-012456</pub-id>
          <pub-id pub-id-type="pmcid">PMC8606468</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rosner</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zwaan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Olson</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Imagining the future of diagnostic performance feedback</article-title>
          <source>Diagnosis (Berl)</source>
          <year>2023</year>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>31</fpage>
          <lpage>37</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.degruyter.com/document/doi/10.1515/dx-2022-0055"/>
          </comment>
          <pub-id pub-id-type="doi">10.1515/dx-2022-0055</pub-id>
          <pub-id pub-id-type="medline">36378520</pub-id>
          <pub-id pub-id-type="pii">dx-2022-0055</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Filiberto</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Leeds</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Loftus</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Editorial: machine learning in clinical decision-making</article-title>
          <source>Front Digit Health</source>
          <year>2021</year>
          <volume>3</volume>
          <fpage>784495</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34870273"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fdgth.2021.784495</pub-id>
          <pub-id pub-id-type="medline">34870273</pub-id>
          <pub-id pub-id-type="pmcid">PMC8636718</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mizuta</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hirosawa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Harada</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Can ChatGPT-4 evaluate whether a differential diagnosis list contains the correct diagnosis as accurately as a physician?</article-title>
          <source>Diagnosis (Berl)</source>
          <year>2024</year>
          <pub-id pub-id-type="doi">10.1515/dx-2024-0027</pub-id>
          <pub-id pub-id-type="medline">38465399</pub-id>
          <pub-id pub-id-type="pii">dx-2024-0027</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Graber</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Mathew</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Performance of a web-based clinical diagnosis support system for internists</article-title>
          <source>J Gen Intern Med</source>
          <year>2007</year>
          <volume>23</volume>
          <issue>S1</issue>
          <fpage>37</fpage>
          <lpage>40</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1007/s11606-007-0271-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11606-007-0271-8</pub-id>
          <pub-id pub-id-type="medline">18095042</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nori</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>McKinney</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Carignan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Horvitz</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Capabilities of GPT-4 on medical challenge problems</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online March 20, 2024</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2303.13375</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krupat</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Wormwood</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartzstein</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Richards</surname>
              <given-names>JB</given-names>
            </name>
          </person-group>
          <article-title>Avoiding premature closure and reaching diagnostic accuracy: some key predictive factors</article-title>
          <source>Med Educ</source>
          <year>2017</year>
          <month>11</month>
          <volume>51</volume>
          <issue>11</issue>
          <fpage>1127</fpage>
          <lpage>1137</lpage>
          <pub-id pub-id-type="doi">10.1111/medu.13382</pub-id>
          <pub-id pub-id-type="medline">28857266</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fleiss</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Levin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Paik</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>Statistical Methods for Rates and Proportions</source>
          <year>2003</year>
          <publisher-loc>Hoboken, NJ</publisher-loc>
          <publisher-name>John Wiley &#38; Sons</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Webber</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Moffat</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zobel</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A similarity measure for indefinite rankings</article-title>
          <source>ACM Trans Inf Syst</source>
          <year>2010</year>
          <month>11</month>
          <day>23</day>
          <volume>28</volume>
          <issue>4</issue>
          <fpage>1</fpage>
          <lpage>38</lpage>
          <pub-id pub-id-type="doi">10.1145/1852102.1852106</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hattie</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Timperley</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>The power of feedback</article-title>
          <source>Rev Educ Res</source>
          <year>2016</year>
          <volume>77</volume>
          <issue>1</issue>
          <fpage>81</fpage>
          <lpage>112</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/abs/10.3102/003465430298487"/>
          </comment>
          <pub-id pub-id-type="doi">10.3102/003465430298487</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chamberland</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Setrakian</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>St-Onge</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bergeron</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mamede</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>HG</given-names>
            </name>
          </person-group>
          <article-title>Does providing the correct diagnosis as feedback after self-explanation improve medical students diagnostic performance?</article-title>
          <source>BMC Med Educ</source>
          <year>2019</year>
          <volume>19</volume>
          <issue>1</issue>
          <fpage>194</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-019-1638-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-019-1638-3</pub-id>
          <pub-id pub-id-type="medline">31185971</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-019-1638-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC6558772</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abd-Alrazaq</surname>
              <given-names>Alaa</given-names>
            </name>
            <name name-style="western">
              <surname>AlSaad</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Alhuwail</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Healy</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Latifi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Aziz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Damseh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Alabed Alrazak</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sheikh</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Large language models in medical education: opportunities, challenges, and future directions</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e48291</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48291/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48291</pub-id>
          <pub-id pub-id-type="medline">37261894</pub-id>
          <pub-id pub-id-type="pii">v9i1e48291</pub-id>
          <pub-id pub-id-type="pmcid">PMC10273039</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bond</surname>
              <given-names>WF</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Weaver</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Levick</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Giuliano</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Graber</surname>
              <given-names>ML</given-names>
            </name>
          </person-group>
          <article-title>Differential diagnosis generators: an evaluation of currently available computer programs</article-title>
          <source>J Gen Intern Med</source>
          <year>2012</year>
          <volume>27</volume>
          <issue>2</issue>
          <fpage>213</fpage>
          <lpage>219</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/21789717"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11606-011-1804-8</pub-id>
          <pub-id pub-id-type="medline">21789717</pub-id>
          <pub-id pub-id-type="pmcid">PMC3270234</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Painter</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hayhoe</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Riboli-Sasco</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>El-Osta</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Online symptom checkers: recommendations for a vignette-based cinical evaluation standard</article-title>
          <source>J Med Internet Res</source>
          <year>2022</year>
          <volume>24</volume>
          <issue>10</issue>
          <fpage>e37408</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2022/10/e37408/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/37408</pub-id>
          <pub-id pub-id-type="medline">36287594</pub-id>
          <pub-id pub-id-type="pii">v24i10e37408</pub-id>
          <pub-id pub-id-type="pmcid">PMC9647454</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wiens</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Saria</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sendak</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>VX</given-names>
            </name>
            <name name-style="western">
              <surname>Doshi-Velez</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Heller</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kale</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Saeed</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ossorio</surname>
              <given-names>PN</given-names>
            </name>
            <name name-style="western">
              <surname>Thadaney-Israni</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Goldenberg</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Do no harm: a roadmap for responsible machine learning for health care</article-title>
          <source>Nat Med</source>
          <year>2019</year>
          <volume>25</volume>
          <issue>9</issue>
          <fpage>1337</fpage>
          <lpage>1340</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-019-0548-6</pub-id>
          <pub-id pub-id-type="medline">31427808</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-019-0548-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Karches</surname>
              <given-names>KE</given-names>
            </name>
          </person-group>
          <article-title>Against the iDoctor: why artificial intelligence should not replace physician judgment</article-title>
          <source>Theor Med Bioeth</source>
          <year>2018</year>
          <volume>39</volume>
          <issue>2</issue>
          <fpage>91</fpage>
          <lpage>110</lpage>
          <pub-id pub-id-type="doi">10.1007/s11017-018-9442-3</pub-id>
          <pub-id pub-id-type="medline">29992371</pub-id>
          <pub-id pub-id-type="pii">10.1007/s11017-018-9442-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="web">
          <article-title>Ethics and governance of artificial intelligence for health: WHO guidance</article-title>
          <source>World Health Organization</source>
          <year>2021</year>
          <access-date>2024-06-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.who.int/publications/i/item/9789240029200">https://www.who.int/publications/i/item/9789240029200</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="web">
          <article-title>Build the future of AI with Meta Llama 3 2024</article-title>
          <source>Meta AI</source>
          <year>2024</year>
          <access-date>2024-05-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://llama.meta.com/llama3/">https://llama.meta.com/llama3/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Passi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Vorvoreanu</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Overreliance on AI literature review</article-title>
          <source>Microsoft</source>
          <year>2022</year>
          <access-date>2024-06-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.microsoft.com/en-us/research/publication/overreliance-on-ai-literature-review/">https://www.microsoft.com/en-us/research/publication/overreliance-on-ai-literature-review/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pinar</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Cicekli</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Akman</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Turing test: 50 years later</article-title>
          <source>Minds Mach</source>
          <year>2000</year>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>50</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1023/A:1011288000451"/>
          </comment>
          <pub-id pub-id-type="doi">10.1023/A:1011288000451</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
