<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v10i1e90673</article-id>
      <article-id pub-id-type="pmid">41941721</article-id>
      <article-id pub-id-type="doi">10.2196/90673</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Performance of DeepSeek V3, DeepSeek R1, ChatGPT 4o, and ChatGPT o1 on the National Health Professional and Technical Qualification Examination (Intermediate Level) in China: Comparative Analysis</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Xin</surname>
            <given-names>Kai</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Yazici</surname>
            <given-names>Ramiz</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Xue</surname>
            <given-names>Jipeng</given-names>
          </name>
          <degrees>BM</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-8802-2455</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Shitong</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-3568-4049</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Jinan</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0004-4317-7268</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Guo</surname>
            <given-names>Xiaogang</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6754-1432</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Shen</surname>
            <given-names>Jie</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8116-0473</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Qiwen</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>First Affiliated Hospital Zhejiang University</institution>
            <addr-line>Qingchun Road</addr-line>
            <addr-line>Shangcheng District</addr-line>
            <addr-line>Hangzhou, Zhejiang, 310000</addr-line>
            <country>China</country>
            <phone>86 13732250743</phone>
            <email>wangqiwen@zju.edu.cn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4255-3960</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>First Affiliated Hospital Zhejiang University</institution>
        <addr-line>Hangzhou, Zhejiang</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Hangzhou City University</institution>
        <addr-line>Hangzhou, Zhejiang</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Qiwen Wang <email>wangqiwen@zju.edu.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2026</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>6</day>
        <month>4</month>
        <year>2026</year>
      </pub-date>
      <volume>10</volume>
      <elocation-id>e90673</elocation-id>
      <history>
        <date date-type="received">
          <day>1</day>
          <month>1</month>
          <year>2026</year>
        </date>
        <date date-type="rev-request">
          <day>19</day>
          <month>1</month>
          <year>2026</year>
        </date>
        <date date-type="rev-recd">
          <day>15</day>
          <month>3</month>
          <year>2026</year>
        </date>
        <date date-type="accepted">
          <day>17</day>
          <month>3</month>
          <year>2026</year>
        </date>
      </history>
      <copyright-statement>©Jipeng Xue, Shitong Wang, Jinan Yang, Xiaogang Guo, Jie Shen, Qiwen Wang. Originally published in JMIR Formative Research (https://formative.jmir.org), 06.04.2026.</copyright-statement>
      <copyright-year>2026</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2026/1/e90673" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>In recent years, large language models (LLMs) have undergone swift cycles of refinement and iteration. However, in the realm of clinical medicine, different LLMs' capability of logical reasoning and disease diagnosis needs further investigation.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The aim of our study was to evaluate the performance of 4 different LLMs in the National Health Professional and Technical Qualification Examination in China.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>A total of 398 multiple-choice questions of 5 different question types were integrated within the examination with respect to the diagnosis or care of cases. These questions were categorized into different cardiology subspecialties and different clinical disciplines. DeepSeek V3 and R1 were accessed through an application programming interface, while ChatGPT 4o and o1 were queried via its public chat-based interface. We offered the same prompts instructing LLMs to assume the role of a physician and provide answers with explanations at the beginning of each conversation. We assessed different LLMs’ performance by the accuracy in the responses to the multiple-choice questions. For the first 3 examination sections, McNemar test was used to compare the accuracy among the models, with post hoc pairwise comparisons performed using partitions of chi-square method and Bonferroni correction (significance set at <italic>P</italic>&#60;.008). For the fourth section involving partially credit scoring, one-way ANOVA was performed to compare the mean scores among the models, with statistical significance set at <italic>P</italic>&#60;.05.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Both DeepSeek V3 and R1 showed superior performance in the first 3 sections of the Chinese National Health Professional and Technical Qualification Examination, achieving an overall performance of 93% and 93.6%, respectively. ChatGPT 4o and o1 achieved accuracies of 73.3% and 69%, respectively (all <italic>P</italic>&#60;.001 compared with DeepSeek V3). For the fourth section, the performance of all 4 LLMs markedly declined compared to their results in the preceding sections. Particularly, in the section of gastroenterology and hematology, DeepSeek V3 achieved the highest accuracy, while R1 ranked first in cardiology and neurology. ChatGPT o1 achieved the highest accuracy in the topic of coronary artery disease, with no statistical significance.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>DeepSeek V3 and R1 showed remarkable potential in facilitating clinical decision-making in the Chinese professional examination, with both outperforming ChatGPT 4o and o1. Nonetheless, future research should continue evaluating their economic efficiency and susceptibility to hallucination.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>large language models</kwd>
        <kwd>DeepSeek</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>support clinical decision-making</kwd>
        <kwd>cardiology</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>In recent years, large language models (LLMs) such as Anthropic 2024, Google Gemini 2024 [<xref ref-type="bibr" rid="ref1">1</xref>], and OpenAI 2024 [<xref ref-type="bibr" rid="ref2">2</xref>] have undergone swift refinement and iteration. Since their launch by OpenAI, ChatGPT holds significant potential across various facets of the medical field, including medical documentation, scientific writing, and medical education [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. Numerous studies have demonstrated its potential applications in health care, particularly in cardiology, due to its ability to advance the management of long-term heart conditions [<xref ref-type="bibr" rid="ref6">6</xref>], provide medical advice on acute cardiac events [<xref ref-type="bibr" rid="ref7">7</xref>], answer clinical cardiac questions [<xref ref-type="bibr" rid="ref8">8</xref>], interpret cardiac diagnostics tests [<xref ref-type="bibr" rid="ref9">9</xref>], and design an individualized therapeutic strategy [<xref ref-type="bibr" rid="ref10">10</xref>]. Specifically, Wang et al [<xref ref-type="bibr" rid="ref11">11</xref>] revealed that ChatGPT was proficient in specific medical tasks such as discharge summarization and group learning within the Chinese linguistic paradigm. Another investigation showed that both ChatGPT-3.5 and GPT-4 can successfully achieve average scores that exceed the admission benchmark on the master’s degree entrance examination in clinical medicine [<xref ref-type="bibr" rid="ref12">12</xref>] only, with an accuracy of 48% and 68% respectively. However, Sarangi et al [<xref ref-type="bibr" rid="ref13">13</xref>] illustrated that ChatGPT-4 has limitations in processing radiology anatomy. While these studies suggest that ChatGPT may have potential proficiency in logical reasoning and disease diagnosis, considering its financial cost and underperformance on image-based questions, its performance warrants further evaluation.</p>
      <p>In addition to proprietary systems, open-source frameworks are achieving substantial breakthroughs in capability development, actively narrowing the performance divide with their closed-source counterparts such as the newly published DeepSeek MoE. Launched in January 2025, DeepSeek’s DeepThink (R1), an open-source LLM [<xref ref-type="bibr" rid="ref14">14</xref>], is different from proprietary models as it fosters a sustained learning environment by integrating publicly accessible open-source datasets, which may in turn improve its ability to adapt to the continuously evolving domains of medical expertise and scientific analysis [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Moreover, compared to proprietary LLMs, DeepSeek R1 offered free-tier access and reduced financial costs, making artificial intelligence more accessible for smaller institutions [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. In terms of performance on mathematics and science problems, DeepSeek-R1 demonstrates proficiency rivaling that of the ChatGPT-o1 model, released in September by OpenAI, whose reasoning models were considered industry leaders [<xref ref-type="bibr" rid="ref21">21</xref>]. However, in the clinical medicine domain, DeepSeek-R1’s capabilities of logical reasoning and disease diagnosis warrants further investigation.</p>
      <p>The National Health Professional and Technical Qualification Examination (intermediate level) in China is a government-organized assessment, and passing the examination demonstrates the requisite competence to assume corresponding levels of professional and technical responsibilities. This examination is designed to evaluate the clinical acumen, depth of knowledge, diagnostic competence, and clinical decision-making expertise of resident physicians who have chosen cardiology as their practice area and are aiming at the promotion to fellows. The examination consists of 5 types of questions, namely, A1 (knowledge-based multiple choice), A2 (case-based multiple choice), A3/A4 (case-group-based multiple choice), B (matching), and X (multiple choice), totaling 398 multiple-choice questions (MCQs) distributed across 4 parts: basic concepts, relevant expertise, foundational professional knowledge, and professional practical skills. In the sections of basic concepts and relevant expertise, the examination encompasses several distinct branches of internal medicine covering the foundational concepts and standard clinical practices in medicine, such as respiratory medicine, cardiology, gastroenterology, hematology, nephrology, infectious diseases, neurology, rheumatology and immunology, endocrinology, and emergency medicine. In the section on foundational professional knowledge, a comprehensive overview of 10 prevalent cardiac conditions is provided, covering in-depth, cardiology-specific expertise in heart failure, arrhythmias, cardiac arrest and sudden cardiac death, congenital cardiovascular diseases, hypertension, coronary artery disease, valvular heart disease, infective endocarditis, myocardial diseases, and pericardial disorders. For the section on professional practical skills, a series of multiple-answer questions is set within a simulated clinical scenario. To eliminate the need for analyzing pictures or other visual forms, we established a dataset comprising questions and options only in text format.</p>
      <p>In this study, we aimed to investigate whether different types or fields of questions would influence LLMs’ performance in the Chinese linguistic paradigm. Delving deeper, we aimed to evaluate the efficacy and reliability of different LLMs’ decision-making ability, offering insights and practical recommendations of their possible role in facilitating clinical decision-making. We selected the most recent LLMs, GPT-4o and GPT-o1 in the GPT family of models, as they represent the proprietary systems that were released in May 2024 and September 2024, respectively. In contrast to GPT’s proprietary framework, we selected the open-access models DeepSeek V3 and R1 [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref22">22</xref>] launched in January 2025 to compare their accuracy, robustness, and limitations.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Chinese National Health Professional and Technical Qualification Examination (Intermediate Level) Knowledge Datasets</title>
        <p>We created an examination dataset of the National Health Professional and Technical Qualification Examination (intermediate level) containing questions extracted from the book <italic>Cardiovascular Medicine: Synchronized Exercises and Comprehensive Mock Examinations</italic> [<xref ref-type="bibr" rid="ref23">23</xref>] to test the performance of different LLMs (<xref rid="figure1" ref-type="fig">Figure 1</xref>). We randomly selected 398 queries from the dataset across various medical fields (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) under 4 different types: A1 (knowledge-based multiple choice), A2 (case-based multiple choice), A3/A4 (case-group-based multiple choice), B (matching), and X (multiple choice). The composition of these four sections is presented in <xref ref-type="table" rid="table1">Table 1</xref>. Except for the X-type questions, each question presented 5 answer options, with only 1 correct answer. Meanwhile, for the X-type questions, there were 5-11 options across various questions. All questions were composed and presented in Chinese, with no English inclusion or explanation.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Workflow illustration of this study.</p>
          </caption>
          <graphic xlink:href="formative_v10i1e90673_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Dataset of the National Health Professional and Technical Qualification Examination in China.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="590"/>
            <col width="380"/>
            <thead>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>Questions, n</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3">
                  <bold>Parts of the examination (N=398)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Basic Concepts</td>
                <td>100</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Relevant Expertise</td>
                <td>100</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Foundational Professional Knowledge</td>
                <td>100</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Professional Practical Skills</td>
                <td>98</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Format of the examination (N=398)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>A1-type</td>
                <td>89</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>A2-type</td>
                <td>78</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>A3/A4-type</td>
                <td>88</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>B-type</td>
                <td>45</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>X-type</td>
                <td>98</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Clinical disciplines in the section of Basic Concepts and Relevant Expertise (n=200)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Respiratory Medicine</td>
                <td>34</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cardiology</td>
                <td>32</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gastroenterology</td>
                <td>24</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hematology</td>
                <td>17</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Nephrology</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Infectious diseases</td>
                <td>31</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Neurology</td>
                <td>19</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Rheumatology and immunology</td>
                <td>8</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Endocrinology</td>
                <td>15</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Emergency medicine</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>Cardiology subspecialties in the section of Foundational Professional Knowledge (n=100)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Heart failure</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Arrhythmias</td>
                <td>25</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cardiac arrest and sudden cardiac death</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Congenital cardiovascular diseases</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hypertension</td>
                <td>12</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Coronary artery disease</td>
                <td>17</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Valvular heart disease</td>
                <td>3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Infective endocarditis</td>
                <td>3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Myocardial diseases</td>
                <td>15</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Pericardial disorders</td>
                <td>13</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>LLM Testing</title>
        <p>In this comparative study, we tested 398 MCQs selected from the dataset with 4 different LLMs, namely, DeepSeek V3, DeepSeek R1, ChatGPT 4o, and ChatGPT o1 by manually entering the questions. To assess the performance of DeepSeek V3 and R1, we used the application programming interface (API) provided by SiliconFlow [<xref ref-type="bibr" rid="ref24">24</xref>], a cloud service platform, due to usage limitations on the official server. For the evaluation of ChatGPT 4o and o1, we obtained them through the official chat user interface (UI). Temperature settings are crucial in the usage of LLMs, as this directly influences the randomness of the generated content. In this study, the temperature for DeepSeek V3 and R1 was set typically at 0.7. Regarding the ChatGPT chat UI, we were unable to find the direct control over temperature settings; thus, these 2 models were evaluated under default configurations, which could have introduced variability in systematic bias. The responses were generated by different LLMs between February 21 and February 28, 2025.</p>
        <p>Questions were run independently without additional instructions during the conversation. To enhance contextual connections capabilities, we offered the same prompts instructing LLMs to assume the role of a physician and provide answers with explanations at the beginning of each conversation [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>] (Table S1 of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Answers and explanations generated by LLMs were meticulously documented using Word and cross-referenced with the correct answers to ensure precise evaluation of examination performance.</p>
        <p>For the first three parts (basic concepts, relevant expertise, foundational professional knowledge), we evaluated different models’ performance by calculating the accuracy rates (percentage of correct answers out of the total). For the final part of professional practical skills, each question was assigned one point; correct answers received full credit, while incorrect ones received none. For partially correct responses, scores were awarded proportionally based on the number of accurate options selected. To compare the performance of different models in various types or fields of questions, the questions from the first three sections were categorized into different segments based on their types and subject domains, as mentioned before, and then analyzed.</p>
      </sec>
      <sec>
        <title>Cost-Effectiveness Analysis</title>
        <p>For DeepSeek V3 and R1, costs were calculated on a pay‑per‑token basis using the official pricing published on the SiliconFlow website (as of February 2025): ¥2 per million input tokens and ¥8 per million output tokens for DeepSeek V3; ¥4 per million input tokens and ¥16 per million output tokens for DeepSeek R1 [<xref ref-type="bibr" rid="ref24">24</xref>]. During the study, the applicable exchange rate was US $1=¥7.52. Total input and output tokens for each model were obtained from the API response logs.</p>
        <p>For ChatGPT 4o and o1, both models were queried via the public chat UI. We estimated expenditure based on the monthly subscription fee required for o1 access (ChatGPT Plus, US $20 per month, approximately ¥150.59), which includes up to 50 o1 prompts per month according to OpenAI’s policy at the time of the study. ChatGPT 4o is included in the same subscription tier with no separate prompt limit.</p>
      </sec>
      <sec>
        <title>Data Analysis</title>
        <p>All data for this study were collected using Microsoft Excel for Mac 16.95, and the accuracy was analyzed using SPSS software (version 30.0; IBM Corp). For the first three sections, that is, basic concepts, relevant expertise and foundational professional knowledge, we performed the McNemar test to examine the performance among the models. For post hoc multiple comparisons of accuracy rates across multiple groups, we used partitions of chi-square method. To control the risk of type I error, statistical significance was set at <italic>P</italic>&#60;.008 according to the Bonferroni correction α'= α/(k*(k-1)/2). For the professional practical skills, one-way ANOVA was used to compare the performance of the 4 LLMs in processing real-world clinical cases, with statistical significance set at <italic>P</italic>&#60;.05.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>As this study was limited to medical state examination questions and publicly available results, no research involving human participants was conducted. Ethics approval was therefore not required.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overview of Different LLMs’ Performance in the Examination</title>
        <p>As illustrated in the <xref ref-type="table" rid="table2">Table 2</xref> and <xref rid="figure2" ref-type="fig">Figure 2</xref>A, for the first three parts of the examination, DeepSeek V3, DeepSeek R1, ChatGPT 4o, and ChatGPT o1 showed accuracy of 93%, 93.6%, 73.3%, and 69%, respectively (<italic>χ</italic><sup>2</sup><sub>3</sub>=102.9; <italic>P</italic>&#60;.001). Compared with ChatGPT 4o and ChatGPT o1, DeepSeek R1 demonstrated better performance (DeepSeek R1 vs ChatGPT 4o, <italic>χ</italic><sup>2</sup><sub>1</sub>=45.1, <italic>P</italic>&#60;.001; DeepSeek R1 vs ChatGPT o1, <italic>χ</italic><sup>2</sup><sub>1</sub>= 60.1; <italic>P</italic>&#60;.001) among the 4 LLMs, and DeepSeek V3 ranked second (DeepSeek V3 vs ChatGPT 4o <italic>χ</italic><sup>2</sup><sub>1</sub>=41.4; <italic>P</italic>&#60;.001; DeepSeek V3 vs ChatGPT o1 <italic>χ</italic><sup>2</sup><sub>1</sub>=56.1; <italic>P</italic>&#60;.001), with no statistically significant differences between R1 and V3 (<italic>χ</italic><sup>2</sup><sub>1</sub>=0.1; <italic>P</italic>=.74).</p>
        <p>Regarding each individual section, both in basic concepts and relevant expertise sections, ChatGPT 4o and o1 showed lower accuracy of 66% and 58% for the basic concepts part and 71% and 66% for the relevant expertise part, respectively—all with statistical significance compared with 2 models of DeepSeek (<italic>P</italic>&#60;.008). In the section of foundational professional knowledge, the two models of ChatGPT, 4o and o1, showed a moderate increase of accuracy of 82% and 83%, respectively, compared with the first two sections, and showed no statistical significance when the 4 models compared with each other (<italic>χ</italic><sup>2</sup><sub>3</sub>=11.3; <italic>P</italic>=.01).</p>
        <p>Additionally, for the section on professional practical skills, as illustrated in the <xref ref-type="table" rid="table2">Table 2</xref> and <xref rid="figure2" ref-type="fig">Figure 2</xref>B, although the two DeepSeek’s models achieved worse performance as they did in the first three sections, DeepSeek V3 and R1 ranked first and second respectively, with scores of 64.63 and 64.3, respectively, while the two models of ChatGPT ranked third and last, with 4o’s score at 47.98 and o1’s at 46.29. This is also of statistical significance compared with DeepSeek V3 (<italic>P</italic>=.003; <italic>P</italic>&#60;.001) and DeepSeek R1 (<italic>P</italic>=.003; <italic>P</italic>&#60;.001), respectively.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Performance of the different large language models in the examination.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="370"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>DeepSeek V3</td>
                <td>DeepSeek R1</td>
                <td>ChatGPT 4o</td>
                <td>ChatGPT o1</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>Correct answers in the first 3 sections, n (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Basic Concepts (n=100)</td>
                <td>94 (94)</td>
                <td>94 (94)</td>
                <td>66 (66)</td>
                <td>58 (58)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Relevant Expertise (n=100)</td>
                <td>92 (92)</td>
                <td>93 (93)</td>
                <td>71 (71)</td>
                <td>66 (66)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Foundational Professional Knowledge (n=100)</td>
                <td>93 (93)</td>
                <td>94 (94)</td>
                <td>82 (82)</td>
                <td>83 (83)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Total (n=300)</td>
                <td>279 (93)</td>
                <td>281 (93.6)</td>
                <td>220 (73.3)</td>
                <td>207 (69)</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Mean scores on the fourth section (n=98, each scored 0-1 point with partial credit)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Professional Practical Skills</td>
                <td>64.63</td>
                <td>64.3</td>
                <td>47.98</td>
                <td> 46.29</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Comparisons among DeepSeek V3, DeepSeek R1, ChatGPT 4o, and ChatGPT o1. (A) Performance on the first three sections. Pairwise model comparisons were performed using McNemar test with Bonferroni correction. (**<italic>P</italic>&#60;.008; ***<italic>P</italic>&#60;.001). (B) Performance on the fourth section of the multiple choice questions. Pairwise model comparisons were performed using one-way ANOVA (***<italic>P</italic>&#60;.001).</p>
          </caption>
          <graphic xlink:href="formative_v10i1e90673_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Performance of LLMs in Various Topics</title>
        <p><xref ref-type="table" rid="table3">Table 3</xref> presents the percentage of correct answers for each field for each question answered by various LLMs. The topics in <xref ref-type="table" rid="table3">Table 3</xref> were manually sorted into 10 clinical disciplines, which contained different branches of internal medicine. Among these topics, DeepSeek-V3 achieved the highest accuracy in two topics, that is, gastroenterology (91.7%) and hematology (94.1%), with no statistical significance compared with DeepSeek R1 (<italic>χ</italic><sup>2</sup><sub>1</sub>=1.3; <italic>P</italic>=.26; <italic>χ</italic><sup>2</sup><sub>1</sub>=0.4; <italic>P</italic>=.54). DeepSeek R1 achieved the highest accuracy in cardiology (96.9%) and neurology (100%). In the topics of gastroenterology, hematology, nephrology, neurology, rheumatology, immunology, endocrinology, and emergency medicine, the performance of the 4 LLMs did not illustrate statistically significant differences (<italic>χ</italic><sup>2</sup><sub>1</sub>=9.4, <italic>P</italic>=.02; <italic>χ</italic><sup>2</sup><sub>1</sub>=1.1, <italic>P</italic>=.77; <italic>χ</italic><sup>2</sup><sub>1</sub>=3.9, <italic>P</italic>=.27; <italic>χ</italic><sup>2</sup><sub>1</sub>=11.4, <italic>P</italic>=.01; <italic>P</italic>&#62;.99; <italic>χ</italic><sup>2</sup><sub>1</sub>=2.5, <italic>P</italic>=.48; <italic>χ</italic><sup>2</sup><sub>1</sub>=2.3, <italic>P</italic>=.50, respectively). Besides, statistical significance was observed when ChatGPT 4o was compared with DeepSeek V3 and R1 in the fields of respiratory medicine (<italic>χ</italic><sup>2</sup><sub>1</sub>=14.8, <italic>P</italic>&#60;.001; <italic>χ</italic><sup>2</sup><sub>1</sub>=11.6, <italic>P</italic>&#60;.001), cardiology (<italic>χ</italic><sup>2</sup><sub>1</sub>=9.1, <italic>P</italic>=.002; <italic>χ</italic><sup>2</sup><sub>1</sub>=11.7, <italic>P</italic>&#60;.001), and infectious diseases (<italic>χ</italic><sup>2</sup><sub>1</sub>=14.8, <italic>P</italic>&#60;.001; <italic>χ</italic><sup>2</sup><sub>1</sub>=14.8, <italic>P</italic>&#60;.001). As for ChatGPT-o1, statistical significance was observed also in respiratory medicine (<italic>χ</italic><sup>2</sup><sub>1</sub>=14.8, <italic>P</italic>&#60;.001; <italic>χ</italic><sup>2</sup><sub>1</sub>=11.6, <italic>P</italic>&#60;.001), cardiology (<italic>χ</italic><sup>2</sup><sub>1</sub>=15.8, <italic>P</italic>&#60;.001; <italic>χ</italic><sup>2</sup><sub>1</sub>=18.8, <italic>P</italic>&#60;.001), and infectious diseases (<italic>χ</italic><sup>2</sup><sub>1</sub>=14.9, <italic>P</italic>&#60;.001; <italic>χ</italic><sup>2</sup><sub>1</sub>=14.9, <italic>P</italic>&#60;.001) in comparison with DeepSeek V3 and R1 (<xref rid="figure3" ref-type="fig">Figures 3</xref>A-C).</p>
        <p>Meanwhile, for the third section of foundational professional knowledge, 100 questions were categorized into 10 prevalent cardiology subspecialties. As presented in <xref ref-type="table" rid="table3">Table 3</xref> and <xref rid="figure3" ref-type="fig">Figures 3</xref>B-D, the performance of ChatGPT 4o and o1, in the field of arrhythmias, hypertension, myocardial diseases and pericardial disorders, was poorer than that of DeepSeek V3 and R1, while it was of no statistical significance (<italic>χ</italic><sup>2</sup><sub>1</sub>=9.3, <italic>P</italic>=.03; <italic>χ</italic><sup>2</sup><sub>1</sub>=1.5, <italic>P</italic>=.68; <italic>χ</italic><sup>2</sup><sub>1</sub>=4.3, <italic>P</italic>=.23; <italic>χ</italic><sup>2</sup><sub>1</sub>=2.1, <italic>P</italic>=.56). Particularly, it is noteworthy that in the field of coronary artery disease, the accuracy of ChatGPT o1 was 88.2% and ranked the highest even when no statistical significance was observed compared with DeepSeek V3 and R1 (<italic>χ</italic><sup>2</sup><sub>1</sub>=0.8, <italic>P</italic>=.37; <italic>χ</italic><sup>2</sup><sub>1</sub>=0.2, <italic>P</italic>=.63).</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Correct answers by clinical discipline in the Basic Concepts and Relevant Expertise section and by cardiology subspecialty in the Foundational Professional Knowledge section.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="350"/>
            <col width="0"/>
            <col width="160"/>
            <col width="0"/>
            <col width="160"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="2">DeepSeek V3, n (%)</td>
                <td colspan="2">DeepSeek R1, n (%)</td>
                <td colspan="2">ChatGPT 4o, n (%)</td>
                <td>ChatGPT o1, n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="10">
                  <bold>Basic Concepts and Relevant Expertise section (n=200)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Respiratory medicine (n=34)</td>
                <td colspan="2">30 (88.2)</td>
                <td colspan="2">30 (88.2)</td>
                <td colspan="2">17 (50)</td>
                <td colspan="2">17 (50)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cardiology (n=32)</td>
                <td colspan="2">30 (93.6)</td>
                <td colspan="2">31 (96.9)</td>
                <td colspan="2">20 (62.5)</td>
                <td colspan="2">16 (50)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gastroenterology (n=24)</td>
                <td colspan="2">22 (91.7)</td>
                <td colspan="2">21 (87.5)</td>
                <td colspan="2">18 (75)</td>
                <td colspan="2">14 (58.3)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hematology (n=17)</td>
                <td colspan="2">16 (94.1)</td>
                <td colspan="2">15 (88.2)</td>
                <td colspan="2">14 (82.3)</td>
                <td colspan="2">15 (88.2)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Nephrology (n=10)</td>
                <td colspan="2">9 (90)</td>
                <td colspan="2">9 (90)</td>
                <td colspan="2">8 (80)</td>
                <td colspan="2">7 (70)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Infectious diseases (n=31)</td>
                <td colspan="2">30 (96.8)</td>
                <td colspan="2">30 (96.8)</td>
                <td colspan="2">17 (54.8)</td>
                <td colspan="2">17 (54.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Neurology (n=19)</td>
                <td colspan="2">18 (94.7)</td>
                <td colspan="2">19 (100)</td>
                <td colspan="2">13 (68.4)</td>
                <td colspan="2">13 (68.4)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Rheumatology and immunology (n=8)</td>
                <td colspan="2">8 (100)</td>
                <td colspan="2">8 (100)</td>
                <td colspan="2">8 (100)</td>
                <td colspan="2">8 (100)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Endocrinology (n=15)</td>
                <td colspan="2">13 (86.7)</td>
                <td colspan="2">13 (86.7)</td>
                <td colspan="2">12 (80)</td>
                <td colspan="2">10 (66.7)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Emergency medicine (n=10)</td>
                <td colspan="2">9 (90)</td>
                <td colspan="2">9 (90)</td>
                <td colspan="2">9 (90)</td>
                <td colspan="2">7 (70)</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>Cardiology subspecialty in the Foundational Professional Knowledge section (n=100)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Heart failure (n=10)</td>
                <td colspan="2">10 (100)</td>
                <td colspan="2">10 (100)</td>
                <td colspan="2">10 (100)</td>
                <td colspan="2">8 (80)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Arrhythmias (n=25)</td>
                <td colspan="2">24 (96)</td>
                <td colspan="2">25 (100)</td>
                <td colspan="2">19 (76)</td>
                <td colspan="2">21 (84)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Congenital cardiovascular diseases (n=1)</td>
                <td colspan="2">1 (100)</td>
                <td colspan="2">1 (100)</td>
                <td colspan="2">1 (100)</td>
                <td colspan="2">1 (100)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cardiac arrest and sudden cardiac death (n=1)</td>
                <td colspan="2">1 (100)</td>
                <td colspan="2">1 (100)</td>
                <td colspan="2">1 (100)</td>
                <td colspan="2">1 (100)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hypertension (n=12)</td>
                <td colspan="2">10 (83.3)</td>
                <td colspan="2">10 (83.3)</td>
                <td colspan="2">10 (83.3)</td>
                <td colspan="2">8 (66.7)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Coronary artery disease (n=17)</td>
                <td colspan="2">13 (76.5)</td>
                <td colspan="2">14 (82.4)</td>
                <td colspan="2">12 (70.6)</td>
                <td colspan="2">15 (88.2)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Valvular heart disease (n=3)</td>
                <td colspan="2">3 (100)</td>
                <td colspan="2">3 (100)</td>
                <td colspan="2">3 (100)</td>
                <td colspan="2">3 (100)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Infective endocarditis (n=3)</td>
                <td colspan="2">3 (100)</td>
                <td colspan="2">3 (100)</td>
                <td colspan="2">3 (100)</td>
                <td colspan="2">3 (100)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Myocardial diseases (n=15)</td>
                <td colspan="2">14 (93.3)</td>
                <td colspan="2">14 (93.3)</td>
                <td colspan="2">11 (73.3)</td>
                <td colspan="2">11 (73.3)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Pericardial disorders (n=13)</td>
                <td colspan="2">13 (100)</td>
                <td colspan="2">13 (100)</td>
                <td colspan="2">12 (92.3)</td>
                <td colspan="2">12 (92.3)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Comparisons among DeepSeek V3, DeepSeek R1, ChatGPT 4o, and ChatGPT o1. (A) and (C) show the performance on the topics of clinical disciplines, and (B) and (D) show the performance on the topics of cardiology subspecialties. Pairwise model comparisons were performed using McNemar test with Bonferroni correction (**<italic>P</italic>&#60;.008; ***<italic>P</italic>&#60;.001).</p>
          </caption>
          <graphic xlink:href="formative_v10i1e90673_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Performance of LLMs in Different Question Types</title>
        <p>As mentioned before, questions in basic concepts, relevant expertise, and foundational professional knowledge were separated into 4 types, namely, A1 (knowledge-based multiple choice), A2 (case-based multiple choice), A3/A4 (case-group-based multiple choice), and B (matching). As illustrated in <xref ref-type="table" rid="table4">Table 4</xref> and <xref rid="figure4" ref-type="fig">Figure 4</xref>, DeepSeek V3 and R1 showed distinct capability in answering various question types, even though they did not show a comparable difference (<italic>χ</italic><sup>2</sup><sub>1</sub>=0.06, <italic>P</italic>=.80; <italic>χ</italic><sup>2</sup><sub>1</sub>=0.09, <italic>P</italic>=.75; <italic>χ</italic><sup>2</sup><sub>1</sub>=0.2, <italic>P</italic>=.70; <italic>P</italic>&#62;.99). Specifically, DeepSeek R1 ranked the highest in A1-type (91%) and A3/4-type (96.6%), while DeepSeek V3 ranked first in A2 (93.6%). Except for A1 type questions (ChatGPT 4o vs DeepSeek V3 <italic>χ</italic><sup>2</sup><sub>1</sub>=6.6; <italic>P</italic>=.01), ChatGPT 4o and o1 demonstrated an equal weakness in other 3 question types, with statistically significant differences compared with DeepSeek V3 and R1 (<italic>P</italic>&#60;.001).</p>
        <p>To make it more transparent and demonstrate the performance differences among models, the representative questions and their responses made by the LLMs were chosen and are presented in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Performance of the 4 large language models by question format in the first 3 sections.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="380"/>
            <col width="160"/>
            <col width="160"/>
            <col width="150"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Question format</td>
                <td>DeepSeek V3, n (%)</td>
                <td>DeepSeek R1, n (%)</td>
                <td>ChatGPT 4o, n (%)</td>
                <td>ChatGPT o1, n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>A1-type (n=89)</td>
                <td>80 (89.9)</td>
                <td>81 (91)</td>
                <td>67 (75.3)</td>
                <td>61 (68.5)</td>
              </tr>
              <tr valign="top">
                <td>A2-type (n=78)</td>
                <td>73 (93.6)</td>
                <td>72 (92.3)</td>
                <td>56 (71.8)</td>
                <td>55 (70.5)</td>
              </tr>
              <tr valign="top">
                <td>A3/4-type (n=88)</td>
                <td>84 (95.5)</td>
                <td>85 (96.6)</td>
                <td>68 (77.3)</td>
                <td>64 (72.7)</td>
              </tr>
              <tr valign="top">
                <td>B-type (n=45)</td>
                <td>43 (95.6)</td>
                <td>43 (95.6)</td>
                <td>28 (62.2)</td>
                <td>27 (60)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Comparisons among DeepSeek V3, DeepSeek R1, ChatGPT 4o, and ChatGPT o1. (A) and (B) show the performance on the topics of different question types. Pairwise model comparisons were performed using McNemar test with Bonferroni correction (**<italic>P</italic>&#60;.008; ***<italic>P</italic>&#60;.001).</p>
          </caption>
          <graphic xlink:href="formative_v10i1e90673_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Cost-Effectiveness</title>
        <p>In total, there were approximately 251,190 input tokens used to prompt DeepSeek V3 and R1. The responses made by DeepSeek V3 contained 22,530 output tokens, while for R1, there were about 48,575 output tokens when compiling all the responses. Based on the token counts recorded during API queries, the total cost for DeepSeek V3 was approximately ¥0.68, and for DeepSeek R1 ¥1.78. For ChatGPT 4o and o1, the estimated cost using the monthly subscription model was ¥37.65 for the 8-day data collection period. Under these specific access conditions, the expense of using ChatGPT models was approximately 55-fold higher than using DeepSeek V3 and 21-fold higher than using DeepSeek R1.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>Our study aimed to evaluate and compare the performance of four LLMs—DeepSeek V3, DeepSeek R1, ChatGPT 4o, and ChatGPT o1—in answering medical questions within a Chinese-language context. As demonstrated above, our findings indicated that DeepSeek V3 and R1 showed comparable and superior overall performance compared to both ChatGPT models across multiple question types and clinical disciplines. In the first 3 sections, DeepSeek R1 achieved the best performance, slightly exceeding DeepSeek V3. Additionally, DeepSeek V3 and R1 achieved the highest accuracy in the greatest number of clinical topics. Regarding performance across different question types, DeepSeek R1 performed exceptionally well on A1 and A3/4 types, while DeepSeek V3 performed better on A2 type. Notably, ChatGPT o1, which was thought to be comparable on reasoning tasks to DeepSeek R1 and better than ChatGPT 4o, performed the poorest across most question types among the 4 LLMs. However, all models showed a notable performance decline in realistic clinical case simulations (professional practical skills section).</p>
      <p>Our findings demonstrated substantial accordance with previous research. Xu et al [<xref ref-type="bibr" rid="ref27">27</xref>] reported that in ophthalmology, DeepSeek R1, with overall accuracy of 86.2% on Chinese MCQs, showed superior performance in Chinese complex reasoning tasks compared to Gemini 2.0 Pro, OpenAI o1 and o3-mini. Similarly, Mikhail et al [<xref ref-type="bibr" rid="ref28">28</xref>] illustrated that DeepSeek R1 had comparable performance with ChatGPT o1 at reduced cost. However, DeepSeek R1 outperformed ChatGPT-4 on pediatric MCQs [<xref ref-type="bibr" rid="ref19">19</xref>]. Integrating previous research outcomes with our new evidence, LLMs, in particular, DeepSeek V3 and R1, show extensive medical knowledge under the given settings.</p>
      <p>However, both in our research and prior studies [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>], LLMs demonstrated exceptional performance primarily in MCQs. Although these questions were designed to assess examinees' mastery of clinical knowledge, most can be effortlessly answered through mere memorization. MCQs failed to mirror the complexity and depth inherent in real-world clinical judgments, which require gathering and evaluating diverse data to reach evidence-based clinical decisions. To assess the usage of LLMs in an autonomous, real-world context, rigorous testing with authentic data and within practical, real-life conditions is essential [<xref ref-type="bibr" rid="ref29">29</xref>]. Hence, the fourth section of professional practical skills was designed simulate a real-world clinical case. As evidenced in our study, all 4 LLMs showed marked performance decline compared to their results in the preceding three sections, highlighting their evident limitations in confronting with X-type questions (multiple choices) involving real-world clinical case analyses. Tordjman et al [<xref ref-type="bibr" rid="ref30">30</xref>] reported that for text-based cases without answer choices, DeepSeek R1 (0.36) preformed similarly as ChatGPT o1 (0.32), reflecting the underperformance of the LLMs on open-ended questions. Consequently, while we firmly believe that LLMs have immense potential to revolutionize clinical decision-making in the future, their limitations in more realistic clinical contexts make us skeptical about their suitability at this stage.</p>
      <p>Synthesizing the findings from previous research with the outcomes of our study, we raised the following inquiry: what factors enable DeepSeek to surpass ChatGPT in examinations within a Chinese linguistic framework? The DeepSeek team reported in their article [<xref ref-type="bibr" rid="ref14">14</xref>] that to train a user-friendly model that can produce clear and coherent chains of thought, they designed a pipeline constructed in 4 stages. These 4 stages incorporate the following components: cold start, reasoning-oriented reinforcement learning, rejection sampling, and supervision of fine-tuning and reinforcement learning for all scenarios [<xref ref-type="bibr" rid="ref14">14</xref>]. These 4 unique stages may be the elements that set DeepSeek R1 apart from the multitude of LLMs in reasoning tasks, elevating it to a distinct level of excellence. Moreover, despite the absence of publicly available details regarding the precise proportion of Chinese and English corpora used in DeepSeek R1’s training process, we found that in its early version DeepSeek V2, it contained 1.12 times more Chinese tokens than English data [<xref ref-type="bibr" rid="ref31">31</xref>]. It is reasonable to infer that DeepSeek, a Chinese company, likely prioritizes Chinese corpora over English materials in training its LLMs; thus, the superior performance of DeepSeek V3 and R1 in the examination can be partly attributed to this factor.</p>
      <p>Beyond the aforementioned discussion, we found several limitations requiring attention. First, in this study, costs of different LLMs were not calculated precisely, and we are yet to establish a comprehensive framework to assess the costs associated with different models. This may contribute to the underestimation of the actual token usage and associated costs. Second, during the interaction process, we provided a prompt requesting each LLM to furnish a detailed analysis for each question. Nonetheless, we were unable to devise a suitable methodology to thoroughly examine these analyses, which warranted further in-depth investigation. Third, as previously reported, LLMs may generate nonsensical or untrue content in relation to certain sources, which is called hallucinations [<xref ref-type="bibr" rid="ref32">32</xref>]. The occurrence of such inaccuracies in clinical applications may lead to significant economic repercussions and, more gravely, the loss of life [<xref ref-type="bibr" rid="ref33">33</xref>]. Additionally, we encompassed only one question referring to the image-based questions. However, as illustrated by Sarangi et al [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>], even the performance of 4 LLMs, that is, Bing, Claude, ChatGPT, and Perplexity, varied in responding to MCQs based on radiology cases, while all failed to perform remarkably well when compared with residents. Meanwhile, even we compiled our own question database to avoid the risk of dataset contamination noted in prior studies (eg, Mikhail et al [<xref ref-type="bibr" rid="ref28">28</xref>]), we cannot exclude the possibility that this material was included in the training corpora of the evaluated LLMs. Consequently, high accuracy in this study should not be equated with robust clinical decision-making ability. Last, the temperature for DeepSeek V3 and R1 was set to 0.7, while ChatGPT 4o and o1 were evaluated under default configuration. Thus, the comparison between models accessed via API and those evaluated through chat-based interfaces would have introduced systematic bias in output variability and accuracy [<xref ref-type="bibr" rid="ref12">12</xref>]. Therefore, we have to admit that our findings reflect comparative performance under these specific experimental conditions rather than inherent model superiority.</p>
      <p>In the future, we will concentrate on addressing the limitations mentioned above to further evaluate the disparities in the economic efficiency and the dissemination of erroneous information among various LLMs.</p>
      <p>On these 398 questions comprising 5 different question types and 10 fields of different clinic disciplines, DeepSeek V3 and R1 demonstrated comparable performance, both surpassing ChatGPT 4o and o1 within the Chinese linguistic environment under the chosen experimental conditions. Consequently, they showed remarkable potential in facilitating clinical decision-making. Nonetheless, continued research is needed to evaluate their economic efficiency and hallucination.</p>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Questions and representative answers given by different large language models.</p>
        <media xlink:href="formative_v10i1e90673_app1.docx" xlink:title="DOCX File , 542 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">MCQ</term>
          <def>
            <p>multiple-choice question</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">UI</term>
          <def>
            <p>user interface</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <notes>
      <sec>
        <title>Funding</title>
        <p>This work was supported by National Natural Science Foundation of China (82270531).</p>
      </sec>
    </notes>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>JX and QW contributed to overall study design and manuscript preparation. JX and SW contributed to technical support and data analysis. JX, QW, and JS contributed to manuscript writing, preparation, review, revision, and submission and approved the final manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Team</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Anil</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Borgeaud</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Alayrac</surname>
              <given-names>JB</given-names>
            </name>
            <collab>et al</collab>
          </person-group>
          <article-title>Gemini: a family of highly capable multimodal models</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on May 9, 2025</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2312.11805</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>OpenAI</collab>
            <name name-style="western">
              <surname>Achiam</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Adler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <collab>et al</collab>
          </person-group>
          <article-title>GPT-4 technical report</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on March 4, 2024</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Else</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Abstracts written by ChatGPT fool scientists</article-title>
          <source>Nature</source>
          <year>2023</year>
          <month>01</month>
          <volume>613</volume>
          <issue>7944</issue>
          <fpage>423</fpage>
          <pub-id pub-id-type="doi">10.1038/d41586-023-00056-7</pub-id>
          <pub-id pub-id-type="medline">36635510</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-023-00056-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sallam</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title>
          <source>Healthcare (Basel)</source>
          <year>2023</year>
          <month>03</month>
          <day>19</day>
          <volume>11</volume>
          <issue>6</issue>
          <fpage>887</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=healthcare11060887"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id>
          <pub-id pub-id-type="medline">36981544</pub-id>
          <pub-id pub-id-type="pii">healthcare11060887</pub-id>
          <pub-id pub-id-type="pmcid">PMC10048148</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bubeck</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Petro</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title>
          <source>N Engl J Med</source>
          <year>2023</year>
          <month>03</month>
          <day>30</day>
          <volume>388</volume>
          <issue>13</issue>
          <fpage>1233</fpage>
          <lpage>1239</lpage>
          <pub-id pub-id-type="doi">10.1056/nejmsr2214184</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dimitriadis</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Alkagiet</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tsigkriki</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kleitsioti</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sidiropoulos</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Efstratiou</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Askalidi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tsaousidis</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Siarkos</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Giannakopoulou</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Mavrogianni</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zarifis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Koulaouzidis</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and patients with heart failure</article-title>
          <source>Angiology</source>
          <year>2025</year>
          <month>09</month>
          <volume>76</volume>
          <issue>8</issue>
          <fpage>796</fpage>
          <lpage>801</lpage>
          <pub-id pub-id-type="doi">10.1177/00033197241238403</pub-id>
          <pub-id pub-id-type="medline">38451243</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Scquizzato</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Semeraro</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Swindell</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Simpson</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Angelini</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gazzato</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sajjad</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Bignami</surname>
              <given-names>EG</given-names>
            </name>
            <name name-style="western">
              <surname>Landoni</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Keeble</surname>
              <given-names>TR</given-names>
            </name>
            <name name-style="western">
              <surname>Mion</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Testing ChatGPT ability to answer laypeople questions about cardiac arrest and cardiopulmonary resuscitation</article-title>
          <source>Resuscitation</source>
          <year>2024</year>
          <month>01</month>
          <volume>194</volume>
          <fpage>110077</fpage>
          <pub-id pub-id-type="doi">10.1016/j.resuscitation.2023.110077</pub-id>
          <pub-id pub-id-type="medline">38081504</pub-id>
          <pub-id pub-id-type="pii">S0300-9572(23)00813-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Harskamp</surname>
              <given-names>RE</given-names>
            </name>
            <name name-style="western">
              <surname>De Clercq</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT as an AI-assisted decision support tool in medicine: a proof-of-concept study for interpreting symptoms and management of common cardiac conditions (AMSTELHEART-2)</article-title>
          <source>Acta Cardiol</source>
          <year>2024</year>
          <month>05</month>
          <volume>79</volume>
          <issue>3</issue>
          <fpage>358</fpage>
          <lpage>366</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.tandfonline.com/doi/10.1080/00015385.2024.2303528?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1080/00015385.2024.2303528</pub-id>
          <pub-id pub-id-type="medline">38348835</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fijačko</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Prosen</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Abella</surname>
              <given-names>BS</given-names>
            </name>
            <collab>Metličar</collab>
            <name name-style="western">
              <surname>Štiglic</surname>
              <given-names>Gregor</given-names>
            </name>
          </person-group>
          <article-title>Can novel multimodal chatbots such as Bing Chat Enterprise, ChatGPT-4 Pro, and Google Bard correctly interpret electrocardiogram images?</article-title>
          <source>Resuscitation</source>
          <year>2023</year>
          <month>12</month>
          <volume>193</volume>
          <fpage>110009</fpage>
          <pub-id pub-id-type="doi">10.1016/j.resuscitation.2023.110009</pub-id>
          <pub-id pub-id-type="medline">37884222</pub-id>
          <pub-id pub-id-type="pii">S0300-9572(23)00324-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Motaganahalli</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the clinical decision-making ability of large language models using MKSAP-19 cardiology questions</article-title>
          <source>JACC Adv</source>
          <year>2023</year>
          <month>11</month>
          <volume>2</volume>
          <issue>9</issue>
          <fpage>100658</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2772-963X(23)00647-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jacadv.2023.100658</pub-id>
          <pub-id pub-id-type="medline">38938709</pub-id>
          <pub-id pub-id-type="pii">S2772-963X(23)00647-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC11198637</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Dou</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Performance and exploration of ChatGPT in medical examination, records and education in Chinese: pave the way for medical AI</article-title>
          <source>Int J Med Inform</source>
          <year>2023</year>
          <month>09</month>
          <volume>177</volume>
          <fpage>105173</fpage>
          <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105173</pub-id>
          <pub-id pub-id-type="medline">37549499</pub-id>
          <pub-id pub-id-type="pii">S1386-5056(23)00191-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Shahjalal</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zhuang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on Chinese master's degree entrance examination in clinical medicine</article-title>
          <source>PLoS One</source>
          <year>2024</year>
          <volume>19</volume>
          <issue>4</issue>
          <fpage>e0301702</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0301702"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0301702</pub-id>
          <pub-id pub-id-type="medline">38573944</pub-id>
          <pub-id pub-id-type="pii">PONE-D-24-06964</pub-id>
          <pub-id pub-id-type="pmcid">PMC10994287</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarangi</surname>
              <given-names>PK</given-names>
            </name>
            <name name-style="western">
              <surname>Datta</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Panda</surname>
              <given-names>BB</given-names>
            </name>
            <name name-style="western">
              <surname>Panda</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mondal</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Evaluating ChatGPT-4's performance in identifying radiological anatomy in FRCR part 1 examination questions</article-title>
          <source>Indian J Radiol Imaging</source>
          <year>2025</year>
          <month>04</month>
          <volume>35</volume>
          <issue>2</issue>
          <fpage>287</fpage>
          <lpage>294</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.thieme-connect.com/DOI/DOI?10.1055/s-0044-1792040"/>
          </comment>
          <pub-id pub-id-type="doi">10.1055/s-0044-1792040</pub-id>
          <pub-id pub-id-type="medline">40297110</pub-id>
          <pub-id pub-id-type="pii">IJRI-24-7-3882</pub-id>
          <pub-id pub-id-type="pmcid">PMC12034419</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>DeepSeek-AI</collab>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>DeepSeek-R1: incentivizing reasoning capability in LLMs via reinforcement learning</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on January 4, 2026</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2501.12948</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Temsah</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Alhasan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Altamimi</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Jamal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Eyadhy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Malki</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Temsah</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>DeepSeek in healthcare: revealing opportunities and steering challenges of a new open-source artificial intelligence frontier</article-title>
          <source>Cureus</source>
          <year>2025</year>
          <month>02</month>
          <volume>17</volume>
          <issue>2</issue>
          <fpage>e79221</fpage>
          <pub-id pub-id-type="doi">10.7759/cureus.79221</pub-id>
          <pub-id pub-id-type="medline">39974299</pub-id>
          <pub-id pub-id-type="pmcid">PMC11836063</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>OpenP5: an open-source platform for developing, training, and evaluating LLM-based recommender systems</article-title>
          <year>2024</year>
          <month>06</month>
          <day>14</day>
          <conf-name>SIGIR's 24</conf-name>
          <conf-date>July 14-18</conf-date>
          <conf-loc>Washington, DC, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3626772.3657883</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dreyer</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>China made waves with Deepseek, but its real ambition is AI-driven industrial innovation</article-title>
          <source>Nature</source>
          <year>2025</year>
          <month>02</month>
          <volume>638</volume>
          <issue>8051</issue>
          <fpage>609</fpage>
          <lpage>611</lpage>
          <pub-id pub-id-type="doi">10.1038/d41586-025-00460-1</pub-id>
          <pub-id pub-id-type="medline">39966638</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-025-00460-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gibney</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>China's cheap, open AI model DeepSeek thrills scientists</article-title>
          <source>Nature</source>
          <year>2025</year>
          <month>02</month>
          <volume>638</volume>
          <issue>8049</issue>
          <fpage>13</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.1038/d41586-025-00229-6</pub-id>
          <pub-id pub-id-type="medline">39849139</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-025-00229-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mondillo</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Colosimo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Perrotta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Frattolillo</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Masino</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Comparative evaluation of advanced AI reasoning models in pediatric clinical decision support: ChatGPT O1 vs. DeepSeek-R1</article-title>
          <source>MedRxiv</source>
          <comment>Preprint posted online on January 28, 2025</comment>
          <pub-id pub-id-type="doi">10.1101/2025.01.27.25321169</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kayaalp</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Prill</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sezgin</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Cong</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Królikowska</surname>
              <given-names>Aleksandra</given-names>
            </name>
            <name name-style="western">
              <surname>Hirschmann</surname>
              <given-names>MT</given-names>
            </name>
          </person-group>
          <article-title>DeepSeek versus ChatGPT: Multimodal artificial intelligence revolutionizing scientific discovery. From language editing to autonomous content generation-Redefining innovation in research and practice</article-title>
          <source>Knee Surg Sports Traumatol Arthrosc</source>
          <year>2025</year>
          <month>05</month>
          <day>12</day>
          <volume>33</volume>
          <issue>5</issue>
          <fpage>1553</fpage>
          <lpage>1556</lpage>
          <pub-id pub-id-type="doi">10.1002/ksa.12628</pub-id>
          <pub-id pub-id-type="medline">39936363</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gibney</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Scientists flock to DeepSeek: how they're using the blockbuster AI model</article-title>
          <source>Nature. Online ahead of print</source>
          <year>2025</year>
          <month>01</month>
          <day>29</day>
          <pub-id pub-id-type="doi">10.1038/d41586-025-00275-0</pub-id>
          <pub-id pub-id-type="medline">39881178</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-025-00275-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>DeepSeek-AI</collab>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Xue</surname>
              <given-names>B</given-names>
            </name>
            <collab>et al</collab>
          </person-group>
          <article-title>DeepSeek-V3 Technical Report</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on February 18, 2025</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2412.19437</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <source>Cardiovascular Medicine: Synchronized Exercises and Comprehensive Mock Examinations</source>
          <year>2023</year>
          <month>11</month>
          <publisher-loc>Beijing, China</publisher-loc>
          <publisher-name>The People's Health Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="web">
          <source>Siliconflow</source>
          <access-date>2025-10-05</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cloud.siliconflow.cn/">https://cloud.siliconflow.cn/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="web">
          <source>AI Short</source>
          <access-date>2025-03-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.aishort.top/en/">https://www.aishort.top/en/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <source>AI Prompts Community</source>
          <access-date>2025-03-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://prompts.chat/">https://prompts.chat/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>DeepSeek-R1 outperforms Gemini 2.0 Pro, OpenAI o1, and o3-mini in bilingual complex ophthalmology reasoning</article-title>
          <source>Adv Ophthalmol Pract Res</source>
          <year>2025</year>
          <volume>5</volume>
          <issue>3</issue>
          <fpage>189</fpage>
          <lpage>195</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2667-3762(25)00029-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.aopr.2025.05.001</pub-id>
          <pub-id pub-id-type="medline">40678192</pub-id>
          <pub-id pub-id-type="pii">S2667-3762(25)00029-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC12269606</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikhail</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Farah</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Milad</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nassrallah</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Mihalache</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Milad</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Antaki</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Balas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Popovic</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Feo</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Muni</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Keane</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Duval</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Performance of DeepSeek-R1 in ophthalmology: an evaluation of clinical decision-making and cost-effectiveness</article-title>
          <source>Br J Ophthalmol</source>
          <year>2025</year>
          <month>08</month>
          <day>20</day>
          <volume>109</volume>
          <issue>9</issue>
          <fpage>976</fpage>
          <lpage>981</lpage>
          <pub-id pub-id-type="doi">10.1136/bjo-2025-327360</pub-id>
          <pub-id pub-id-type="medline">40701781</pub-id>
          <pub-id pub-id-type="pii">bjo-2025-327360</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hager</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Jungmann</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Holland</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bhagat</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hubrecht</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Knauer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Vielhauer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Makowski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Braren</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kaissis</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Rueckert</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Evaluation and mitigation of the limitations of large language models in clinical decision-making</article-title>
          <source>Nat Med</source>
          <year>2024</year>
          <month>09</month>
          <volume>30</volume>
          <issue>9</issue>
          <fpage>2613</fpage>
          <lpage>2622</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-024-03097-1</pub-id>
          <pub-id pub-id-type="medline">38965432</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-024-03097-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC11405275</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tordjman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yuce</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fauveau</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Mei</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hadjadj</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bolger</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Almansour</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Horst</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Parihar</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Geahchan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Meribout</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yatim</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Robson</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Deyer</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Taouli</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Fayad</surname>
              <given-names>ZA</given-names>
            </name>
            <name name-style="western">
              <surname>Mei</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Comparative benchmarking of the DeepSeek large language model on medical tasks and clinical reasoning</article-title>
          <source>Nat Med</source>
          <year>2025</year>
          <month>08</month>
          <volume>31</volume>
          <issue>8</issue>
          <fpage>2550</fpage>
          <lpage>2555</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-025-03726-3</pub-id>
          <pub-id pub-id-type="medline">40267969</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-025-03726-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>DeepSeek-AI</collab>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>DeepSeek-V2: a strong, economical, and efficient mixture-of-experts language model</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on June 19, 2024</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2405.04434</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Challenges in building intelligent open-domain dialog systems</article-title>
          <source>ACM Trans Inf Syst</source>
          <year>2020</year>
          <month>04</month>
          <day>09</day>
          <volume>38</volume>
          <issue>3</issue>
          <fpage>1</fpage>
          <lpage>32</lpage>
          <pub-id pub-id-type="doi">10.1145/3383123</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Powles</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hodson</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Google DeepMind and healthcare in an age of algorithms</article-title>
          <source>Health Technol (Berl)</source>
          <year>2017</year>
          <volume>7</volume>
          <issue>4</issue>
          <fpage>351</fpage>
          <lpage>367</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29308344"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s12553-017-0179-1</pub-id>
          <pub-id pub-id-type="medline">29308344</pub-id>
          <pub-id pub-id-type="pii">179</pub-id>
          <pub-id pub-id-type="pmcid">PMC5741783</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarangi</surname>
              <given-names>PK</given-names>
            </name>
            <name name-style="western">
              <surname>Datta</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Swarup</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Panda</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nayak</surname>
              <given-names>DSK</given-names>
            </name>
            <name name-style="western">
              <surname>Malik</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Datta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mondal</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Radiologic decision-making for imaging in pulmonary embolism: accuracy and reliability of large language models-Bing, Claude, ChatGPT, and Perplexity</article-title>
          <source>Indian J Radiol Imaging</source>
          <year>2024</year>
          <month>10</month>
          <volume>34</volume>
          <issue>4</issue>
          <fpage>653</fpage>
          <lpage>660</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.thieme-connect.com/DOI/DOI?10.1055/s-0044-1787974"/>
          </comment>
          <pub-id pub-id-type="doi">10.1055/s-0044-1787974</pub-id>
          <pub-id pub-id-type="medline">39318561</pub-id>
          <pub-id pub-id-type="pii">IJRI-24-1-3319</pub-id>
          <pub-id pub-id-type="pmcid">PMC11419749</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarangi</surname>
              <given-names>PK</given-names>
            </name>
            <name name-style="western">
              <surname>Narayan</surname>
              <given-names>RK</given-names>
            </name>
            <name name-style="western">
              <surname>Mohakud</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Vats</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sahani</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mondal</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Assessing the capability of ChatGPT, Google Bard, and Microsoft Bing in solving radiology case vignettes</article-title>
          <source>Indian J Radiol Imaging</source>
          <year>2024</year>
          <month>04</month>
          <volume>34</volume>
          <issue>2</issue>
          <fpage>276</fpage>
          <lpage>282</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.thieme-connect.com/DOI/DOI?10.1055/s-0043-1777746"/>
          </comment>
          <pub-id pub-id-type="doi">10.1055/s-0043-1777746</pub-id>
          <pub-id pub-id-type="medline">38549897</pub-id>
          <pub-id pub-id-type="pii">IJRI-23-9-2963</pub-id>
          <pub-id pub-id-type="pmcid">PMC10972658</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
