<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i1e77357</article-id>
      <article-id pub-id-type="pmid">41411646</article-id>
      <article-id pub-id-type="doi">10.2196/77357</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Performance of DeepSeek-R1, ChatGPT (GPT-o3-mini), and Gemini 2.0 Flash on German Medical Multiple-Choice Questions: Comparative Evaluation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Stone</surname>
            <given-names>Alicia</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mosca</surname>
            <given-names>Lucia</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Sharma</surname>
            <given-names>Priyanshu</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Meyer</surname>
            <given-names>Annika</given-names>
          </name>
          <degrees>Dr med</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Anesthesiology and Operative Intensive Care</institution>
            <institution>Faculty of Medicine and University Hospital</institution>
            <institution>University Hospital Cologne</institution>
            <addr-line>Kerpener Str. 62</addr-line>
            <addr-line>Cologne, 50937</addr-line>
            <country>Germany</country>
            <email>annika.meyer1@uk-koeln.de</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8411-8799</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Karay</surname>
            <given-names>Yassin</given-names>
          </name>
          <degrees>Dr rer med</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-6380-158X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Steinbicker</surname>
            <given-names>Andrea U</given-names>
          </name>
          <degrees>Prof Dr Med</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5237-961X</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Streichert</surname>
            <given-names>Thomas</given-names>
          </name>
          <degrees>Prof Dr Med</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6588-720X</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Overbeek</surname>
            <given-names>Remco</given-names>
          </name>
          <degrees>Dr med</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4046-0234</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Anesthesiology and Operative Intensive Care</institution>
        <institution>Faculty of Medicine and University Hospital</institution>
        <institution>University Hospital Cologne</institution>
        <addr-line>Cologne</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Dean’s Office for Student Affairs</institution>
        <institution>Faculty of Medicine</institution>
        <institution>University Hospital Cologne</institution>
        <addr-line>Cologne</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Institute for Clinical Chemistry</institution>
        <institution>Faculty of Medicine and University Hospital</institution>
        <institution>University Hospital Cologne</institution>
        <addr-line>Cologne</addr-line>
        <country>Germany</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Annika Meyer <email>annika.meyer1@uk-koeln.de</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>18</day>
        <month>12</month>
        <year>2025</year>
      </pub-date>
      <volume>9</volume>
      <elocation-id>e77357</elocation-id>
      <history>
        <date date-type="received">
          <day>12</day>
          <month>5</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>26</day>
          <month>9</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>16</day>
          <month>10</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>16</day>
          <month>10</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Annika Meyer, Yassin Karay, Andrea U Steinbicker, Thomas Streichert, Remco Overbeek. Originally published in JMIR Formative Research (https://formative.jmir.org), 18.12.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2025/1/e77357" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Despite the transformative potential of artificial intelligence (AI)–based chatbots in medicine, their implementation is hindered by data privacy and security concerns. DeepSeek offers a conceivable solution through its capability for local offline operations. However, as of 2025, it remains unclear whether DeepSeek can achieve an accuracy comparable to that of conventional, cloud-based AI chatbots.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to evaluate whether DeepSeek, an AI-based chatbot capable of offline operation, achieves answer accuracy on medical multiple-choice questions (MCQs) comparable to that of leading chatbots (ie, ChatGPT and Gemini) on German medical MCQs, thereby assessing its potential as a privacy-preserving alternative for clinical use.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>A total of 200 interdisciplinary MCQs from the German Progress Test Medicine were administered to ChatGPT (GPT-o3-mini), DeepSeek (DeepSeek-R1), and Gemini (Gemini 2.0 Flash). Accuracy was defined as the proportion of correctly solved questions. Overall differences among the 3 models were tested with the Cochran Q test, while pairwise comparisons were conducted using the McNemar test. Subgroup analyses were performed by medical domain (Fisher exact test) and question length (Wilcoxon rank-sum test). An a priori power analysis indicated a minimum sample size of 195 questions.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>All 3 chatbots surpassed the conventional passing threshold of 60%, with accuracies of 96% (192/200) for DeepSeek, 94% (188/200) for Gemini, and 92.5% (185/200) for ChatGPT. The overall difference among models was not statistically significant (<italic>P</italic>=.10) nor were pairwise comparisons. However, incorrect responses were significantly associated with longer question length for DeepSeek (<italic>P</italic>=.049) and ChatGPT (<italic>P</italic>=.04) but not for Gemini. No significant differences in performance were observed across clinical versus preclinical domains or medical specialties (all <italic>P</italic>&#62;.05).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Overall, DeepSeek demonstrates outstanding performance on German medical MCQs comparable to the widely used chatbots ChatGPT and Gemini. Similar to ChatGPT, DeepSeek’s performance declined with increasing question length, highlighting verbosity as a persistent challenge for large language models. While DeepSeek’s offline capability and lower operational costs are advantageous, its safe and reliable application in clinical contexts requires further investigation.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>ChatGPT</kwd>
        <kwd>Gemini</kwd>
        <kwd>DeepSeek</kwd>
        <kwd>large language model</kwd>
        <kwd>chatbots</kwd>
        <kwd>artificial intelligence</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>In recent years, artificial intelligence (AI) has experienced a remarkable surge in development and adoption [<xref ref-type="bibr" rid="ref1">1</xref>]. Although applications in medicine are not entirely new, with substantial investments in health care–related AI already initiated nearly a decade ago, the capabilities of AI have advanced considerably [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. These developments offer opportunities for research, automation of routine tasks, and diagnostic support while simultaneously raising persistent ethical, governance, and regulatory challenges [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref3">3</xref>].</p>
      <p>A major catalyst for the rapid uptake of AI has been the publication of AI-based chatbots on the World Wide Web, which have substantially lowered barriers related to the usability and accessibility of sophisticated AI systems [<xref ref-type="bibr" rid="ref4">4</xref>]. Their applications in medicine and research are increasingly under investigation. For example, ChatGPT (OpenAI), introduced in 2022, demonstrates strong performance in medical state examinations, can generate medical reports and radiology documentation, and supports medical programming [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. In addition, these chatbots exhibit considerable linguistic fluency and empathy in response to patients’ inquiries, although answer quality can vary across medical specialties [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>].</p>
      <p>Despite their promising capabilities, relevant concerns regarding data privacy and security impede the clinical adoption of online AI-based chatbots in hospitals [<xref ref-type="bibr" rid="ref4">4</xref>]. One proposed solution is the recently introduced offline-capable chatbot, DeepSeek (Hangzhou DeepSeek Artificial Intelligence Basic Technology Research Co, Ltd) [<xref ref-type="bibr" rid="ref10">10</xref>]. Notably, initial analyses already suggest that DeepSeek excels in various benchmarks, such as mathematics-based assessments, surpassing previous chatbot iterations [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Developed by a small team of computer scientists in 2025 [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>], DeepSeek gained immediate prominence due to its efficient computational design that requires fewer graphical processing units. Operational costs are thereby reduced, and user fees and the carbon footprint are lowered [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. Moreover, its partially open-source nature might further encourage ongoing innovation of AI [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p>
      <p>However, empirical evidence on the accuracy of answers on medical multiple-choice questions (MCQs) compared with established chatbots remains scarce, underscoring the need for systematic evaluation [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Therefore, this study aimed to systematically evaluate DeepSeek’s performance in applying medical knowledge by comparing it with ChatGPT and Gemini (Google DeepMind) on 200 MCQs from the German Progress Test Medicine (PTM).</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Berlin Progress Test</title>
        <p>The PTM used in this study is a knowledge test designed at the Charité in Berlin for students of human medicine. It is taken by approximately 12,000 students from 19 universities in Germany, Austria, and Switzerland through their medical studies. The PTM consists of 200 interdisciplinary MCQs at the graduate level and is designed to provide students with objective feedback on their personal growth in knowledge over the course of their studies. The questions cover a broad spectrum of domains, including internal medicine, surgery, pediatrics, obstetrics and gynecology, psychiatry, anesthesiology, radiology, laboratory medicine, and the basic sciences (eg, anatomy, physiology, and biochemistry), ensuring balanced representation of both clinical and preclinical content [<xref ref-type="bibr" rid="ref16">16</xref>]. By covering both clinical and preclinical domains, the test provides a balanced and integrated assessment of medical knowledge, which was the rationale for its selection in this study. Moreover, the test results and, in particular, the knowledge gained per semester have proven to be suitable criteria for predicting academic success with regard to the German state examinations [<xref ref-type="bibr" rid="ref17">17</xref>]. For this study, we used the 51st PTM, published in October 2024, which had a mean discrimination index of 0.45 and a Cronbach α of 0.98 [<xref ref-type="bibr" rid="ref16">16</xref>]. Each question was categorized by a physician according to the subject area and study phase (eg, clinical phase and preclinical phase).</p>
      </sec>
      <sec>
        <title>Data Collection</title>
        <p>To address our research question, we evaluated the performance of 3 chatbots, ChatGPT (GPT-o3-mini), DeepSeek (DeepSeek-R1), and Gemini (Gemini 2.0 Flash), using 200 MCQs from the PTM 51 published in October 2024 between February 21, 2025, and March 4, 2025. These chatbots were selected because they represent 3 leading approaches to large language models in medicine: ChatGPT as the widely used benchmark, Gemini as a major proprietary competitor, and DeepSeek as a novel offline-capable alternative with potential privacy advantages. The specific versions were chosen because they were the most recent publicly accessible releases and free of cost at the time of data collection, reflecting the default user-facing performance available in February 2025.</p>
        <p>All questions were used with formal permission, presented in German, and included without modification or exclusion. The number of the included 200 questions was determined based on a sample size calculation for the McNemar test, assuming a Cohen <italic>d</italic> of 0.28 and a statistical power of 80%, yielding a required sample size of 195. In accordance with the findings of Alfertshofer et al [<xref ref-type="bibr" rid="ref18">18</xref>], the word count of each question was subsequently determined.</p>
        <p>Each unaltered question was entered into the publicly accessible default web-based interface of each chatbot without supplementary prompting or user-directed modifications. Browsing and integrated tools were left enabled, reflecting the standard user-facing functionality of each system. To avoid memory effects or response contamination, each question was submitted in a separate, newly initiated chat session. On occasions where the chatbot failed to generate a response initially, the query was reinitiated.</p>
        <p>In addition to the quantitative analyses, 2 physicians independently conducted an exploratory qualitative review of discrepancies between the officially defined correct answers and the responses generated by the chatbots. Potential reasons for discrepancies were derived through independent review and subsequent discussion, without the application of a predefined coding framework, aimed at providing illustrative examples of chatbot limitations.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>Statistical analyses were performed using “R” (version 2025.09.1+401: R Foundation for Statistical Computing) [<xref ref-type="bibr" rid="ref19">19</xref>]. Sample size calculations were performed with the <italic>pwr</italic> package [<xref ref-type="bibr" rid="ref20">20</xref>]. Data wrangling and analysis were carried out using <italic>rio</italic> [<xref ref-type="bibr" rid="ref21">21</xref>], <italic>tidyverse</italic> [<xref ref-type="bibr" rid="ref22">22</xref>], <italic>gtsummary</italic> [<xref ref-type="bibr" rid="ref23">23</xref>], <italic>rstatix</italic> [<xref ref-type="bibr" rid="ref24">24</xref>], <italic>labelled</italic> [<xref ref-type="bibr" rid="ref25">25</xref>], and <italic>fastDummies</italic> [<xref ref-type="bibr" rid="ref26">26</xref>], while data visualization was accomplished using the <italic>tidyverse</italic> [<xref ref-type="bibr" rid="ref22">22</xref>], <italic>RColorBrewer</italic> [<xref ref-type="bibr" rid="ref27">27</xref>], and <italic>cowplot</italic> packages [<xref ref-type="bibr" rid="ref28">28</xref>]. ChatGPT was used to facilitate statistical programming, while all AI-generated content was critically reviewed by a human. Categorical variables were summarized by absolute and relative frequencies, whereas continuous variables were described using medians and IQRs. Normality was assessed using the Shapiro-Wilk test. Differences in chatbot performance were evaluated using the McNemar test for paired categorical data with Bonferroni correction and the Fisher exact test for unpaired categorical data, while the Wilcoxon rank-sum test was applied for continuous data. A binomial test was applied to test the performance of the chatbots against the threshold of 60%. A <italic>P</italic> value of &#60;.05 was considered statistically significant (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref> and <xref ref-type="supplementary-material" rid="app2">2</xref>).</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>As this study was limited to medical state examination questions and publicly available results, no research involving human participants was conducted. In accordance with the guidelines of the ethics committee of the University of Cologne, ethics approval was therefore not required [<xref ref-type="bibr" rid="ref29">29</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Question Characteristics</title>
        <p>A total of 200 original medical progress test questions were used to evaluate the 3 chatbots, with an average length of 55 (IQR 40-74) words. Most (177/200, 88.5%) questions focused on clinical knowledge, with nearly one-quarter (47/200, 23.5%) specifically addressing internal medicine (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Comparison of DeepSeek, Gemini, and ChatGPT in the Progress Test Medicine.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="80"/>
            <col width="100"/>
            <col width="70"/>
            <col width="50"/>
            <col width="70"/>
            <col width="70"/>
            <col width="0"/>
            <col width="70"/>
            <col width="60"/>
            <col width="70"/>
            <col width="70"/>
            <col width="0"/>
            <col width="70"/>
            <col width="50"/>
            <col width="70"/>
            <col width="70"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Characteristic</td>
                <td>All questions (N=200)</td>
                <td colspan="5">DeepSeek</td>
                <td colspan="5">Gemini</td>
                <td colspan="4">ChatGPT</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>True (n=192)</td>
                <td>False (n=8)</td>
                <td><italic>P</italic> value</td>
                <td>Q value</td>
                <td colspan="2">True (n=188)</td>
                <td>False (n=12)</td>
                <td><italic>P</italic> value</td>
                <td>Q value</td>
                <td colspan="2">True (n=185)</td>
                <td>False (n=15)</td>
                <td><italic>P</italic> value</td>
                <td>Q value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">Word count, median (IQR)</td>
                <td>55 (40, 74)</td>
                <td>54 (39, 73)</td>
                <td>70 (60, 83)</td>
                <td><bold>.</bold>049<sup>a</sup></td>
                <td>0.098</td>
                <td colspan="2">54 (39, 74)</td>
                <td>65 (56, 74)</td>
                <td>.12<sup>a</sup></td>
                <td>0.175</td>
                <td colspan="2">53 (39, 73)</td>
                <td>64 (55, 87)</td>
                <td>.04<sup>a</sup></td>
                <td>0.082</td>
              </tr>
              <tr valign="top">
                <td colspan="5">
                  <bold>Phase, n (%)</bold>
                </td>
                <td>.60<sup>b</sup></td>
                <td colspan="4">0.72</td>
                <td>.15<sup>b</sup></td>
                <td colspan="4">1.75</td>
                <td>&#62;.99<sup>b</sup></td>
                <td>&#62;.99</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Clinical phase</td>
                <td>177 (89)</td>
                <td>169 (88)</td>
                <td>8 (100)</td>
                <td/>
                <td/>
                <td colspan="2">168 (89)</td>
                <td>9 (75)</td>
                <td/>
                <td/>
                <td colspan="2">163 (88)</td>
                <td>14 (93)</td>
                <td/>
                <td/>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Preclinical phase</td>
                <td>23 (12)</td>
                <td>23 (12)</td>
                <td>0 (0)</td>
                <td/>
                <td/>
                <td colspan="2">20 (11)</td>
                <td>3 (25)</td>
                <td/>
                <td/>
                <td colspan="2">22 (12)</td>
                <td>1 (6.7)</td>
                <td/>
                <td/>
              </tr>
              <tr valign="top">
                <td colspan="5">
                  <bold>Specialty, n (%)</bold>
                </td>
                <td>&#62;.99<sup>b</sup></td>
                <td colspan="4">&#62;.99</td>
                <td>.49<sup>b</sup></td>
                <td colspan="4">.498</td>
                <td>.15<sup>b</sup></td>
                <td>0.22</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Internal medicine</td>
                <td>47 (24)</td>
                <td>45 (23)</td>
                <td>2 (25)</td>
                <td/>
                <td/>
                <td colspan="2">46 (24)</td>
                <td>1 (8.3)</td>
                <td/>
                <td/>
                <td colspan="2">45 (24)</td>
                <td>2 (13)</td>
                <td/>
                <td/>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Surgery</td>
                <td>24 (12)</td>
                <td>23 (12)</td>
                <td>1 (13)</td>
                <td/>
                <td/>
                <td colspan="2">23 (12)</td>
                <td>1 (8.3)</td>
                <td/>
                <td/>
                <td colspan="2">20 (11)</td>
                <td>4 (27)</td>
                <td/>
                <td/>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Others</td>
                <td>129 (65)</td>
                <td>124 (65)</td>
                <td>5 (63)</td>
                <td/>
                <td/>
                <td colspan="2">119 (63)</td>
                <td>10 (83)</td>
                <td/>
                <td/>
                <td colspan="2">120 (65)</td>
                <td>9 (60)</td>
                <td/>
                <td/>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Wilcoxon rank-sum test.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>Fisher exact test.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Accuracy of the Chatbots</title>
        <p>All chatbot models significantly exceeded the predefined performance threshold of 60% (<italic>P</italic>&#60;.001 for all comparisons). Accuracy was 96% (95% CI 92.9%-100%) for DeepSeek, 94% (95% CI 90.5%-100%) for Gemini, and 92.5% (95% CI 88.7%-100%) for ChatGPT. Accuracy differences among the 3 chatbots were small and not statistically significant, and there was no difference in pairwise comparison after Bonferroni adjustment (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>; <xref rid="figure1" ref-type="fig">Figures 1</xref>A and 1B). DeepSeek correctly answered MCQs with a median word count of 54 (IQR 39-73), whereas incorrectly answered questions had a significantly higher median word count of 70 (IQR 60-83; <italic>P</italic>=.049). A similar pattern was also observed for ChatGPT (<italic>P</italic>=.04) but not for Gemini (<xref rid="figure1" ref-type="fig">Figure 1</xref>C). No accuracy variations were found across medical specialties or clinical or preclinical categorization (<xref rid="figure1" ref-type="fig">Figures 1</xref>A and 1B; <xref ref-type="table" rid="table1">Table 1</xref>).</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Comparison of ChatGPT, Gemini, and DeepSeek in answering German-language medical multiple-choice questions. (A) Shows the performance categorized by clinical and preclinical phases. (B) Shows the performance on internal medicine questions. The dashed blue line indicates the conventional passing threshold of 60% in the German Progress Test Medicine. (C) Shows a box plot illustrating the relationship between question word count and accuracy.</p>
          </caption>
          <graphic xlink:href="formative_v9i1e77357_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Exploratory Qualitative Assessment of Chatbot Answers</title>
        <p>Despite their high accuracy, all 3 chatbots occasionally produced highly plausible but incorrect explanations. In the context of large language models, such errors are commonly referred to as <italic>hallucinations</italic>, meaning the confident generation of factually incorrect or fabricated information that is not supported by the input or external knowledge [<xref ref-type="bibr" rid="ref30">30</xref>]. For example, in one case, Gemini fabricated an incorrect response label for a multiple-choice answer. In another case, both DeepSeek and ChatGPT proposed a diagnosis that was not among the available options. The clinical scenario described recurrent morning stiffness with enlargement of the distal interphalangeal joints and proximal interphalangeal joints but without redness or swelling. Despite the absence of <italic>arthrosis</italic> from the listed answer choices, both DeepSeek and ChatGPT chose it as the most likely cause. In contrast, the correct answer provided by the questionnaire’s designer was rheumatoid arthritis.</p>
        <p>Moreover, some discrepancies among the chatbots appeared to mirror inconsistencies in the medical literature itself [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. For example, in explaining metamizole-induced agranulocytosis, ChatGPT emphasized antibody-mediated granulocyte destruction, whereas DeepSeek and Gemini attributed the condition to direct toxic effects on the bone marrow.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>In this evaluation of 200 German-language medical MCQs spanning both preclinical and clinical domains, DeepSeek, ChatGPT, and Gemini achieved very high overall accuracies (93%-96%), indicating that an offline‐deployable model, such as DeepSeek, can match leading chatbots in core medical reasoning tasks [<xref ref-type="bibr" rid="ref10">10</xref>]. Moreover, the mean difficulty index of the PTM 51 was 0.35, corresponding to an average student accuracy of 35% on the same questions [<xref ref-type="bibr" rid="ref16">16</xref>]. Thus, all 3 chatbots substantially outperformed the comparison group of medical students. Notably, performance for both DeepSeek and ChatGPT declined with increasing question length, underscoring that verbose prompts remain a persistent challenge and echoing previous observations that longer MCQs amplify opportunities for error [<xref ref-type="bibr" rid="ref18">18</xref>]. Sporadic yet plausible <italic>hallucinations</italic> and <italic>out‐of‐options</italic> answers—well‐documented safety concerns in large language models—were observed across all 3 chatbots [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
      </sec>
      <sec>
        <title>High Performance of AI-Based Chatbots in MCQs</title>
        <p>Compared to earlier ChatGPT versions on the PTM, our results illustrate rapid progress in the previous years [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. Similarly, studies reporting improvements from GPT-3.5 (58%) to GPT-4 (81%) in medical state examinations mirror our findings [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. Moreover, Alfertshofer et al [<xref ref-type="bibr" rid="ref36">36</xref>] analyzed 1200 medical licensing MCQs and identified question length and language as key determinants of accuracy, mirroring our finding that verbosity negatively impacts performance. Within German-language contexts, Friederichs et al [<xref ref-type="bibr" rid="ref34">34</xref>] found that ChatGPT answered roughly two-thirds of PTM items correctly and outperformed early-year medical students, consistent with our result that all 3 chatbots surpassed conventional pass thresholds on the PTM 51 item pool. Notably, the accuracies reported in our study exceed those of earlier PTM estimates, underscoring the rapid capability gains since 2023.</p>
        <p>Our results also fit into a broader international landscape of benchmarking studies that have consistently reported high but context-dependent performance of generative AI in medical MCQs. DeepSeek-R1, for example, achieved accuracies of 97% on English and Chinese licensing items [<xref ref-type="bibr" rid="ref1">1</xref>]; 92% to 95% on the Chinese National Medical Licensing Examination, with significant and stable advantages over ChatGPT [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]; and more than 90% in oncology [<xref ref-type="bibr" rid="ref10">10</xref>] and microbiology [<xref ref-type="bibr" rid="ref11">11</xref>]. In contrast, head-to-head comparisons on the United States Medical Licensing Examination found DeepSeek slightly inferior to ChatGPT-o1 (92% vs 95%) [<xref ref-type="bibr" rid="ref3">3</xref>], while in ophthalmology board-style examinations, ChatGPT o1 Pro (83.4%) clearly outperformed DeepSeek-R1 (72.5%) [<xref ref-type="bibr" rid="ref4">4</xref>]. Conversely, in pediatric board preparation questions, DeepSeek-R1 reached 98% accuracy, markedly surpassing ChatGPT-4 (82.7%) [<xref ref-type="bibr" rid="ref5">5</xref>]. These findings underline that apparent global accuracy masks substantial domain-specific variability, where some models excel in pediatrics or oncology, while others dominate in ophthalmology. Importantly, several studies found minimal overlap in the specific questions missed by different models, suggesting complementary rather than uniform knowledge gaps [<xref ref-type="bibr" rid="ref5">5</xref>].</p>
        <p>Language-specific effects further add nuance to these comparisons. In bilingual ophthalmology MCQs, DeepSeek performed better in Chinese (86.2%) than in English (80.8%), while Gemini and OpenAI models showed weaker robustness across languages [<xref ref-type="bibr" rid="ref8">8</xref>]. Similar findings in multiyear Chinese National Medical Licensing Examination evaluations confirm DeepSeek’s consistent advantage in Chinese [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref6">6</xref>], supporting the view that training corpus composition can strongly shape performance across languages. This resonates with our observation that even in German-language testing, performance is influenced not only by specialty but also by linguistic and structural features of the questions.</p>
        <p>Taken together, previous literature and our findings converge on 3 themes. First, the overall performance of state-of-the-art chatbots on medical MCQs is now consistently at or above medical student thresholds, often exceeding 90%. Second, accuracy varies by language, with models such as DeepSeek particularly advantaged in Chinese settings. Third, accuracy differs by domain, with ophthalmology and pediatrics illustrating opposite outcomes across models. Finally, our demonstration that verbosity predicts chatbot errors echoes the hypothesis by Alfertshofer et al [<xref ref-type="bibr" rid="ref18">18</xref>] that longer questions amplify opportunities for error [<xref ref-type="bibr" rid="ref35">35</xref>].</p>
        <p>Ophthalmology-specific studies likewise found DeepSeek on par with, or superior to, contemporaneous versions of ChatGPT and Gemini [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>], although 1 study found it lagged behind ChatGPT on pediatric MCQs, suggesting specialty-dependent effects or influences of question format [<xref ref-type="bibr" rid="ref39">39</xref>]. Indeed, we observed word count to be a critical determinant of chatbot accuracy on medical MCQs, supporting the hypothesis by Alfertshofer et al [<xref ref-type="bibr" rid="ref18">18</xref>] that longer questions create more opportunities for error.</p>
      </sec>
      <sec>
        <title>Implications for Safety and Transparency</title>
        <p>Consistent with the literature, hallucinations remained a relevant obstacle for the chatbots in addressing medical MCQs [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. Thus, DeepSeek’s <italic>think-aloud</italic> feature, where it exposes intermediate reasoning steps, may help end users detect hallucinations, overgeneralization, and dataset biases, provided these outputs are critically reviewed [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. In addition, this feature might facilitate the scrutiny of DeepSeek’s novel or out-of-scope reasoning.</p>
        <p>However, transparency does not inherently mitigate the risk of generating unsafe content. After all, the literature suggests that DeepSeek produces unsafe responses 10 times more often than ChatGPT [<xref ref-type="bibr" rid="ref42">42</xref>]. While DeepSeek is released under a permissive license with publicly available model weights, sometimes described as partially open source [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>], the very notion of openness in generative AI remains contested. Many so-called open models are only <italic>open-weight</italic>, sharing parameters but withholding training and fine-tuning data, a practice termed <italic>open-washing</italic> [<xref ref-type="bibr" rid="ref43">43</xref>]. This highlights that openness is best understood as a graded and multidimensional property rather than a binary state.</p>
        <p>Furthermore, DeepSeek’s operation within government‐regulated frameworks [<xref ref-type="bibr" rid="ref14">14</xref>] highlights the tension between transparency and regulatory compliance. Thus, despite the potential to save time, costs, and personnel resources in clinical and research-related decision-making processes in health care [<xref ref-type="bibr" rid="ref44">44</xref>], the hope that AI-based chatbots can serve as reliable decision-support tools must be critically questioned at this point in time.</p>
      </sec>
      <sec>
        <title>Future Directions</title>
        <p>While DeepSeek, ChatGPT, and Gemini all excel on German-language MCQs, persistent issues, such as hallucinations, bias, and unsafe outputs mentioned in the literature, underscore the need for further refinement of these chatbots. Thus, future research should assess chatbot performance on tasks that demand free-text generation and complex reasoning without predefined answer options in real-world simulated environments. Evaluations should also extend across diverse languages, specialties, and examination formats. In addition, studies need to refine prompt engineering approaches to reduce verbosity-related errors and systematically monitor safety, bias, and regulatory compliance over time while accounting for the threat to reproducibility posed by continuous model updates.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Nevertheless, our reliance on medical MCQs may overestimate real-world clinical utility of such AI-based chatbots because such formats cannot capture the nuance of actual patient-clinician interactions [<xref ref-type="bibr" rid="ref8">8</xref>]. Furthermore, presenting every item in German limits generalizability to other languages, health care settings, and assessment formats. In the literature on ophthalmology MCQs, for instance, DeepSeek’s accuracy rose from 81% in English to 86% in Chinese, whereas the accuracies of ChatGPT and Gemini fell from 72% to 75% down to 68% to 71%, a pattern attributed to the higher proportion of Chinese tokens in DeepSeek’s training data [<xref ref-type="bibr" rid="ref38">38</xref>]. This finding challenges the common assumption that multilingual AI models are inherently biased toward the English language [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref45">45</xref>] and highlights how training‐corpus composition can drive language‐specific performance [<xref ref-type="bibr" rid="ref38">38</xref>]. To address concerns regarding reproducibility, we conducted a stability spot-check on a random subset of 20 questions in September 2025. As the original versions tested between February 21, 2025, and March 4, 2025, were no longer available, we used the most recent publicly accessible default web-based versions at that time (ChatGPT5, DeepSeek [version 3.2], and Gemini 2.5 Flash, all accessed on September 30, 2025). A total of 19 (95%) questions were answered correctly by all models, while 1 (5%) error initially made by ChatGPT (GPT-o3-mini) was now reproduced by DeepSeek (version 3.2). This demonstrates that ongoing model updates can change outcomes, underscoring that our findings are specific to particular model versions and access dates.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In conclusion, DeepSeek matches Gemini and ChatGPT in accuracy on German-language medical MCQs while offering novel insights and a transparent <italic>thinking‑aloud</italic> glimpse into its reasoning. Yet, recurring hallucinations and documented biases make expert oversight and critical appraisal indispensable. Furthermore, the literature points to potential safety and regulatory concerns that could outweigh DeepSeek’s offline-deployment advantages, such as lower environmental footprint and operational costs. Ultimately, targeted research is needed to delineate DeepSeek’s failure modes, rigorously validate its safety and impartiality, and establish best-practice strategies.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Cleaned data for data analysis.</p>
        <media xlink:href="formative_v9i1e77357_app1.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 16 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Code for data analysis.</p>
        <media xlink:href="formative_v9i1e77357_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 222 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Accuracy of Gemini, ChatGPT, and DeepSeek in the German Progress Test Medicine.</p>
        <media xlink:href="formative_v9i1e77357_app3.docx" xlink:title="DOCX File , 15 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">MCQ</term>
          <def>
            <p>multiple-choice question</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">PTM</term>
          <def>
            <p>Progress Test Medicine</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors would like to thank Ari Soleman for proofreading this manuscript.</p>
      <p>DeepL and ChatGPT (OpenAI) were used for proofreading, providing translation support, and assisting with the linguistic aspects of this paper. In addition, ChatGPT aided in the statistical programming, with all artificial intelligence–generated output being critically reviewed by the authors.</p>
    </ack>
    <notes>
      <sec>
        <title>Funding</title>
        <p>The German Research Foundation provided funding for the article processing fee of this work.</p>
      </sec>
    </notes>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The datasets analyzed during this study are not publicly available due to copyright restrictions but are available from the corresponding author on reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>AM, TS, and YK designed the study. AM and RO collected the data and conducted the exploratory qualitative analysis. AM performed the data analysis and interpretation and drafted the manuscript. TS, RO, YK, and AUS critically reviewed and revised the manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>AM received speaker support at both the Congress of the German Society for Clinical Chemistry and Laboratory Medicine 2024 and Medica 2024, as well as the Digital Laboratory research award from the German Society for Clinical Chemistry and Laboratory Medicine for her previous work on artificial intelligence–based chatbots. In addition, for another research manuscript, TS and AM were supported by OpenAI’s Researcher Access Program and Application Programming Interface. AM and TS received funding from the German Research Foundation for the article processing charges of previous work and will also receive German Research Foundation support for the article processing charges of this manuscript.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abulibdeh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Sejdić</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>The illusion of safety: a report to the FDA on AI healthcare product approvals</article-title>
          <source>PLOS Digit Health</source>
          <year>2025</year>
          <month>06</month>
          <day>5</day>
          <volume>4</volume>
          <issue>6</issue>
          <fpage>e0000866</fpage>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000866</pub-id>
          <pub-id pub-id-type="medline">40471897</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-25-00085</pub-id>
          <pub-id pub-id-type="pmcid">PMC12140231</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>Amisha</collab>
            <name name-style="western">
              <surname>Malik</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Pathania</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rathaur</surname>
              <given-names>VK</given-names>
            </name>
          </person-group>
          <article-title>Overview of artificial intelligence in medicine</article-title>
          <source>J Family Med Prim Care</source>
          <year>2019</year>
          <month>07</month>
          <volume>8</volume>
          <issue>7</issue>
          <fpage>2328</fpage>
          <lpage>31</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jfmpc.com/article.asp?issn=2249-4863;year=2019;volume=8;issue=7;spage=2328;epage=2331;aulast=Amisha%2C"/>
          </comment>
          <pub-id pub-id-type="doi">10.4103/jfmpc.jfmpc_440_19</pub-id>
          <pub-id pub-id-type="medline">31463251</pub-id>
          <pub-id pub-id-type="pii">JFMPC-8-2328</pub-id>
          <pub-id pub-id-type="pmcid">PMC6691444</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beam</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Drazen</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Kohane</surname>
              <given-names>IS</given-names>
            </name>
            <name name-style="western">
              <surname>Leong</surname>
              <given-names>TY</given-names>
            </name>
            <name name-style="western">
              <surname>Manrai</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Rubin</surname>
              <given-names>EJ</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in medicine</article-title>
          <source>N Engl J Med</source>
          <year>2023</year>
          <month>03</month>
          <day>30</day>
          <volume>388</volume>
          <issue>13</issue>
          <fpage>1220</fpage>
          <lpage>1</lpage>
          <pub-id pub-id-type="doi">10.1056/NEJMe2206291</pub-id>
          <pub-id pub-id-type="medline">36988598</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thirunavukarasu</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Elangovan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gutierrez</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>TF</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DS</given-names>
            </name>
          </person-group>
          <article-title>Large language models in medicine</article-title>
          <source>Nat Med</source>
          <year>2023</year>
          <month>08</month>
          <day>17</day>
          <volume>29</volume>
          <issue>8</issue>
          <fpage>1930</fpage>
          <lpage>40</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id>
          <pub-id pub-id-type="medline">37460753</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-023-02448-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meyer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Riese</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Streichert</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Comparison of the performance of GPT-3.5 and GPT-4 with that of medical students on the written German medical licensing examination: observational study</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <month>02</month>
          <day>08</day>
          <volume>10</volume>
          <fpage>e50965</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e50965/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/50965</pub-id>
          <pub-id pub-id-type="medline">38329802</pub-id>
          <pub-id pub-id-type="pii">v10i1e50965</pub-id>
          <pub-id pub-id-type="pmcid">PMC10884900</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Utility of ChatGPT in clinical practice</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <month>06</month>
          <day>28</day>
          <volume>25</volume>
          <fpage>e48568</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e48568/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48568</pub-id>
          <pub-id pub-id-type="medline">37379067</pub-id>
          <pub-id pub-id-type="pii">v25i1e48568</pub-id>
          <pub-id pub-id-type="pmcid">PMC10365580</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meyer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ruthard</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Streichert</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Dear ChatGPT – can you teach me how to program an app for laboratory medicine?</article-title>
          <source>J Lab Med</source>
          <year>2024</year>
          <month>5</month>
          <volume>48</volume>
          <issue>5</issue>
          <fpage>197</fpage>
          <lpage>201</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.researchgate.net/publication/380568074_Dear_ChatGPT_-_can_you_teach_me_how_to_program_an_app_for_laboratory_medicine"/>
          </comment>
          <pub-id pub-id-type="doi">10.1515/labmed-2024-0034</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meyer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Soleman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Riese</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Streichert</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Comparison of ChatGPT, Gemini, and Le Chat with physician interpretations of medical laboratory questions from an online health forum</article-title>
          <source>Clin Chem Lab Med</source>
          <year>2024</year>
          <month>11</month>
          <day>26</day>
          <volume>62</volume>
          <issue>12</issue>
          <fpage>2425</fpage>
          <lpage>34</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.degruyter.com/document/doi/10.1515/cclm-2024-0246"/>
          </comment>
          <pub-id pub-id-type="doi">10.1515/cclm-2024-0246</pub-id>
          <pub-id pub-id-type="medline">38804035</pub-id>
          <pub-id pub-id-type="pii">cclm-2024-0246</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ayers</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Poliak</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Leas</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Kelley</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Faix</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Longhurst</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Hogarth</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title>
          <source>JAMA Intern Med</source>
          <year>2023</year>
          <month>06</month>
          <day>01</day>
          <volume>183</volume>
          <issue>6</issue>
          <fpage>589</fpage>
          <lpage>96</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37115527"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id>
          <pub-id pub-id-type="medline">37115527</pub-id>
          <pub-id pub-id-type="pii">2804309</pub-id>
          <pub-id pub-id-type="pmcid">PMC10148230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Temsah</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Alhasan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Altamimi</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Jamal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Eyadhy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Malki</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Temsah</surname>
              <given-names>MH</given-names>
            </name>
          </person-group>
          <article-title>DeepSeek in healthcare: revealing opportunities and steering challenges of a new open-source artificial intelligence frontier</article-title>
          <source>Cureus</source>
          <year>2025</year>
          <month>02</month>
          <volume>17</volume>
          <issue>2</issue>
          <fpage>e79221</fpage>
          <pub-id pub-id-type="doi">10.7759/cureus.79221</pub-id>
          <pub-id pub-id-type="medline">39974299</pub-id>
          <pub-id pub-id-type="pmcid">PMC11836063</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>DeepSeek-AI</collab>
          </person-group>
          <article-title>DeepSeek-R1: incentivizing reasoning capability in LLMs via reinforcement learning</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on January 22, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2501.12948"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2501.12948</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Malin</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Rousseau</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bian</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>From GPT to DeepSeek: significant gaps remain in realizing AI in healthcare</article-title>
          <source>J Biomed Inform</source>
          <year>2025</year>
          <month>03</month>
          <volume>163</volume>
          <fpage>104791</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/j.jbi.2025.104791"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2025.104791</pub-id>
          <pub-id pub-id-type="medline">39938624</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(25)00020-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC12188495</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Poo</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Reflections on DeepSeek's breakthrough</article-title>
          <source>Natl Sci Rev</source>
          <year>2025</year>
          <month>03</month>
          <volume>12</volume>
          <issue>3</issue>
          <fpage>nwaf044</fpage>
          <pub-id pub-id-type="doi">10.1093/nsr/nwaf044</pub-id>
          <pub-id pub-id-type="medline">40041025</pub-id>
          <pub-id pub-id-type="pii">nwaf044</pub-id>
          <pub-id pub-id-type="pmcid">PMC11879125</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kayaalp</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Prill</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sezgin</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Cong</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Królikowska</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hirschmann</surname>
              <given-names>MT</given-names>
            </name>
          </person-group>
          <article-title>DeepSeek versus ChatGPT: multimodal artificial intelligence revolutionizing scientific discovery. From language editing to autonomous content generation-redefining innovation in research and practice</article-title>
          <source>Knee Surg Sports Traumatol Arthrosc</source>
          <year>2025</year>
          <month>05</month>
          <volume>33</volume>
          <issue>5</issue>
          <fpage>1553</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1002/ksa.12628</pub-id>
          <pub-id pub-id-type="medline">39936363</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gibney</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>China's cheap, open AI model DeepSeek thrills scientists</article-title>
          <source>Nature</source>
          <year>2025</year>
          <month>02</month>
          <volume>638</volume>
          <issue>8049</issue>
          <fpage>13</fpage>
          <lpage>4</lpage>
          <pub-id pub-id-type="doi">10.1038/d41586-025-00229-6</pub-id>
          <pub-id pub-id-type="medline">39849139</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-025-00229-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
          <source>Progress Test Medizin</source>
          <access-date>2025-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ptm-dashboard.charite.de/">https://ptm-dashboard.charite.de/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Karay</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Schauber</surname>
              <given-names>SK</given-names>
            </name>
          </person-group>
          <article-title>A validity argument for progress testing: examining the relation between growth trajectories obtained by progress tests and national licensing examinations using a latent growth curve approach</article-title>
          <source>Med Teach</source>
          <year>2018</year>
          <month>11</month>
          <volume>40</volume>
          <issue>11</issue>
          <fpage>1123</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1080/0142159X.2018.1472370</pub-id>
          <pub-id pub-id-type="medline">29950124</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alfertshofer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Knoedler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hoch</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Cotofana</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Panayi</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Kauke-Navarro</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tullius</surname>
              <given-names>SG</given-names>
            </name>
            <name name-style="western">
              <surname>Orgill</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Austen</surname>
              <given-names>WG</given-names>
            </name>
            <name name-style="western">
              <surname>Pomahac</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Knoedler</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Analyzing question characteristics influencing ChatGPT's performance in 3000 USMLE®-style questions</article-title>
          <source>Med Sci Educ</source>
          <year>2024</year>
          <month>09</month>
          <day>28</day>
          <volume>35</volume>
          <issue>1</issue>
          <fpage>257</fpage>
          <lpage>67</lpage>
          <pub-id pub-id-type="doi">10.1007/s40670-024-02176-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>R Core Team</collab>
          </person-group>
          <article-title>R: a language and environment for statistical computing</article-title>
          <source>R Foundation for Statistical Computing</source>
          <year>2022</year>
          <access-date>2025-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.r-project.org/">https://www.r-project.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Champely</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ekstrom</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dalgaard</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Gill</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weibelzahl</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Anandkumar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ford</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Volcic</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>De Rosario</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>pwr: basic functions for power analysis</article-title>
          <source>The Comprehensive R Archive Network</source>
          <year>2020</year>
          <access-date>2025-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/pwr/pwr.pdf">https://cran.r-project.org/web/packages/pwr/pwr.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Leeper</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Becker</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schoch</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>rio: a Swiss-army knife for data I/O</article-title>
          <source>The Comprehensive R Archive Network</source>
          <access-date>2025-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/rio/readme/README.html">https://cran.r-project.org/web/packages/rio/readme/README.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wickham</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Averick</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bryan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>McGowan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>François</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Grolemund</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Hayes</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Henry</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hester</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kuhn</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pedersen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bache</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ooms</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Robinson</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Seidel</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Spinu</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Takahashi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Vaughan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wilke</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Woo</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Yutani</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Welcome to the Tidyverse</article-title>
          <source>J Open Source Softw</source>
          <year>2019</year>
          <volume>4</volume>
          <issue>43</issue>
          <fpage>1686</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://joss.theoj.org/papers/10.21105/joss.01686"/>
          </comment>
          <pub-id pub-id-type="doi">10.21105/JOSS.01686</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sjoberg</surname>
              <given-names>DD</given-names>
            </name>
            <name name-style="western">
              <surname>Larmarange</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Curry</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lavery</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Whiting</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zabor</surname>
              <given-names>EC</given-names>
            </name>
          </person-group>
          <article-title>gtsummary: presentation-ready data summary and analytic result tables</article-title>
          <source>Daniel D. Sjoberg</source>
          <access-date>2025-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.danieldsjoberg.com/gtsummary//reference/gtsummary-package.html">https://www.danieldsjoberg.com/gtsummary//reference/gtsummary-package.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kassambara</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>rstatix: pipe-friendly framework for basic statistical tests</article-title>
          <source>The Comprehensive R Archive Network</source>
          <year>2025</year>
          <access-date>2025-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cloud.r-project.org/web/packages/rstatix/index.html">https://cloud.r-project.org/web/packages/rstatix/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Larmarange</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ludecke</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wickham</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Bojanowski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Briatte</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>labelled: manipulating labelled data</article-title>
          <source>The Comprehensive R Archive Network</source>
          <access-date>2025-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/labelled/index.html">https://cran.r-project.org/web/packages/labelled/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>fastDummies: fast creation of dummy (binary) columns and rows from categorical variables</article-title>
          <source>The Comprehensive R Archive Network</source>
          <year>2025</year>
          <month>7</month>
          <day>22</day>
          <access-date>2025-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/fastDummies/fastDummies.pdf">https://cran.r-project.org/web/packages/fastDummies/fastDummies.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Neuwirth</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>RColorBrewer: ColorBrewer palettes</article-title>
          <source>The Comprehensive R Archive Network</source>
          <year>2022</year>
          <access-date>2025-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/RColorBrewer/index.html">https://cran.r-project.org/web/packages/RColorBrewer/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wilke</surname>
              <given-names>CO</given-names>
            </name>
          </person-group>
          <article-title>cowplot: streamlined plot theme and plot annotations for ggplot2</article-title>
          <source>The Comprehensive R Archive Network</source>
          <year>2025</year>
          <month>7</month>
          <day>7</day>
          <access-date>2025-11-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/cowplot/index.html">https://cran.r-project.org/web/packages/cowplot/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Köln</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>Ethikkommission der medizinischen fakultät der Universität zu Köln</article-title>
          <source>Universität zu Köln</source>
          <access-date>2025-09-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medfak.uni-koeln.de/forschung-transfer/translation-i-klinische-forschung/ethikkommission">https://medfak.uni-koeln.de/forschung-transfer/translation-i-klinische-forschung/ethikkommission</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Farquhar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kossen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kuhn</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gal</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Detecting hallucinations in large language models using semantic entropy</article-title>
          <source>Nature</source>
          <year>2024</year>
          <month>06</month>
          <volume>630</volume>
          <issue>8017</issue>
          <fpage>625</fpage>
          <lpage>30</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38898292"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41586-024-07421-0</pub-id>
          <pub-id pub-id-type="medline">38898292</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41586-024-07421-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC11186750</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tomidis Chatzimanouil</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Goppelt</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Zeissig</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sachs</surname>
              <given-names>UJ</given-names>
            </name>
            <name name-style="western">
              <surname>Laass</surname>
              <given-names>MW</given-names>
            </name>
          </person-group>
          <article-title>Metamizole-induced agranulocytosis (MIA): a mini review</article-title>
          <source>Mol Cell Pediatr</source>
          <year>2023</year>
          <month>08</month>
          <day>17</day>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>6</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37589909"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s40348-023-00160-8</pub-id>
          <pub-id pub-id-type="medline">37589909</pub-id>
          <pub-id pub-id-type="pii">10.1186/s40348-023-00160-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC10435429</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rudin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Roos</surname>
              <given-names>NJ</given-names>
            </name>
            <name name-style="western">
              <surname>Duthaler</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Krähenbühl</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Toxicity of metamizole on differentiating HL60 cells and human neutrophil granulocytes</article-title>
          <source>Toxicology</source>
          <year>2019</year>
          <month>10</month>
          <day>01</day>
          <volume>426</volume>
          <fpage>152254</fpage>
          <pub-id pub-id-type="doi">10.1016/j.tox.2019.152254</pub-id>
          <pub-id pub-id-type="medline">31356851</pub-id>
          <pub-id pub-id-type="pii">S0300-483X(19)30209-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rodrigues Alessi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gomes</surname>
              <given-names>HA</given-names>
            </name>
            <name name-style="western">
              <surname>Lopes de Castro</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Terumy Okamoto</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT in solving questions from the progress test (Brazilian National Medical Exam): a potential artificial intelligence tool in medical practice</article-title>
          <source>Cureus</source>
          <year>2024</year>
          <month>07</month>
          <volume>16</volume>
          <issue>7</issue>
          <fpage>e64924</fpage>
          <pub-id pub-id-type="doi">10.7759/cureus.64924</pub-id>
          <pub-id pub-id-type="medline">39156244</pub-id>
          <pub-id pub-id-type="pmcid">PMC11330648</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Friederichs</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Friederichs</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>März</surname>
              <given-names>Maren</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT in medical school: how successful is AI in progress testing?</article-title>
          <source>Med Educ Online</source>
          <year>2023</year>
          <month>12</month>
          <volume>28</volume>
          <issue>1</issue>
          <fpage>2220920</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.tandfonline.com/doi/10.1080/10872981.2023.2220920?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1080/10872981.2023.2220920</pub-id>
          <pub-id pub-id-type="medline">37307503</pub-id>
          <pub-id pub-id-type="pmcid">PMC10262795</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Okuhara</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Shirabe</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Nishiie</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Okada</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kiuchi</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT across different versions in medical licensing examinations worldwide: systematic review and meta-analysis</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <month>07</month>
          <day>25</day>
          <volume>26</volume>
          <fpage>e60807</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e60807/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/60807</pub-id>
          <pub-id pub-id-type="medline">39052324</pub-id>
          <pub-id pub-id-type="pii">v26i1e60807</pub-id>
          <pub-id pub-id-type="pmcid">PMC11310649</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alfertshofer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hoch</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Funk</surname>
              <given-names>PF</given-names>
            </name>
            <name name-style="western">
              <surname>Hollmann</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Wollenberg</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Knoedler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Knoedler</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Sailing the seven seas: a multinational comparison of ChatGPT's performance on medical licensing examinations</article-title>
          <source>Ann Biomed Eng</source>
          <year>2024</year>
          <month>06</month>
          <volume>52</volume>
          <issue>6</issue>
          <fpage>1542</fpage>
          <lpage>5</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37553555"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10439-023-03338-3</pub-id>
          <pub-id pub-id-type="medline">37553555</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10439-023-03338-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC11082010</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikhail</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Farah</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Milad</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nassrallah</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Mihalache</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Milad</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Antaki</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Balas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Popovic</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Feo</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Muni</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Keane</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Duval</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Performance of DeepSeek-R1 in ophthalmology: an evaluation of clinical decision-making and cost-effectiveness</article-title>
          <source>Br J Ophthalmol</source>
          <year>2025</year>
          <month>08</month>
          <day>20</day>
          <volume>109</volume>
          <issue>9</issue>
          <fpage>976</fpage>
          <lpage>81</lpage>
          <pub-id pub-id-type="doi">10.1136/bjo-2025-327360</pub-id>
          <pub-id pub-id-type="medline">40701781</pub-id>
          <pub-id pub-id-type="pii">bjo-2025-327360</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>DeepSeek-R1 outperforms Gemini 2.0 Pro, OpenAI o1, and o3-mini in bilingual complex ophthalmology reasoning</article-title>
          <source>Adv Ophthalmol Pract Res</source>
          <year>2025</year>
          <month>08</month>
          <volume>5</volume>
          <issue>3</issue>
          <fpage>189</fpage>
          <lpage>95</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2667-3762(25)00029-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.aopr.2025.05.001</pub-id>
          <pub-id pub-id-type="medline">40678192</pub-id>
          <pub-id pub-id-type="pii">S2667-3762(25)00029-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC12269606</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mondillo</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Colosimo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Perrotta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Frattolillo</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Masino</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Comparative evaluation of advanced AI reasoning models in pediatric clinical decision support: ChatGPT O1 vs. DeepSeek-R1</article-title>
          <source>medRxiv</source>
          <comment>
              Preprint posted online on January 28, 2025
          </comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.medrxiv.org/content/10.1101/2025.01.27.25321169v1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2025.01.27.25321169</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Daily briefing: the pros and cons of DeepSeek. Online ahead of print</article-title>
          <source>Nature</source>
          <year>2025</year>
          <month>01</month>
          <day>30</day>
          <pub-id pub-id-type="doi">10.1038/d41586-025-00330-w</pub-id>
          <pub-id pub-id-type="medline">39890911</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-025-00330-w</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meyer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wetsch</surname>
              <given-names>WA</given-names>
            </name>
            <name name-style="western">
              <surname>Steinbicker</surname>
              <given-names>AU</given-names>
            </name>
            <name name-style="western">
              <surname>Streichert</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Through ChatGPT's eyes: the large language model's stereotypes and what they reveal about healthcare</article-title>
          <source>J Med Syst</source>
          <year>2025</year>
          <month>02</month>
          <day>05</day>
          <volume>49</volume>
          <issue>1</issue>
          <fpage>20</fpage>
          <pub-id pub-id-type="doi">10.1007/s10916-025-02159-2</pub-id>
          <pub-id pub-id-type="medline">39907718</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10916-025-02159-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arrieta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ugarte</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Valle</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Parejo</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Segura</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>o3-mini vs DeepSeek-R1: which one is safer?</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on January 30, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2501.18438"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2501.18438</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liesenfeld</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dingemanse</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Rethinking open source generative AI: open-washing and the EU AI Act</article-title>
          <year>2024</year>
          <conf-name>FAccT '24: Proceedings of the 2024 ACM Conference on Fairness, Accountability, and Transparency</conf-name>
          <conf-date>June 3-6</conf-date>
          <conf-loc>Rio de Janeiro, Brazil</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.1145/3630106.3659005"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/3630106.3659005</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Laymouna</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lessard</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Schuster</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Engler</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Lebouché</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Roles, users, benefits, and limitations of chatbots in health care: rapid review</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <month>07</month>
          <day>23</day>
          <volume>26</volume>
          <fpage>e56930</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e56930/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/56930</pub-id>
          <pub-id pub-id-type="medline">39042446</pub-id>
          <pub-id pub-id-type="pii">v26i1e56930</pub-id>
          <pub-id pub-id-type="pmcid">PMC11303905</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gabriel</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Bhatia</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Lost in translation: large language models in non-English content analysis</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on June 12, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2306.07377"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2306.07377</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
