<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e90242</article-id><article-id pub-id-type="doi">10.2196/90242</article-id><article-categories><subj-group subj-group-type="heading"><subject>Letter to the Editor</subject></subj-group></article-categories><title-group><article-title>Critical Limitations in Comparing ChatGPT and DeepSeek for Orthopedic Assessment</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Ayas</surname><given-names>Orhan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Acar</surname><given-names>Alaeddin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Orthopedics and Traumatology, Fethi Sekin City Hospital</institution><addr-line>Elaz&#x0131;&#x011F;</addr-line><country>Turkey</country></aff><aff id="aff2"><institution>Department of Neurosurgery, Kulu State Hospital</institution><addr-line>No 4, 139518 Street, Dinek, Kulu</addr-line><addr-line>Konya</addr-line><country>Turkey</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Iannaccio</surname><given-names>Amanda</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Alaeddin Acar, MD, Department of Neurosurgery, Kulu State Hospital, No 4, 139518 Street, Dinek, Kulu, Konya, 42770, Turkey, 90 542 472 37 23; <email>alaeacar@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>17</day><month>3</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e90242</elocation-id><history><date date-type="received"><day>23</day><month>12</month><year>2025</year></date><date date-type="accepted"><day>26</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Orhan Ayas, Alaeddin Acar. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 17.3.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e90242"/><related-article related-article-type="commentary article" ext-link-type="doi" xlink:href="10.2196/75607" xlink:title="Comment on" xlink:type="simple">https://formative.jmir.org/2025/1/e75607</related-article><related-article related-article-type="commentary" ext-link-type="doi" xlink:href="10.2196/91470" xlink:title="Comment in" xlink:type="simple">https://formative.jmir.org/2026/1/e91470</related-article><kwd-group><kwd>ChatGPT</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>orthopedic</kwd><kwd>multiple-choice question</kwd><kwd>MCQ</kwd></kwd-group></article-meta></front><body><p>We read with great interest the study by Anusitviwat et al [<xref ref-type="bibr" rid="ref1">1</xref>], which compared the performance of ChatGPT and DeepSeek in orthopedic examinations. While the study provides timely insights into the utility of large language models (LLMs) in medical education, we identified specific methodological and terminological limitations that warrant clarification to ensure the validity and reproducibility of the findings.</p><sec id="s2"><title>Misinterpretation of Reliability Statistics</title><p>The authors state that the &#x201C;interrater reliability between the two LLMs&#x201D; was evaluated using the Cohen &#x03BA; coefficient [<xref ref-type="bibr" rid="ref1">1</xref>]. Mathematically, measuring the agreement between two independent raters (interrater) yields a single coefficient. However, the results report two separate values: &#x03BA; of 0.81 for ChatGPT and &#x03BA; of 0.78 for DeepSeek [<xref ref-type="bibr" rid="ref1">1</xref>]. This finding, combined with the methodology stating that questions were input on &#x201C;separate days&#x201D; [<xref ref-type="bibr" rid="ref1">1</xref>], indicates that the study actually measured intramodel consistency (test-retest reliability) rather than the agreement between the models. Labeling internal consistency as &#x201C;interrater reliability&#x201D; is terminologically inaccurate and misrepresents the statistical relationship between the two models.</p></sec><sec id="s3"><title>Linguistic Ambiguity and Generalizability</title><p>The manuscript does not specify the language of the input multiple-choice questions (Thai or English) used in the assessments. This omission is critical, as the impact of input language on LLM performance is well-documented. For instance, Noda et al [<xref ref-type="bibr" rid="ref2">2</xref>] demonstrated that GPT-4V&#x2019;s accuracy on the Japanese Otolaryngology Board Examination significantly improved from 24.7% (Japanese input) to 47.3% when translated into English. This finding underscores that models optimized for English exhibit distinct performance disparities in non-English languages. Without clarifying whether the assessments were administered in the local language or English, it is impossible to determine if the reported accuracy gap between ChatGPT (80.4%) and DeepSeek (74.2%) stems from medical reasoning capabilities or linguistic processing proficiency.</p></sec><sec id="s4"><title>Reproducibility and Interface Transparency</title><p>The methodology reports the use of &#x201C;Reason&#x201D; and &#x201C;DeepThink&#x201D; functions but does not explicitly state whether the models were accessed via web-based user interfaces or application programming interfaces [<xref ref-type="bibr" rid="ref1">1</xref>]. This distinction is vital for reproducibility. Web-based user interfaces are subject to opaque updates and lack the stability of controlled application programming interface environments. Without defining the access method and the specific prompt structures used, the experimental conditions cannot be replicated.</p></sec><sec id="s5"><title>Risk of Data Contamination</title><p>The authors note that the multiple-choice questions &#x201C;have been used in orthopedic examinations for more than 5 years.&#x201D; This longevity significantly increases the risk of data contamination, as older items likely exist in public repositories within LLM training corpora, potentially conflating memorization with reasoning. To ensure validity, recent benchmarks use private datasets (Busch et al [<xref ref-type="bibr" rid="ref3">3</xref>]) or questions postdating the model&#x2019;s training cutoff (Noda et al [<xref ref-type="bibr" rid="ref2">2</xref>]). The absence of such controls in this study undermines the internal validity of the comparison.</p></sec><sec id="s6"><title>Data Reporting Discrepancy</title><p>Finally, we noted a minor discrepancy in Table 2. In the &#x201C;Pelvic and spine injury&#x201D; category (n=19), the accuracy for the Reason function is listed as 16 (68.8%) [<xref ref-type="bibr" rid="ref1">1</xref>]. Mathematically, 16 of 19 corresponds to approximately 84.2%, not 68.8%. We respectfully invite the authors to clarify this value to ensure the precision of the tabulated data.</p></sec></body><back><ack><p>Google Gemini was used for language editing.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Anusitviwat</surname><given-names>C</given-names> </name><name name-style="western"><surname>Suwannaphisit</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bvonpanttarananon</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tangtrakulwanich</surname><given-names>B</given-names> </name></person-group><article-title>Comparing ChatGPT and DeepSeek for assessment of multiple-choice questions in orthopedic medical education: cross-sectional study</article-title><source>JMIR Form Res</source><year>2025</year><month>12</month><day>19</day><volume>9</volume><fpage>e75607</fpage><pub-id pub-id-type="doi">10.2196/75607</pub-id><pub-id pub-id-type="medline">41418321</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Noda</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ueno</surname><given-names>T</given-names> </name><name name-style="western"><surname>Koshu</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Performance of GPT-4V in answering the Japanese Otolaryngology Board Certification Examination questions: evaluation study</article-title><source>JMIR Med Educ</source><year>2024</year><month>03</month><day>28</day><volume>10</volume><fpage>e57054</fpage><pub-id pub-id-type="doi">10.2196/57054</pub-id><pub-id pub-id-type="medline">38546736</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Busch</surname><given-names>F</given-names> </name><name name-style="western"><surname>Han</surname><given-names>T</given-names> </name><name name-style="western"><surname>Makowski</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Truhn</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bressem</surname><given-names>KK</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>L</given-names> </name></person-group><article-title>Integrating text and image analysis: exploring GPT-4V&#x2019;s capabilities in advanced radiological applications across subspecialties</article-title><source>J Med Internet Res</source><year>2024</year><month>05</month><day>1</day><volume>26</volume><fpage>e54948</fpage><pub-id pub-id-type="doi">10.2196/54948</pub-id><pub-id pub-id-type="medline">38691404</pub-id></nlm-citation></ref></ref-list></back></article>