<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e91470</article-id><article-id pub-id-type="doi">10.2196/91470</article-id><article-categories><subj-group subj-group-type="heading"><subject>Letter to the Editor</subject></subj-group></article-categories><title-group><article-title>Authors&#x2019; Reply: Critical Limitations in Comparing ChatGPT and DeepSeek for Orthopedic Assessment</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Anusitviwat</surname><given-names>Chirathit</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Suwannaphisit</surname><given-names>Sitthiphong</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bvonpanttarananon</surname><given-names>Jongdee</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tangtrakulwanich</surname><given-names>Boonsin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Prince of Songkla University</institution><addr-line>15 Karnjanavanich Road</addr-line><addr-line>Hat Yai</addr-line><addr-line>Songkhla</addr-line><country>Thailand</country></aff><aff id="aff2"><institution>Navamindradhiraj University</institution><addr-line>Bangkok</addr-line><addr-line>Bangkok</addr-line><country>Thailand</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Iannaccio</surname><given-names>Amanda</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Chirathit Anusitviwat, MD, Prince of Songkla University, 15 Karnjanavanich Road, Hat Yai, Songkhla, 90110, Thailand, 66 74451601; <email>pchirathit@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>17</day><month>3</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e91470</elocation-id><history><date date-type="received"><day>15</day><month>01</month><year>2026</year></date><date date-type="rev-recd"><day>14</day><month>02</month><year>2026</year></date><date date-type="accepted"><day>26</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Chirathit Anusitviwat, Sitthiphong Suwannaphisit, Jongdee Bvonpanttarananon, Boonsin Tangtrakulwanich. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 17.3.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e91470"/><related-article related-article-type="commentary article" ext-link-type="doi" xlink:href="10.2196/75607" xlink:title="Comment on" xlink:type="simple">https://formative.jmir.org/2025/1/e75607</related-article><related-article related-article-type="commentary" ext-link-type="doi" xlink:href="10.2196/90242" xlink:title="Comment in" xlink:type="simple">https://formative.jmir.org/2026/1/e90242/</related-article><kwd-group><kwd>ChatGPT</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>orthopedic</kwd><kwd>multiple-choice question</kwd><kwd>MCQ</kwd></kwd-group></article-meta></front><body><p>We thank you for the useful and constructive comments [<xref ref-type="bibr" rid="ref1">1</xref>] on our article &#x201C;Comparing ChatGPT and DeepSeek for Assessment of Multiple-Choice Questions in Orthopedic Medical Education: Cross-Sectional Study&#x201D; [<xref ref-type="bibr" rid="ref2">2</xref>]. This reply aims to address the concerning points that were brought up in the letter to the editor.</p><sec id="s2"><title>Misinterpretation of Reliability Statistics</title><p>According to our study, we administered the multiple-choice questions (MCQs) for ChatGPT and DeepSeek on a separate day. All data from the two large language models (LLMs) were measured by two assessors. Although two assessors were used for each LLM, the reported Cohen &#x03BA; coefficient values represent within-model interrater reliability, not interrater reliability between the two LLMs [<xref ref-type="bibr" rid="ref3">3</xref>]. Therefore, describing these results as agreement between the two models is inaccurate.</p></sec><sec id="s3"><title>Linguistic Ambiguity and Generalizability</title><p>All MCQs used in our study were administered in English. No Thai language inputs or translations were used. Therefore, the performance differences between the two models reflect the model performance on English language medical questions rather than variability due to language translation or non-English linguistic processing.</p></sec><sec id="s4"><title>Reproducibility and Interface Transparency</title><p>All models in our study were accessed via web-based user interfaces (UIs), not application programming interfaces. We acknowledge that web-based UIs may be subject to updates and lack version control. However, the web-based version of ChatGPT is easy to access and requires no software installation. It also allows quick testing and exploration without technical or cost barriers, making it well-suited for nontechnical users and educational studies [<xref ref-type="bibr" rid="ref4">4</xref>]. Therefore, we used the web-based UI in our study.</p></sec><sec id="s5"><title>Risk of Data Contamination</title><p>Even though these MCQs have been used for more than 5 years, the MCQs used in our study are from private orthopedic examinations. Thus, we believe that these items would not appear in public sources. Future research using newly created MCQs may be better for assessing the capability or efficacy of LLMs.</p></sec><sec id="s6"><title>Data Reporting Discrepancy</title><p>Upon re-examination, we confirm that the correct accuracy for the pelvic and spine injury category (n=19) using the Reason function is indeed 16 of 19, corresponding to approximately 84.2%. The value of 68.8% reported in Table 2 was a typographical error. This error has been corrected through a published corrigendum [<xref ref-type="bibr" rid="ref5">5</xref>].</p></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb2">MCQ</term><def><p>multiple-choice question</p></def></def-item><def-item><term id="abb3">UI</term><def><p>user interface</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayas</surname><given-names>O</given-names> </name><name name-style="western"><surname>Acar</surname><given-names>A</given-names> </name></person-group><article-title>Critical limitations in comparing ChatGPT and DeepSeek for orthopedic assessment</article-title><source>JMIR Form Res</source><year>2026</year><volume>10</volume><fpage>e90242</fpage><pub-id pub-id-type="doi">10.2196/90242</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Anusitviwat</surname><given-names>C</given-names> </name><name name-style="western"><surname>Suwannaphisit</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bvonpanttarananon</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tangtrakulwanich</surname><given-names>B</given-names> </name></person-group><article-title>Comparing ChatGPT and DeepSeek for assessment of multiple-choice questions in orthopedic medical education: cross-sectional study</article-title><source>JMIR Form Res</source><year>2025</year><month>12</month><day>19</day><volume>9</volume><fpage>e75607</fpage><pub-id pub-id-type="doi">10.2196/75607</pub-id><pub-id pub-id-type="medline">41418321</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McHugh</surname><given-names>ML</given-names> </name></person-group><article-title>Interrater reliability: the kappa statistic</article-title><source>Biochem Med (Zagreb)</source><year>2012</year><volume>22</volume><issue>3</issue><fpage>276</fpage><lpage>282</lpage><pub-id pub-id-type="medline">23092060</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Heo</surname><given-names>H</given-names> </name><name name-style="western"><surname>Suh</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Shim</surname><given-names>WH</given-names> </name></person-group><article-title>Uncover this tech term: application programming interface for large language models</article-title><source>Korean J Radiol</source><year>2025</year><month>08</month><volume>26</volume><issue>8</issue><fpage>793</fpage><lpage>796</lpage><pub-id pub-id-type="doi">10.3348/kjr.2025.0360</pub-id><pub-id pub-id-type="medline">40736411</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Anusitviwat</surname><given-names>C</given-names> </name><name name-style="western"><surname>Suwannaphisit</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bvonpanttarananon</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tangtrakulwanich</surname><given-names>B</given-names> </name></person-group><article-title>Correction: comparing ChatGPT and DeepSeek for assessment of multiple-choice questions in orthopedic medical education: cross-sectional study</article-title><source>JMIR Form Res</source><year>2026</year><month>02</month><day>26</day><volume>10</volume><fpage>e92549</fpage><pub-id pub-id-type="doi">10.2196/92549</pub-id><pub-id pub-id-type="medline">41747218</pub-id></nlm-citation></ref></ref-list></back></article>