<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e89726</article-id><article-id pub-id-type="doi">10.2196/89726</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Letter</subject></subj-group></article-categories><title-group><article-title>Clinical Evaluation of the Clinical Reasoning Process of Large Language Models in Nephrology: Comparative Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Yano</surname><given-names>Yuichiro</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kakizaki</surname><given-names>Hiroaki</given-names></name><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nagasu</surname><given-names>Hajime</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kishi</surname><given-names>Seiji</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Koshida</surname><given-names>Takeo</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nihei</surname><given-names>Yoshihito</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hirano</surname><given-names>Akira</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nangaku</surname><given-names>Masaomi</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mori</surname><given-names>Hirotake</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Naito</surname><given-names>Toshio</given-names></name><degrees>MD, MBA, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ohashi</surname><given-names>Mizuki</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Maruyama</surname><given-names>Shoichi</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Matsui</surname><given-names>Isao</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Isaka</surname><given-names>Yoshitaka</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Okada</surname><given-names>Hirokazu</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Suzuki</surname><given-names>Yusuke</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kashihara</surname><given-names>Naoki</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib></contrib-group><aff id="aff1"><institution>Department of General Medicine, Juntendo University</institution><addr-line>2-1-1, Hongo, Bunkyo-Ku</addr-line><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Artificial Intelligence Incubation Farm, Faculty of Medicine, Juntendo University</institution><addr-line>Tyoko</addr-line><country>Japan</country></aff><aff id="aff3"><institution>PeopleMedia, Inc</institution><addr-line>Osaka</addr-line><country>Japan</country></aff><aff id="aff4"><institution>Department of Nephrology and Hypertension, Kawasaki Medical School</institution><addr-line>Kurashiki</addr-line><addr-line>Okayama</addr-line><country>Japan</country></aff><aff id="aff5"><institution>Department of Nephrology, Juntendo University</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff6"><institution>Division of Nephrology and Endocrinology, The University of Tokyo</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff7"><institution>Department of Nephrology, Nagoya University</institution><addr-line>Nagoya</addr-line><addr-line>Aichi</addr-line><country>Japan</country></aff><aff id="aff8"><institution>Department of Nephrology, The University of Osaka</institution><addr-line>Suita</addr-line><addr-line>Osaka</addr-line><country>Japan</country></aff><aff id="aff9"><institution>Department of Nephrology, Saitama Medical University</institution><addr-line>Saitama</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Steenstra</surname><given-names>Ivan</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Kurban</surname><given-names>Hasan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Huang</surname><given-names>Jiajia</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Yuichiro Yano, MD, PhD, Department of General Medicine, Juntendo University, 2-1-1, Hongo, Bunkyo-Ku, Tokyo, 113-8421, Japan, 81 3-3813-3111; <email>y.yano@juntendo.ac.jp</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>3</day><month>6</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e89726</elocation-id><history><date date-type="received"><day>17</day><month>12</month><year>2025</year></date><date date-type="rev-recd"><day>15</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>23</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Yuichiro Yano, Hiroaki Kakizaki, Hajime Nagasu, Seiji Kishi, Takeo Koshida, Yoshihito Nihei, Akira Hirano, Masaomi Nangaku, Hirotake Mori, Toshio Naito, Mizuki Ohashi, Shoichi Maruyama, Isao Matsui, Yoshitaka Isaka, Hirokazu Okada, Yusuke Suzuki, Naoki Kashihara. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 3.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e89726"/><abstract><p>This study evaluates the dynamic clinical reasoning of 4 leading large language models in complex nephrology cases, demonstrating that while Gemini 2.5 Pro achieved the highest reasoning scores and computational efficiency, all tested models excelled at static data synthesis but shared vulnerabilities in formulating nuanced differential diagnoses and in prospective clinical planning.</p></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>large language models</kwd><kwd>clinical reasoning</kwd><kwd>nephrology</kwd><kwd>evaluation</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>While large language models (LLMs) are increasingly being applied in medicine, evaluating their performance relies on static knowledge tests, such as medical licensing examinations, which fail to capture the dynamic, iterative reasoning of real clinical practice [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Recent benchmark studies have begun to address this gap; for instance, the MedR-Bench [<xref ref-type="bibr" rid="ref3">3</xref>] framework evaluates medical LLMs across 3 clinical stages using an automated, artificial intelligence (AI)&#x2013;driven &#x201C;reasoning evaluator&#x201D; to score free-text reasoning. However, while automated metrics provide scalability, they cannot fully replace rigorous human verification in medical contexts.</p><p>The aim of this study is to perform a clinical evaluation of 4 leading LLMs (GPT-o3 [OpenAI], Gemini 2.5 Pro [Google], DeepSeek-R1 [Hangzhou DeepSeek Artificial Intelligence], and Llama 4 Maverick [Meta]) in nephrology [<xref ref-type="bibr" rid="ref4">4</xref>], a specialty renowned for its complex, multisystemic pathologies and diagnostic challenges, using a multiagent architecture for temporal workflows [<xref ref-type="bibr" rid="ref5">5</xref>]. Rather than using clinical reasoning with broad automated metrics, we systematically deconstructed the reasoning process into 9 distinct, scorable cognitive steps mapped to real-world workflows.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>A detailed description of the methods is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Briefly, 4 nephrologists used the Delphi method to select 10 cases that met the inclusion criteria from over 100 case reports [<xref ref-type="bibr" rid="ref6">6</xref>]. As permission could not be obtained for 1 case, 9 cases were included in the final analysis.</p><p>We developed a clinical reasoning application using Dify [<xref ref-type="bibr" rid="ref7">7</xref>], a no-code AI development platform that enables the creation of AI agents leveraging LLMs via application programming interface (API) integration. We evaluated 4 LLMs: DeepSeek-R1 (DeepSeek-R1-0528, released May 28, 2025), Gemini 2.5 Pro (preview 03&#x2010;25, released May 25, 2025), GPT-o3 (released April 16, 2025), and Llama 4 Maverick (meta-llama/Llama-4-Maveric-17B-128E-Instruct-FP8, released July 20, 2025). All were accessed via the together.ai [<xref ref-type="bibr" rid="ref8">8</xref>] API. All 4 LLMs used the same sequential 3-agent architecture designed to mirror the temporal progression of clinical practice. To systematically evaluate model performance, we deconstructed clinical reasoning into 9 cognitive steps (<xref ref-type="table" rid="table1">Table 1</xref>).</p><p>The evaluation of the LLM outputs was conducted on July 20, 2025. The primary outcome was the reasoning quality score, measured on a 3-point scale (0=incorrect; 1=reasonable but suboptimal; 2=correct). The 4 nephrologists independently and blindly scored the randomized, deidentified outputs. Results were aggregated by blinded researcher. Group differences were assessed using the Kruskal-Wallis test, followed by pairwise comparisons with Holm correction for multiple testing. A 2-sided <italic>P</italic> value &#x003C;.05 was considered statistically significant. Interrater reliability was assessed using the intraclass correlation coefficient (ICC[2,k]). Full prompts and system details are available in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>End-to-end multi-agent clinical reasoning workflow and evaluated tasks. All agents operated under a strict system prompt with instructions to act as a &#x201C;Japanese nephrologist specialized in clinical reasoning and evidence-based medicine,&#x201D; thinking step-by-step in English while strictly relying only on provided information without assuming new facts.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Agent</td><td align="left" valign="bottom">Clinical stage</td><td align="left" valign="bottom">Input data</td><td align="left" valign="bottom">Nine cognitive steps and evaluated specific reasoning tasks (user prompts)</td><td align="left" valign="bottom">Expected output format and constraints (assistant prompts)</td></tr></thead><tbody><tr><td align="left" valign="top">Agent 1</td><td align="left" valign="top">Initial clinical assessment</td><td align="left" valign="top">Patient&#x2019;s chief concern and brief initial clinical findings</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Step 1: medical summarization; question: &#x201C;Summarize the patient as a medical problem.&#x201D;</p></list-item><list-item><p>Step 2: differential diagnosis; question: &#x201C;Provide three differential diagnoses (most likely &#x0026; &#x2019;must not miss&#x2019;).&#x201D;</p></list-item><list-item><p>Step 3: additional data acquisition: question: &#x201C;Suggest additional questions or physical exams to verify hypotheses.&#x201D;</p></list-item><list-item><p>Step 4: diagnostic planning; question: &#x201C;Recommend diagnostic tests with purposes and expected outcomes.&#x201D;</p></list-item></list></td><td align="left" valign="top">Include age, sex, and time course; present a prioritized differential list with rationale; list purposeful questions or examinations to rule in or out; develop a cost-effective and logical testing plan</td></tr><tr><td align="left" valign="top">Agent 2</td><td align="left" valign="top">Diagnostic refinement</td><td align="left" valign="top">Newly obtained diagnostic test results</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Step 5: results interpretation; question: &#x201C;Interpret test results in the context of the patient&#x2019;s condition.&#x201D;</p></list-item><list-item><p>Step 6: diagnostic updating; question: &#x201C;Update the diagnostic thinking and suggest the most likely working diagnosis.&#x201D;</p></list-item><list-item><p>Step 7: treatment formulation; question: &#x201C;Propose a treatment plan with rationale and alternative options.&#x201D;</p></list-item></list></td><td align="left" valign="top">Explain medical significance beyond mere abnormal values; appropriately revise the prioritization of differentials; develop a treatment plan considering both evidence-based medicine and patient factors</td></tr><tr><td align="left" valign="top">Agent 3</td><td align="left" valign="top">Therapeutic evaluation</td><td align="left" valign="top">Information on treatments administered and subsequent clinical outcomes</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Step 8: monitoring and risk mitigation; question: &#x201C;Propose strategies to monitor treatment effectiveness, specific risks, and countermeasures.&#x201D;</p></list-item><list-item><p>Step 9: contingency planning; question: &#x201D;Recommend actions if the condition fails to improve or changes suddenly.&#x201D;</p></list-item></list></td><td align="left" valign="top">List specific measures for evaluating efficacy and side effects; explain the thought process for rapidly reassessing causes and flexibly revising the plan</td></tr></tbody></table></table-wrap></sec><sec id="s2-2"><title>Ethical Considerations</title><p>Under the Ethical Guidelines for Medical and Biological Research Involving Human Subjects in Japan, this study was exempt from institutional review board review and informed consent requirements, as it exclusively involved the secondary analysis of fully anonymized, publicly available case reports without accessing personal health information.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Overall performance differed significantly among the 4 LLMs (<xref ref-type="fig" rid="figure1">Figure 1A</xref>). Gemini 2.5 Pro achieved the highest average score (mean 7.57, SD 0.61), followed by GPT-o3 (mean 7.39, SD 0.81), DeepSeek-R1 (mean 7.13, SD 1.03), and Llama 4 Maverick (mean 6.23, SD 0.93). Across all models (<xref ref-type="fig" rid="figure1">Figure 1B</xref>), Q2 and Q7 were the most challenging tasks, with the lowest mean scores (mean 6.56, SD 1.13 and mean 6.58, SD 0.97, respectively), while the models performed best on Q1 and Q6 (mean 7.50, SD 0.77, and mean 7.50, SD 0.85, respectively). <xref ref-type="fig" rid="figure1">Figure 1C</xref> shows a heatmap of average scores by model for each clinical reasoning question. Gemini 2.5 Pro demonstrated superior or competitive performance, particularly on complex tasks such as Q6 (mean 7.89, SD 0.85), Q5 (mean 8.00, SD 0.87), and Q9 (mean 8.00, SD 0.88). The ICC(2,k) was 0.36 (95% CI 0.24&#x2010;0.46). Gemini 2.5 Pro was efficient (mean response time: 124.7, SD 16.0 sec), whereas DeepSeek-R1 incurred the highest computational cost, with the longest response time (mean 249.7, SD 68.6 sec) and highest token use (mean 16,105.1, SD 4508.5) (Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). In contrast, Llama 4 Maverick had the shortest response time and the lowest token use.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Scores by model for each case and question, and distribution of scores by clinical reasoning question. (A) Scores for each case and question are displayed by model as box-and-whisker plots. Overall differences were assessed with the Kruskal&#x2013;Wallis test; when significant, pairwise comparisons between models were performed with Holm correction for multiple testing. Asterisks indicate a statistically significant difference (2-sided <italic>P</italic> value &#x003C;.05) in performance compared to Llama 4 Maverick. (B) Box-and-whisker plots show the distribution of scores for each of the 9 clinical reasoning questions, aggregating the results from all 4 LLMs. The boxes represent IQRs, the lines inside the boxes indicate the medians, and the whiskers show the ranges of the data. Triangles mark the mean score for each question. The questions from 1 to 9 were as follows: summary of the medical problem; differential diagnoses and rationale; necessary physical examinations and rationale; plan for investigations/tests; interpretation of test results; reassessment of the differential diagnoses; treatment planning; evaluation of treatment; and management in case of clinical worsening. (C) The values presented as a heatmap, where red outlines highlight the highest-scoring model for each question. Group differences were assessed using the Kruskal-Wallis test, followed by pairwise comparisons with Holm correction for multiple testing.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e89726_fig01.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>We demonstrate that while LLMs effectively processed clinical data, overall performance varied significantly among models. Gemini 2.5 Pro achieved the highest overall reasoning quality score while maintaining computational efficiency. Our step-by-step evaluation revealed a consistent pattern across all models: while they excelled at information synthesis and test interpretation (questions 1 and 6), they shared specific vulnerabilities in higher-order cognitive tasks, particularly in formulating nuanced differential diagnoses (question 2) and planning optimal interventions (question 7).</p><p>Recent benchmarks, such as AMIE, MAI-DxO, and MedR-Bench [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], have advanced AI evaluation from static examinations toward dynamic clinical workflows. However, these frameworks often assess reasoning using broad, automated metrics. By deconstructing clinical reasoning into a stage-gated, multiagent workflow evaluated by domain experts, our study pinpoints exact cognitive bottlenecks. Our results indicate that LLMs struggle heavily with the higher-order, divergent skills required for prospective planning [<xref ref-type="bibr" rid="ref10">10</xref>]. For example, when a nephrotic syndrome patient developed sudden-onset nocardiosis, the models failed to adequately pivot their proposed treatment plans. This exposes a critical gap between static knowledge retrieval and the fluid, iterative reality of complex multisystemic specialties like nephrology.</p><p>Furthermore, our findings challenge the assumption that bigger models are always better. We revealed a nuanced relationship between reasoning quality and computational efficiency. For real-world clinical deployment, where rapid bedside decision-making and sustainable API costs are paramount, identifying models that balance high reasoning quality with low computational overhead is an encouraging and practical metric.</p><p>A primary limitation is the small sample size of 9 nephrology cases evaluated across 9 specific questions, which restricts the generalizability of our findings across diverse, real-world clinical scenarios. However, the cases were rigorously curated via Delphi consensus from 104 candidates, prioritizing diagnostic complexity over breadth. By deconstructing these cases to generate 324 expert-scored data points per model, this granular approach might provide information power to detect statistically significant performance differences. Additionally, our reliance on expert-scored evaluation showed limited interrater agreement, and the study currently lacks extensive statistical validation. Our findings should be interpreted as an early, exploratory assessment of AI clinical reasoning rather than a definitively generalizable conclusion.</p><p>Ultimately, future model development in the medical field must move beyond static examinations to prioritize dynamic adaptability and prospective clinical planning. By emphasizing targeted cognitive evaluations and computational efficiency over sheer model size, the medical community can ensure the safe and sustainable integration of AI into real-world bedside practice.</p></sec></body><back><ack><p>The authors used Gemini (3.1 Pro) and ChatGPT (GPT-5) to refine the English composition. All generated content was critically reviewed by the authors. The experimental evaluation of the four models was conducted independently of these writing-assistant tools.</p></ack><notes><sec><title>Funding</title><p>This research was partially funded by the Advanced Medical Personnel Training Program (principal investigator TN) and was supported by the Ministry of Education, Culture, Sports, Science, and Technology of Japan.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>YY conceptualized the study and developed the methodology. HN, SK, TK, and YN curated the data. HK and MO conducted the formal analysis. AH, MN, HM, TN, SM, IM, YI, HO, YS, and NK supervised the study. TN acquired funding. YY drafted the manuscript, and all authors reviewed, edited, and approved the final version.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McDuff</surname><given-names>D</given-names> </name><name name-style="western"><surname>Schaekermann</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Towards accurate differential diagnosis with large language models</article-title><source>Nature New Biol</source><year>2025</year><month>06</month><volume>642</volume><issue>8067</issue><fpage>451</fpage><lpage>457</lpage><pub-id pub-id-type="doi">10.1038/s41586-025-08869-4</pub-id><pub-id pub-id-type="medline">40205049</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Schaekermann</surname><given-names>M</given-names> </name><name name-style="western"><surname>Palepu</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Towards conversational diagnostic artificial intelligence</article-title><source>Nature New Biol</source><year>2025</year><month>06</month><volume>642</volume><issue>8067</issue><fpage>442</fpage><lpage>450</lpage><pub-id pub-id-type="doi">10.1038/s41586-025-08866-7</pub-id><pub-id pub-id-type="medline">40205050</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qiu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Quantifying the reasoning abilities of LLMs on clinical cases</article-title><source>Nat Commun</source><year>2025</year><month>11</month><day>6</day><volume>16</volume><issue>1</issue><fpage>9799</fpage><pub-id pub-id-type="doi">10.1038/s41467-025-64769-1</pub-id><pub-id pub-id-type="medline">41198657</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boyle</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Martindale</surname><given-names>J</given-names> </name><name name-style="western"><surname>Parsons</surname><given-names>AS</given-names> </name><etal/></person-group><article-title>Development and validation of a formative assessment tool for nephrology fellows&#x2019; clinical reasoning</article-title><source>Clin J Am Soc Nephrol</source><year>2024</year><month>01</month><day>1</day><volume>19</volume><issue>1</issue><fpage>26</fpage><lpage>34</lpage><pub-id pub-id-type="doi">10.2215/CJN.0000000000000315</pub-id><pub-id pub-id-type="medline">37851423</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Berger</surname><given-names>A</given-names> </name><name name-style="western"><surname>Khanna</surname><given-names>S</given-names> </name><name name-style="western"><surname>Berghaus</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sifa</surname><given-names>R</given-names> </name></person-group><article-title>Reasoning LLMs in the medical domain: a literature survey</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 26, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2508.19097</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Humphrey-Murto</surname><given-names>S</given-names> </name><name name-style="western"><surname>Varpio</surname><given-names>L</given-names> </name><name name-style="western"><surname>Gonsalves</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wood</surname><given-names>TJ</given-names> </name></person-group><article-title>Using consensus group methods such as Delphi and nominal group in medical education research</article-title><source>Med Teach</source><year>2017</year><month>01</month><volume>39</volume><issue>1</issue><fpage>14</fpage><lpage>19</lpage><pub-id pub-id-type="doi">10.1080/0142159X.2017.1245856</pub-id><pub-id pub-id-type="medline">27841062</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="web"><source>Dify</source><access-date>2025-12-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://dify.ai/jp">https://dify.ai/jp</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><source>together.ai</source><year>2025</year><access-date>2025-12-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.together.ai">https://www.together.ai</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nori</surname><given-names>H</given-names> </name><name name-style="western"><surname>Daswani</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kelly</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Sequential diagnosis with language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 27, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2506.22405</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hager</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jungmann</surname><given-names>F</given-names> </name><name name-style="western"><surname>Holland</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Evaluation and mitigation of the limitations of large language models in clinical decision-making</article-title><source>Nat Med</source><year>2024</year><month>09</month><volume>30</volume><issue>9</issue><fpage>2613</fpage><lpage>2622</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03097-1</pub-id><pub-id pub-id-type="medline">38965432</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Delphi case selection process, multiagent system architecture, prompt design, and comprehensive model responses across 9 structured clinical cases.</p><media xlink:href="formative_v10i1e89726_app1.docx" xlink:title="DOCX File, 2308 KB"/></supplementary-material></app-group></back></article>