<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e76618</article-id><article-id pub-id-type="doi">10.2196/76618</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Teaching Clinical Reasoning in Health Care Professions Learners Using AI-Generated Script Concordance Tests: Mixed Methods Formative Evaluation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Hudon</surname><given-names>Alexandre</given-names></name><degrees>BEng, MSc, MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Phan</surname><given-names>V&#x00E9;ronique</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff6">6</xref><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Charlin</surname><given-names>Bernard</given-names></name><degrees>MEd, MD, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wittmer</surname><given-names>Ren&#x00E9;</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff9">9</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Psychiatry and Addictology, Faculty of Medicine, Universit&#x00E9; de Montr&#x00E9;al</institution><addr-line>Pavillon Roger-Gaudry, 2900 Bd &#x00C9;douard-Montpetit Local L-315</addr-line><addr-line>Montr&#x00E9;al</addr-line><addr-line>QC</addr-line><country>Canada</country></aff><aff id="aff2"><institution>Department of Psychiatry, Institut universitaire en sant&#x00E9; mentale de Montr&#x00E9;al</institution><addr-line>Montreal</addr-line><addr-line>QC</addr-line><country>Canada</country></aff><aff id="aff3"><institution>Centre de recherche de l'Institut universitaire en sant&#x00E9; mentale de Montr&#x00E9;al</institution><addr-line>Montreal</addr-line><addr-line>QC</addr-line><country>Canada</country></aff><aff id="aff4"><institution>Department of Psychiatry, Institut national de psychiatrie l&#x00E9;gale Philippe-Pinel</institution><addr-line>Montreal</addr-line><addr-line>QC</addr-line><country>Canada</country></aff><aff id="aff5"><institution>Groupe Interdisciplinaire de recherche sur la cognition et le raisonnement professionnel (GIRCoPRo), Universit&#x00E9; de Montr&#x00E9;al</institution><addr-line>Montreal</addr-line><addr-line>QC</addr-line><country>Canada</country></aff><aff id="aff6"><institution>Department of Pediatrics, Faculty of Medicine, Universit&#x00E9; de Montr&#x00E9;al</institution><addr-line>Montreal</addr-line><addr-line>QC</addr-line><country>Canada</country></aff><aff id="aff7"><institution>Department of Pediatrics, Department of Pediatrics, Centre Hospitalier Universitaire Sainte-Justine</institution><addr-line>Montreal</addr-line><addr-line>QC</addr-line><country>Canada</country></aff><aff id="aff8"><institution>Centre de p&#x00E9;dagogie appliqu&#x00E9;e au sciences de la sant&#x00E9;, Faculty of Medicine, Universit&#x00E9; de Montr&#x00E9;al</institution><addr-line>Montreal</addr-line><addr-line>QC</addr-line><country>Canada</country></aff><aff id="aff9"><institution>Department of Family Medicine and Emergency Medicine, Faculty of Medicine, Universit&#x00E9; de Montr&#x00E9;al</institution><addr-line>Montreal</addr-line><addr-line>QC</addr-line><country>Canada</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Potla</surname><given-names>Ravi Teja</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Guo</surname><given-names>Song-Bin</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kiyak</surname><given-names>Yavuz Selim</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Alexandre Hudon, BEng, MSc, MD, PhD, Department of Psychiatry and Addictology, Faculty of Medicine, Universit&#x00E9; de Montr&#x00E9;al, Pavillon Roger-Gaudry, 2900 Bd &#x00C9;douard-Montpetit Local L-315, Montr&#x00E9;al, QC, H3T 1J4, Canada, 1 514 343 6111; <email>alexandre.hudon.1@umontreal.ca</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>20</day><month>11</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e76618</elocation-id><history><date date-type="received"><day>27</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>19</day><month>10</month><year>2025</year></date><date date-type="accepted"><day>10</day><month>11</month><year>2025</year></date></history><copyright-statement>&#x00A9; Alexandre Hudon, V&#x00E9;ronique Phan, Bernard Charlin, Ren&#x00E9; Wittmer. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 20.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e76618"/><abstract><sec><title>Background</title><p>The integration of artificial intelligence (AI) in medical education is evolving, offering new tools to enhance teaching and assessment. Among these, script concordance tests (SCTs) are well-suited to evaluate clinical reasoning in contexts of uncertainty. Traditionally, SCTs require expert panels for scoring and feedback, which can be resource-intensive. Recent advances in generative AI, particularly large language models (LLMs), suggest the possibility of replacing human experts with simulated ones, though this potential remains underexplored.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate whether LLMs can effectively simulate expert judgment in SCTs by using generative AI to author, score, and provide feedback for SCTs in cardiology and pneumology. A secondary objective was to assess students&#x2019; perceptions of the test&#x2019;s difficulty and the pedagogical value of AI-generated feedback.</p></sec><sec sec-type="methods"><title>Methods</title><p>A cross-sectional, mixed methods study was conducted with 25 second-year medical students who completed a 32-item SCT authored by ChatGPT-4o (OpenAI). Six LLMs (3 trained on the course material and 3 untrained) served as simulated experts to generate scoring keys and feedback. Students answered SCT questions, rated perceived difficulty, and selected the most helpful feedback explanation for each item. Quantitative analysis included scoring, difficulty ratings, and correlations between student and AI responses. Qualitative comments were thematically analyzed.</p></sec><sec sec-type="results"><title>Results</title><p>The average student score was 22.8 out of 32 (SD 1.6), with scores ranging from 19.75 to 26.75. Trained AI systems showed significantly higher concordance with student responses (&#x03C1;=0.64) than untrained models (&#x03C1;=0.41). AI-generated feedback was rated as most helpful in 62.5% of cases, especially when provided by trained models. The SCT demonstrated good internal consistency (Cronbach &#x03B1;=0.76), and students reported moderate perceived difficulty (mean 3.7, SD 1.1). Qualitative feedback highlighted appreciation for SCTs as reflective tools, while recommending clearer guidance on Likert-scale use and more contextual detail in vignettes.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This is among the first studies to demonstrate that trained generative AI models can reliably simulate expert clinical reasoning within a script-concordance framework. The findings suggest that AI can both streamline SCT design and offer educationally valuable feedback without compromising authenticity. Future studies should explore longitudinal effects on learning and assess how hybrid models (human and AI) can optimize reasoning instruction in medical education.</p></sec></abstract><kwd-group><kwd>script concordance test</kwd><kwd>clinical reasoning</kwd><kwd>artificial intelligence</kwd><kwd>large language models</kwd><kwd>medical education</kwd><kwd>formative assessment</kwd><kwd>generative AI</kwd><kwd>expert simulation</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>One of the biggest challenges in medical education is evaluating clinical capabilities, especially higher-order abilities such as clinical reasoning and decision-making. Knowledge-based tests, such as multiple-choice questions, frequently fall short of capturing the subtleties of clinical reasoning in the face of diagnostic uncertainty [<xref ref-type="bibr" rid="ref1">1</xref>]. By simulating clinical interactions, tools such as workplace-based assessments and Objective Structured Clinical Examinations try to assess clinical reasoning, but they come with a high logistical and human cost. An additional method for addressing thinking in clinical environments with unclear definitions is the script concordance test (SCT). In contrast to standard assessments, SCTs ask students to use a Likert-type scale to rate the impact of scenarios that are followed by additional clinical information on a postulated hypothesis. Concordance with a panel of expert responses serves as the basis for scoring, which reflects the variety and probabilistic nature of actual clinical practice [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. SCTs are valuable, but creating and scoring them requires time and involves hiring several specialists who must give careful, consistent answers to a variety of questions.</p><p>The format of an SCT usually involves presenting a clinical vignette followed by a diagnostic, investigative, or therapeutic hypothesis. A new piece of information is then introduced, and the test-taker must judge, using a Likert-type scale, how this new data affects the likelihood or relevance of the hypothesis. Responses are scored by comparing the learner&#x2019;s choices to those of a panel of experienced clinicians, with partial credit awarded based on the distribution of expert answers rather than a single correct response. This scoring system captures the variability in expert judgment and allows the SCT to measure concordance with expert reasoning rather than factual recall. Validity evidence for the SCT includes its ability to discriminate between levels of expertise, its positive impact on cognitive engagement, and its reliability when a sufficient number of items and panel members are used [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. As such, SCTs are now used internationally in both undergraduate and postgraduate medical education, particularly in specialties that demand complex clinical judgment.</p><p>Artificial intelligence (AI) algorithms, such as large language models (LLMs), have developed in recent years, presenting a possible paradigm change in this field. Having been trained on extensive text datasets, LLMs are deep learning models that can produce language that is similar to that of a person, summarize intricate concepts, and even mimic domain-specific reasoning [<xref ref-type="bibr" rid="ref6">6</xref>]. These models, as opposed to rule-based systems, generate context-aware responses, often with good fluency, by identifying patterns in unstructured data. LLMs have demonstrated the ability to replicate patient or clinician discourse, produce high-quality feedback, and perform at or near the passing threshold on United States Medical Licensing Examination&#x2013;style questions [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>In the field of LLMs used to generate SCTs, several studies have demonstrated the usefulness of this tool for creating clinical vignettes in medical education and other health sciences fields [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. A recent study highlighted the educational quality of script concordance vignettes created using ChatGPT (OpenAI [<xref ref-type="bibr" rid="ref11">11</xref>]). To use tools based on generative AI in the design of educational materials, and to improve efficiency while optimizing human and material resources, it is essential to evaluate the level of difficulty of the proposed scenarios and questions [<xref ref-type="bibr" rid="ref12">12</xref>]. Moreover, to our knowledge, no study has yet demonstrated the ability of generative AI to embody the role of a clinical expert in selecting the most appropriate response to a given scenario and in providing feedback to learners.</p><p>The main objective of this project is to explore the performance of generative AI in its ability to create SCTs for undergraduate medical students and to embody the role of an expert. A secondary objective is to evaluate how medical learners perceive the difficulty level of the AI-generated SCTs, as well as their level of appreciation for each type of feedback provided.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>This study used a cross-sectional, mixed methods experimental design to evaluate the performance of medical students on a generative AI&#x2013;authored formative SCT and to assess the use of AI systems as content experts for both scoring and providing formative feedback. The flow diagram of this study is provided in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Protocol flow diagram for this study. AI: artificial intelligence; SCT: script concordance test.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e76618_fig01.png"/></fig></sec><sec id="s2-2"><title>Participants and Recruitment</title><p>The study was conducted in November 2024 at the Universit&#x00E9; de Montr&#x00E9;al&#x2019;s Faculty of Medicine. To be included, participants needed to be enrolled in the second year of the undergraduate medical education program, have completed their cardiology course, and be preparing for their respirology exam, having completed the related course materials. This timing allowed for the inclusion of clinical reasoning questions involving symptoms and signs that may relate to both systems, given their close physiological and clinical connections. An email was sent to all potential participants to register for this formative examination.</p></sec><sec id="s2-3"><title>Test Development and Structure</title><p>The SCT was developed using generative AI (ChatGPT-4o) following established guidelines for script concordance testing [<xref ref-type="bibr" rid="ref13">13</xref>]. Four clinical vignettes (chronic obstructive pulmonary disease [COPD] exacerbation, hemoptysis, chest pain, and acute cough) were created, each containing 8 items for a total of 32. Prompts were standardized to ensure consistent vignette structure, 5-point Likert scaling (ranging from not likely to very likely, indicating how the new information would influence the hypothesis), and reasoning alignment across cardiopulmonary domains. Each item followed the classic SCT structure: (1) a clinical scenario introducing uncertainty, (2) a diagnostic, investigative, or therapeutic hypothesis, and (3) a new clinical finding. Items were designed and reviewed to reflect varying degrees of diagnostic ambiguity, probabilistic reasoning, and real-world uncertainty.</p><p>The full test, in French and its English translation, is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The following prompts were used to design the SCTs: (1) act as an expert in health sciences education at the university level, specializing in cardiology and pulmonology; (2) act as an expert in health sciences education at the university level, specializing in cardiology and pulmonology; (3) act as an expert in designing SCT vignettes; (4) generate a script concordance vignette that includes 8 questions based on a theme integrating both pulmonology and cardiology; and (5) create questions related to the vignette that begin with: &#x201C;If you are considering &#x2018;a diagnostic hypothesis&#x2019; and you find &#x2018;a sign or symptom,&#x2019; how does this finding affect the likelihood of the hypothesis?&#x201D; (using a Likert scale from &#x2212;2 to +2 which refers to not likely to very likely).</p><p>While the AI-generated items were designed to follow the canonical SCT format (clinical vignette, hypothesis, new information, and Likert-scale inference), some questions did not fully adhere to this structure. This occurred when the generative model incorporated redundant information from the vignette into subsequent items, occasionally blurring the distinction between context and new data. The prompt did not explicitly constrain the model to exclude previously stated information, which likely contributed to partial format deviations.</p></sec><sec id="s2-4"><title>Use of Generative AI as an Expert Panel</title><p>The reference panel comprised 6 LLMs, chosen to balance diversity of reasoning with practical feasibility. Although the classical SCT format recommends 15&#x2010;20 human experts to ensure adequate response variability [<xref ref-type="bibr" rid="ref5">5</xref>], LLMs can rapidly generate consistent probabilistic judgments across multiple iterations. Pilot testing indicated that 6 distinct model architectures provided sufficient variability in reasoning patterns to construct an aggregated scoring key while keeping the process computationally manageable for formative evaluation. These models were divided into two groups:</p><list list-type="order"><list-item><p>Trained AI experts (n=3): these systems (ChatGPT-4o, Claude 3.7 [Anthropic], and Microsoft 365 Copilot) were not fine-tuned or modified at the model-parameter level. Instead, each was contextually conditioned using course-specific documents (lecture notes and clinical guidelines from the cardiopulmonary block) that were embedded directly into the prompt context before generating responses. This approach (often referred to as in-context learning or prompt-based domain adaptation) allowed the models to access relevant curricular information without altering their underlying architecture.</p></list-item><list-item><p>Untrained AI experts (n=3): the same base models were used without additional curricular material or contextual embedding, representing their general-purpose configuration.</p></list-item></list><p>Each AI system was prompted to respond to all 32 SCT items as if it were a senior clinical expert. Their responses were used to establish scoring keys and to generate formative feedback explanations for each item. The modal response across all 6 AI experts served as the reference answer for scoring.</p><p>The prompts used to generate feedback were the same across the different AI systems. They were: (1) act as a medical expert in cardiology and pneumology; (2) act as a SCT expert; (3) for this vignette and each of its related questions ([Insert Vignette] and [Insert Questions]), please pick the most appropriate answer from the Likert scale; and (4) for each of your answers, please provide an explanation as if you were trying to explain your reasoning to a medical student.</p></sec><sec id="s2-5"><title>Student Completion and Data Collection</title><p>Participants completed a 32-question AI-generated SCT via a Google Form. For each item, students (1) selected a Likert-scale response, (2) rated the item&#x2019;s perceived difficulty on a 7-point scale, (3) identified the most and least helpful feedback explanations among 6 anonymous AI-generated rationales, and (4) indicated whether they believed the question was AI- or human-authored. Open-ended comment boxes captured qualitative impressions. Completion was voluntary and anonymous, with implied consent through submission. The total estimated completion time was 35&#x2010;45 minutes. Responses were exported and cleaned in Microsoft Excel for analysis.</p></sec><sec id="s2-6"><title>Scoring and Analysis</title><p>Student answers were scored using the aggregate partial credit method: (1) one point was awarded if the student&#x2019;s response matched the modal expert answer, (2) partial credit (eg, 0.75 and 0.5) was granted if the response matched a minority AI expert choice, and (3) zero points were given if the response did not match any AI panel member.</p><p>Descriptive statistics were computed for total scores, individual item performance, and perceived difficulty. Internal consistency was measured using Cronbach &#x03B1;. AI-to-AI and AI-to-student concordance was evaluated using Spearman rank correlation (&#x03C1;).</p><p>In the feedback evaluation component, student preferences for expert rationales were tabulated across all items. Each expert (AI system) received a score reflecting how frequently its feedback was selected as the &#x201C;most helpful&#x201D; and &#x201C;least helpful,&#x201D; allowing comparison between trained and untrained AI systems as feedback providers.</p></sec><sec id="s2-7"><title>Qualitative Analysis</title><p>Open-ended responses were analyzed using an inductive thematic approach. Comments were reviewed, coded iteratively, and grouped into major and minor themes by 2 independent reviewers. Discrepancies were resolved through discussion and consensus. Themes focused on student perceptions of the SCT format, feedback quality, and suggestions for improvement.</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>This study was approved by the Universit&#x00E9; de Montr&#x00E9;al&#x2019;s Research Ethics Committee in Education and Psychology under the project name 2024&#x2010;6168. Participation was anonymous and voluntary, with implied consent obtained through survey completion.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Participants</title><p>A total of 25 students volunteered to complete the SCT in a formative, nonevaluative setting. Each student also evaluated the difficulty of each question, selected the most and least helpful expert feedback, and provided qualitative comments on individual questions and on the overall examination. Six generative AI systems (3 trained on course content and 3 untrained) completed the test and provided justifications for each item, which served as immediate feedback for the students.</p></sec><sec id="s3-2"><title>Student Performance</title><p>Among the 25 second-year medical students who completed the SCT, the average score was 22.8 out of 32 (SD 1.6), with individual scores ranging from 19.75 to 26.75. The distribution of scores approximated a normal curve with a slight left skew, indicating generally strong alignment between student responses and the AI expert panel, while still highlighting variability in performance. This range suggests that the AI-generated SCT was capable of differentiating clinical reasoning abilities among participants, supporting its discriminatory power even in a formative context. The full distribution of results for each question is provided in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Distribution of the test results (N=25).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question (English)</td><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom">Range (min-max)</td><td align="left" valign="bottom">Median (IQR)</td></tr></thead><tbody><tr><td align="left" valign="top">Q1: To assess an acute COPD<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> exacerbation, determine the impact of an oxygen saturation of 85%.</td><td align="left" valign="top">0.72 (0.15)</td><td align="left" valign="top">0.25-1</td><td align="left" valign="top">0.75 (0.75-0.75)</td></tr><tr><td align="left" valign="top">Q2: To evaluate acute pulmonary edema, judge the impact of a recent respiratory infection.</td><td align="left" valign="top">0.65 (0.2)</td><td align="left" valign="top">0-0.75</td><td align="left" valign="top">0.75 (0.75-0.75)</td></tr><tr><td align="left" valign="top">Q3: To assess the hypothesis of pulmonary embolism, determine the impact of dyspnea and tachycardia.</td><td align="left" valign="top">0.76 (0.17)</td><td align="left" valign="top">0.25-1</td><td align="left" valign="top">0.75 (0.75-0.75)</td></tr><tr><td align="left" valign="top">Q4: To evaluate myocardial dysfunction, judge the influence of a high heart rate.</td><td align="left" valign="top">0.56 (0.27)</td><td align="left" valign="top">0-1</td><td align="left" valign="top">0.75 (0.50-0.75)</td></tr><tr><td align="left" valign="top">Q5: To differentiate pulmonary edema from asthma exacerbation, assess the importance of a chest X-ray.</td><td align="left" valign="top">0.89 (0.21)</td><td align="left" valign="top">0.5-1</td><td align="left" valign="top">1 (1-1)</td></tr><tr><td align="left" valign="top">Q6: To evaluate a COPD exacerbation, determine how a history of smoking affects the hypothesis.</td><td align="left" valign="top">0.82 (0.15)</td><td align="left" valign="top">0.5-1</td><td align="left" valign="top">0.75 (0.75-1)</td></tr><tr><td align="left" valign="top">Q7: To guide diagnosis in respiratory distress, assess the importance of history-taking.</td><td align="left" valign="top">0.97 (0.11)</td><td align="left" valign="top">0.5-1</td><td align="left" valign="top">1 (1-1)</td></tr><tr><td align="left" valign="top">Q8: To diagnose pulmonary edema, judge the influence of acute electrocardiogram ischemia.</td><td align="left" valign="top">0.79 (0.19)</td><td align="left" valign="top">0.25-1</td><td align="left" valign="top">0.75 (0.75-1)</td></tr><tr><td align="left" valign="top">Q9: To assess tuberculosis as a differential diagnosis, evaluate the impact of night sweats and weight loss.</td><td align="left" valign="top">0.78 (0.08)</td><td align="left" valign="top">0.75-1</td><td align="left" valign="top">0.75 (0.75-0.75)</td></tr><tr><td align="left" valign="top">Q10: To assess the likelihood of chronic pulmonary hemoptysis, judge the influence of a history of bronchiectasis.</td><td align="left" valign="top">0.76 (0.13)</td><td align="left" valign="top">0.25-1</td><td align="left" valign="top">0.75 (0.75-0.75)</td></tr><tr><td align="left" valign="top">Q11: To assess life-threatening hemoptysis requiring urgent care, evaluate the impact of an oxygen saturation of 92%.</td><td align="left" valign="top">0.59 (0.32)</td><td align="left" valign="top">0-1</td><td align="left" valign="top">0.75 (0.25-0.75)</td></tr><tr><td align="left" valign="top">Q12: To evaluate bronchopulmonary cancer, determine the influence of a 30 pack-year smoking history.</td><td align="left" valign="top">0.86 (0.13)</td><td align="left" valign="top">0.75-1</td><td align="left" valign="top">0.75 (0.75-1)</td></tr><tr><td align="left" valign="top">Q13: To orient toward an infectious etiology like tuberculosis or pneumonia, assess the usefulness of the patient&#x2019;s history.</td><td align="left" valign="top">1 (0)</td><td align="left" valign="top">1-1</td><td align="left" valign="top">1 (1-1)</td></tr><tr><td align="left" valign="top">Q14: To distinguish possible causes of hemoptysis, assess the importance of performing a chest X-ray.</td><td align="left" valign="top">1 (0)</td><td align="left" valign="top">1-1</td><td align="left" valign="top">1 (1-1)</td></tr><tr><td align="left" valign="top">Q15: To assess active bronchiectasis, judge the interaction between smoking history and risk of complications.</td><td align="left" valign="top">0.82 (0.14)</td><td align="left" valign="top">0.5-1</td><td align="left" valign="top">0.75 (0.75-1)</td></tr><tr><td align="left" valign="top">Q16: To diagnose tuberculosis-related hemoptysis, evaluate the influence of prior tuberculosis exposure.</td><td align="left" valign="top">0.86 (0.13)</td><td align="left" valign="top">0.75-1</td><td align="left" valign="top">0.75 (0.75-1)</td></tr><tr><td align="left" valign="top">Q17: To assess acute coronary syndrome (ACS), evaluate the impact of ST elevation in DII, DIII, and aVF<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>.</td><td align="left" valign="top">0.99 (0.05)</td><td align="left" valign="top">0.75-1</td><td align="left" valign="top">1 (1-1)</td></tr><tr><td align="left" valign="top">Q18: To evaluate atypical ACS presentation, assess the influence of diabetes.</td><td align="left" valign="top">0.75 (0.16)</td><td align="left" valign="top">0.5-1</td><td align="left" valign="top">0.75 (0.75-0.75)</td></tr><tr><td align="left" valign="top">Q19: To assess aortic dissection as a cause of pain, evaluate the impact of hypertension.</td><td align="left" valign="top">0.77 (0.1)</td><td align="left" valign="top">0.5-1</td><td align="left" valign="top">0.75 (0.75-0.75)</td></tr><tr><td align="left" valign="top">Q20: To evaluate suspected pneumothorax, judge how pain radiation influences the hypothesis.</td><td align="left" valign="top">0.29 (0.27)</td><td align="left" valign="top">0-1</td><td align="left" valign="top">0.25 (0.25-0.25)</td></tr><tr><td align="left" valign="top">Q21: To differentiate between ACS and pulmonary embolism, assess the importance of the electrocardiogram.</td><td align="left" valign="top">0.96 (0.12)</td><td align="left" valign="top">0.5-1</td><td align="left" valign="top">1 (1-1)</td></tr><tr><td align="left" valign="top">Q22: To evaluate pulmonary embolism, determine the influence of elevated blood pressure on the hypothesis.</td><td align="left" valign="top">0.55 (0.23)</td><td align="left" valign="top">0.25-0.75</td><td align="left" valign="top">0.75 (0.25-0.75)</td></tr><tr><td align="left" valign="top">Q23: To assess ACS likelihood, judge how chest pain radiation differs between male and female patients.</td><td align="left" valign="top">0.77 (0.2)</td><td align="left" valign="top">0.25-1</td><td align="left" valign="top">0.75 (0.75-1)</td></tr><tr><td align="left" valign="top">Q24: To determine the urgency of management, assess the influence of diabetes and hypertension.</td><td align="left" valign="top">0.78 (0.21)</td><td align="left" valign="top">0.5-1</td><td align="left" valign="top">0.75 (0.50-1)</td></tr><tr><td align="left" valign="top">Q25: To assess community-acquired pneumonia, evaluate the impact of initial fever on the hypothesis.</td><td align="left" valign="top">0.86 (0.13)</td><td align="left" valign="top">0.75-1</td><td align="left" valign="top">0.75 (0.75-1)</td></tr><tr><td align="left" valign="top">Q26: To evaluate treatment-induced cough, determine the influence of ACE inhibitor use.</td><td align="left" valign="top">0.72 (0.18)</td><td align="left" valign="top">0.25-1</td><td align="left" valign="top">0.75 (0.75-0.75)</td></tr><tr><td align="left" valign="top">Q27: To assess severe pneumonia requiring hospitalization, evaluate the impact of a normal oxygen saturation (97%).</td><td align="left" valign="top">0.15 (0.16)</td><td align="left" valign="top">0-0.5</td><td align="left" valign="top">0.25 (0-0.25)</td></tr><tr><td align="left" valign="top">Q28: To assess acute bronchitis, evaluate the impact of crackles on the hypothesis.</td><td align="left" valign="top">0.43 (0.28)</td><td align="left" valign="top">0-1</td><td align="left" valign="top">0.25 (0.25-0.75)</td></tr><tr><td align="left" valign="top">Q29: To distinguish between ACE-inhibitor-induced<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> cough and acute respiratory infection, assess the usefulness of the history.</td><td align="left" valign="top">1 (0)</td><td align="left" valign="top">1-1</td><td align="left" valign="top">1 (1-1)</td></tr><tr><td align="left" valign="top">Q30: To assess upper respiratory tract infection, evaluate the influence of a 10-day duration of cough.</td><td align="left" valign="top">0.7 (0.2)</td><td align="left" valign="top">0.25-1</td><td align="left" valign="top">0.75 (0.75-0.75)</td></tr><tr><td align="left" valign="top">Q31: To differentiate pneumonia from bronchitis, assess the importance of a chest X-ray.</td><td align="left" valign="top">1 (0)</td><td align="left" valign="top">1-1</td><td align="left" valign="top">1 (1-1)</td></tr><tr><td align="left" valign="top">Q32: To guide the need for antibiotic treatment in suspected pneumonia, evaluate the role of initial fever and productive cough.</td><td align="left" valign="top">0.84 (0.2)</td><td align="left" valign="top">0.25-1</td><td align="left" valign="top">1 (0.75-1)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>COPD: chronic obstructive pulmonary disease.</p></fn><fn id="table1fn2"><p><sup>b</sup>aVF: augmented vector foot.</p></fn><fn id="table1fn3"><p><sup>c</sup>ACE: angiotensin-converting enzyme.</p></fn></table-wrap-foot></table-wrap><p>A more detailed item-level analysis revealed that questions with clearer clinical cues, such as Q1 (which evaluated the impact of hypoxemia [peripheral oxygen saturation of 85%] on the hypothesis of COPD exacerbation) and Q8 (which examined the influence of electrocardiogram findings on the diagnosis of acute pulmonary edema), had high rates of agreement with the expert panel. In these items, over 80% of student responses matched the most common expert answer, suggesting that students were comfortable applying well-established diagnostic patterns. In contrast, questions such as Q2 (regarding the influence of a recent respiratory infection on the hypothesis of pulmonary edema) and Q4 (assessing the relevance of tachycardia for myocardial dysfunction) showed greater variability in student responses, with less than 50% aligning with the modal expert choice. These items appeared to introduce more clinical ambiguity or demanded a higher level of inference, which may explain the broader dispersion of answers. Preliminary psychometric analysis demonstrated a Cronbach &#x03B1; of 0.76, indicating good internal consistency across the 32 items.</p><p>About 40% of students scored 25 or above, frequently aligning with expert responses even on more ambiguous questions. These students tended to report lower perceived difficulty and used the &#x201C;slightly confirmed&#x201D; response category more judiciously. The remaining 60% scored between 19 and 24 points and were more likely to select &#x201C;non influenced&#x201D; or &#x201C;slightly confirmed&#x201D; options, reflecting a more cautious reasoning style or uncertainty in applying new clinical information. Overall, the SCT demonstrated its utility in capturing variations in reasoning patterns and levels of diagnostic confidence among early clinical learners.</p></sec><sec id="s3-3"><title>Agreement Between AI and Medical Students</title><p>To evaluate the extent to which AI experts simulated expert reasoning, we examined concordance (the degree to which AI and student responses followed similar reasoning patterns) using Spearman rank correlation coefficient (&#x03C1;). This nonparametric measure assesses the strength and direction of association between 2 ranked sets of scores, where values closer to +1 indicate strong agreement and those near 0 indicate weak or no relationship.</p><p>Among the trained AI systems, ChatGPT-4o achieved the highest concordance with student responses (&#x03C1;=0.68; <italic>P</italic>&#x003C;.001), followed by Claude (&#x03C1;=0.64) and Microsoft Copilot (&#x03C1;=0.61). These coefficients represent moderate-to-strong positive correlations, suggesting that trained models reasoned in ways closely aligned with students&#x2019; decision patterns. In contrast, untrained models showed weaker correlations (average &#x03C1;=0.41), reflecting less consistent or more generic reasoning.</p><p>Educationally, this implies that contextualized AI models can mirror how medical students weigh diagnostic information when course-specific data are embedded in their prompts. Such alignment supports the potential use of trained AI not only as an assessment proxy but also as a feedback tool capable of modeling clinically coherent reasoning.</p><p>The untrained AI models demonstrated lower levels of agreement. The untrained version of ChatGPT-4o showed a moderate correlation (&#x03C1;=0.48; <italic>P</italic>=.04), while Claude and Copilot, when untrained, showed weaker and statistically nonsignificant correlations of 0.42 (<italic>P</italic>=.06) and 0.34 (<italic>P</italic>=.08), respectively. These models tended to exhibit less consistent patterns, often defaulting to neutral or noncommittal answers such as &#x201C;non influenced,&#x201D; especially in items requiring nuanced interpretation of clinical signs. Their reasoning lacked the contextual anchoring present in the trained models and showed more variability across similar clinical scenarios. On average, the untrained models produced a correlation coefficient of 0.41, substantially lower than their trained counterparts.</p></sec><sec id="s3-4"><title>Perceived Difficulty</title><p>In addition to completing the 32 SCT items, students were asked to rate the perceived difficulty of each question on a 7-point Likert scale, ranging from 1 (&#x201C;very easy&#x201D;) to 7 (&#x201C;very difficult&#x201D;). This measure provided insight into how learners experienced the complexity and cognitive demands of the AI-generated test items. The mean perceived difficulty across all questions was 3.7, suggesting that students generally found the test to be of moderate difficulty. This aligns with the formative purpose of the SCT and indicates that the AI-generated items were accessible yet still challenging enough to stimulate clinical reasoning.</p><p>When analyzed at the item level, some questions emerged as consistently more difficult. In particular, Q2 (which assessed how a recent respiratory infection influences the hypothesis of acute pulmonary edema) and Q4 (which explored the relevance of tachycardia in the context of suspected myocardial dysfunction) received higher average difficulty ratings, with mean scores exceeding 4.5 (SD 0.9). These questions involved ambiguous or indirect relationships between the new information and the hypothesis, requiring students to reason within clinical gray zones where data interpretation is not straightforward. In contrast, items that presented clear diagnostic anchors, such as Q1 (hypoxemia in COPD) and Q8 (electrocardiogram ischemia in pulmonary edema), were rated as significantly easier, with average difficulty scores below 3.0. These findings suggest that item clarity and familiarity with pathophysiological mechanisms strongly influence perceived complexity.</p><p>Furthermore, there was no statistically significant difference in perceived difficulty between items believed by students to be human-authored and those they believed to be AI-authored (<italic>t</italic> test; <italic>P</italic>=.47). As all items were in fact AI-generated, this finding reflects students&#x2019; perceived authenticity of the AI-written vignettes rather than a comparison with true human-authored items. This finding supports the face validity of AI-generated items, indicating that they are perceived as authentic and comparable in challenge to traditional, faculty-written SCT items. Several students even remarked in open comments that they were &#x201C;unable to tell&#x201D; which questions had been generated by AI, further supporting the indistinguishability of AI-authored content when properly crafted.</p><p>Cluster analysis of difficulty ratings also revealed subtle differences in student subgroups. High-performing students (as defined by SCT scores &#x2265;25/32) tended to rate the test as less difficult overall (mean 3.3, SD 0.8) compared to their peers (mean 3.9, SD 0.8), which may reflect their greater familiarity with clinical reasoning or more efficient script activation. In addition, free-text comments frequently noted that clinical ambiguity (rather than item length or language) was the primary factor contributing to perceived difficulty. Students consistently emphasized that the challenge came from &#x201C;figuring out what matters most&#x201D; in the presence of partial or conflicting clinical cues. The difficulty ratings offer an important validation metric for AI-generated SCTs. The overall moderate difficulty, the range of item-specific variation, and the lack of perceived discrepancy between AI- and human-generated content all point toward a well-calibrated test.</p><p>Spearman correlation analysis between item difficulty ratings and alignment with trained AI responses yielded a modest positive trend (&#x03C1;=0.32; <italic>P</italic>=.08), suggesting that items more closely matching trained AI reasoning were often perceived as less difficult. This pattern hints that when AI-generated reasoning aligns with instructional scripts, students experience smoother cognitive processing and reinforce the pedagogical relevance of model calibration.</p></sec><sec id="s3-5"><title>Evaluation of AI-Expert Feedback</title><p>In addition to rating the difficulty of each SCT item, students were asked to select the expert feedback they considered most helpful and least helpful among the options provided. For every SCT question, 6 expert rationales were made available to the medical students by the AI systems (trained and untrained). Students were instructed to base their selection on 3 specific criteria: clarity, clinical relevance, and overall educational value. This evaluation provided insight not only into the perceived credibility of AI-generated explanations but also into the comparative strengths of different expert types in formative feedback contexts.</p><p>Across all 32 questions, AI-generated feedback (especially from models trained on the course material) was selected as helpful in 62.5% of instances. Among these, ChatGPT-4o (trained) was most frequently chosen, followed by Claude (trained) and Copilot (trained). Students frequently cited the clarity of language, conciseness, and alignment with taught clinical reasoning strategies as reasons for preferring these explanations. Trained AI feedback was especially appreciated in items requiring pathophysiological reasoning, such as those related to acute coronary syndromes and differential diagnoses in respiratory presentations. The consistent use of structured logic and evidence-based phrasing contributed to a perception of trustworthiness and educational quality.</p><p>In contrast, the feedback generated by untrained AI systems was selected as most helpful in only 9.4% of instances and more frequently identified as least useful. The untrained models often produced generic or excessively cautious explanations, relying on vague phrases such as &#x201C;may support the hypothesis&#x201D; or &#x201C;needs further investigation&#x201D; without directly engaging with the specific clinical cues provided in the vignette. These models occasionally failed to recognize important pathophysiological links, leading to misalignment with students&#x2019; expectations. Several students noted that while the untrained AI responses were not necessarily incorrect, they lacked the pedagogical precision needed for effective feedback.</p></sec><sec id="s3-6"><title>Qualitative Feedback From Students</title><sec id="s3-6-1"><title>Overview</title><p>To complement the quantitative analyses, students were invited to provide open-ended comments after each SCT question and at the end of the test. These narrative responses were analyzed using an inductive thematic analysis. A total of 94 discrete comments were submitted by the 25 participating students. From these, three major themes were identified: (1) ambiguity and interpretation of Likert-scale options, (2) educational value of SCTs, and (3) recommendations for improvement in item design and feedback delivery. Key themes and examples of quotes are provided in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Key themes, descriptions, and their representative quotes.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Theme</td><td align="left" valign="bottom">Description</td><td align="left" valign="bottom">Representative quotes</td></tr></thead><tbody><tr><td align="left" valign="top">Ambiguity and interpretation of Likert-scale options</td><td align="left" valign="top">Students expressed uncertainty distinguishing between closely-related options (&#x201D;slightly confirmed&#x201C; vs &#x201D;confirmed&#x201D;), reflecting the inherent challenge of probabilistic reasoning under limited data.</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>&#x201C;It&#x2019;s really hard to know when to say &#x2018;confirmed&#x2019; vs &#x2018;slightly confirmed&#x2019; without lab results.&#x201D;</p></list-item><list-item><p>&#x201C;Sometimes the answer depends on how I imagine the rest of the case.&#x201D;</p></list-item></list></td></tr><tr><td align="left" valign="top">Educational value of the SCT<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> format</td><td align="left" valign="top">Learners appreciated how SCTs stimulated reflective reasoning and moved beyond right-wrong logic, reinforcing their understanding of uncertainty.</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>&#x201C;This test made me think like a real clinician.&#x201D;</p></list-item><list-item><p>&#x201C;I liked how the questions made me weigh the value of each piece of information.&#x201D;</p></list-item></list></td></tr><tr><td align="left" valign="top">Recommendations for improvement</td><td align="left" valign="top">Students suggested adding contextual information (eg, vital signs and comorbidities) and calibration examples to improve clarity and confidence.</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>&#x201C;More case details would help me justify my choice better.&#x201D;</p></list-item><list-item><p>&#x201C;Before the real test, I would have liked an example with expert explanation.&#x201D;</p></list-item></list></td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>SCT: script concordance test.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-6-2"><title>Ambiguity and Interpretation of Likert-Scale Options</title><p>A recurrent theme in the comments related to students&#x2019; uncertainty in differentiating between closely related answer choices. Many expressed difficulties in deciding whether a new piece of clinical information &#x201C;slightly confirmed&#x201D; versus &#x201C;confirmed&#x201D; a hypothesis, particularly when the vignette lacked strong diagnostic anchors.</p><p>This theme reflects a broader cognitive tension between probabilistic reasoning and binary decision-making in early clinical training. Students seemed to grasp the script concordance philosophy but struggled with the semantic granularity required by the scale, particularly in low-certainty situations.</p></sec><sec id="s3-6-3"><title>Educational Value of the SCT Format</title><p>Despite difficulties with Likert scales, students generally expressed strong appreciation for the SCT format as a learning tool. Many found it stimulated reflection on clinical uncertainty and offered a welcome departure from the right/wrong dichotomy typical of multiple-choice assessments.</p><p>Students also praised the opportunity to compare their reasoning to experts, particularly when feedback was provided in structured, explanatory formats. Some noted that this type of test encouraged them to revisit and revise their clinical scripts, reinforcing clinical reasoning pathways rather than isolated facts.</p></sec><sec id="s3-6-4"><title>Recommendations for Improvement</title><p>Several students offered constructive suggestions aimed at improving the test format and clarity. One prominent comment was the request for more context in the vignettes, such as vital signs, lab values, or imaging findings. Students felt that additional details would allow them to apply reasoning with greater precision and confidence.</p><p>Another frequently mentioned point was the desire for guidance or training on how to use the Likert scale. Some students suggested including sample questions or expert rationales before the actual test to calibrate their expectations.</p><p>Interestingly, several students highlighted their surprise at the quality of feedback provided by AI systems. These comments were often framed with curiosity or mild skepticism, indicating an openness to AI as a pedagogical tool, contingent on quality and relevance.</p><p>While students appreciated the reflective and authentic nature of the format, they also identified areas where clearer structure or training could enhance their experience. Their willingness to accept AI-generated feedback, when well-crafted, suggests that such technologies could play a productive role in clinical education, particularly when embedded within thoughtful educational frameworks. This thematic analysis suggests that future iterations of the SCT should include providing students with explicit guidance and calibration examples on how to interpret the Likert scale; enhancing vignettes with additional contextual cues (eg, vital signs or comorbidities) to support more confident reasoning; incorporating trained AI models as feedback generators, with ensuring faculty review for quality assurance; and designing preparatory activities that help learners understand how to engage with probabilistic reasoning in uncertain clinical scenarios.</p></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study explores the performance of generative AI in its ability to create SCTs for undergraduate medical students and to embody the role of an expert model by fully designing, scoring, and providing feedback for an SCT in undergraduate medical education. Among the 25 second-year medical students who completed the AI-generated SCT, the mean score was 22.8 out of 32, with a distribution that reflected meaningful variation in clinical reasoning patterns. Students generally aligned well with expert-modeled responses, particularly on items with strong pathophysiological anchors, while questions requiring greater inferential reasoning yielded more dispersed answers. Trained AI systems (those provided with course-specific materials) achieved the highest concordance with both student performance and expected reasoning pathways, offering feedback that was frequently rated as the most helpful. Untrained AI models, in contrast, were perceived as less pedagogically effective and exhibited weaker correlations with student responses. The SCT also performed reliably, with a Cronbach &#x03B1; of 0.76, and was rated by students as moderately difficult, further reinforcing the test&#x2019;s capacity to capture meaningful distinctions in clinical reasoning. Interestingly, preliminary trends suggested that students with higher SCT scores tended to select feedback from trained AI systems more often, implying that the pedagogical structure of these explanations may resonate most with learners who already demonstrate stronger reasoning frameworks. This observation supports the idea that trained AI feedback could help reinforce expert-like reasoning scripts, though confirmatory analyses in larger samples are needed.</p></sec><sec id="s4-2"><title>Comparison With Previous Work</title><p>The integration of AI in formative clinical assessment tools, such as SCTs, is still at an early stage, but this study offers an early applied example of how LLMs can simulate aspects of expert reasoning within a script-concordance framework when appropriately contextualized. While previous research has already demonstrated the clinical reasoning capabilities of LLMs in diverse testing environments (eg, Nori et al [<xref ref-type="bibr" rid="ref14">14</xref>] and Singhal et al [<xref ref-type="bibr" rid="ref6">6</xref>]), this work extends these findings to a formative educational setting by integrating AI-generated items, scoring, and feedback within a single workflow. While previous studies have explored LLMs&#x2019; ability to answer multiple-choice questions or United States Medical Licensing Examination&#x2013;style questions, those formats tend to rely on recall and fixed answers [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. In contrast, SCTs demand probabilistic thinking and tolerance of ambiguity, qualities that better reflect real-world medical clinical reasoning and thus could be used in exams to test clinical reasoning. The ability of trained LLMs in this study to mirror the distribution of expert responses (and to provide feedback that students found clear and pedagogically relevant) suggests a step forward in the use of AI not merely as a content generator, but as a cognitive proxy for experienced clinicians.</p><p>This study also aligns with findings from recent work examining AI&#x2019;s capacity to replicate reflective thinking patterns. For instance, Nori et al [<xref ref-type="bibr" rid="ref14">14</xref>] highlighted that LLMs such as GPT-4 are increasingly capable of &#x201C;meta-cognitive&#x201D; behavior, adapting their justifications based on context and complexity. The findings support this, especially in items where the trained AI systems offered nuanced rationales that resonated with learners. Furthermore, the indistinguishability between human- and AI-authored feedback, as perceived by students in this study, aligns with previous reports by Lee and Song [<xref ref-type="bibr" rid="ref17">17</xref>], who found that students often could not discern between AI-generated and expert-written content when both were of high quality. Importantly, students in this cohort demonstrated openness to AI-generated feedback, provided it was contextually accurate and clearly structured, suggesting that AI can be accepted as a trusted educational voice when properly curated.</p><p>Nonetheless, this project also illustrates the current limitations of general-purpose LLMs. Untrained models, though capable of generating grammatically correct responses, frequently defaulted to cautious or generic reasoning, especially in questions requiring interpretation of subtle clinical cues. This reduced their pedagogical value and undermined student trust. These findings are consistent with recent critiques from Arora and Arora [<xref ref-type="bibr" rid="ref18">18</xref>], who argue that untrained LLMs lack the contextual grounding needed for expert-level interpretation, particularly in nuanced or culturally specific clinical scenarios. As such, effective integration of AI into medical education likely hinges on customized fine-tuning, prompt engineering, and ongoing human oversight to ensure that AI explanations remain accurate, relevant, and educationally meaningful.</p></sec><sec id="s4-3"><title>Limitations</title><p>This study has several limitations. The sample size was modest and drawn from a single institution, which may limit generalizability. The test was also formative in nature, and students may not have engaged with the same level of cognitive effort as they would in summative settings. It is also possible that only students who expressed an interest in AI were accepted to participate in this SCT, which may affect the generalizability of the qualitative components of this study. In addition, while the AI models were trained on course materials, their responses still depended on prompt clarity and token limits, which may have shaped their output in subtle ways. No direct comparison was made between AI and human expert panels on the same test, which restricts our ability to determine whether the observed concordance between AI and student reasoning reflects genuine expert-level clinical judgment. As such, these findings demonstrate internal validity and feasibility rather than equivalence to clinician consensus. Future studies should include parallel human-expert panels to establish criterion validity and to calibrate the pedagogical quality of AI-generated scoring keys and feedback. As this was a single-round implementation, long-term effects on clinical reasoning or retention were not assessed. Future studies should explore how repeated exposure to AI-generated SCTs impacts learning trajectories and whether hybrid models (AI and human feedback) offer added value over AI alone.</p><p>Finally, a further limitation relates to the potential for hallucinations or inaccurate information in AI-generated explanations. While all outputs were reviewed for face plausibility and internal consistency, no formal content validation against gold-standard references was performed. As highlighted by Masters [<xref ref-type="bibr" rid="ref19">19</xref>], generative AI systems can produce fabricated or misleading references and medical details, a phenomenon known as &#x201C;AI hallucination.&#x201D; This risk underscores the importance of treating AI-generated feedback as supplementary rather than authoritative, especially in formative educational contexts. Future implementations should incorporate systematic expert review of AI-generated feedback to ensure clinical accuracy and safeguard learners from misinformation.</p></sec><sec id="s4-4"><title>Conclusions</title><p>As AI technologies evolve and become increasingly embedded in medical education, this study offers novel insight into how LLMs can assume the role traditionally occupied by human experts in clinical reasoning assessments. By designing and scoring an SCT entirely with generative AI systems, and by leveraging both trained and untrained models to simulate expert feedback, we demonstrated that AI can produce assessment content that is not only functionally valid but also well-received by learners. The high degree of alignment between student responses and trained AI-generated reference panels, alongside the positive reception of AI-authored feedback, suggests that these technologies hold real promise for supporting formative assessment. While distinctions remain between the effectiveness of trained versus untrained models, this study illustrates that with proper calibration, AI systems can enhance both the efficiency and pedagogical depth of clinical education. These findings open the door for further research into scalable, AI-assisted assessment models that can flexibly support reasoning under uncertainty, which is an essential competency in modern medical practice. Beyond feasibility, this work underscores the transformative potential of generative AI to democratize access to high-quality formative assessment. In academic environments where expert availability and time are limited, AI-assisted SCT generation could substantially reduce faculty workload, expand curricular coverage, and enable rapid adaptation of tests to evolving learning objectives. By lowering the resource barrier traditionally associated with expert panel-based assessment, this approach positions AI as a potential catalyst for sustainable, scalable innovation in medical education globally.</p></sec></sec></body><back><notes><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">COPD</term><def><p>chronic obstructive pulmonary disease</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">SCT</term><def><p>script concordance test</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schuwirth</surname><given-names>LWT</given-names> </name><name name-style="western"><surname>Van der Vleuten</surname><given-names>CPM</given-names> </name></person-group><article-title>Programmatic assessment: from assessment of learning to assessment for learning</article-title><source>Med Teach</source><year>2011</year><volume>33</volume><issue>6</issue><fpage>478</fpage><lpage>485</lpage><pub-id pub-id-type="doi">10.3109/0142159X.2011.565828</pub-id><pub-id pub-id-type="medline">21609177</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Charlin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Roy</surname><given-names>L</given-names> </name><name name-style="western"><surname>Brailovsky</surname><given-names>C</given-names> </name><name name-style="western"><surname>Goulet</surname><given-names>F</given-names> </name><name name-style="western"><surname>van der Vleuten</surname><given-names>C</given-names> </name></person-group><article-title>The script concordance test: a tool to assess the reflective clinician</article-title><source>Teach Learn Med</source><year>2000</year><volume>12</volume><issue>4</issue><fpage>189</fpage><lpage>195</lpage><pub-id pub-id-type="doi">10.1207/S15328015TLM1204_5</pub-id><pub-id pub-id-type="medline">11273368</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lubarsky</surname><given-names>S</given-names> </name><name name-style="western"><surname>Charlin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Chalk</surname><given-names>C</given-names> </name><name name-style="western"><surname>van der Vleuten</surname><given-names>CPM</given-names> </name></person-group><article-title>Script concordance testing: a review of published validity evidence</article-title><source>Med Educ</source><year>2011</year><month>04</month><volume>45</volume><issue>4</issue><fpage>329</fpage><lpage>338</lpage><pub-id pub-id-type="doi">10.1111/j.1365-2923.2010.03863.x</pub-id><pub-id pub-id-type="medline">21401680</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dory</surname><given-names>V</given-names> </name><name name-style="western"><surname>Gagnon</surname><given-names>R</given-names> </name><name name-style="western"><surname>Vanpee</surname><given-names>D</given-names> </name><name name-style="western"><surname>Charlin</surname><given-names>B</given-names> </name></person-group><article-title>How to construct and implement script concordance tests: insights from a systematic review</article-title><source>Med Educ</source><year>2012</year><month>06</month><volume>46</volume><issue>6</issue><fpage>552</fpage><lpage>563</lpage><pub-id pub-id-type="doi">10.1111/j.1365-2923.2011.04211.x</pub-id><pub-id pub-id-type="medline">22626047</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gagnon</surname><given-names>R</given-names> </name><name name-style="western"><surname>Charlin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Coletti</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sauv&#x00E9;</surname><given-names>E</given-names> </name><name name-style="western"><surname>van der Vleuten</surname><given-names>C</given-names> </name></person-group><article-title>Assessment in the context of uncertainty: how many members are needed on the panel of reference of a script concordance test?</article-title><source>Med Educ</source><year>2005</year><month>03</month><volume>39</volume><issue>3</issue><fpage>284</fpage><lpage>291</lpage><pub-id pub-id-type="doi">10.1111/j.1365-2929.2005.02092.x</pub-id><pub-id pub-id-type="medline">15733164</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Publisher correction: large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><day>10</day><volume>620</volume><issue>7973</issue><fpage>E19</fpage><lpage>E19</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06455-0</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeblick</surname><given-names>K</given-names> </name><name name-style="western"><surname>Schachtner</surname><given-names>B</given-names> </name><name name-style="western"><surname>Dexl</surname><given-names>J</given-names> </name><etal/></person-group><article-title>ChatGPT makes medicine easy to swallow: an exploratory case study on simplified radiology reports</article-title><source>Eur Radiol</source><year>2024</year><month>05</month><volume>34</volume><issue>5</issue><fpage>2817</fpage><lpage>2825</lpage><pub-id pub-id-type="doi">10.1007/s00330-023-10213-1</pub-id><pub-id pub-id-type="medline">37794249</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abd-Alrazaq</surname><given-names>A</given-names> </name><name name-style="western"><surname>AlSaad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Alhuwail</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Large language models in medical education: opportunities, challenges, and future directions</article-title><source>JMIR Med Educ</source><year>2023</year><month>06</month><day>1</day><volume>9</volume><fpage>e48291</fpage><pub-id pub-id-type="doi">10.2196/48291</pub-id><pub-id pub-id-type="medline">37261894</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>K&#x0131;yak</surname><given-names>YS</given-names> </name><name name-style="western"><surname>Emekli</surname><given-names>E</given-names> </name></person-group><article-title>Using large language models to generate script concordance test in medical education: ChatGPT and Claude</article-title><source>Rev Esp Edu Med</source><year>2024</year><volume>6</volume><issue>1</issue><pub-id pub-id-type="doi">10.6018/edumed.636331</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hudon</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kiepura</surname><given-names>B</given-names> </name><name name-style="western"><surname>Pelletier</surname><given-names>M</given-names> </name><name name-style="western"><surname>Phan</surname><given-names>V</given-names> </name></person-group><article-title>Using ChatGPT in psychiatry to design script concordance tests in undergraduate medical education: mixed methods study</article-title><source>JMIR Med Educ</source><year>2024</year><month>04</month><day>4</day><volume>10</volume><fpage>e54067</fpage><pub-id pub-id-type="doi">10.2196/54067</pub-id><pub-id pub-id-type="medline">38596832</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>GW</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>D</given-names> </name><name name-style="western"><surname>Moon</surname><given-names>J</given-names> </name></person-group><article-title>Utilizing generative AI for instructional design: exploring strengths, weaknesses, opportunities, and threats</article-title><source>TechTrends</source><year>2024</year><month>07</month><volume>68</volume><issue>4</issue><fpage>832</fpage><lpage>844</lpage><pub-id pub-id-type="doi">10.1007/s11528-024-00967-w</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fournier</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Demeester</surname><given-names>A</given-names> </name><name name-style="western"><surname>Charlin</surname><given-names>B</given-names> </name></person-group><article-title>Script concordance tests: guidelines for construction</article-title><source>BMC Med Inform Decis Mak</source><year>2008</year><month>05</month><day>6</day><volume>8</volume><fpage>18</fpage><pub-id pub-id-type="doi">10.1186/1472-6947-8-18</pub-id><pub-id pub-id-type="medline">18460199</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nori</surname><given-names>H</given-names> </name><name name-style="western"><surname>King</surname><given-names>N</given-names> </name><name name-style="western"><surname>McKinney</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Carignan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Horvitz</surname><given-names>E</given-names> </name></person-group><article-title>Capabilities of GPT-4 on medical challenge problems</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 12, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.13375</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>D</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Evaluating the performance of GPT-3.5, GPT-4, and GPT-4o in the Chinese National Medical Licensing Examination</article-title><source>Sci Rep</source><year>2025</year><volume>15</volume><issue>1</issue><fpage>14119</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-98949-2</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Song</surname><given-names>KS</given-names> </name></person-group><article-title>Teachers&#x2019; and students&#x2019; perceptions of AI-generated concept explanations: implications for integrating generative AI in computer science education</article-title><source>Computers and Education: Artificial Intelligence</source><year>2024</year><month>12</month><volume>7</volume><fpage>100283</fpage><pub-id pub-id-type="doi">10.1016/j.caeai.2024.100283</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arora</surname><given-names>A</given-names> </name><name name-style="western"><surname>Arora</surname><given-names>A</given-names> </name></person-group><article-title>The promise of large language models in health care</article-title><source>The Lancet</source><year>2023</year><month>02</month><volume>401</volume><issue>10377</issue><fpage>641</fpage><pub-id pub-id-type="doi">10.1016/S0140-6736(23)00216-7</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Masters</surname><given-names>K</given-names> </name></person-group><article-title>Medical Teacher&#x2019;s first ChatGPT&#x2019;s referencing hallucinations: lessons for editors, reviewers, and teachers</article-title><source>Med Teach</source><year>2023</year><month>07</month><volume>45</volume><issue>7</issue><fpage>673</fpage><lpage>675</lpage><pub-id pub-id-type="doi">10.1080/0142159X.2023.2208731</pub-id><pub-id pub-id-type="medline">37183932</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Script concordance test (French and English).</p><media xlink:href="formative_v9i1e76618_app1.docx" xlink:title="DOCX File, 49 KB"/></supplementary-material></app-group></back></article>