<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e66478</article-id><article-id pub-id-type="doi">10.2196/66478</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Novel Evaluation Metric and Quantified Performance of ChatGPT-4 Patient Management Simulations for Early Clinical Education: Experimental Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Scherr</surname><given-names>Riley</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Spina</surname><given-names>Aidin</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dao</surname><given-names>Allen</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Andalib</surname><given-names>Saman</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Halaseh</surname><given-names>Faris F</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Blair</surname><given-names>Sarah</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wiechmann</surname><given-names>Warren</given-names></name><degrees>MD, MBA, MSED</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rivera</surname><given-names>Ronald</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib></contrib-group><aff id="aff1"><institution>School of Medicine, University of California, Irvine School of Medicine</institution><addr-line>1001 Health Sciences Road</addr-line><addr-line>Irvine</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Medicine, Stanford Medicine</institution><addr-line>Stanford</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff3"><institution>School of Medicine, Tufts University</institution><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff4"><institution>Department of Emergency Medicine, University of California Irvine</institution><addr-line>Orange</addr-line><addr-line>CA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Zickler</surname><given-names>Christine</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Barakat</surname><given-names>Muna</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chien</surname><given-names>Nicholas</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Patel</surname><given-names>Shree</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Li</surname><given-names>Will</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Riley Scherr, BS, School of Medicine, University of California, Irvine School of Medicine, 1001 Health Sciences Road, Irvine, CA, 92617, United States, 1 9498246119; <email>rscherr@hs.uci.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>27</day><month>2</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e66478</elocation-id><history><date date-type="received"><day>14</day><month>09</month><year>2024</year></date><date date-type="rev-recd"><day>30</day><month>01</month><year>2025</year></date><date date-type="accepted"><day>31</day><month>01</month><year>2025</year></date></history><copyright-statement>&#x00A9; Riley Scherr, Aidin Spina, Allen Dao, Saman Andalib, Faris F Halaseh, Sarah Blair, Warren Wiechmann, Ronald Rivera. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 27.2.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e66478"/><abstract><sec><title>Background</title><p>Case studies have shown ChatGPT can run clinical simulations at the medical student level. However, no data have assessed ChatGPT&#x2019;s reliability in meeting desired simulation criteria such as medical accuracy, simulation formatting, and robust feedback mechanisms.</p></sec><sec><title>Objective</title><p>This study aims to quantify ChatGPT&#x2019;s ability to consistently follow formatting instructions and create simulations for preclinical medical student learners according to principles of medical simulation and multimedia educational technology.</p></sec><sec sec-type="methods"><title>Methods</title><p>Using ChatGPT-4 and a prevalidated starting prompt, the authors ran 360 separate simulations of an acute asthma exacerbation. A total of 180 simulations were given correct answers and 180 simulations were given incorrect answers. ChatGPT was evaluated for its ability to adhere to basic simulation parameters (stepwise progression, free response, interactivity), advanced simulation parameters (autonomous conclusion, delayed feedback, comprehensive feedback), and medical accuracy (vignette, treatment updates, feedback). Significance was determined with <italic>&#x03C7;</italic>&#x00B2; analyses using 95% CIs for odds ratios.</p></sec><sec sec-type="results"><title>Results</title><p>In total, 100% (n=360) of simulations met basic simulation parameters and were medically accurate. For advanced parameters, 55% (200/360) of all simulations delayed feedback, while the Correct arm (157/180, 87%) delayed feedback was significantly more than the Incorrect arm (43/180, 24%; <italic>P</italic>&#x003C;.001). A total of 79% (285/360) of simulations concluded autonomously, and there was no difference between the Correct and Incorrect arms in autonomous conclusion (146/180, 81% and 139/180, 77%; <italic>P</italic>=.36). Overall, 78% (282/360) of simulations gave comprehensive feedback, and there was no difference between the Correct and Incorrect arms in comprehensive feedback (137/180, 76% and 145/180, 81%; <italic>P</italic>=.31). ChatGPT-4 was not significantly more likely to conclude simulations autonomously (<italic>P</italic>=.34) and provide comprehensive feedback (<italic>P</italic>=.27) when feedback was delayed compared to when feedback was not delayed.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>These simulations have the potential to be a reliable educational tool for simple simulations and can be evaluated by a novel 9-part metric. Per this metric, ChatGPT simulations performed perfectly on medical accuracy and basic simulation parameters. It performed well on comprehensive feedback and autonomous conclusion. Delayed feedback depended on the accuracy of user inputs. A simulation meeting one advanced parameter was not more likely to meet all advanced parameters. Further work must be done to ensure consistent performance across a broader range of simulation scenarios.</p></sec></abstract><kwd-group><kwd>medical school simulations</kwd><kwd>AI in medical education</kwd><kwd>preclinical curriculum</kwd><kwd>ChatGPT</kwd><kwd>ChatGPT-4</kwd><kwd>medical simulation</kwd><kwd>simulation</kwd><kwd>multimedia</kwd><kwd>feedback</kwd><kwd>medical education</kwd><kwd>medical student</kwd><kwd>clinical education</kwd><kwd>pilot study</kwd><kwd>patient management</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>With the rise of generative artificial intelligence (AI) applications such as OpenAI&#x2019;s ChatGPT, research into its medical application has expanded. Early investigations assessed whether large language models (LLMs) could pass medical trainee licensing examinations [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. Such studies indicated that LLMs can pass medical exams and possess solid foundations in medical reasoning [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>An expanding body of literature has focused on LLMs in medical education, including the perspectives of both students and seasoned clinicians [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. Medical students have shown interest in LLMs and frequently use or intend to use them educationally [<xref ref-type="bibr" rid="ref14">14</xref>]. Studies have also suggested that ChatGPT can be an effective tool for students when entering the clinical wards [<xref ref-type="bibr" rid="ref15">15</xref>]. Moving from theoretical, classroom-based instruction to hands-on patient care introduces new challenges [<xref ref-type="bibr" rid="ref16">16</xref>]. The steep learning curve heightens the need for reliable training tools, and generative AI technologies may satisfy this need.</p><p>Current literature suggests a need for rigorous validation of student use of generative AI. Frameworks such as RISE (role, input, steps, expectations; ie, inputting the LLM&#x2019;s role, anticipated input, required steps, and desired expectations) exist for prompt engineering but have been infrequently applied to medical student LLM use [<xref ref-type="bibr" rid="ref17">17</xref>]. Initial work established prompts for medical trainees to practice common clinical scenarios in platforms such as ChatGPT [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. These studies established that, given the precise wording of prompts, LLMs can act as an effective simulator of basic clinical scenarios. Despite this promising result, these technologies&#x2019; unknown accuracy and precision in displaying desirable parameters limits their broader applicability [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>In this study, we reviewed evidence-based resources for medical simulation and multimedia educational design. Using The Society for Simulation in Healthcare guidelines along with Richard Mayer&#x2019;s multimedia design principles, we created an evaluation system with 3 main evaluation categories: basic simulation parameters, advanced simulation parameters, and medical accuracy parameters. Each category was further divided into 3 subparameters. Basic parameters were composed of stepwise progression, free response, and interactivity. Advanced parameters were composed of autonomous conclusion, delayed feedback, and comprehensive feedback. Medical accuracy parameters were composed of clinical vignette, updates based on treatment, and feedback. See <xref ref-type="fig" rid="figure1">Figure 1</xref> for definitions.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Simulation parameters. Desirable simulation parameters of ChatGPT clinical simulations were based on multimedia educational technology principles and simulation in health care recommendations. Three major parameters were each divided into 3 corresponding subparameters with clear definitions. Subparameters constitute the total 9-part metric by which ChatGPT simulations were evaluated. &#x201C;P&#x201D; or green indicates a pass, and &#x201C;F&#x201D; or red indicates a fail for whether a simulation exhibits that characteristic.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e66478_fig01.png"/></fig></sec><sec id="s1-2"><title>Basic Parameters</title><p>Stepwise progression in simulation is supported by the segmenting principle of multimedia learning, meaning that learning is maximized when broken into smaller units [<xref ref-type="bibr" rid="ref22">22</xref>]. At breaks between units, the learner provides the next steps in the simulated patient&#x2019;s management with reasoning. This is supported by Mayer&#x2019;s self-explanation principle, which shows strong learning when previous knowledge is integrated into current learning by explaining one&#x2019;s reasoning [<xref ref-type="bibr" rid="ref23">23</xref>]. An user-responsive design further facilitates this and creates a generative learning activity poised to improve learning acquisition and retention [<xref ref-type="bibr" rid="ref24">24</xref>]. These are both in-line with real medical practice where every step of a treatment or diagnosis must be chosen. The free-response format for responses similarly mimics real medical practice, as it allows space for treatment justifications. It is also supported by the active processing principle, which asserts that learning is improved when people actively organize information into cognitive models and integrate prior knowledge to address the new task, rather than passively absorbing teaching points. Just like in real life, where physicians create treatment plans de novo, rather than choosing from a selection of options [<xref ref-type="bibr" rid="ref25">25</xref>]. Finally, interactivity promotes cognitive engagement and is beneficial to learning outcomes, especially when planned and augmented by peer-to-peer and peer-to-teacher learning [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>].</p></sec><sec id="s1-3"><title>Advanced Parameters</title><p>Advanced parameters were selected to improve authenticity and decrease the cognitive load devoted to simulation mechanics, allowing greater focus on learning engagement [<xref ref-type="bibr" rid="ref28">28</xref>]. An autonomous conclusion is valuable because it provides a specific end point and prevents learners from following extraneous threads when learning goals have been achieved. Learners do not have to decide when they have learned enough, rather they must meet the simulation goals to finish. Delayed feedback keeps novice learners from overrelying on immediate feedback cues to make decisions and promotes active learning in simulations [<xref ref-type="bibr" rid="ref29">29</xref>]. However, Mayer suggests immediate feedback may be better for some tasks such as solo novice learning or guided reasoning with an expert, but the Society for Simulation in Healthcare guidelines suggest that delayed debrief is most appropriate, and we accordingly chose delayed feedback for our metric [<xref ref-type="bibr" rid="ref30">30</xref>]. This also mimics the real-life practice of medicine, where the only indication of a treatment&#x2019;s accuracy is a change in patient stability. Delaying feedback also keeps these simulations in-line with the active-processing principle that active learning is more effective than passive learning from regular performance feedback cues [<xref ref-type="bibr" rid="ref29">29</xref>]. Finally, comprehensive feedback falls in-line with Mayer&#x2019;s signaling principle. Signaling, or indicating what information is vital, highlights specific learning points rather than generalizing about a user&#x2019;s performance [<xref ref-type="bibr" rid="ref31">31</xref>]. Comprehensive feedback thus ensures meaningful performance assessments with specific takeaways.</p></sec><sec id="s1-4"><title>Medical Accuracy Parameters</title><p>Medical accuracy is perhaps the most important. This is a common-sense principle that undergirds all the other parameters&#x2019; efficacy. Medical accuracy categories were defined whenever there were physiologic, pharmacologic, or pathologic assertions.</p><p>A 9-part model for evaluating AI medical simulations is therefore proposed. Using a previously engineered and verified starting prompt, this model helps quantify the consistency of an LLM (ChatGPT-4) in following prompt commands to create simulations exhibiting these 9 features of effective simulations. The provided results aim to better inform recommendations to students and educators who use such resources in their clinical education and establish a method for future evaluation of simulation reliability.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>The Society for Simulation in Healthcare recommends simulation sessions be high or low fidelity with low physical realism, making AI simulation sessions acceptable when high-fidelity sessions are unavailable [<xref ref-type="bibr" rid="ref30">30</xref>]. This is also congruent with the authentic learning environments principle which suggests that learning can happen equivalently in any environment as long as the design adheres to effective learning principles [<xref ref-type="bibr" rid="ref32">32</xref>]. The Society also recommends training should be spaced, frequent, short, and skill-oriented sessions. Therefore, we built our prompt around asthma exacerbation management, a short lesson with evidence-based best practices to be done in 5 to 10 minutes with specific learning takeaways. This is further supported by the Society&#x2019;s recommendation that each session end with focused feedback via structured debrief, so our prompt accordingly addressed this. The only criteria we were unable to meet were recommendations for in-situ practice and the use of interprofessional teams. However, the authentic learning environments principle again suggests that these simulations can have educational benefits even though they are not in situ, given our use of educational design principles. The following prompt, developed with features of RISE prompt engineering, was subsequently used [<xref ref-type="bibr" rid="ref9">9</xref>]:</p><p>&#x201C;Please create a clinical scenario on a patient presenting to the hospital with an acute asthma exacerbation and quiz me on what the proper next step of management is. Please make it free response and interactive, meaning you ask me what the next step is one question at a time, and then I write out what I would do, and then you ask me another question based on how my answer would affect the patient. Please update/change the patient&#x2019;s condition based on my actions, and do not tell me the right answers until the end of the scenario.&#x201D;</p><p>Simulations were run on 1 of 3 ChatGPT-4 accounts due to ChatGPT-4&#x2019;s token limits and split across Safari, Google Chrome, and Firefox browsers from March 29, 2024, to April 24, 2024. Each simulation was run on its own browser tab session of ChatGPT-4 to eliminate session-dependent memory. All cases simulated an acute asthma exacerbation and focused on treatment steps.</p><p>An appropriate treatment algorithm for acute asthma exacerbation was developed based on the review of recommendations from the American Thoracic Society (a leading pulmonology society) and the American Academy of Family Physicians, and confirmed by author RR, a board-certified emergency medicine physician at a high-volume academic medical center [<xref ref-type="bibr" rid="ref33">33</xref>]. Movement to the next step of the algorithm depended on the simulated patient&#x2019;s clinical condition, and the tester could proceed to the final step (discharge) if the patient was appropriately stabilized in a stepwise manner. For example, if a patient was no longer in respiratory distress and had normal oxygen saturation after administration of a short-acting muscarinic antagonist, the tester would skip magnesium sulfate and endotracheal intubation and proceed to patient education and counseling. See <xref ref-type="fig" rid="figure2">Figure 2</xref> for treatment workflow.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Acute asthma simulation treatment algorithms. Treatment algorithms for acute asthma exacerbation simulations were developed on societal guidelines and were used as a template for ChatGPT simulation inputs. The green pathway represents &#x201C;correct treatment&#x201D; and was followed for 180 simulations in the correct subgroup. The red path represents &#x201C;incorrect treatment,&#x201D; a variation of the correct treatment algorithm, and was followed for 180 simulations in the incorrect subgroup.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e66478_fig02.png"/></fig><p>Simulations were divided into 2 subgroups based on adherence to the treatment workflow. The &#x201C;Correct Treatment&#x201D; subgroup followed the treatment workflow. The &#x201C;Incorrect Treatment&#x201D; subgroup added administration of a macrolide antibiotic (azithromycin) as the third step (<xref ref-type="fig" rid="figure2">Figure 2</xref>). Progression through the algorithm for each simulation depended on the patient&#x2019;s response to treatment, but all incorrect simulations included steps 1, 2, 3, and 6 at minimum. In total, 180 simulations each were run for the Correct and Incorrect treatment arms, leaving a total of 360 simulations for analysis. Simulations were evaluated according to our 9-part model using the definitions in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><p>Author RS resolved unclear data points. Simulations were reviewed by all authors for medical accuracy and confirmed with author RR, a licensed, board-certified emergency medicine physician with 8 years of clinical experience.</p><p>Descriptive statistics were run on GraphPad Prism (version 10.2.3; Graphstats Technologies) and Excel (version 16.85; Microsoft Corp). Statistical significance was determined by <italic>&#x03C7;</italic>&#x00B2; test, with an &#x03B1; level set at .05.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>Study data were collected by an author (AD) and did not involve personal information or observation of any person&#x2019;s private or public information. As such data were collected from ChatGPT, all study data were inherently anonymous. This study was reviewed under the University of California, Irvine&#x2019;s Institutional Review Board Protocol #3213 and deemed not human subjects research and not requiring institutional review board review.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>A summary of the results is listed in <xref ref-type="table" rid="table1">Table 1</xref> and represented in <xref ref-type="fig" rid="figure3">Figure 3</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Simulation characteristics<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Combined (n=360), n (%)</td><td align="left" valign="bottom">Correct (n=180), n (%)</td><td align="left" valign="bottom">Incorrect (n=180), n (%)</td><td align="left" valign="bottom"><italic>&#x03C7;</italic>&#x00B2; (<italic>df</italic>)</td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Basic parameters</td><td align="left" valign="top">360 (100)</td><td align="left" valign="top">180 (100)</td><td align="left" valign="top">180 (100)</td><td align="left" valign="top">0 (1)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top">Medical accuracy</td><td align="left" valign="top">360 (100)</td><td align="left" valign="top">180 (100)</td><td align="left" valign="top">180 (100)</td><td align="left" valign="top">0 (1)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top">Comprehensive feedback</td><td align="left" valign="top">282 (78)</td><td align="left" valign="top">137 (76)</td><td align="left" valign="top">145 (81)</td><td align="left" valign="top">1.05 (1)</td><td align="left" valign="top">.31</td></tr><tr><td align="left" valign="top">Autonomous conclusion</td><td align="left" valign="top">285 (79)</td><td align="left" valign="top">146 (81)</td><td align="left" valign="top">139 (77)</td><td align="left" valign="top">0.825 (1)</td><td align="left" valign="top">.36</td></tr><tr><td align="left" valign="top">Delayed feedback</td><td align="left" valign="top">200 (55)</td><td align="left" valign="top">157 (87)</td><td align="left" valign="top">43 (24)</td><td align="left" valign="top">146.2 (1)</td><td align="left" valign="top">&#x003C;.001</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Simulations were given the correct or incorrect treatment algorithm inputs and evaluated on exhibition of characteristics against the 9-part metric. Successful exhibition of a parameter was calculated as a percent of all simulations in a subgroup. The table provides a summary of simulation outcomes.</p></fn><fn id="table1fn2"><p><sup>b</sup><italic>P</italic> values were calculated from <italic>&#x03C7;</italic>&#x00B2; values</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>ChatGPT performance on simulation parameters by correct and incorrect inputs. Radar plot showing correct (blue) and incorrect (green) ChatGPT-4 performance on simulation parameters. Each point represents a simulation parameter. Lines indicate the percentage of simulations exhibiting that parameter, with the circumference of the shape indicating 100%. Simulations were given the correct or incorrect acute asthma exacerbation treatment algorithm inputs and evaluated by the 9-part metric developed from multimedia education and simulation principles. Successful exhibition of a parameter was calculated as a percent of all simulations in a subgroup and plotted on the radar plot. Correct and Incorrect subgroups performed similarly on all metrics except &#x201C;delayed feedback.&#x201D;</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e66478_fig03.png"/></fig><p>In 360 completed simulations, ChatGPT adhered to basic parameters of stepwise progression of treatment, appropriate patient status adjustments based on treatment inputs, and free response format in 100% (n=360) of simulations. It was also found to be 100% medically accurate in its vignettes, treatment updates, and feedback. There was no difference in medical accuracy between ChatGPT-4 outputs after correct versus incorrect tester treatment responses.</p><p>Results for advanced parameters were mixed. Among the correct treatment subgroup (n=180), 87% (157/180) of simulations delayed feedback until the end of the scenario. Only 24% (43/180) of the simulations with incorrect treatment (n=180) delayed feedback (<italic>P</italic>&#x003C;.001). In all but 2 of these simulations with immediate (nondelayed) feedback, the feedback came after the administration of azithromycin. When combining correct and incorrect simulations (n=360), ChatGPT-4 demonstrated a 55% success rate in delaying feedback.</p><p>Simulations with correct treatments concluded the scenario autonomously 81% (146/180) of the time, while those with incorrect treatment concluded autonomously 77% (139/180; <italic>P</italic>=.36). Overall, 79% (285/360) of all simulations, regardless of treatment accuracy, concluded autonomously without tester input.</p><p>Correct simulations provided comprehensive feedback 76% (137/180) of the time while incorrect simulations provided comprehensive feedback 81% (145/180) of the time (<italic>P</italic>=.31). Combined analysis showed ChatGPT-4 provided comprehensive feedback on 76% (137/180) of outputs.</p><p>Further analysis revealed that ChatGPT-4 was not more likely to conclude the simulation autonomously (<italic>P</italic>=.34) and provide comprehensive feedback (<italic>P</italic>=.27) when feedback was delayed compared to when feedback was not delayed (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Impact of delayed feedback on simulation autonomous conclusion and comprehensive feedback.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Delayed feedback</td><td align="left" valign="bottom">Early feedback</td></tr></thead><tbody><tr><td align="left" valign="top">Autonomous conclusion<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">162</td><td align="left" valign="top">123</td></tr><tr><td align="left" valign="top">No autonomous conclusion</td><td align="left" valign="top">38</td><td align="left" valign="top">37</td></tr><tr><td align="left" valign="top">Comprehensive feedback<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">161</td><td align="left" valign="top">121</td></tr><tr><td align="left" valign="top">No comprehensive feedback</td><td align="left" valign="top">39</td><td align="left" valign="top">39</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Number of all simulations regardless of subgroup that reached an autonomous conclusion versus those that did not based on delayed or early feedback timing.</p></fn><fn id="table2fn2"><p><sup>b</sup>Number of all simulations regardless of subgroup that provided comprehensive feedback versus those that did not based on delayed or early feedback timing.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Basic Parameters</title><p>ChatGPT-4 performed perfectly on basic simulation parameters, which is a requirement for it to be a viable educational tool. Its performance agrees with other research on prompt engineering showing proper prompt generation can produce high-quality results [<xref ref-type="bibr" rid="ref34">34</xref>]. However, even given the use of proper prompt engineering techniques, perfect performance is surprising given ChatGPT&#x2019;s known inconsistencies [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. This may reflect its algorithmic refinement and increased LLM training from widespread use. Given this known feature of LLMs, we should expect it to continue improving over time.</p></sec><sec id="s4-2"><title>Medical Accuracy</title><p>ChatGPT was also medically accurate for both patient progression within the simulation and in its feedback; simulated patients had reasonable clinical presentations, responded appropriately to therapies, and feedback was in accordance with basic pharmacology, physiology, antibiotic stewardship, and clinical practice. This agrees with other findings on ChatGPT&#x2019;s clinical reasoning abilities, but the consistently excellent performance is again impressive. However, acute asthma exacerbation is not an exceedingly complex problem and has well-established management guidelines that likely exist in ChatGPT&#x2019;s training data, so its medical accuracy cannot be generalized to more complex clinical simulations like intensive care management [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Further research needs to be done into LLM performance differences between complex and simple simulations.</p></sec><sec id="s4-3"><title>Advanced Parameters</title><p>ChatGPT&#x2019;s performance on feedback was variable. The observed significant difference in delaying feedback is noteworthy. Delayed feedback best mirrors real-life clinical scenarios where immediate feedback is not always available to redirect incorrect thought processes [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. However, immediate feedback to incorrect answers mirrors the role a teacher might take with a student in simulation, where the teacher does not give a correct answer but instead explains why a learner&#x2019;s answer was incorrect in an attempt to give a better chance at the correct answer. It is also in-line with the expertise reversal effect that suggests novice learners will do better with more intense educational guidance, while advanced learners do better with less [<xref ref-type="bibr" rid="ref38">38</xref>]. One possibility is that ChatGPT may be intentionally providing more structured guidance to learners who make errors, recognizing the need for additional support.</p><p>Delayed feedback best mirrors real-life clinical scenarios where immediate feedback is not always available to redirect incorrect thought processes. This may be particularly important for learners intent on practicing in smaller community settings where there are fewer opportunities for collaboration. However, this independence is a major developmental process in residency training, and thus, may be above the training level for which these simulations were targeted. Additionally, health care systems are interdisciplinary and collaborative. Expecting completely delayed feedback on patient care may be unrealistic considering shift hand-offs, pharmacists, nurses, medical record alerts, and other checks. Furthermore, the difference between delayed feedback for correct and incorrect treatments may offer different learning experiences for learners of different skills; it is unclear whether these different learning experiences would be inequitable or if they would offer new opportunities for learning (ie, learners who know the proper treatment are ready to practice independently, whereas learners still mastering treatment protocols are not forced to practice independently when they are not ready).</p><p>Another consideration is that different prompt engineering could resolve this variability. A prompt could include the instructions to &#x201C;not give feedback until the end of the scenario, and instead of giving feedback on incorrect answers, make the patient more unstable as a sign that the choice was incorrect, then explain this incorrect choice in the summary feedback at the end of the simulation.&#x201D; Alternatively, because ChatGPT&#x2019;s algorithm learns from user inputs, learners running simulations in the same chat window could give ChatGPT feedback as they run simulations to hone in on their desired parameters, including desired feedback mechanisms. Future studies will need to evaluate differential prompt engineering, learner experiences, and learner preferences.</p><p>Of note, ChatGPT also gave hints to guide the tester after incorrect inputs. For example, ChatGPT could change from asking about the &#x201C;next step in management&#x201D; to asking about the &#x201C;next step in controlling bronchoconstriction,&#x201D; thus nudging the tester toward administering a bronchodilator. This feature, as well as not delaying feedback after incorrect inputs, might be ChatGPT exhibiting flow, the educational and gaming principle that tasks should be made difficult enough to optimally challenge the learner, but not so difficult that the learner quits [<xref ref-type="bibr" rid="ref39">39</xref>]. Again, further studies will need to evaluate learner experiences and preferences.</p><p>Comprehensive feedback was not significantly different between correct and incorrect simulations, with an overall rate of 78% (282/360). This high performance is encouraging, as comprehensive feedback offers more detailed learning opportunities and touchpoints for self-study. While this study did not explore the possibility of learners requesting expanded feedback, doing so could potentially increase the rate of comprehensive feedback to close to 100%, thereby enhancing the educational value of the simulations. ChatGPT concluded autonomously at similar rates to comprehensive feedback, which is also encouraging. Autonomous conclusion prevents learners from becoming &#x201C;trapped&#x201D; in the simulation, primed by the continuing simulation to believe there is more treatment needed even when the patient is stable and treatment should conclude. While autonomous conclusion at 100% would be best for efficiency and ease of use, forcing a learner to decide when to end the simulation can also mimic the testing style of oral examinations used by some medical specialties for board certification. This is a skill expected of advanced learners who have completed residency, but earlier learners could be easily prompted to end the simulation when they feel they have completed their treatments. Proper instructions and framing could address this problem from the prompt generator&#x2019;s perspective.</p><p>ChatGPT-4 was not significantly more likely to conclude simulations autonomously and provide comprehensive feedback when feedback was delayed. This suggests that ChatGPT-4 does not necessarily perform desirable actions in clusters, meaning that learners who make mistakes are not at risk of losing some of the simulation&#x2019;s desirable characteristics.</p></sec><sec id="s4-4"><title>Future Work</title><p>This study focused on a single mistake in the treatment workflow, but future studies should look at performance with multiple mistakes. ChatGPT-4&#x2019;s ability to display desirable simulation parameters in the presence of multiple mistakes would be crucial for it to be a viable educational tool; real learners may make multiple mistakes of varying severity, and ChatGPT-4 would need to handle this. As is, the feasibility of using this tool given ChatGPT&#x2019;s varied performance on advanced parameters is still unclear. Medical educators may be hesitant to use a tool that is not completely standardized and risks inequitable learning outcomes, while others may welcome a tool that seemingly adapts to the learner&#x2019;s level. Quantified student learning gains after simulation use are thus needed for educators to make informed decisions in their classrooms and simulation centers and are the subject of ongoing work. However, with the aforementioned testing, we foresee the following potential implementation: educators would use the most advanced version of ChatGPT available and tailor the prompt provided here to their students&#x2019; learning goals. Simulations would focus on simple diagnostic or therapeutic problems with straightforward algorithms. Simulations would be paired with robust pre- or postsimulation didactics to reinforce simulation concepts and not be the sole method of instruction. Ideally, simulations would be overseen by an experienced educator capable of monitoring simulations for accuracy.</p><p>Future work should focus on measuring student performance after using simulations to assess educational utility, trialing simulations on different LLMs to assess if the best LLM exists for this purpose, and trialing LLM performance on increasingly complex, nonalgorithmic simulations with multiple wrong answers. All of these are important for this tool&#x2019;s external validity.</p></sec><sec id="s4-5"><title>Limitations</title><p>This study has several limitations which should be addressed. First, the simulations were limited to a single clinical scenario&#x2014;acute asthma exacerbation&#x2014;which may not be representative of ChatGPT-4&#x2019;s performance across a wider range of medical conditions. More complex scenarios reflecting the breadth of clinical practice and training level cannot be assumed to function as the simulations provided here. Our simulations were also run on only 1 prompt to limit confounders, but different prompts may produce varied results. Additionally, the study only compared correct treatment and 1 type of incorrect treatment, excluding a broader spectrum of clinical inaccuracies that may occur in real-world educational settings. Furthermore, simulations were run on 3 different ChatGPT-4 accounts and across different browsers, which, although necessary due to token limits, might introduce variability in the model&#x2019;s responses. Finally, the results of this study are contingent on the current application program interface settings established by ChatGPT and each new update of the LLM can potentially impact reproducibility. However, this does not detract from this study&#x2019;s conclusions given its intent to establish the methodology for evaluating simulations, rather than being a definitive recommendation for implementing LLM into preclinical medical education.</p></sec><sec id="s4-6"><title>Conclusions</title><p>The 9-part model described here can be useful in evaluating ChatGPT simulations as a learning tool in accordance with established principles of medical simulation and multimedia design in educational technology. The use of this model can help standardize the evaluation of LLM simulation research. As an example use of this model, ChatGPT performed well in creating practice clinical scenarios that adhered to 3 simulation parameter categories. It performed excellently on 2 primary features, medical accuracy and basic simulation parameters, and reasonably well on advanced parameters. Variation in simulation characteristics based on the accuracy of learner inputs is a point of concern. This tool&#x2019;s impact on student learning is an important next step to explore, but these simulations demonstrate ChatGPT&#x2019;s potential to be a reliable educational tool for the appropriate preclinical-to-clinical learning level.</p></sec></sec></body><back><notes><sec><title>Data Availability</title><p>All data generated or analyzed during this study are included in this published article and in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">RISE</term><def><p>role, input, steps, expectations</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nakao</surname><given-names>T</given-names> </name><name name-style="western"><surname>Miki</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nakamura</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Capability of GPT-4V(ision) in the Japanese National Medical Licensing Examination: evaluation study</article-title><source>JMIR Med Educ</source><year>2024</year><month>03</month><day>12</day><volume>10</volume><fpage>e54393</fpage><pub-id pub-id-type="doi">10.2196/54393</pub-id><pub-id pub-id-type="medline">38470459</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digital Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>OY</given-names> </name><name name-style="western"><surname>Connolly</surname><given-names>ID</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT and GPT-4 on Neurosurgery written board examinations</article-title><source>Neurosurgery</source><year>2023</year><month>12</month><day>1</day><volume>93</volume><issue>6</issue><fpage>1353</fpage><lpage>1365</lpage><pub-id pub-id-type="doi">10.1227/neu.0000000000002632</pub-id><pub-id pub-id-type="medline">37581444</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Vaid</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Comparing ChatGPT and GPT-4 performance in USMLE soft skill assessments</article-title><source>Sci Rep</source><year>2023</year><month>10</month><day>1</day><volume>13</volume><issue>1</issue><fpage>16492</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-43436-9</pub-id><pub-id pub-id-type="medline">37779171</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mohammad</surname><given-names>B</given-names> </name><name name-style="western"><surname>Supti</surname><given-names>T</given-names> </name><name name-style="western"><surname>Alzubaidi</surname><given-names>M</given-names> </name><etal/></person-group><article-title>The pros and cons of using ChatGPT in medical education: a scoping review</article-title><source>Stud Health Technol Inform</source><year>2023</year><month>06</month><day>29</day><volume>305</volume><fpage>644</fpage><lpage>647</lpage><pub-id pub-id-type="doi">10.3233/SHTI230580</pub-id><pub-id pub-id-type="medline">37387114</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boscardin</surname><given-names>CK</given-names> </name><name name-style="western"><surname>Gin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Golde</surname><given-names>PB</given-names> </name><name name-style="western"><surname>Hauer</surname><given-names>KE</given-names> </name></person-group><article-title>ChatGPT and generative artificial intelligence for medical education: potential impact and opportunity</article-title><source>Acad Med</source><year>2024</year><month>01</month><day>1</day><volume>99</volume><issue>1</issue><fpage>22</fpage><lpage>27</lpage><pub-id pub-id-type="doi">10.1097/ACM.0000000000005439</pub-id><pub-id pub-id-type="medline">37651677</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khan</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Jawaid</surname><given-names>M</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Sajjad</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT&#x2014;reshaping medical education and clinical management</article-title><source>Pak J Med Sci</source><year>2023</year><volume>39</volume><issue>2</issue><fpage>605</fpage><lpage>607</lpage><pub-id pub-id-type="doi">10.12669/pjms.39.2.7653</pub-id><pub-id pub-id-type="medline">36950398</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scherr</surname><given-names>R</given-names> </name><name name-style="western"><surname>Halaseh</surname><given-names>FF</given-names> </name><name name-style="western"><surname>Spina</surname><given-names>A</given-names> </name><name name-style="western"><surname>Andalib</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rivera</surname><given-names>R</given-names> </name></person-group><article-title>ChatGPT interactive medical simulations for early clinical education: case study</article-title><source>JMIR Med Educ</source><year>2023</year><month>11</month><day>10</day><volume>9</volume><fpage>e49877</fpage><pub-id pub-id-type="doi">10.2196/49877</pub-id><pub-id pub-id-type="medline">37948112</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Halaseh</surname><given-names>FF</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Danza</surname><given-names>CN</given-names> </name><name name-style="western"><surname>Halaseh</surname><given-names>R</given-names> </name><name name-style="western"><surname>Spiegelman</surname><given-names>L</given-names> </name></person-group><article-title>ChatGPT&#x2019;s role in improving education among patients seeking emergency medical treatment</article-title><source>West J Emerg Med</source><year>2024</year><month>09</month><volume>25</volume><issue>5</issue><fpage>845</fpage><lpage>855</lpage><pub-id pub-id-type="doi">10.5811/westjem.18650</pub-id><pub-id pub-id-type="medline">39319818</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>B</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>A</given-names> </name></person-group><article-title>Embracing ChatGPT for medical education: exploring its impact on doctors and medical students</article-title><source>JMIR Med Educ</source><year>2024</year><month>04</month><day>10</day><volume>10</volume><fpage>e52483</fpage><pub-id pub-id-type="doi">10.2196/52483</pub-id><pub-id pub-id-type="medline">38598263</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Araujo</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Cruz-Correia</surname><given-names>R</given-names> </name></person-group><article-title>Incorporating ChatGPT in medical informatics education: mixed methods study on student perceptions and experiential integration proposals</article-title><source>JMIR Med Educ</source><year>2024</year><month>03</month><day>20</day><volume>10</volume><fpage>e51151</fpage><pub-id pub-id-type="doi">10.2196/51151</pub-id><pub-id pub-id-type="medline">38506920</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sallam</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title><source>Healthcare (Basel)</source><year>2023</year><month>03</month><day>19</day><volume>11</volume><issue>6</issue><fpage>887</fpage><pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id><pub-id pub-id-type="medline">36981544</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alkhaaldi</surname><given-names>SMI</given-names> </name><name name-style="western"><surname>Kassab</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Dimassi</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Medical student experiences and perceptions of chatgpt and artificial intelligence: cross-sectional study</article-title><source>JMIR Med Educ</source><year>2023</year><month>12</month><day>22</day><volume>9</volume><fpage>e51302</fpage><pub-id pub-id-type="doi">10.2196/51302</pub-id><pub-id pub-id-type="medline">38133911</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Skryd</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lawrence</surname><given-names>K</given-names> </name></person-group><article-title>ChatGPT as a tool for medical education and clinical decision-making on the wards: case study</article-title><source>JMIR Form Res</source><year>2024</year><month>05</month><day>8</day><volume>8</volume><fpage>e51346</fpage><pub-id pub-id-type="doi">10.2196/51346</pub-id><pub-id pub-id-type="medline">38717811</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>HW</given-names> </name><name name-style="western"><surname>Hong</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Nam</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>KY</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>JI</given-names> </name></person-group><article-title>Medical students&#x2019; perceived stress and perceptions regarding clinical clerkship during the COVID-19 pandemic</article-title><source>PLoS ONE</source><year>2022</year><volume>17</volume><issue>10</issue><fpage>e0277059</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0277059</pub-id><pub-id pub-id-type="medline">36315569</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Heston</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Khun</surname><given-names>C</given-names> </name></person-group><article-title>Prompt engineering in medical education</article-title><source>IME</source><year>2023</year><volume>2</volume><issue>3</issue><fpage>198</fpage><lpage>205</lpage><pub-id pub-id-type="doi">10.3390/ime2030019</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Musallam</surname><given-names>E</given-names> </name><name name-style="western"><surname>Alhaj Ali</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alkhafaji</surname><given-names>M</given-names> </name></person-group><article-title>OpenAI&#x2019;s ChatGPT clinical simulation: an innovative teaching strategy for clinical nursing education</article-title><source>Nurse Educ</source><year>2024</year><volume>49</volume><issue>6</issue><fpage>E361</fpage><lpage>E362</lpage><pub-id pub-id-type="doi">10.1097/NNE.0000000000001657</pub-id><pub-id pub-id-type="medline">38728116</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lucas</surname><given-names>HC</given-names> </name><name name-style="western"><surname>Upperman</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Robinson</surname><given-names>JR</given-names> </name></person-group><article-title>A systematic review of large language models and their implications in medical education</article-title><source>Med Educ</source><year>2024</year><month>11</month><volume>58</volume><issue>11</issue><fpage>1276</fpage><lpage>1285</lpage><pub-id pub-id-type="doi">10.1111/medu.15402</pub-id><pub-id pub-id-type="medline">38639098</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeyaraman</surname><given-names>M</given-names> </name><name name-style="western"><surname>K</surname><given-names>SP</given-names> </name><name name-style="western"><surname>Jeyaraman</surname><given-names>N</given-names> </name><name name-style="western"><surname>Nallakumarasamy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yadav</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bondili</surname><given-names>SK</given-names> </name></person-group><article-title>ChatGPT in medical education and research: a boon or a bane?</article-title><source>Cureus</source><year>2023</year><month>08</month><volume>15</volume><issue>8</issue><fpage>e44316</fpage><pub-id pub-id-type="doi">10.7759/cureus.44316</pub-id><pub-id pub-id-type="medline">37779749</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van de Ridder</surname><given-names>JMM</given-names> </name><name name-style="western"><surname>Shoja</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Rajput</surname><given-names>V</given-names> </name></person-group><article-title>Finding the place of ChatGPT in medical education</article-title><source>Acad Med</source><year>2023</year><month>08</month><day>1</day><volume>98</volume><issue>8</issue><fpage>867</fpage><pub-id pub-id-type="doi">10.1097/ACM.0000000000005254</pub-id><pub-id pub-id-type="medline">37162206</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Clark</surname><given-names>RC</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Mayer</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Fiorella</surname><given-names>L</given-names> </name></person-group><article-title>Multimedia learning in e-courses</article-title><source>The Cambridge Handbook of Multimedia Learning</source><year>2014</year><edition>2</edition><publisher-name>Cambridge University Press</publisher-name><fpage>842</fpage><lpage>882</lpage><pub-id pub-id-type="doi">10.1017/CBO9781139547369.040</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Wylie</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chi</surname><given-names>MTH</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Mayer</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Fiorella</surname><given-names>L</given-names> </name></person-group><article-title>The self-explanation principle in multimedia learning</article-title><source>The Cambridge Handbook of Multimedia Learning</source><year>2014</year><edition>2</edition><publisher-name>Cambridge University Press</publisher-name><fpage>413</fpage><lpage>432</lpage><pub-id pub-id-type="doi">10.1017/CBO9781139547369.021</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Luetner</surname><given-names>D</given-names> </name><name name-style="western"><surname>Schmeck</surname><given-names>A</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Mayer</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Fiorella</surname><given-names>L</given-names> </name></person-group><article-title>The generative activity principle in multimedia learning</article-title><source>The Cambridge Handbook of Multimedia Learning</source><year>2014</year><edition>2</edition><publisher-name>Cambridge University Press</publisher-name><fpage>433</fpage><lpage>448</lpage><pub-id pub-id-type="doi">10.1017/CBO9781139547369.022</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Mayer</surname><given-names>RE</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Mayer</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Fiorella</surname><given-names>L</given-names> </name></person-group><article-title>Cognitive theory of multimedia learning</article-title><source>The Cambridge Handbook of Multimedia Learning</source><year>2014</year><edition>2</edition><publisher-name>Cambridge University Press</publisher-name><fpage>43</fpage><lpage>71</lpage><pub-id pub-id-type="doi">10.1017/CBO9781139547369.005</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Low</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sweller</surname><given-names>J</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Mayer</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Fiorella</surname><given-names>L</given-names> </name></person-group><article-title>The modality principle in multimedia learning</article-title><source>The Cambridge Handbook of Multimedia Learning</source><year>2014</year><edition>2</edition><publisher-name>Cambridge University Press</publisher-name><fpage>227</fpage><lpage>246</lpage><pub-id pub-id-type="doi">10.1017/CBO9781139547369.012</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Plass</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Schwartz</surname><given-names>RN</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Mayer</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Fiorella</surname><given-names>L</given-names> </name></person-group><article-title>Multimedia learning with simulations</article-title><source>The Cambridge Handbook of Multimedia Learning</source><year>2014</year><edition>2</edition><publisher-name>Cambridge University Press</publisher-name><fpage>729</fpage><lpage>761</lpage><pub-id pub-id-type="doi">10.1017/CBO9781139547369.036</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Mayer</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Fiorella</surname><given-names>L</given-names> </name></person-group><article-title>Principles for reducing extraneous processing in multimedia learning</article-title><source>The Cambridge Handbook of Multimedia Learning</source><year>2014</year><edition>2</edition><publisher-name>Cambridge University Press</publisher-name><fpage>279</fpage><lpage>315</lpage><pub-id pub-id-type="doi">10.1017/CBO9781139547369.015</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>CI</given-names> </name><name name-style="western"><surname>Priest</surname><given-names>HA</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Mayer</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Fiorella</surname><given-names>L</given-names> </name></person-group><article-title>The feedback principle in multimedia learning</article-title><source>The Cambridge Handbook of Multimedia Learning</source><year>2014</year><edition>2</edition><publisher-name>Cambridge University Press</publisher-name><fpage>449</fpage><lpage>463</lpage><pub-id pub-id-type="doi">10.1017/CBO9781139547369.023</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stefanidis</surname><given-names>D</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kalantar-Motamedi</surname><given-names>SM</given-names> </name><etal/></person-group><article-title>Society for simulation in healthcare guidelines for simulation training</article-title><source>Simul Healthc</source><year>2024</year><month>01</month><day>1</day><volume>19</volume><issue>1S</issue><fpage>S4</fpage><lpage>S22</lpage><pub-id pub-id-type="doi">10.1097/SIH.0000000000000776</pub-id><pub-id pub-id-type="medline">38240614</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>van Gog</surname><given-names>T</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Mayer</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Fiorella</surname><given-names>L</given-names> </name></person-group><article-title>The signaling (or cueing) principle in multimedia learning</article-title><source>The Cambridge Handbook of Multimedia Learning</source><year>2014</year><edition>2</edition><publisher-name>Cambridge University Press</publisher-name><fpage>263</fpage><lpage>278</lpage><pub-id pub-id-type="doi">10.1017/CBO9781139547369.014</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Feldon</surname><given-names>DF</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>S</given-names> </name><name name-style="western"><surname>Clark</surname><given-names>RE</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Mayer</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Fiorella</surname><given-names>L</given-names> </name></person-group><article-title>Fifteen common but questionable principles of multimedia learning</article-title><source>The Cambridge Handbook of Multimedia Learning</source><year>2021</year><edition>3</edition><publisher-name>Cambridge University Press</publisher-name><fpage>25</fpage><lpage>40</lpage><pub-id pub-id-type="doi">10.1017/9781108894333.005</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dabbs</surname><given-names>W</given-names> </name><name name-style="western"><surname>Bradley</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Chamberlin</surname><given-names>SM</given-names> </name></person-group><article-title>Acute asthma exacerbations: management strategies</article-title><source>Am Fam Physician</source><year>2024</year><month>01</month><volume>109</volume><issue>1</issue><fpage>43</fpage><lpage>50</lpage><pub-id pub-id-type="medline">38227870</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alkaissi</surname><given-names>H</given-names> </name><name name-style="western"><surname>McFarlane</surname><given-names>SI</given-names> </name></person-group><article-title>Artificial hallucinations in ChatGPT: implications in scientific writing</article-title><source>Cureus</source><year>2023</year><month>02</month><volume>15</volume><issue>2</issue><fpage>e35179</fpage><pub-id pub-id-type="doi">10.7759/cureus.35179</pub-id><pub-id pub-id-type="medline">36811129</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hattie</surname><given-names>J</given-names> </name><name name-style="western"><surname>Timperley</surname><given-names>H</given-names> </name></person-group><article-title>The power of feedback</article-title><source>Rev Educ Res</source><year>2007</year><month>03</month><volume>77</volume><issue>1</issue><fpage>81</fpage><lpage>112</lpage><pub-id pub-id-type="doi">10.3102/003465430298487</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chamberland</surname><given-names>M</given-names> </name><name name-style="western"><surname>Setrakian</surname><given-names>J</given-names> </name><name name-style="western"><surname>St-Onge</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bergeron</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mamede</surname><given-names>S</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>HG</given-names> </name></person-group><article-title>Does providing the correct diagnosis as feedback after self-explanation improve medical students diagnostic performance?</article-title><source>BMC Med Educ</source><year>2019</year><month>06</month><day>11</day><volume>19</volume><issue>1</issue><fpage>194</fpage><pub-id pub-id-type="doi">10.1186/s12909-019-1638-3</pub-id><pub-id pub-id-type="medline">31185971</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kalyuga</surname><given-names>S</given-names></name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Mayer</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Fiorella</surname><given-names>L</given-names> </name></person-group><article-title>The expertise reversal principle in multimedia learning</article-title><source>The Cambridge Handbook of Multimedia Learning</source><year>2014</year><edition>2</edition><publisher-name>Cambridge University Press</publisher-name><fpage>576</fpage><lpage>597</lpage><pub-id pub-id-type="doi">10.1017/CBO9781139547369.028</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Csikszentmihalyi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Abuhamdeh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nakamura</surname><given-names>J</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Csikszentmihalyi</surname><given-names>M</given-names> </name></person-group><source>Flow and the Foundations of Positive Psychology: The Collected Works of Mihaly Csikszentmihaly</source><year>2014</year><publisher-name>Springer</publisher-name><fpage>227</fpage><lpage>238</lpage><pub-id pub-id-type="doi">10.1007/978-94-017-9088-8_15</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary files.</p><media xlink:href="formative_v9i1e66478_app1.xlsx" xlink:title="XLSX File, 62 KB"/></supplementary-material></app-group></back></article>