<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e84904</article-id><article-id pub-id-type="doi">10.2196/84904</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Fine-Tuned Large Language Models for Generating Multiple-Choice Questions in Anesthesiology: Psychometric Comparison With Faculty-Written Items</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>H&#x00F6;lzing</surname><given-names>Carlos Ramon</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Meynhardt</surname><given-names>Charlotte</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Meybohm</surname><given-names>Patrick</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>K&#x00F6;nig</surname><given-names>Sarah</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kranke</surname><given-names>Peter</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Anaesthesiology, Intensive Care, Emergency and Pain Medicine, University Hospital W&#x00FC;rzburg</institution><addr-line>Oberd&#x00FC;rrbacher Str. 6</addr-line><addr-line>W&#x00FC;rzburg</addr-line><country>Germany</country></aff><aff id="aff2"><institution>Institute of Medical Teaching and Medical Education Research, University Hospital W&#x00FC;rzburg</institution><addr-line>W&#x00FC;rzburg</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Rezigalla</surname><given-names>Assad Ali</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ding</surname><given-names>Liang</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Carlos Ramon H&#x00F6;lzing, MD, Department of Anaesthesiology, Intensive Care, Emergency and Pain Medicine, University Hospital W&#x00FC;rzburg, Oberd&#x00FC;rrbacher Str. 6, W&#x00FC;rzburg, 97080, Germany; <email>hoelzing_c@ukw.de</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>18</day><month>2</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e84904</elocation-id><history><date date-type="received"><day>26</day><month>09</month><year>2025</year></date><date date-type="rev-recd"><day>18</day><month>12</month><year>2025</year></date><date date-type="accepted"><day>24</day><month>12</month><year>2025</year></date></history><copyright-statement>&#x00A9; Carlos Ramon H&#x00F6;lzing, Charlotte Meynhardt, Patrick Meybohm, Sarah K&#x00F6;nig, Peter Kranke. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 18.2.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e84904"/><abstract><sec><title>Background</title><p>Multiple-choice examinations (MCQs) are widely used in medical education to ensure standardized and objective assessment. Developing high-quality items requires both subject expertise and methodological rigor. Large language models (LLMs) offer new opportunities for automated item generation. However, most evaluations rely on general-purpose prompting, and psychometric comparisons with faculty-written items remain scarce.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate whether a fine-tuned LLM can generate MCQs (Type A) in anesthesiology with psychometric properties comparable to those written by expert faculty.</p></sec><sec sec-type="methods"><title>Methods</title><p>The study was embedded in the regular written anesthesiology examination of the eighth-semester medical curriculum with 157 students. The examination comprised 30 single best-answer MCQs, of which 15 were generated by senior faculty and 15 by a fine-tuned GPT-based model. A custom GPT-based (GPT-4) model was adapted with anesthesiology lecture slides, the National Competence-Based Learning Objectives Catalogue (NKLM 2.0), past examination questions, and faculty publications using supervised instruction-tuning with standardized prompt&#x2013;response pairs. Item analysis followed established psychometric standards.</p></sec><sec sec-type="results"><title>Results</title><p>In total, 29 items (14 expert, 15 LLM-generated) were analyzed. Expert-generated questions had a mean difficulty of 0.81 (SD 0.19), point-biserial correlation of 0.19 (SD 0.07), and discrimination index of 0.09 (SD 0.08). LLM-generated items had a mean difficulty of 0.79 (SD 0.18), point-biserial correlation of 0.17 (SD 0.04), and discrimination index of 0.08 (SD 0.11). Mann-Whitney <italic>U</italic> tests revealed no significant differences between expert- and LLM-generated items for difficulty (<italic>P</italic>=.38), point-biserial correlation coefficient (<italic>P</italic>=.96), or discrimination index (<italic>P</italic>=.59). Categorical analyses confirmed no significant group differences. Both sets, however, showed only modest psychometric quality.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Supervised fine-tuned LLMs are capable of generating MCQs with psychometric properties comparable to those written by experienced faculty. Given the limitations and cohort-dependency of psychometric indices, automated item generation should be considered a complement rather than a replacement for manual item writing. Further research with larger item sets and multi-institutional validation is needed to confirm generalizability and optimize integration of LLM-based tools into assessment development.</p></sec></abstract><kwd-group><kwd>medical education</kwd><kwd>multiple-choice questions</kwd><kwd>large language models</kwd><kwd>fine-tuning</kwd><kwd>psychometrics</kwd><kwd>assessment</kwd><kwd>anesthesiology</kwd><kwd>artificial intelligence</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Multiple-choice questions (MCQs) are fundamental to the objective assessment of medical students. They allow standardized testing across large cohorts and play a central role in evaluating foundational and applied knowledge [<xref ref-type="bibr" rid="ref1">1</xref>]. However, the development of high-quality MCQs demands not only deep domain knowledge but also significant methodological and didactic expertise [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. Effective items must balance appropriate difficulty, plausible distractors, minimal cueing, and strong discriminatory power to differentiate between varying levels of student performance [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Recent advances in artificial intelligence (AI), particularly large language models (LLMs), offer novel tools for automated question generation. For instance, efforts comparing ChatGPT-3.5&#x2013;generated MCQs with expert-written items in neurophysiology revealed similar difficulty levels but lower discriminatory power in LLM-generated questions [<xref ref-type="bibr" rid="ref5">5</xref>]. A systematic review of LLM use in medical MCQ generation found that while LLMs can produce examination-relevant items, many require additional modification due to quality issues [<xref ref-type="bibr" rid="ref6">6</xref>]. Other studies highlight linguistic and structural shortcomings in automatically generated MCQs, particularly regarding distractor plausibility and alignment with instructional content [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>Recent domain-specific efforts such as Hypnos [<xref ref-type="bibr" rid="ref9">9</xref>], CDGen [<xref ref-type="bibr" rid="ref10">10</xref>], and the Chinese Anesthesiology Benchmark [<xref ref-type="bibr" rid="ref11">11</xref>] have demonstrated that LLMs can be effectively fine-tuned or benchmarked within anesthesiology. However, these studies primarily focus on domain adaptation and benchmark performance rather than psychometric validation of automatically generated examination items. To address this gap, a GPT-based model was adapted using anesthesia-specific teaching materials, the National Competence-Based Learning Objectives Catalogue in Medicine (NKLM 2.0), past examination items, and faculty publications [<xref ref-type="bibr" rid="ref12">12</xref>]. Item development for both expert- and AI-generated questions was systematically mapped to the NKLM 2.0, Bloom&#x2019;s taxonomy, and the local examination blueprint to ensure comprehensive curricular coverage and to allow a fair psychometric comparison.</p><p>This study aimed to evaluate whether a fine-tuned LLM can generate MCQs (Type A) in anesthesiology with psychometric properties comparable to those written by expert faculty.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>This study analyzed the performance of MCQs used in the regular written examinations of anesthesiology in the eighth semester of medical training with 157 students. The examination consisted of 30 items. Half of the items (n=15) were written by senior faculty members, and half (n=15) were generated by a fine-tuned LLM. Nine faculty members from the Department of Anesthesiology, each with at least 10 years of experience, participated in item creation. All had prior training in assessment design through institutional workshops on multiple-choice item writing. In addition, all items were independently reviewed by an educational specialist with a Master of Medical Education degree to ensure adherence to established item-writing principles. Faculty were aware of the study but blinded to the psychometric comparison during data collection.</p><p>Data analysis was performed fully anonymously. The participating students were regular medical students in their eighth semester. They were not informed about the origin of the examination questions and therefore did not know whether an item was generated by faculty or the LLM.</p><p>A customized GPT-based model was developed specifically for this study. The model was built as a domain-adapted instance of GPT-3.5-Turbo, configured to generate single-best-answer MCQs. Adaptation followed a supervised instruction-tuning approach: several hundred standardized prompt-response pairs were created using anesthesiology lecture slides, NKLM 2.0, past examination questions, and faculty publications. Faculty publications were included to capture authentic domain phrasing and ensure that the model reflected institution-specific conceptualizations of anesthetic procedures. Previous research shows that faculty development and the use of high-quality source material improve item validity and discrimination [<xref ref-type="bibr" rid="ref13">13</xref>].</p><p>These materials were curated to align with Bloom&#x2019;s taxonomy and national curricular requirements [<xref ref-type="bibr" rid="ref14">14</xref>]. The fine-tuning pipeline can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>Item analysis followed established psychometric standards. Difficulty was defined as the mean proportion of correct responses (0&#x2010;1). Values between 0.30 and 0.70 are generally considered optimal, those greater than 0.70 indicate easy items, and those less than 0.30 indicate difficult items [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. The point-biserial correlation was classified as follows: negative correlation (<italic>r</italic>&#x003C;0), very low correlation (0&#x2264;<italic>r</italic>&#x003C;0.10), low correlation (0.10&#x2264;<italic>r</italic>&#x2264;0.20), and acceptable correlation (<italic>r</italic>&#x2265;0.20) [<xref ref-type="bibr" rid="ref15">15</xref>]. The discrimination index (D) was calculated as the difference in difficulty between the upper and lower 27% performance groups, with values of 0.40 and above considered excellent; 0.30&#x2010;0.39, good; 0.20&#x2010;0.29, acceptable; and those less than 0.20, poor [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Statistical analysis was performed using SPSS Statistics version 27 (IBM Corp). Graphs were created with Prism 9 (GraphPad Software). Nominal variables were summarized as counts and percentages. The Shapiro-Wilk test was used to test for normal distribution. Group comparisons of categorical data were performed with the chi-square test or Fisher exact test if expected frequencies were less than 5. Continuous data were reported as mean and SD values and compared using the Mann-Whitney <italic>U</italic> test. A significance level of <italic>P</italic>&#x2264;.05 was applied.</p><p><xref ref-type="fig" rid="figure1">Figure 1</xref> summarizes the item-generation workflow, including consolidated inputs, supervised instruction-tuning, the custom GPT MCQ generator, and parallel faculty-written items converging into the examination.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>The study was submitted to the Ethics Committee of the University of W&#x00FC;rzburg, which confirmed (reference number 2024-&#x2010;258-ka on November 11, 2024) that no formal review was required and that no ethical objections were raised.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Item-generation workflow. Consolidated inputs (anesthesiology lecture slides, NKLM 2.0, prior examination items, faculty publications) inform supervised instruction-tuning of a custom GPT configured for single-best-answer MCQs to produce 15 AI-generated questions. In parallel, faculty authored 15 questions. AI: artificial intelligence; MCQ: multiple-choice question.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e84904_fig01.png"/></fig></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>A total of 30 MCQs were analyzed. One expert-generated item was excluded from analysis due to a strongly negative discrimination index (&#x2013;0.22) and negative point-biserial correlation (&#x2013;0.20). Its difficulty (<italic>P</italic>=.86) indicated a ceiling effect, suggesting that most students answered it correctly despite unclear key wording.</p><p>The final dataset therefore included 14 expert-generated and 15 AI-generated items. <xref ref-type="table" rid="table1">Table 1</xref> displays the descriptive metrics for expert- and AI-generated items. Expert-generated items showed a mean difficulty of 0.81 (SD 0.19), a mean point-biserial correlation of 0.16 (SD 0.07), and a mean discrimination index of 0.09 (SD 0.08). AI-generated items had a mean difficulty of 0.79 (SD 0.18), a mean point-biserial correlation of 0.17 (SD 0.04), and a mean discrimination index of 0.08 (SD 0.11). Mann-Whitney <italic>U</italic> tests indicated no significant differences between expert- and AI-generated items with respect to difficulty (<italic>P</italic>=.38), point-biserial correlation (<italic>P</italic>=.96), or discrimination index (<italic>P</italic>=.59; <xref ref-type="fig" rid="figure2">Figure 2</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Overview of question metrics by expert and artificial intelligence (AI).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question created by</td><td align="left" valign="bottom">Questions, n</td><td align="left" valign="bottom" colspan="3">Metrics</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">Minimum</td><td align="left" valign="bottom">Maximum</td><td align="left" valign="bottom">Difficulty, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Expert</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Difficulty</td><td align="left" valign="top">&#x2003;14</td><td align="left" valign="top">&#x2003;0.48</td><td align="left" valign="top">&#x2003;0.99</td><td align="left" valign="top">&#x2003;0.81 (0.19)</td></tr><tr><td align="left" valign="top">&#x2003;Point-biserial correlation</td><td align="left" valign="top">&#x2003;14</td><td align="left" valign="top">&#x2003;&#x2212;0.02</td><td align="left" valign="top">&#x2003;0.20</td><td align="left" valign="top">&#x2003;0.16 (0.07)</td></tr><tr><td align="left" valign="top">&#x2003;Discrimination index</td><td align="left" valign="top">&#x2003;14</td><td align="left" valign="top">&#x2003;0.01</td><td align="left" valign="top">&#x2003;0.24</td><td align="left" valign="top">&#x2003;0.09 (0.08)</td></tr><tr><td align="left" valign="top">AI</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Difficulty</td><td align="left" valign="top">&#x2003;15</td><td align="left" valign="top">&#x2003;0.44</td><td align="left" valign="top">&#x2003;0.98</td><td align="left" valign="top">&#x2003;0.79 (0.18)</td></tr><tr><td align="left" valign="top">&#x2003;Point-biserial correlation</td><td align="left" valign="top">&#x2003;15</td><td align="left" valign="top">&#x2003;0.08</td><td align="left" valign="top">&#x2003;0.25</td><td align="left" valign="top">&#x2003;0.17 (0.04)</td></tr><tr><td align="left" valign="top">&#x2003;Discrimination index</td><td align="left" valign="top">&#x2003;15</td><td align="left" valign="top">&#x2003;&#x2212;0.07</td><td align="left" valign="top">&#x2003;0.33</td><td align="left" valign="top">&#x2003;0.08 (0.11)</td></tr></tbody></table></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Psychometric characteristics of AI- and expert-generated multiple-choice items<italic>.</italic> (A) Item difficulty, (B) point-biserial correlation, and (C) discrimination index is displayed for each question<italic>.</italic> AI: artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e84904_fig02.png"/></fig><p>Reference ranges are as follows:</p><list list-type="bullet"><list-item><p>Difficulty (<italic>P</italic>)=0.30-0.70 is considered desirable;</p></list-item><list-item><p>Discrimination (<italic>r</italic><sub>pb</sub>)&#x2265;0.20 is considered acceptable;</p></list-item><list-item><p><italic>r</italic><sub>pb</sub>&#x003C;0 indicates flawed items [<xref ref-type="bibr" rid="ref17">17</xref>].</p></list-item></list><p>The categorical distributions of item properties are summarized in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Psychometric characteristics of expert- and LLM<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>-generated items displayed side by side for difficulty, point-biserial correlation, and discrimination index.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="2">Questions, n (%)</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Expert-generated (n=14)</td><td align="left" valign="bottom">LLM-generated (n=15)</td></tr></thead><tbody><tr><td align="left" valign="top">Difficulty categories</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Extremely difficult (<italic>P</italic>&#x2264;0.25)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">&#x2003;Tends to heavy (0.25&#x003C;<italic>P</italic>&#x2264;0.4)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">&#x2003;Optimal difficulty (0.4&#x003C;<italic>P</italic>&#x2264;0.8)</td><td align="left" valign="top">5 (35.7)</td><td align="left" valign="top">6 (40.0)</td></tr><tr><td align="left" valign="top">&#x2003;Very simple (0.8&#x003C;<italic>P</italic>&#x2264;0.9)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">3 (20.0)</td></tr><tr><td align="left" valign="top">&#x2003;Extremely simple (<italic>P</italic>&#x003E;0.9)</td><td align="left" valign="top">9 (64.3)</td><td align="left" valign="top">6 (40.0)</td></tr><tr><td align="left" valign="top">Point-biserial correlation categories</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Negative correlation (<italic>r</italic>&#x003C;0)</td><td align="left" valign="top">1 (7.1)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Very low correlation (0&#x2264;<italic>r</italic>&#x003C;0.10)</td><td align="char" char="." valign="top">0 (0)</td><td align="char" char="." valign="top">1 (6.7)</td></tr><tr><td align="left" valign="top">&#x2003;Low correlation (0.1&#x2264;r&#x2264;0.20)</td><td align="left" valign="top">8 (57.1)</td><td align="left" valign="top">10 (66.7)</td></tr><tr><td align="left" valign="top">&#x2003;Acceptable correlation (<italic>r</italic>&#x2265;0.20)</td><td align="left" valign="top">5 (35.7)</td><td align="left" valign="top">4 (26.7)</td></tr><tr><td align="left" valign="top">Discrimination index categories</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Urgent need for revision (D&#x2019;&#x003C;0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">3 (20.0)</td></tr><tr><td align="left" valign="top">&#x2003;Need for revision (0&#x2264;D&#x2019;&#x003C;0.2)</td><td align="left" valign="top">12 (85.7)</td><td align="left" valign="top">11 (73.3)</td></tr><tr><td align="left" valign="top">&#x2003;Check required (0.2&#x2264;D&#x2019;&#x003C;0.3)</td><td align="left" valign="top">2 (14.3)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">&#x2003;Potential for improvement (0.3&#x2264;D&#x2019;&#x003C;0.4)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (6.7)</td></tr><tr><td align="left" valign="top">&#x2003;Item effectively distinguishes (D&#x2019;&#x2265;0.4)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>LLM: large language model.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this study, we compared psychometric properties (difficulty, point-biserial correlation, and discrimination) of MCQs generated by a supervised fine-tuned LLM with those written by expert faculty in an undergraduate anesthesiology examination. Although no statistically significant differences were observed, the overall quality of both item sets remained moderate. The point-biserial correlations and discrimination indices suggest that neither set reliably distinguishes higher- from lower-performing students, a finding consistent with previous research indicating that even expert-authored items often underperform in psychometric analyses [<xref ref-type="bibr" rid="ref18">18</xref>]. This pattern aligns with broader evidence in medical education, where cohort studies have demonstrated that AI-generated MCQs often achieve discrimination indices similar to expert-generated items but tend to be easier overall and still require expert review to ensure distractor plausibility and alignment with higher-order learning objectives [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>Supervised adaptation with domain-specific materials likely contributed to the close alignment of psychometric indices between AI and faculty-written items. Other work shows that when AI-mediated question generation is guided by domain content, structured prompts, or instruction tuning, the output more closely resembles faculty items in both difficulty and discrimination [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. Notably, neither item set in our study consistently achieved high point-biserial correlation or discrimination, confirming that generating functionally effective distractors remains a challenge for both experts and LLMs [<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]. Prior studies have similarly identified that AI items often underperform in assessing higher cognitive levels or using plausible distractors without ambiguity [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>The absence of psychometric superiority in either group suggests that AI-assisted question generation can produce items of comparable statistical quality to traditional item writing. However, psychometric analysis alone is insufficient for examination quality assurance; human oversight remains essential to safeguard content validity, blueprint alignment, and cognitive level coverage. Studies in high-stakes examination settings show that expert review reduces factual inaccuracies and improves alignment with assessment blueprints [<xref ref-type="bibr" rid="ref31">31</xref>]. Importantly, in our study, the fine-tuned LLM generated all 15 candidate items within a few minutes. While we evaluated only a subset psychometrically, our study demonstrates that domain-adapted LLMs support rapid item drafting at scale. Automatic item generation methods have long promised efficiency gains by expanding item pools from templates rather than crafting each item manually [<xref ref-type="bibr" rid="ref32">32</xref>]. Recent AI studies show that LLM-based MCQ generation can approach human performance while drastically reducing human effort [<xref ref-type="bibr" rid="ref33">33</xref>]. In practice, educators may use LLM throughput to generate large candidate sets and then filter, refine, and align items to the blueprint and cognitive levels, shifting effort from generation toward qualitative review and validation.</p></sec><sec id="s4-2"><title>Limitations and Future Work</title><p>Study limitations include a small item sample size, single-institution administration, and fine-tuning with primarily local teaching resources, which may reduce external validity. Cognitive level of items (eg, recall vs application) was not measured, although comparative studies indicate this is an important differentiator between AI- vs expert-generated MCQs [<xref ref-type="bibr" rid="ref31">31</xref>]. Future work should involve larger item pools, multi-institutional validation, and systematic qualitative review of items, including stem clarity, distractor plausibility, and distractor efficiency, as well as cognitive demand. It would also be valuable to compare different fine-tuning or prompt-engineering strategies and to assess students&#x2019; perceptions of AI-generated items [<xref ref-type="bibr" rid="ref34">34</xref>].</p></sec><sec id="s4-3"><title>Conclusion</title><p>This study demonstrates that a supervised fine-tuned LLM can generate MCQs with psychometric properties comparable to those created by experienced faculty. While neither approach consistently produced items with high point-biserial correlation or discrimination, the results indicate that automated question generation can complement traditional item writing in medical education.</p></sec></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">MCQ</term><def><p>multiple-choice question</p></def></def-item><def-item><term id="abb4">NKLM 2.0</term><def><p>National Competence-Based Learning Objectives Catalogue in Medicine</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>St-Onge</surname><given-names>C</given-names> </name><name name-style="western"><surname>Young</surname><given-names>M</given-names> </name><name name-style="western"><surname>Renaud</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Cummings</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Drescher</surname><given-names>O</given-names> </name><name name-style="western"><surname>Varpio</surname><given-names>L</given-names> </name></person-group><article-title>Sound practices: An exploratory study of building and monitoring multiple-choice exams at Canadian undergraduate medical education programs</article-title><source>Acad Med</source><year>2021</year><month>02</month><day>1</day><volume>96</volume><issue>2</issue><fpage>271</fpage><lpage>277</lpage><pub-id pub-id-type="doi">10.1097/ACM.0000000000003659</pub-id><pub-id pub-id-type="medline">32769474</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Sideris</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Catanzano</surname><given-names>T</given-names> </name></person-group><article-title>Writing High-Quality Multiple-Choice Questions</article-title><source>Image-Based Teaching</source><year>2022</year><publisher-name>Springer</publisher-name><fpage>123</fpage><lpage>146</lpage><pub-id pub-id-type="doi">10.1007/978-3-031-11890-6_9</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Collins</surname><given-names>J</given-names> </name></person-group><article-title>Education techniques for lifelong learning: writing multiple-choice questions for continuing medical education activities and self-assessment modules</article-title><source>Radiographics</source><year>2006</year><volume>26</volume><issue>2</issue><fpage>543</fpage><lpage>551</lpage><pub-id pub-id-type="doi">10.1148/rg.262055145</pub-id><pub-id pub-id-type="medline">16549616</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Caldwell</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Pate</surname><given-names>AN</given-names> </name></person-group><article-title>Effects of question formats on student and item performance</article-title><source>Am J Pharm Educ</source><year>2013</year><month>05</month><day>13</day><volume>77</volume><issue>4</issue><fpage>71</fpage><pub-id pub-id-type="doi">10.5688/ajpe77471</pub-id><pub-id pub-id-type="medline">23716739</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Laupichler</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Rother</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Grunwald Kadow</surname><given-names>IC</given-names> </name><name name-style="western"><surname>Ahmadi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Raupach</surname><given-names>T</given-names> </name></person-group><article-title>Large language models in medical education: Comparing ChatGPT- to human-generated exam questions</article-title><source>Acad Med</source><year>2024</year><month>05</month><day>1</day><volume>99</volume><issue>5</issue><fpage>508</fpage><lpage>512</lpage><pub-id pub-id-type="doi">10.1097/ACM.0000000000005626</pub-id><pub-id pub-id-type="medline">38166323</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Artsi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Konen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>G</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name></person-group><article-title>Large language models for generating medical examinations: systematic review</article-title><source>BMC Med Educ</source><year>2024</year><month>03</month><day>29</day><volume>24</volume><issue>1</issue><fpage>354</fpage><pub-id pub-id-type="doi">10.1186/s12909-024-05239-y</pub-id><pub-id pub-id-type="medline">38553693</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gr&#x00E9;visse</surname><given-names>C</given-names> </name><name name-style="western"><surname>Pavlou</surname><given-names>MAS</given-names> </name><name name-style="western"><surname>Schneider</surname><given-names>JG</given-names> </name></person-group><article-title>Docimological quality analysis of LLM-generated multiple choice questions in computer science and medicine</article-title><source>SN Comput Sci</source><year>2024</year><volume>5</volume><issue>5</issue><fpage>636</fpage><pub-id pub-id-type="doi">10.1007/s42979-024-02963-6</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Al Shuraiqi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Aal Abdulsalam</surname><given-names>A</given-names> </name><name name-style="western"><surname>Masters</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zidoum</surname><given-names>H</given-names> </name><name name-style="western"><surname>AlZaabi</surname><given-names>A</given-names> </name></person-group><article-title>Automatic generation of medical case-based multiple-choice questions (MCQs): A review of methodologies, applications, evaluation, and future directions</article-title><source>BDCC</source><year>2024</year><volume>8</volume><issue>10</issue><fpage>139</fpage><pub-id pub-id-type="doi">10.3390/bdcc8100139</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhan</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Hypnos: A domain-specific large language model for anesthesiology</article-title><source>Neurocomputing</source><year>2025</year><month>04</month><volume>624</volume><fpage>129389</fpage><pub-id pub-id-type="doi">10.1016/j.neucom.2025.129389</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Fine-tuning LLMs for anesthesiology via compositional data generation</article-title><source>IEEE Trans Emerg Top Comput Intell</source><year>2025</year><volume>9</volume><issue>6</issue><fpage>4051</fpage><lpage>4065</lpage><pub-id pub-id-type="doi">10.1109/TETCI.2025.3567602</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Benchmarking medical LLMs on anesthesiology: A comprehensive dataset in Chinese</article-title><source>IEEE Trans Emerg Top Comput Intell</source><year>2025</year><volume>9</volume><issue>4</issue><fpage>3057</fpage><lpage>3071</lpage><pub-id pub-id-type="doi">10.1109/TETCI.2024.3502465</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Fakult&#x00E4;tentag</surname><given-names>M</given-names> </name></person-group><source>Nationaler Kompetenzbasierter Lernzielkatalog Medizin</source><year>2025</year><access-date>2026-02-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://nklm.de/menu">https://nklm.de/menu</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Naeem</surname><given-names>N</given-names> </name><name name-style="western"><surname>van der Vleuten</surname><given-names>C</given-names> </name><name name-style="western"><surname>Alfaris</surname><given-names>EA</given-names> </name></person-group><article-title>Faculty development on item writing substantially improves item quality</article-title><source>Adv Health Sci Educ Theory Pract</source><year>2012</year><month>08</month><volume>17</volume><issue>3</issue><fpage>369</fpage><lpage>376</lpage><pub-id pub-id-type="doi">10.1007/s10459-011-9315-2</pub-id><pub-id pub-id-type="medline">21837548</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Bloom</surname><given-names>BS</given-names> </name><etal/></person-group><source>Taxonomy of Educational Objectives</source><year>1964</year><volume>2</volume><publisher-name>Longmans</publisher-name></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Escudero</surname><given-names>EB</given-names> </name><name name-style="western"><surname>Reyna</surname><given-names>NL</given-names> </name><name name-style="western"><surname>Morales</surname><given-names>MR</given-names> </name></person-group><article-title>The level of difficulty and discrimination power of the Basic Knowledge and Skills Examination (EXHCOBA)</article-title><source>Revista Electr&#x00F3;nica de Investigaci&#x00F3;n Educativa</source><year>2000</year><access-date>2026-02-10</access-date><volume>2</volume><issue>1</issue><fpage>2</fpage><comment><ext-link ext-link-type="uri" xlink:href="https://redie.uabc.mx/redie/article/download/15/27/75">https://redie.uabc.mx/redie/article/download/15/27/75</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>M&#x00F6;ltner</surname><given-names>A</given-names> </name><name name-style="western"><surname>Schellberg</surname><given-names>D</given-names> </name><name name-style="western"><surname>J&#x00FC;nger</surname><given-names>J</given-names> </name></person-group><article-title>Grundlegende quantitative analysen medizinischer pr&#x00FC;fungen</article-title><year>2006</year><publisher-name>GMS Zeitschrift f&#x00FC;r Medizinische Ausbildung</publisher-name></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Moran</surname><given-names>V</given-names> </name></person-group><source>Item and Exam Analysis, in Item Writing for Nurse Educators</source><year>2023</year><publisher-name>Springer International Publishing</publisher-name><fpage>55</fpage><lpage>64</lpage></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rush</surname><given-names>BR</given-names> </name><name name-style="western"><surname>Rankin</surname><given-names>DC</given-names> </name><name name-style="western"><surname>White</surname><given-names>BJ</given-names> </name></person-group><article-title>The impact of item-writing flaws and item complexity on examination item difficulty and discrimination value</article-title><source>BMC Med Educ</source><year>2016</year><month>09</month><day>29</day><volume>16</volume><issue>1</issue><fpage>250</fpage><pub-id pub-id-type="doi">10.1186/s12909-016-0773-3</pub-id><pub-id pub-id-type="medline">27681933</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cheung</surname><given-names>BHH</given-names> </name><name name-style="western"><surname>Lau</surname><given-names>GKK</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>GTC</given-names> </name><etal/></person-group><article-title>ChatGPT versus human in generating medical graduate exam multiple choice questions-A multinational prospective study (Hong Kong S.A.R., Singapore, Ireland, and the United Kingdom)</article-title><source>PLOS ONE</source><year>2023</year><volume>18</volume><issue>8</issue><fpage>e0290691</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0290691</pub-id><pub-id pub-id-type="medline">37643186</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>E</surname><given-names>K</given-names> </name><name name-style="western"><surname>S</surname><given-names>P</given-names> </name><name name-style="western"><surname>R</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Advantages and pitfalls in utilizing artificial intelligence for crafting medical examinations: A medical education pilot study with GPT-4</article-title><source>BMC Med Educ</source><year>2023</year><month>10</month><day>17</day><volume>23</volume><issue>1</issue><fpage>772</fpage><pub-id pub-id-type="doi">10.1186/s12909-023-04752-w</pub-id><pub-id pub-id-type="medline">37848913</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayub</surname><given-names>I</given-names> </name><name name-style="western"><surname>Hamann</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hamann</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>MJ</given-names> </name></person-group><article-title>Exploring the potential and limitations of Chat Generative Pre-trained Transformer (ChatGPT) in generating board-style dermatology questions: A qualitative analysis</article-title><source>Cureus</source><year>2023</year><month>08</month><volume>15</volume><issue>8</issue><fpage>e43717</fpage><pub-id pub-id-type="doi">10.7759/cureus.43717</pub-id><pub-id pub-id-type="medline">37638266</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaya</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sonmez</surname><given-names>E</given-names> </name><name name-style="western"><surname>Halici</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yildirim</surname><given-names>H</given-names> </name><name name-style="western"><surname>Coskun</surname><given-names>A</given-names> </name></person-group><article-title>Comparison of AI-generated and clinician-designed multiple-choice questions in emergency medicine exam: a psychometric analysis</article-title><source>BMC Med Educ</source><year>2025</year><month>07</month><day>1</day><volume>25</volume><issue>1</issue><fpage>949</fpage><pub-id pub-id-type="doi">10.1186/s12909-025-07528-6</pub-id><pub-id pub-id-type="medline">40597998</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Elzayyat</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mohammad</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Zaqout</surname><given-names>S</given-names> </name></person-group><article-title>Assessing LLM-generated vs. expert-created clinical anatomy MCQs: a student perception-based comparative study in medical education</article-title><source>Med Educ Online</source><year>2025</year><month>12</month><volume>30</volume><issue>1</issue><fpage>2554678</fpage><pub-id pub-id-type="doi">10.1080/10872981.2025.2554678</pub-id><pub-id pub-id-type="medline">40884796</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Emekli</surname><given-names>E</given-names> </name><name name-style="western"><surname>Karahan</surname><given-names>BN</given-names> </name></person-group><article-title>AI in radiography education: Evaluating multiple-choice questions difficulty and discrimination</article-title><source>J Med Imaging Radiat Sci</source><year>2025</year><month>07</month><volume>56</volume><issue>4</issue><fpage>101896</fpage><pub-id pub-id-type="doi">10.1016/j.jmir.2025.101896</pub-id><pub-id pub-id-type="medline">40157013</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bitew</surname><given-names>SK</given-names> </name><etal/></person-group><article-title>Distractor generation for multiple-choice questions with predictive prompting and large language models</article-title><conf-name>European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases</conf-name><conf-date>Sep 18-22, 2023</conf-date><pub-id pub-id-type="doi">10.1007/978-3-031-74627-7_4</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Baldwin</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yaneva</surname><given-names>V</given-names> </name><etal/></person-group><article-title>A natural-language-processing-based procedure for generating distractors for multiple-choice questions</article-title><source>Eval Health Prof</source><year>2022</year><month>12</month><volume>45</volume><issue>4</issue><fpage>327</fpage><lpage>340</lpage><pub-id pub-id-type="doi">10.1177/01632787211046981</pub-id><pub-id pub-id-type="medline">34753326</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>R</given-names> </name><etal/></person-group><article-title>High-quality distractors generation for human exam based on reinforcement learning from preference feedback</article-title><source>Natural Language Processing and Chinese Computing</source><year>2025</year><publisher-name>Springer Nature Singapore</publisher-name><pub-id pub-id-type="doi">10.1007/978-981-97-9440-9_8</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name><name name-style="western"><surname>VanLehn</surname><given-names>K</given-names> </name></person-group><article-title>Evaluation of auto-generated distractors in multiple choice questions from a semantic network</article-title><source>Interactive Learning Environments</source><year>2021</year><month>08</month><day>18</day><volume>29</volume><issue>6</issue><fpage>1019</fpage><lpage>1036</lpage><pub-id pub-id-type="doi">10.1080/10494820.2019.1619586</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abdulghani</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ahmad</surname><given-names>F</given-names> </name><name name-style="western"><surname>Aldrees</surname><given-names>A</given-names> </name><name name-style="western"><surname>Khalil</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ponnamperuma</surname><given-names>G</given-names> </name></person-group><article-title>The relationship between non-functioning distractors and item difficulty of multiple choice questions: A descriptive analysis</article-title><source>J Health Spec</source><year>2014</year><volume>2</volume><issue>4</issue><fpage>148</fpage><pub-id pub-id-type="doi">10.4103/1658-600X.142784</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rezigalla</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Eleragi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Elhussein</surname><given-names>AB</given-names> </name><etal/></person-group><article-title>Item analysis: the impact of distractor efficiency on the difficulty index and discrimination power of multiple-choice items</article-title><source>BMC Med Educ</source><year>2024</year><month>04</month><day>24</day><volume>24</volume><issue>1</issue><fpage>445</fpage><pub-id pub-id-type="doi">10.1186/s12909-024-05433-y</pub-id><pub-id pub-id-type="medline">38658912</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Law</surname><given-names>AK</given-names> </name><name name-style="western"><surname>So</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lui</surname><given-names>CT</given-names> </name><etal/></person-group><article-title>AI versus human-generated multiple-choice questions for medical education: a cohort study in a high-stakes examination</article-title><source>BMC Med Educ</source><year>2025</year><month>02</month><day>8</day><volume>25</volume><issue>1</issue><fpage>208</fpage><pub-id pub-id-type="doi">10.1186/s12909-025-06796-6</pub-id><pub-id pub-id-type="medline">39923067</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Embretson</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Kingston</surname><given-names>NM</given-names> </name></person-group><article-title>Automatic item generation: A more efficient process for developing mathematics achievement items?</article-title><source>J Educational Measurement</source><year>2018</year><month>03</month><access-date>2026-02-10</access-date><volume>55</volume><issue>1</issue><fpage>112</fpage><lpage>131</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://onlinelibrary.wiley.com/toc/17453984/55/1">https://onlinelibrary.wiley.com/toc/17453984/55/1</ext-link></comment><pub-id pub-id-type="doi">10.1111/jedm.12166</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Olney</surname><given-names>AM</given-names> </name></person-group><source>Generating Multiple Choice Questions from a Textbook: LLMs Match Human Performance on Most Metrics</source><year>2023</year><publisher-name>Grantee Submission</publisher-name></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ng</surname><given-names>O</given-names> </name><etal/></person-group><source>Student Perspective Matters for GenAI in Question Setting in Medical Education</source><year>2025</year><publisher-name>Medical Science Educator</publisher-name><pub-id pub-id-type="doi">10.1007/s40670-025-02396-7</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Technical workflow and dataset construction for fine-tuned large language model&#x2013;mediated item generation.</p><media xlink:href="formative_v10i1e84904_app1.pdf" xlink:title="PDF File, 164 KB"/></supplementary-material></app-group></back></article>