<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v9i1e77580</article-id>
      <article-id pub-id-type="pmid">41337739</article-id>
      <article-id pub-id-type="doi">10.2196/77580</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Feasibility of a Specialized Large Language Model for Postgraduate Medical Examination Preparation: Single-Center Proof-Of-Concept Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Burns</surname>
            <given-names>Michael</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Fukuzawa</surname>
            <given-names>Fumitoshi</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Leong</surname>
            <given-names>Yun Hao</given-names>
          </name>
          <degrees>MRCS, MMed</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Division of Anesthesiology and Perioperative Medicine</institution>
            <institution>Singapore General Hospital</institution>
            <addr-line>31 Third Hospital Ave</addr-line>
            <addr-line>Singapore, 168753</addr-line>
            <country>Singapore</country>
            <fax>65 63213411</fax>
            <phone>65 63213411</phone>
            <email>leong.yun.hao@singhealth.com.sg</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2571-4491</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Nambiar</surname>
            <given-names>Lathiga</given-names>
          </name>
          <degrees>MBBS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-1267-350X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Tay</surname>
            <given-names>Victoria Y J</given-names>
          </name>
          <degrees>MMed</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-4759-0966</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Lie</surname>
            <given-names>Sui An</given-names>
          </name>
          <degrees>MRCP, MMed</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8565-1934</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Yuhe</surname>
            <given-names>Ke</given-names>
          </name>
          <degrees>MMed</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7193-4749</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Division of Anesthesiology and Perioperative Medicine</institution>
        <institution>Singapore General Hospital</institution>
        <addr-line>Singapore</addr-line>
        <country>Singapore</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Anesthesiology</institution>
        <institution>Sengkang General Hospital</institution>
        <addr-line>Singapore</addr-line>
        <country>Singapore</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Yun Hao Leong <email>leong.yun.hao@singhealth.com.sg</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>3</day>
        <month>12</month>
        <year>2025</year>
      </pub-date>
      <volume>9</volume>
      <elocation-id>e77580</elocation-id>
      <history>
        <date date-type="received">
          <day>15</day>
          <month>5</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>12</day>
          <month>10</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>7</day>
          <month>11</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>10</day>
          <month>11</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Yun Hao Leong, Lathiga Nambiar, Victoria Y J Tay, Sui An Lie, Ke Yuhe. Originally published in JMIR Formative Research (https://formative.jmir.org), 03.12.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2025/1/e77580" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Large language models (LLMs) are increasingly used in medical education for feedback and grading; yet their role in postgraduate examination preparation remains uncertain due to inconsistent grading, hallucinations, and user acceptance.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study evaluates the Personalized Anesthesia Study Support (PASS), a specialized GPT-4 model developed to assist candidates preparing for Singapore’s postgraduate specialist anesthesiology examination. We assessed user acceptance, grading interrater reliability, and hallucination detection rates to determine the feasibility of integrating specialized LLMs into high-stakes examination preparation.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>PASS was built on OpenAI’s GPT-4 and adapted with domain-specific prompts and references. Twenty-one senior anesthesiology residents completed a mock short answer question examination, which was independently graded by 3 human examiners and 3 PASS iterations. Participants reviewed feedback from both PASS and standard GPT-4 and completed a technology acceptance model (TAM) survey. Grading reliability was evaluated using Cohen and Fleiss κ. Hallucination rates were assessed by participants and examiners.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Of the 21 participants, 17 (81%) completed the TAM survey, generating 136 responses. PASS scored significantly higher than standard GPT-4 in usefulness (mean 4.25, SD 0.50 vs mean 3.44, SD 0.82; <italic>P</italic>&#60;.001), efficiency (mean 4.12, SD 0.61 vs mean 3.41, SD 0.74; <italic>P</italic>&#60;.001), and likelihood of future use (mean 4.13, SD 0.75 vs mean 3.59, SD 0.90; <italic>P</italic>&#60;.001), with no significant difference in ease of use (mean 4.56, SD 0.63 vs mean 4.50, SD 0.61; <italic>P</italic>=.35). Internal grading reliability was moderate for PASS (κ=0.522) and fair for human examiners (κ=0.275). Across 316 PASS-generated responses, 67 hallucinations and 189 deviations were labeled. Hallucination labeling rates were comparable between candidates (10/67, 15%) and examiners (57/249, 22.9%; <italic>P</italic>=.21), while examiners labeled significantly more deviations (168/249, 67.5% vs 21/67, 31%; <italic>P</italic>&#60;.001).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>PASS demonstrated strong user acceptance and grading reliability, suggesting feasibility in high-stakes examination preparation. Experienced learners could identify major hallucinations at comparable rates to examiners, suggesting potential in self-directed learning but with continued need for caution. Further research should refine grading accuracy and explore multicenter evaluation of specialized LLMs for postgraduate medical education.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>large language model</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>technology acceptance model</kwd>
        <kwd>medical education</kwd>
        <kwd>postgraduate examination</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>The integration of artificial intelligence (AI) into education has accelerated in recent years, particularly with the rapid evolution of large language models (LLMs) such as OpenAI’s GPT-4 and Anthropic’s Claude. These models employ transformer-based deep learning architectures to process and generate human-like text, enabling dynamic adaptation across educational domains. In medical education, LLMs have been applied for automated question generation, simulated case discussions, feedback provision, and adaptive tutoring [<xref ref-type="bibr" rid="ref1">1</xref>]. Recent reviews highlight that LLMs can support formative learning through contextual explanations and feedback personalization while also noting persistent issues in transparency, bias, and factual accuracy [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref4">4</xref>].</p>
      <p>Despite these promising uses, major challenges remain in applying LLMs to high-stakes assessment. Studies on LLM-based grading report inconsistent results, including conservative grading patterns, variable interrater reliability (IRR) with human assessors [<xref ref-type="bibr" rid="ref5">5</xref>], and context-dependent scoring variability [<xref ref-type="bibr" rid="ref6">6</xref>]. Even advanced LLMs such as GPT-4 have shown inconsistent scoring in complex clinical assessments, underscoring the importance of standardized rubrics, transparent prompt reporting, and model calibration [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Most prior work has evaluated general-purpose LLMs with minimal adaptation, which limits their accuracy in assessing domain-specific medical content. While prompt engineering has been explored to optimize response consistency [<xref ref-type="bibr" rid="ref9">9</xref>], few studies have explored the feasibility of specialized LLMs tailored to specific educational needs.</p>
      <p>Another major concern surrounding LLMs in education is the phenomenon of hallucinations, where models generate responses that appear plausible but are factually incorrect or misleading [<xref ref-type="bibr" rid="ref10">10</xref>]. Previous studies have reported hallucination rates as high as 27.1% in LLM-generated feedback [<xref ref-type="bibr" rid="ref11">11</xref>], raising concerns about the potential for misinformation. Recent work in medical education has categorized hallucinations by cognitive level and clinical severity, noting that even subtle deviations can distort learners’ conceptual understanding [<xref ref-type="bibr" rid="ref12">12</xref>]. Given the high-stakes nature of medical training, it is essential to understand not only how frequently these hallucinations occur but also the extent to which educators and students can detect and mitigate their impact.</p>
      <p>In parallel, user acceptance strongly influences whether LLM-based systems are adopted into medical curricula. Although several studies report favorable perceptions of LLM-assisted educational tools among undergraduate students [<xref ref-type="bibr" rid="ref13">13</xref>], there remains a gap in understanding postgraduate learners’ perceptions, particularly in specialist examination preparation. Regardless of technical performance, effective integration of LLMs into educational practice depends on user acceptance, which is shaped by perceived usefulness, ease of use, and trustworthiness [<xref ref-type="bibr" rid="ref14">14</xref>].</p>
      <p>To address these issues, our study aims to develop and pilot the Personalized Anesthesia Study Support (PASS), a specialized GPT designed to provide grading and feedback for candidates preparing for Singapore’s postgraduate specialist examination in anesthesiology. We present this as a proof-of-concept study to explore the feasibility and potential role of specialized LLMs as supplemental tools in postgraduate examination preparation. Our primary objective was to assess end user acceptance of PASS by using a technology acceptance model (TAM) survey. Secondary objectives included evaluating the grading reliability compared to human examiners and analyzing the hallucination detection trends.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study was conducted at Singapore General Hospital under the SingHealth Anesthesiology Residency Program. It was reviewed by the Singapore General Hospital Research Office and was granted exemption from formal institutional review board review, as it involved routine educational activities and the use of anonymized resident data. Participation was voluntary, and no identifiable personal data were collected. Completion of the TAM survey was considered implied consent. All data were deidentified prior to analysis, and no compensation was provided to the participants.</p>
      </sec>
      <sec>
        <title>Recruitment and Study Participants</title>
        <p>Participants were recruited in a systematic cohort-based approach within the SingHealth Anesthesiology Residency Program at Singapore General Hospital. All anesthesiology residents actively preparing for an upcoming Master of Medicine (MMed) anesthesiology examination were scheduled to complete a routine, in-person mock short answer question (SAQ) assessment and were invited to participate in the study. Eligibility required active enrollment in the residency program with at least 32 months of anesthesia-related training experience, consistent with the criteria for sitting in the MMed examination. No additional exclusion criteria were applied, as the cohort was already homogeneous in the training level and examination eligibility. At the start of the session, a member of the program faculty provided a standardized study briefing, explaining its objectives, procedures, and voluntary nature of participation. Residents were informed that participation would not influence their training evaluation or examination outcomes. Implied consent was obtained following this briefing. A total of 21 residents, representing the full cohort of candidates preparing for that year’s MMed examination, consented and participated in the study. The cohort comprised residents in years 2 to 4 of training, with a gender distribution of 14 females and 7 males.</p>
      </sec>
      <sec>
        <title>Development Process of PASS</title>
        <p>PASS was developed using OpenAI’s GPT-4 model and customized via the ChatGPT Custom GPT interface to provide grading and detailed feedback for SAQ-style responses. Two study investigators (LYH and KY) conducted 3 iterative rounds of pilot testing by using simulated essay responses modeled after previous MMed anesthesiology examination questions. Each round involved a systematic review of PASS outputs by both investigators and 2 faculty anesthesiologists to evaluate the clinical accuracy, feedback clarity, and grading consistency. Refinements across iterations focused on clarifying role-based prompts, improving alignment with examiner expectations, and reducing hallucinated or ambiguous statements in the generated feedback. After each testing round, the prompt was revised to enhance the specificity of the instructions and the standardization of the grading language. The final prompt version, presented in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, reflected cumulative adjustments derived from these pilot evaluations and represented the configuration used in the study.</p>
        <p>The reference materials in PASS included the 2024 MMed anesthesiology syllabus (PDF, 32 pages), examiner feedback reports (n=5, 1-2 pages each), past SAQ papers (2018-2024), and standard textbooks such as Miller’s Anesthesia (9th edition) and Yao &#38; Artusio’s Anesthesiology: Problem-Oriented Patient Management (9th edition).</p>
        <p>Prompt engineering followed the principles outlined by Meskó [<xref ref-type="bibr" rid="ref15">15</xref>], focusing on specificity and contextualization. Role-playing prompts were used to instruct the LLM to adopt the perspective of an anesthesiologist examiner, ensuring clinically accurate and assessment-calibrated evaluations.</p>
      </sec>
      <sec>
        <title>Study Design and Grading Evaluation</title>
        <p>Participants completed a closed-book mock examination of 4 SAQs. These were adapted from prior internal mock examinations aligned with recent MMed anesthesiology examination themes and reviewed by another anesthesiology faculty to ensure content validity. Examiner 1 developed a standardized mark scheme and graded all 84 scripts as per standard assessment practices. Each SAQ constituted 1 script, with 21 participants producing 84 total responses.</p>
        <p>Additionally, 2 independent human examiners (examiners 2 and 3) also graded all the scripts based on the standardized mark scheme. Examiners 2 and 3 independently evaluated the PASS grading iterations alongside examiner 1. Two study investigators generated 3 separate PASS grading iterations per script in parallel chat windows. PASS was not provided with the official grading rubric and relied solely on its internal knowledge base. SAQ responses were graded on a 0-8 scale and stratified into poor (0-4.5), average (5-5.5), and good (6-8) categories. The stratified gradings were then analyzed for grading IRR, with examiner 1’s scores serving as the reference standard when required.</p>
      </sec>
      <sec>
        <title>TAM Evaluation</title>
        <p>Participants subsequently evaluated both PASS and standard GPT-4.0 feedback by using a customized TAM survey [<xref ref-type="bibr" rid="ref16">16</xref>]. TAM is grounded in the theory of reasoned action, which posits that an individual’s behavioral intention to use a technology is shaped by their attitudes toward the technology and by perceived usefulness. This framework has been widely applied to understand user acceptance of new technologies, including those used in medical education [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Four dimensions were rated on a 5-point Likert scale (1=least agreement, 5=strongest agreement): (1) usefulness for examination preparation, (2) efficiency in study preparation, (3) ease of use, and (4) likelihood of future use. Participation in the survey was voluntary and anonymous. Incomplete surveys were excluded from the analysis.</p>
      </sec>
      <sec>
        <title>Hallucination and Deviation Analysis</title>
        <p>PASS responses and feedback to participant essays were reviewed independently by study investigators and participants to detect hallucinations and deviations. Hallucinations were defined as “any incorrect information or statements that may cause moderate to major patient harm,” while deviations were defined as “variations from usual clinical practice unlikely to result in patient harm.” The frequency of hallucinations and deviations detection was compared between groups.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>All statistical evaluations were performed in the Excel and Python 3.8 environment. The TAM scores for PASS versus GPT-4 were compared using paired 2-sided <italic>t</italic> tests. Fleiss κ was used to calculate IRR within PASS and human examiner groups, and agreement between individual examiners was evaluated using Cohen κ analysis. Chi-square tests were used to compare hallucination and deviation detection between examiners and candidates. As this was a proof-of-concept feasibility study, no a priori power calculation was performed.</p>
        <p>This paper adheres to the STROBE (Strengthening the Reporting of Observational Studies in Epidemiology) reporting guideline [<xref ref-type="bibr" rid="ref19">19</xref>], and a completed STROBE checklist has been included in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>TAM Results</title>
        <p>Of the 21 participants, 17 (81%) completed the TAM survey, giving a total of 136 survey responses. Four participants did not complete the TAM survey, and no data imputation was performed. User perceptions of PASS were positive. Mean ratings for PASS were as follows: usefulness (4.25, SD 0.50), efficiency (4.12, SD 0.61), ease of use (4.56, SD 0.63), and likelihood of future use (4.13, SD 0.75). Paired 2-sided <italic>t</italic> tests comparing PASS to standard GPT-4 revealed statistically significant differences in 3 of the 4 domains. PASS was rated significantly higher than GPT-4 in usefulness (mean difference=0.81, 95% CI 0.66-0.96; <italic>P</italic>&#60;.001), efficiency (mean difference=0.71, 95% CI 0.53-0.88; <italic>P</italic>&#60;.001), and likelihood of future use (mean difference=0.54, 95% CI 0.39-0.70; <italic>P</italic>&#60;.001). There was no significant difference between PASS and GPT-4 in ease of use (mean difference=0.06, 95% CI –0.07 to 0.18; <italic>P</italic>=.35). A summary of the TAM results can be found in <xref ref-type="table" rid="table1">Table 1</xref> and <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Technology acceptance model survey results comparing PASSa and standard GPT-4 in a single-center observational feasibility study involving postgraduate anesthesiology residents (N=21).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="290"/>
            <col width="170"/>
            <col width="170"/>
            <col width="250"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Question</td>
                <td>PASS, mean (SD)</td>
                <td>GPT, mean (SD)</td>
                <td>Mean difference (95% CI)</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Useful for my examination preparation</td>
                <td>4.25 (0.5)</td>
                <td>3.441 (0.817)</td>
                <td>0.809 (0.66 to 0.96)</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>Helps me to study efficiently</td>
                <td>4.118 (0.612)</td>
                <td>3.412 (0.738)</td>
                <td>0.706 (0.53 to 0.88)</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>Easy to use</td>
                <td>4.559 (0.632)</td>
                <td>4.5 (0.611)</td>
                <td>0.059 (–0.07 to 0.18)</td>
                <td>.35</td>
              </tr>
              <tr valign="top">
                <td>I will use it in future examinations</td>
                <td>4.132 (0.751)</td>
                <td>3.588 (0.902)</td>
                <td>0.544 (0.39 to 0.70)</td>
                <td>&#60;.001</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>PASS: Personalized Anesthesia Study Support.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Flow diagram summarizing the design and data pathways of this study. A total of 21 MMed anesthesiology residents participated in a mock SAQ examination comprising 4 questions (84 total scripts). Each script was independently graded by 3 human examiners and 3 iterations of the specialized large language model (PASS 1-3). MMed: Master of Medicine; PASS: Personalized Anesthesia Study Support; SAQ: short answer question; TAM: technology acceptance model.</p>
          </caption>
          <graphic xlink:href="formative_v9i1e77580_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Grading Evaluation</title>
        <p>A total of 504 scripts were marked by 6 examiners (3 humans, 3 PASS). PASS examiners demonstrated moderate internal IRR (κ=0.522), whereas human examiners exhibited fair internal IRR (κ=0.275), based on the benchmarks of Landis and Koch [<xref ref-type="bibr" rid="ref20">20</xref>]. Individually, the agreement of PASS and human examiners with examiner 1 were as follows: PASS 1 (κ=0.237), PASS 2 (κ=0.193), PASS 3 (κ=0.140), examiner 2 (κ=0.149), and examiner 3 (κ=0.470). Among these, examiner 3 demonstrated the highest agreement with examiner 1, while PASS 3 exhibited the lowest. The complete interexaminer agreement matrix can be found in <xref rid="figure2" ref-type="fig">Figure 2</xref>. When analyzed as a group (<xref ref-type="table" rid="table2">Table 2</xref>), PASS examiners approached a moderate level of agreement with the combined group of human examiners (examiners 1, 2, and 3) (κ=0.357, 95% CI 0.156-0.558). Pairwise agreement between PASS and individual human examiners ranged from κ=0.159 (PASS vs examiner 1, slight) to κ=0.429 (PASS vs examiner 3, moderate).</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Interexaminer agreement matrix heatmap showing pairwise Cohen κ values differing among human examiners (examiners 1-3) and PASS iterations (PASS 1-3). Each cell represents the level of agreement between 2 graders, with values ranging from 0 (no agreement) to 1 (perfect agreement). Darker red shading indicates higher agreement, lighter blue tones represent moderate agreement, and deep blue tones indicate lower agreement. PASS iterations demonstrated higher internal consistency (κ=0.52-0.57) compared with human examiners (κ=0.15-0.47), suggesting more standardized grading performance across large language model assessments. PASS: Personalized Anesthesia Study Support.</p>
          </caption>
          <graphic xlink:href="formative_v9i1e77580_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Interexaminer agreement between PASSa and human examiners in a single-center observational feasibility study among anesthesiology residents (N=21).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Comparison</td>
                <td>κ score (95% CI)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>PASS versus combined examiners (1,2,3)</td>
                <td>0.357 (0.156 to 0.558)</td>
              </tr>
              <tr valign="top">
                <td>PASS versus examiner 1</td>
                <td>0.159 (–0.053 to 0.372)</td>
              </tr>
              <tr valign="top">
                <td>PASS versus examiner 2</td>
                <td>0.193 (–0.017 to 0.404)</td>
              </tr>
              <tr valign="top">
                <td>PASS versus examiner 3</td>
                <td>0.429 (0.228 to 0.630)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>PASS: Personalized Anesthesia Study Support.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Hallucination and Deviation Detection Rates</title>
        <p>A total of 316 PASS-generated responses were evaluated for hallucinations and deviations. Examiners conducted 78.8% (249/316) of the evaluations, while candidates accounted for 21.2% (67/316). Across all responses, 67 hallucinations and 189 deviations were labeled. Each hallucination or deviation was labeled by independent reviewers. No statistically significant difference was observed in the hallucination detection rates between candidates and examiners (<italic>P</italic>=.21). Candidates identified 10 hallucinations across 67 responses (15%), while examiners detected 57 hallucinations across 249 responses (22.9%). In contrast, examiners identified significantly more deviations than candidates. Candidates detected 21 deviations (31.3%), whereas examiners identified 168 deviations (67.5%)—a difference that was highly significant (<italic>P</italic>&#60;.001) (<xref ref-type="table" rid="table3">Table 3</xref>).</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Comparison of hallucination and deviation labeling by candidates and examiners. Values indicate detection rate (%). Differences were tested using chi-square analysis.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="250"/>
            <col width="250"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Candidates (n=67), n (%)</td>
                <td>Examiners (n=249), n (%)</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Hallucinations identified</td>
                <td>10 (14.9)</td>
                <td>57 (22.9)</td>
                <td>.21</td>
              </tr>
              <tr valign="top">
                <td>Deviations identified</td>
                <td>21 (31.3)</td>
                <td>168 (67.5)</td>
                <td>&#60;.001</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Primary Outcome: Technology Acceptance and User Perceptions</title>
        <p>This proof-of-concept study demonstrates strong end user acceptance of a specialized LLM, supporting the potential role of tailored AI tools in high-stakes medical examination preparation. Younger learners increasingly prefer technology-driven educational tools that provide prompt feedback, personalized learning experiences, and interactive engagement [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Specialized LLM platforms such as our case example PASS offers an innovative approach to meet these expectations.</p>
        <p>Our findings are consistent with prior research showing that LLM-based educational tools are generally well-received across educational contexts [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. However, user acceptance specifically for high-stakes examination preparation, where performance directly impacts career progression, remains underexplored [<xref ref-type="bibr" rid="ref25">25</xref>]. Given that user acceptance is a key factor in the successful adoption of new technologies [<xref ref-type="bibr" rid="ref26">26</xref>], our results add evidence that specialized LLMs can be successfully incorporated as study aids for postgraduate medical examinations.</p>
        <p>Notably, PASS, a specialized version of GPT-4 customized with domain-specific content, received significantly higher ratings in usefulness, efficiency, and likelihood of future use compared to standard GPT-4. This supports the argument that domain-specialized enhancements improve both learner experience and perceived educational value [<xref ref-type="bibr" rid="ref27">27</xref>].</p>
      </sec>
      <sec>
        <title>Grading Reliability</title>
        <p>In our study, PASS demonstrated a higher internal grading IRR (κ=0.522) compared to human examiners (κ=0.275), suggesting that LLM-based grading may provide more standardized evaluations. When analyzed as a group, PASS examiners showed statistically significant agreement with human examiners (κ=0.357), indicating that PASS grading trends align with human assessment.</p>
        <p>Timely and accurate grading feedback is crucial for effective examination preparation, as it helps candidates refine their responses and align their answering techniques with examiner expectations [<xref ref-type="bibr" rid="ref28">28</xref>]. However, faculty availability is often limited, restricting or delaying access to grading opportunities. Additionally, human grading is inherently variable, influenced by factors such as individual biases and grading fatigue [<xref ref-type="bibr" rid="ref29">29</xref>]. Our results suggest that specialized LLMs could offer scalable, consistent supplemental grading and feedback in postgraduate education, potentially alleviating faculty workload and enabling self-directed formative assessment.</p>
        <p>However, PASS’s grading agreement with humans remained only moderate, indicating that its reliability may not yet be sufficient for formal grading. This finding aligns with previous research highlighting the limitations of LLMs as formal grading tools [<xref ref-type="bibr" rid="ref30">30</xref>]. Despite these constraints, specialized LLMs remain valuable as self-assessment tools, allowing students to estimate their performance during examination preparation.</p>
        <p>PASS was intentionally not provided with the official marking rubric to reflect the realistic conditions of learner-driven use. This design choice likely contributed to moderate agreement levels, as rubric-guided calibration could enhance consistency between AI and human grading. Future studies may investigate whether standardized grading frameworks improve IRR and alignment with human examiners.</p>
      </sec>
      <sec>
        <title>Hallucinations and Deviations in AI-Generated Feedback</title>
        <p>Hallucinations in LLM generated feedback are a well-documented concern [<xref ref-type="bibr" rid="ref1">1</xref>]. In medical education, this issue is particularly critical, as even minor misconceptions can lead to significant errors in clinical management, potentially resulting in adverse patient outcomes [<xref ref-type="bibr" rid="ref31">31</xref>]. While previous studies have discussed potential ways to mitigate the effect of hallucinations [<xref ref-type="bibr" rid="ref32">32</xref>], there is a notable gap in research comparing the detection rate of hallucinations between tutors and learners.</p>
        <p>Encouragingly, our study shows that candidates were able to identify hallucinations at a rate comparable to examiners. The ability of candidates to recognize and filter out major erroneous feedback on their own suggests that specialized LLMs show potential for use under supervision in examination preparation. However, it is important to note that all candidates in our study had at least 32 months of anesthesia experience, which may have contributed to their ability to detect errors effectively. Less experienced learners may not demonstrate the same level of discernment and could require closer supervision when using AI-generated feedback.</p>
        <p>Additionally, our results show that candidates were significantly less likely than examiners to detect deviations from standard clinical practice. Failure to recognize clinically relevant, even if minor, deviations from standard practice may inadvertently reinforce incorrect reasoning patterns. If undetected, such deviations could perpetuate unsafe habits or incomplete understanding, which, in a clinical context, might affect patient management. Incorporating structured faculty review or critical appraisal exercises into curricula could help learners identify and mitigate such errors when using AI feedback.</p>
        <p>Our findings suggest important considerations for future use of LLMs in education. Educators must assess whether learners possess the critical skills to evaluate AI-generated content before incorporating LLM feedback into curricula. Future research should explore hallucination detection across learner levels and strategies to build critical evaluation skills alongside AI-assisted learning.</p>
      </sec>
      <sec>
        <title>Limitations and Future Work</title>
        <p>This study has several limitations. First, the small, single-center cohort of 21 participants may limit the generalizability of our findings. All participants were experienced anesthesiology residents from the same training program and were approaching their final examinations. Consequently, the results may not extend to less experienced trainees, learners from other specialties, or different educational settings. Junior trainees may have greater difficulty discerning hallucinations or subtle deviations in AI-generated feedback, underscoring the continued importance of supervised use and faculty guidance. Future multicenter studies involving a broader range of specialties, experience levels, and training environments will be essential to evaluate the scalability and educational impact of specialized LLMs. Second, no formal power calculation was conducted, given the exploratory, proof-of-concept nature of the study. In addition, intraexaminer reliability was not assessed, which may have influenced the variability in human grading. Finally, this study did not examine the direct effect of using PASS on actual examination performance or learning outcomes. Assessing these educational and behavioral impacts should be a key focus of subsequent research to determine the long-term value and safety of integrating specialized LLMs into postgraduate medical education.</p>
        <p>Building on these preliminary findings, future research should focus on multicenter validation of specialized LLMs across different institutions, specialties, and learner experience levels to better define their generalizability. Further work is also needed to evaluate the longitudinal educational impact of LLM-assisted feedback on examination performance, critical thinking, and clinical reasoning. Incorporating standardized grading rubrics or structured prompt frameworks may improve grading reliability and enhance alignment with human examiners. In parallel, research should explore strategies to train learners in AI literacy and critical appraisal, ensuring they can appropriately interpret and verify model outputs. Ultimately, integrating specialized LLMs into postgraduate education should aim to augment, not replace, human judgment, fostering safe, reflective, and supervised use of AI in medical learning environments.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study provides proof-of-concept evidence supporting the feasibility and user acceptance of specialized LLMs as supplemental study aids for postgraduate medical examination preparation. The specialized LLM (PASS) demonstrated strong user acceptance, outperforming standard GPT-4 in perceived usefulness, study efficiency, and likelihood of future use. PASS also showed moderate grading consistency and agreement with human examiners, indicating potential value in supporting formative, self-directed learning.</p>
        <p>While experienced learners were able to recognize major hallucinations at rates comparable to examiners, their lower detection of subtle deviations underscores the continued necessity of human oversight when using AI-generated educational feedback. These findings highlight the potential of specialized LLMs to enhance postgraduate learning under supervision while reinforcing the importance of ongoing faculty involvement, quality control, and iterative model refinement before broader integration into training programs.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Personalized Anesthesia Study Support prompt.</p>
        <media xlink:href="formative_v9i1e77580_app1.docx" xlink:title="DOCX File , 19 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>STROBE checklist.</p>
        <media xlink:href="formative_v9i1e77580_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 239 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">IRR</term>
          <def>
            <p>interrater reliability</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">MMed</term>
          <def>
            <p>Master of Medicine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">PASS</term>
          <def>
            <p>Personalized Anesthesia Study Support</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">SAQ</term>
          <def>
            <p>short answer question</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">STROBE</term>
          <def>
            <p>Strengthening the Reporting of Observational Studies in Epidemiology</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">TAM</term>
          <def>
            <p>technology acceptance model</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We would like to extend our gratitude to the SingHealth Anesthesiology Residency Program for their support during the project. During the preparation of this manuscript, the authors employed large language models (ChatGPT) solely to assist with limited aspects of text refinement. These tools were used exclusively for (1) paraphrasing technical methodological descriptions to improve readability while preserving the scientific meaning and (2) optimizing grammatical structure in complex sentences. All large language model–generated content underwent rigorous verification against the original study data and were subject to critical revision by all coauthors to ensure factual accuracy and adherence to the study findings. The final manuscript represents the authors' scholarly work and interpretation.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The datasets generated and analyzed during this study are not publicly available due to its sensitive nature but are available from the corresponding author on reasonable request. Deidentified data supporting the findings of this study may be shared for academic or research purposes upon submission of an appropriate data use agreement.</p>
      </sec>
    </notes>
    <notes>
      <sec>
        <title>Funding</title>
        <p>The authors have not declared a specific grant for this research from any funding agency in the public, commercial or not-for-profit sectors.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>Conceptualization: YHL, LN, YK</p>
        <p>Product development: YHL, YK</p>
        <p>Formal analysis: YHL</p>
        <p>Investigation: YHL, LN, VYJT, SAL, YK</p>
        <p>Methodology: YHL, LN, VYJT, SAL</p>
        <p>Supervision: YHL, YK</p>
        <p>Writing – original draft: YHL, LN, YK</p>
        <p>Writing – review &#38; editing: YHL, LN, VYJT, SAL, YK</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abd-Alrazaq</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>AlSaad</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Alhuwail</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Healy</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Latifi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Aziz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Damseh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Alabed Alrazak</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sheikh</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Large language models in medical education: opportunities, challenges, and future directions</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>06</month>
          <day>01</day>
          <volume>9</volume>
          <fpage>e48291</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48291/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48291</pub-id>
          <pub-id pub-id-type="medline">37261894</pub-id>
          <pub-id pub-id-type="pii">v9i1e48291</pub-id>
          <pub-id pub-id-type="pmcid">PMC10273039</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mohammad</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Supti</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Alzubaidi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Alam</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Househ</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The pros and cons of using ChatGPT in medical education: a scoping review</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2023</year>
          <month>06</month>
          <day>29</day>
          <volume>305</volume>
          <fpage>644</fpage>
          <lpage>647</lpage>
          <pub-id pub-id-type="doi">10.3233/SHTI230580</pub-id>
          <pub-id pub-id-type="medline">37387114</pub-id>
          <pub-id pub-id-type="pii">SHTI230580</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lucas</surname>
              <given-names>HC</given-names>
            </name>
            <name name-style="western">
              <surname>Upperman</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Robinson</surname>
              <given-names>JR</given-names>
            </name>
          </person-group>
          <article-title>A systematic review of large language models and their implications in medical education</article-title>
          <source>Med Educ</source>
          <year>2024</year>
          <month>11</month>
          <volume>58</volume>
          <issue>11</issue>
          <fpage>1276</fpage>
          <lpage>1285</lpage>
          <pub-id pub-id-type="doi">10.1111/medu.15402</pub-id>
          <pub-id pub-id-type="medline">38639098</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Assessment of large language models’ performances and hallucinations for Chinese postgraduate medical entrance examination</article-title>
          <source>Discov Educ</source>
          <year>2025</year>
          <month>03</month>
          <day>13</day>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>59</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1007/s44217-025-00446-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s44217-025-00446-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Flodén</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Grading exams using large language models: a comparison between human and AI grading of exams in higher education using ChatGPT</article-title>
          <source>British Educational Res J</source>
          <year>2024</year>
          <month>09</month>
          <day>16</day>
          <volume>51</volume>
          <issue>1</issue>
          <fpage>201</fpage>
          <lpage>224</lpage>
          <pub-id pub-id-type="doi">10.1002/berj.4069</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Quah</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sng</surname>
              <given-names>TJH</given-names>
            </name>
            <name name-style="western">
              <surname>Yong</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Islam</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Reliability of ChatGPT in automated essay scoring for dental undergraduate examinations</article-title>
          <source>BMC Med Educ</source>
          <year>2024</year>
          <month>09</month>
          <day>03</day>
          <volume>24</volume>
          <issue>1</issue>
          <fpage>962</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-024-05881-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-024-05881-6</pub-id>
          <pub-id pub-id-type="medline">39227811</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-024-05881-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC11373238</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Law</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lui</surname>
              <given-names>CT</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>YF</given-names>
            </name>
            <name name-style="western">
              <surname>Cheung</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Kei-Ching Hung</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Graham</surname>
              <given-names>CA</given-names>
            </name>
          </person-group>
          <article-title>AI versus human-generated multiple-choice questions for medical education: a cohort study in a high-stakes examination</article-title>
          <source>BMC Med Educ</source>
          <year>2025</year>
          <month>02</month>
          <day>08</day>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>208</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-025-06796-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-025-06796-6</pub-id>
          <pub-id pub-id-type="medline">39923067</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-025-06796-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC11806894</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Emirtekin</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Large language model-powered automated assessment: a systematic review</article-title>
          <source>Applied Sciences</source>
          <year>2025</year>
          <month>05</month>
          <day>20</day>
          <volume>15</volume>
          <issue>10</issue>
          <fpage>5683</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.3390/app15105683"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/app15105683</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fernández</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>López-Torres</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fernández</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Vázquez-García</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT as an instructor’s assistant for generating and scoring exams</article-title>
          <source>J Chem Educ</source>
          <year>2024</year>
          <month>08</month>
          <day>14</day>
          <volume>101</volume>
          <issue>9</issue>
          <fpage>3780</fpage>
          <lpage>3788</lpage>
          <pub-id pub-id-type="doi">10.1021/acs.jchemed.4c00231</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elsayed</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>The impact of hallucinated information in large language models on student learning outcomes: a critical examination of misinformation risks in AI-assisted education</article-title>
          <source>Northern Reviews on Algorithmic Research, Theoretical Computation, and Complexity</source>
          <access-date>2025-02-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://northernreviews.com/index.php/NRATCC/article/view/2024-08-07/7">https://northernreviews.com/index.php/NRATCC/article/view/2024-08-07/7</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xi</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Rashid</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gehringer</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>On assessing the faithfulness of LLM-generated feedback on student assignments</article-title>
          <year>2024</year>
          <conf-name>Proceedings of the 17th International Conference on Educational Data Mining</conf-name>
          <conf-date>July 14-17, 2024</conf-date>
          <conf-loc>Atlanta, GA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://zenodo.org/records/12729868"/>
          </comment>
          <pub-id pub-id-type="doi">10.5281/zenodo.12729867</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Meng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yue</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Integrating AI into clinical education: evaluating general practice trainees' proficiency in distinguishing AI-generated hallucinations and impacting factors</article-title>
          <source>BMC Med Educ</source>
          <year>2025</year>
          <month>03</month>
          <day>19</day>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>406</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-025-06916-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-025-06916-2</pub-id>
          <pub-id pub-id-type="medline">40108629</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-025-06916-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC11924592</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mondal</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Karri</surname>
              <given-names>JKK</given-names>
            </name>
            <name name-style="western">
              <surname>Ramasubramanian</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mondal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Juhi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>A qualitative survey on perception of medical students on the use of large language models for educational purposes</article-title>
          <source>Adv Physiol Educ</source>
          <year>2025</year>
          <month>03</month>
          <day>01</day>
          <volume>49</volume>
          <issue>1</issue>
          <fpage>27</fpage>
          <lpage>36</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.physiology.org/doi/10.1152/advan.00088.2024?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1152/advan.00088.2024</pub-id>
          <pub-id pub-id-type="medline">39447120</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Assessing clinical medicine students' acceptance of large language model: based on technology acceptance model</article-title>
          <source>BMC Med Educ</source>
          <year>2024</year>
          <month>11</month>
          <day>03</day>
          <volume>24</volume>
          <issue>1</issue>
          <fpage>1251</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-024-06232-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-024-06232-1</pub-id>
          <pub-id pub-id-type="medline">39490999</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-024-06232-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC11533422</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meskó</surname>
              <given-names>Bertalan</given-names>
            </name>
          </person-group>
          <article-title>Prompt engineering as an important emerging skill for medical professionals: tutorial</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <month>10</month>
          <day>04</day>
          <volume>25</volume>
          <fpage>e50638</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e50638/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/50638</pub-id>
          <pub-id pub-id-type="medline">37792434</pub-id>
          <pub-id pub-id-type="pii">v25i1e50638</pub-id>
          <pub-id pub-id-type="pmcid">PMC10585440</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>FD</given-names>
            </name>
            <name name-style="western">
              <surname>Granić</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <source>The Technology Acceptance Model: 30 Years of TAM</source>
          <year>2024</year>
          <month>03</month>
          <day>04</day>
          <publisher-loc>Cham</publisher-loc>
          <publisher-name>Springer International Publishing</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Granić</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Marangunić</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Technology acceptance model in educational context: a systematic literature review</article-title>
          <source>Brit J Educational Tech</source>
          <year>2019</year>
          <month>07</month>
          <day>09</day>
          <volume>50</volume>
          <issue>5</issue>
          <fpage>2572</fpage>
          <lpage>2593</lpage>
          <pub-id pub-id-type="doi">10.1111/bjet.12864</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>DY</given-names>
            </name>
            <name name-style="western">
              <surname>Lehto</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <article-title>User acceptance of YouTube for procedural learning: an extension of the technology acceptance model</article-title>
          <source>Computers &#38; Education</source>
          <year>2013</year>
          <month>02</month>
          <volume>61</volume>
          <fpage>193</fpage>
          <lpage>208</lpage>
          <pub-id pub-id-type="doi">10.1016/j.compedu.2012.10.001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>von Elm</surname>
              <given-names>Erik</given-names>
            </name>
            <name name-style="western">
              <surname>Altman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Egger</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pocock</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gøtzsche</surname>
              <given-names>Peter C</given-names>
            </name>
            <name name-style="western">
              <surname>Vandenbroucke</surname>
              <given-names>J</given-names>
            </name>
            <collab>STROBE Initiative</collab>
          </person-group>
          <article-title>The Strengthening the Reporting of Observational Studies in Epidemiology (STROBE) statement: guidelines for reporting observational studies</article-title>
          <source>Ann Intern Med</source>
          <year>2007</year>
          <month>10</month>
          <day>16</day>
          <volume>147</volume>
          <issue>8</issue>
          <fpage>573</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.acpjournals.org/doi/10.7326/0003-4819-147-8-200710160-00010?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.7326/0003-4819-147-8-200710160-00010</pub-id>
          <pub-id pub-id-type="medline">17938396</pub-id>
          <pub-id pub-id-type="pii">147/8/573</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Landis</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Koch</surname>
              <given-names>GG</given-names>
            </name>
          </person-group>
          <article-title>The measurement of observer agreement for categorical data</article-title>
          <source>Biometrics</source>
          <year>1977</year>
          <month>03</month>
          <volume>33</volume>
          <issue>1</issue>
          <fpage>159</fpage>
          <lpage>174</lpage>
          <pub-id pub-id-type="medline">843571</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cruz</surname>
              <given-names>ROD</given-names>
            </name>
          </person-group>
          <article-title>Pedagogical practice preferences among generational groups of learners: towards effective twenty-first century higher education</article-title>
          <source>JUTLP</source>
          <year>2020</year>
          <month>05</month>
          <day>02</day>
          <fpage>17(5)</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://open-publishing.org/journals/index.php/jutlp/article/view/424"/>
          </comment>
          <pub-id pub-id-type="doi">10.53761/1.17.5.6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shorey</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Rajendran</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ang</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Learning styles, preferences and needs of generation Z healthcare students: scoping review</article-title>
          <source>Nurse Educ Pract</source>
          <year>2021</year>
          <month>11</month>
          <volume>57</volume>
          <fpage>103247</fpage>
          <pub-id pub-id-type="doi">10.1016/j.nepr.2021.103247</pub-id>
          <pub-id pub-id-type="medline">34768214</pub-id>
          <pub-id pub-id-type="pii">S1471-5953(21)00283-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Neumann</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Yin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sowe</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Decker</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jarke</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>An LLM-driven chatbot in higher education for databases and information systems</article-title>
          <source>IEEE Trans Educ</source>
          <year>2025</year>
          <month>2</month>
          <volume>68</volume>
          <issue>1</issue>
          <fpage>103</fpage>
          <lpage>116</lpage>
          <pub-id pub-id-type="doi">10.1109/te.2024.3467912</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bernabei</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Colabianchi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Falegnami</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Costantino</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Students’ use of large language models in engineering education: a case study on technology acceptance, perceptions, efficacy, and detection chances</article-title>
          <source>Computers and Education: Artificial Intelligence</source>
          <year>2023</year>
          <volume>5</volume>
          <fpage>100172</fpage>
          <pub-id pub-id-type="doi">10.1016/j.caeai.2023.100172</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>French</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dickerson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mulder</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>A review of the benefits and drawbacks of high-stakes final examinations in higher education</article-title>
          <source>High Educ</source>
          <year>2023</year>
          <month>12</month>
          <day>01</day>
          <volume>88</volume>
          <issue>3</issue>
          <fpage>893</fpage>
          <lpage>918</lpage>
          <pub-id pub-id-type="doi">10.1007/s10734-023-01148-z</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>FD</given-names>
            </name>
          </person-group>
          <article-title>User acceptance of information technology: system characteristics, user perceptions and behavioral impacts</article-title>
          <source>International Journal of Man-Machine Studies</source>
          <year>1993</year>
          <month>3</month>
          <volume>38</volume>
          <issue>3</issue>
          <fpage>475</fpage>
          <lpage>487</lpage>
          <pub-id pub-id-type="doi">10.1006/imms.1993.1022</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nazir</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>A comprehensive survey of ChatGPT: advancements, applications, prospects, and challenges</article-title>
          <source>Meta Radiol</source>
          <year>2023</year>
          <month>09</month>
          <volume>1</volume>
          <issue>2</issue>
          <fpage>2</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/100022"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.metrad.2023.100022</pub-id>
          <pub-id pub-id-type="medline">37901715</pub-id>
          <pub-id pub-id-type="pii">100022</pub-id>
          <pub-id pub-id-type="pmcid">PMC10611551</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Naujoks</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Harder</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Händel</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Testing pays off twice: potentials of practice tests and feedback regarding exam performance and judgment accuracy</article-title>
          <source>Metacognition Learning</source>
          <year>2022</year>
          <month>03</month>
          <day>18</day>
          <volume>17</volume>
          <issue>2</issue>
          <fpage>479</fpage>
          <lpage>498</lpage>
          <pub-id pub-id-type="doi">10.1007/s11409-022-09295-x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Faherty</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Counihan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kropmans</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Finn</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Inter-rater reliability in clinical assessments: do examiner pairings influence candidate ratings?</article-title>
          <source>BMC Med Educ</source>
          <year>2020</year>
          <month>05</month>
          <day>11</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>147</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-020-02009-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-020-02009-4</pub-id>
          <pub-id pub-id-type="medline">32393228</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-020-02009-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC7212618</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Boey</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>YY</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>AHS</given-names>
            </name>
          </person-group>
          <article-title>Evaluating large language models for criterion-based grading from agreement to consistency</article-title>
          <source>NPJ Sci Learn</source>
          <year>2024</year>
          <month>12</month>
          <day>30</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>79</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41539-024-00291-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41539-024-00291-1</pub-id>
          <pub-id pub-id-type="medline">39738131</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41539-024-00291-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC11683144</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhui</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fenghe</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Xuehu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Qining</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Ethical considerations and fundamental principles of large language models in medical education: viewpoint</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <month>08</month>
          <day>01</day>
          <volume>26</volume>
          <fpage>e60083</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e60083/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/60083</pub-id>
          <pub-id pub-id-type="medline">38971715</pub-id>
          <pub-id pub-id-type="pii">v26i1e60083</pub-id>
          <pub-id pub-id-type="pmcid">PMC11327620</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ishii</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Towards mitigating LLM hallucination via self reflection</article-title>
          <source>Findings of the Association for Computational Linguistics</source>
          <year>2023</year>
          <month>12</month>
          <conf-name>EMNLP 2023</conf-name>
          <conf-date>Dec 6-10, 2023</conf-date>
          <conf-loc>Singapore</conf-loc>
          <fpage>1827</fpage>
          <lpage>1843</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/2023.findings-emnlp.123</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
