<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e66207</article-id><article-id pub-id-type="doi">10.2196/66207</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Medical Misinformation in AI-Assisted Self-Diagnosis: Development of a Method (EvalPrompt) for Analyzing Large Language Models</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Zada</surname><given-names>Troy</given-names></name><degrees>BASc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Tam</surname><given-names>Natalie</given-names></name><degrees>BASc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Barnard</surname><given-names>Francois</given-names></name><degrees>BASc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Van Sittert</surname><given-names>Marlize</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bhat</surname><given-names>Venkat</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Rambhatla</surname><given-names>Sirisha</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Management Sciences and Engineering, University of Waterloo</institution><addr-line>200 University Avenue West</addr-line><addr-line>Waterloo</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff2"><institution>Faculty of Law, University of Toronto</institution><addr-line>Toronto</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff3"><institution>Department of Psychiatry, University of Toronto</institution><addr-line>Toronto</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff4"><institution>Interventional Psychiatry Program, St. Michael&#x2019;s Hospital, Unity Health Toronto</institution><addr-line>Toronto</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Chartash</surname><given-names>David</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Lungeanu</surname><given-names>Diana</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Gupta</surname><given-names>Gaurav Kumar</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Huang</surname><given-names>Yue</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Sirisha Rambhatla, PhD, Department of Management Sciences and Engineering, University of Waterloo, 200 University Avenue West, Waterloo, ON, N2L 3G1, Canada, 1 5198884567 ext 33279; <email>sirisha.rambhatla@uwaterloo.ca</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>10</day><month>3</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e66207</elocation-id><history><date date-type="received"><day>06</day><month>09</month><year>2024</year></date><date date-type="rev-recd"><day>29</day><month>01</month><year>2025</year></date><date date-type="accepted"><day>29</day><month>01</month><year>2025</year></date></history><copyright-statement>&#x00A9; Troy Zada, Natalie Tam, Francois Barnard, Marlize Van Sittert, Venkat Bhat, Sirisha Rambhatla. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 10.3.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e66207"/><abstract><sec><title>Background</title><p>Rapid integration of large language models (LLMs) in health care is sparking global discussion about their potential to revolutionize health care quality and accessibility. At a time when improving health care quality and access remains a critical concern for countries worldwide, the ability of these models to pass medical examinations is often cited as a reason to use them for medical training and diagnosis. However, the impact of their inevitable use as a self-diagnostic tool and their role in spreading health care misinformation has not been evaluated.</p></sec><sec><title>Objective</title><p>This study aims to assess the effectiveness of LLMs, particularly ChatGPT, from the perspective of an individual self-diagnosing to better understand the clarity, correctness, and robustness of the models.</p></sec><sec sec-type="methods"><title>Methods</title><p>We propose the comprehensive testing methodology evaluation of LLM prompts (EvalPrompt). This evaluation methodology uses multiple-choice medical licensing examination questions to evaluate LLM responses. Experiment 1 prompts ChatGPT with open-ended questions to mimic real-world self-diagnosis use cases, and experiment 2 performs sentence dropout on the correct responses from experiment 1 to mimic self-diagnosis with missing information. Humans then assess the responses returned by ChatGPT for both experiments to evaluate the clarity, correctness, and robustness of ChatGPT.</p></sec><sec sec-type="results"><title>Results</title><p>In experiment 1, we found that ChatGPT-4.0 was deemed correct for 31% (29/94) of the questions by both nonexperts and experts, with only 34% (32/94) agreement between the 2 groups. Similarly, in experiment 2, which assessed robustness, 61% (92/152) of the responses continued to be categorized as correct by all assessors. As a result, in comparison to a passing threshold of 60%, ChatGPT-4.0 is considered incorrect and unclear, though robust. This indicates that sole reliance on ChatGPT-4.0 for self-diagnosis could increase the risk of individuals being misinformed.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The results highlight the modest capabilities of LLMs, as their responses are often unclear and inaccurate. Any medical advice provided by LLMs should be cautiously approached due to the significant risk of misinformation. However, evidence suggests that LLMs are steadily improving and could potentially play a role in health care systems in the future. To address the issue of medical misinformation, there is a pressing need for the development of a comprehensive self-diagnosis dataset. This dataset could enhance the reliability of LLMs in medical applications by featuring more realistic prompt styles with minimal information across a broader range of medical fields.</p></sec></abstract><kwd-group><kwd>ChatGPT</kwd><kwd>health care</kwd><kwd>LLM</kwd><kwd>misinformation</kwd><kwd>self-diagnosis</kwd><kwd>large language model</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Large language models (LLMs) have grown in popularity with an ever-expanding list of applications due to their efficiency and accessibility [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. With their increased usage, LLMs are gaining user trust [<xref ref-type="bibr" rid="ref3">3</xref>], partly due to the anthropomorphic responses produced by models such as GPT-4o, even though they can generate misinformation at scale [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Recent reports highlight the inability of differentiating truth from misinformation, and the potential collapse of health care systems, as major disruptors on the horizon [<xref ref-type="bibr" rid="ref6">6</xref>]. This emphasizes the urgent need to develop solutions to ensure the delivery of factual information.</p><p>In health care, self-diagnosis through web searches has become a widespread practice and is especially important for underserved communities [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref10">10</xref>], which means that the prospective usage of LLMs in this domain is inevitable. However, relying solely on online searches for health information can result in severe misinformation as content on social media often spreads more rapidly than scientific knowledge [<xref ref-type="bibr" rid="ref11">11</xref>]. Inaccurate content, conspiracy theories, and false claims are all forms of misinformation which can impact public perceptions, alter behaviors, and reduce trust in health care systems [<xref ref-type="bibr" rid="ref12">12</xref>]. Moreover, the ongoing global shortage of health care workers [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref16">16</xref>] has driven government entities and health care organizations to explore the use of LLMs as health care assistants and expertise replacements for diagnosis and education [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. Thus, there is a need to study the quality and reliability of LLM-generated responses to health care&#x2013;related questions.</p></sec><sec id="s1-2"><title>Prior Work</title><p>Recent work has focused on analyzing ChatGPT across various industries, including its application within health care. ChatGPT is a natural language processing model distinctive for its narrative response style to user input [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Studies have assessed its performance on examinations [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref26">26</xref>] and its utility as a self-studying tool [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>], leveraging its ability to provide tailored responses and immediate feedback. Furthermore, ChatGPT has demonstrated potential in assisting research and academic writing by enhancing efficiency and mitigating gaps in researcher knowledge [<xref ref-type="bibr" rid="ref29">29</xref>]. However, the increased usage of ChatGPT raises significant ethical concerns regarding plagiarism, bias, transparency, inaccuracy, and health equity [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref34">34</xref>].</p><p>ChatGPT has also demonstrated superior performance in answering medical questions compared to other LLMs [<xref ref-type="bibr" rid="ref35">35</xref>]. Research has explored its applications in medical education, including its effectiveness on licensing examinations, tailored learning experiences, and comprehension of complex medical concepts and clinical reasoning [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. Other areas of study have focused on identifying inefficiencies and inaccuracies within clinical workflows, medical research, and diagnoses, with the objective of integrating LLMs to optimize documentation, triage, and clinical data management procedures [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref44">44</xref>]. Additionally, investigations into diagnostic assistance have integrated patient questionnaires and medical imaging with LLMs [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref45">45</xref>-<xref ref-type="bibr" rid="ref48">48</xref>]. Despite the apparent high performance of LLMs in health care, they remain inferior compared to the judgment of human clinicians [<xref ref-type="bibr" rid="ref49">49</xref>].</p><p>In summary, prior work has generally evaluated the trustworthiness of LLMs [<xref ref-type="bibr" rid="ref50">50</xref>], along with specifically examining their performance in medical situations. These methods have used the multiple-choice questions from the United States Medical Licensing Exam (USMLE) [<xref ref-type="bibr" rid="ref51">51</xref>] to evaluate the capability of LLMs in attaining scores near the passing threshold of 60% [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref52">52</xref>]. However, these evaluations inaccurately depict the capability of LLMs for self-diagnosis. In practical situations where individuals use technology to self-diagnose, they would not include answers when posing questions and would not provide the same level of information as the examination questions.</p></sec><sec id="s1-3"><title>Goal of This Study</title><p>This study critically examines the performance of LLMs in responding to health care&#x2013;related questions. To achieve this, we propose evaluation of LLM prompts. This evaluation procedure contains detailed guidelines to assess ChatGPT&#x2019;s response to open-ended questions and validate the robustness of these responses using a sentence dropout method. This 2-staged approach, to our knowledge, is the first comprehensive strategy aimed at better understanding LLM responses and their implications for medical misinformation. We hypothesize that LLMs, particularly ChatGPT-4.0 (referred to as GPT-4.0), are currently unsuitable for self-diagnosis purposes since a significant portion of responses will be ambiguous or incorrect. In particular, this hypothesis is validated if GPT-4.0 surpasses a minimum threshold of 60% [<xref ref-type="bibr" rid="ref20">20</xref>] for each of the following three questions:</p><list list-type="order"><list-item><p>Are the responses clear? This question can be answered by analyzing the response consistency.</p></list-item><list-item><p>Are the responses genuinely correct? This question can be answered by identifying the responses classified as correct by all assessors.</p></list-item><list-item><p>Are the responses robust? This question can be answered by conducting an ablation study on the correct responses.</p></list-item></list></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>In this section, the considerations and preparations for the decided dataset are first specified. Then, the assessor procedures and guidelines are discussed along with the process used to analyze the output answers from ChatGPT. Finally, the complete testing methodology is introduced. The overall process uses both nonmedical and medical experts for assessment and is segmented into two experiments: (1) ChatGPT responses on USMLE step 1 open-ended prompts and (2) ChatGPT robustness and ablation study analysis. The GPT-4.0 responses were generated for each of the USMLE questions using Python, with the code and datasets available in the EvalPrompt (evaluation of large language model prompts) repository [<xref ref-type="bibr" rid="ref53">53</xref>].</p></sec><sec id="s2-2"><title>Dataset Considerations and Preparations</title><sec id="s2-2-1"><title>Overview</title><p>The USMLE [<xref ref-type="bibr" rid="ref51">51</xref>] dataset consists of 3 steps undertaken by medical students throughout their program. Each step is a test consisting of multiple-choice, single-answer, and no-justification questions. Particularly, in this work, the questions are extracted from step 1 since the step 2 and step 3 questions are medically complex for the general population. Furthermore, only textual questions were kept, resulting in a dataset containing 94 single-answer questions that would be used for prompting ChatGPT. <xref ref-type="other" rid="box1">Textbox 1</xref> [<xref ref-type="bibr" rid="ref51">51</xref>] displays a sample of an extracted step 1 question.</p><boxed-text id="box1"><title> Sample question 3.1 is directly extracted from the United States Medical Licensing Exam step 1 test. This question is a multiple-choice question with a correct answer of (<bold>D</bold>).</title><p>Question: In a sample of 100 individuals, the mean leukocyte count is 7500/mm&#x00B3;, with a standard deviation of 1000/mm&#x00B3;. If the leukocyte counts in this population follow a normal (gaussian) distribution, approximately 50% of individuals will have which of the following total leukocyte counts?</p><p>(A) 5500&#x2013;9500/mm&#x00B3;</p><p>(B) &#x003C;6500/mm&#x00B3; or &#x003E;8500/mm&#x00B3;</p><p>(C) 6500&#x2013;8500/mm&#x00B3;</p><p>(D) &#x003C;7500/mm&#x00B3;</p><p>(E) &#x003E;9500/mm&#x00B3;</p><p>Answer: (D) &#x003C;7500/mm&#x00B3;</p></boxed-text></sec><sec id="s2-2-2"><title>Baseline ChatGPT Answer Analysis</title><p>The first experiment established the foundation for all testing. From the initial multiple-choice questions, each question was transformed into an open-ended question to accurately simulate the circumstances of an individual interacting with ChatGPT. This process was accomplished by removing the multiple-choice options and replacing any instance of &#x201C;which of the following&#x201D; with &#x201C;what.&#x201D; An example transformation is provided in <xref ref-type="other" rid="box2">Textbox 2</xref> [<xref ref-type="bibr" rid="ref51">51</xref>], where the original question from <xref ref-type="other" rid="box1">Textbox 1</xref> was transformed into an open-ended question.</p><boxed-text id="box2"><title> Transformed United States Medical Licensing Exam step 1 test question based on the original <xref ref-type="other" rid="box1">Textbox 1</xref> question. The transformation involves removing the (<bold>A</bold>)-(<bold>E</bold>) options and replacing the text &#x201C;which of the following&#x201D; with &#x201C;what.&#x201D;</title><p>Question: In a sample of 100 individuals, the mean leukocyte count is 7500/mm&#x00B3;, with a standard deviation of 1000/mm&#x00B3;. If the leukocyte counts in this population follow a normal (gaussian) distribution, approximately 50% of individuals will have what total leukocyte counts?</p></boxed-text></sec><sec id="s2-2-3"><title>ChatGPT Robustness and Ablation Study Analysis</title><p>The second experiment was built upon the foundation established in the first experiment by using only the questions that were categorized as correct (C) by all assessors. Using this subset of data, new questions were iteratively generated by removing a sentence from the question and prompting ChatGPT with the modified question. This process is repeated across all the sentences of a question, excluding the final question sentence indicated by the &#x201C;what...?&#x201D; structure. For instance, if a question has 3 sentences, then 3 versions of the same question would be generated, where each question variation has 2 sentences. The first version would remove the first sentence, the second version would remove the second sentence, and the third version would remove the third sentence. The ablation study aimed to simulate how a self-diagnosing individual may overlook information within their health-related searches.</p><p>Each of the open-ended modified questions was then provided to ChatGPT using the OpenAI application programming interface to aggregate the LLM responses for the questions. Specifically, the <italic>gpt-4</italic> model was used for GPT-4.0, along with specific parameters to ensure consistency among the responses and that each question would be answered independently. For instance, the model&#x2019;s temperature was set to zero to ensure the responses were focused and deterministic. Details of the complete GPT-4.0 model and corresponding parameters are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, along with <xref ref-type="other" rid="box3">Textbox 3</xref> [<xref ref-type="bibr" rid="ref51">51</xref>] displaying the GPT-4.0 response to the open-ended <xref ref-type="other" rid="box2">Textbox 2</xref> question.</p><boxed-text id="box3"><title> GPT-4.0 response to United States Medical Licensing Exam step 1 test question based on the transformed <xref ref-type="other" rid="box2">Textbox 2</xref> question. The assessor can then compare the GPT-4.0 answer with the original multiple-choice options to evaluate the correctness of the response.</title><p>Answer: Approximately 50% of individuals will have total leukocyte counts of 7500/mm&#x00B3; or less. Explanation: In a normal distribution, the mean value is the point at which approximately 50% of the data falls below and 50% falls above. Therefore, in this case, where the mean leukocyte count is 7500/mm&#x00B3;, approximately 50% of individuals will have leukocyte counts of 7500/mm&#x00B3; or less.</p></boxed-text></sec></sec><sec id="s2-3"><title>Assessor Procedures and Guidelines</title><p>The 94 open-ended questions served as prompts for ChatGPT, where 3 nonexperts and 3 experts evaluated the generated responses. Nonexperts were used to simulate the self-diagnosis process, while experts were used to assess the response quality.</p><p>Each assessor was provided with detailed procedures and guidelines as outlined in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> to ensure the response categorization process was standardized among the assessors. Assessors were instructed to categorize the responses as either C, partially correct (PC), incorrect (I), or ambiguous (A), and were provided with detailed examples to distinguish between the categories. Moreover, assessors were directed to evaluate the responses independently to ensure no bias.</p></sec><sec id="s2-4"><title>Testing Methodology</title><p>We develop the testing methodology EvalPrompt, as summarized in <xref ref-type="fig" rid="figure1">Figure 1</xref>. In total, <inline-formula><mml:math id="ieqn1"><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>94</mml:mn></mml:math></inline-formula> questions were selected from the USMLE step 1 test. Each question contained a question portion <inline-formula><mml:math id="ieqn2"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, answer options <inline-formula><mml:math id="ieqn3"><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, and a correct answer <inline-formula><mml:math id="ieqn4"><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, altogether forming the dataset of multiple-choice questions <inline-formula><mml:math id="ieqn5"><mml:mi>D</mml:mi><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mfenced open="{" close="}" separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfenced></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msubsup></mml:math></inline-formula>. Subsequently, each question, <inline-formula><mml:math id="ieqn6"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, was extracted and transformed into an open-ended variation which was then presented to ChatGPT.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>EvalPrompt summary. (1) A subset of 94 USMLE step 1 [<xref ref-type="bibr" rid="ref51">51</xref>] questions consisting of multiple-choice, single-answer questions are selected. (2) These 94 questions are modified to produce open-ended prompts. (3) The open-ended prompts are processed through the ChatGPT API (4) ChatGPT&#x2019;s answers are recorded and presented to <italic><inline-formula><mml:math id="ieqn7"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:mstyle></mml:math></inline-formula></italic> independent assessors to categorize as either correct, partially correct, incorrect, or ambiguous. (5) The categorizations classified as correct by all assessors are aggregated to formulate a new dataset for sensitivity analysis. (6) The prompts produced from the iterative sentence dropout are processed through ChatGPT. (7) The independent assessors categorize ChatGPT&#x2019;s answers. These categorizations are then aggregated based on the agreement where all assessors categorized the answer as correct. The resulting dataset, <inline-formula><mml:math id="ieqn8"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>W</mml:mi></mml:mrow></mml:mstyle></mml:math></inline-formula>, is the subset of USMLE question variations that ChatGPT answered correctly. USMLE: United States Medical Licensing Exam.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e66207_fig01.png"/></fig><p>After collecting and processing the answers from ChatGPT, 3 nonexpert and 3 expert assessors independently evaluated each ChatGPT answer, <inline-formula><mml:math id="ieqn9"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. In total, <inline-formula><mml:math id="ieqn10"><mml:mi>K</mml:mi><mml:mo>=</mml:mo><mml:mn>6</mml:mn></mml:math></inline-formula> assessors were used. The assessors were instructed to provide a label <inline-formula><mml:math id="ieqn11"><mml:mi>z</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>Z</mml:mi><mml:mo>;</mml:mo><mml:mi>Z</mml:mi><mml:mo>=</mml:mo><mml:mi> </mml:mi><mml:mfenced open="{" close="}" separators="|"><mml:mrow><mml:mi>C</mml:mi><mml:mo>,</mml:mo><mml:mi>P</mml:mi><mml:mi>C</mml:mi><mml:mo>,</mml:mo><mml:mi>I</mml:mi><mml:mo>,</mml:mo><mml:mi>A</mml:mi></mml:mrow></mml:mfenced></mml:math></inline-formula> denoting whether the answer, in comparison to the ground truth, <inline-formula><mml:math id="ieqn12"><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, was C, PC, I, or A, respectively. The assessment process can be expressed as a function <inline-formula><mml:math id="ieqn13"><mml:msup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msup><mml:mfenced separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfenced><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> for the <inline-formula><mml:math id="ieqn14"><mml:msup><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> assessor, where the assessor would use the correct answer to the question along with ChatGPT&#x2019;s answer to categorize the response.</p><p>After the <inline-formula><mml:math id="ieqn15"><mml:mi>k</mml:mi></mml:math></inline-formula> assessors finished the categorization process, the questions categorized as C by all <inline-formula><mml:math id="ieqn16"><mml:mi>k</mml:mi></mml:math></inline-formula> assessors were aggregated to define a subset dataset, <inline-formula><mml:math id="ieqn17"><mml:mi>&#x03B2;</mml:mi></mml:math></inline-formula>. Once the resulting dataset, <inline-formula><mml:math id="ieqn18"><mml:mi>&#x03B2;</mml:mi><mml:mo>=</mml:mo><mml:mfenced open="{" close="}" separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mi> </mml:mi><mml:mo>|</mml:mo><mml:mi> </mml:mi><mml:msubsup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi> </mml:mi><mml:mo>&#x2200;</mml:mo><mml:mi> </mml:mi><mml:mi>k</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>K</mml:mi></mml:mrow></mml:mfenced></mml:math></inline-formula> was generated, the first experiment was concluded.</p><p>The second experiment was built upon the dataset and assessments completed during the first experiment. An ablation study was conducted over the questions within dataset <inline-formula><mml:math id="ieqn19"><mml:mi>&#x03B2;</mml:mi></mml:math></inline-formula> via an iterative sentence dropout by performing <inline-formula><mml:math id="ieqn20"><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> iterations over an open-ended prompt <inline-formula><mml:math id="ieqn21"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, where <inline-formula><mml:math id="ieqn22"><mml:mi>i</mml:mi></mml:math></inline-formula> is the number of sentences in the prompt. For the <inline-formula><mml:math id="ieqn23"><mml:msup><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> iteration, the <inline-formula><mml:math id="ieqn24"><mml:msup><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> sentence was removed from the prompt before running the prompt through ChatGPT, as expressed by <inline-formula><mml:math id="ieqn25"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, where <inline-formula><mml:math id="ieqn26"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mi>&#x03B2;</mml:mi></mml:math></inline-formula>. Each <inline-formula><mml:math id="ieqn27"><mml:msup><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> sentence was iteratively removed and processed except for the final sentence which contained the question sentence. The final sentence of the question was mandatory to include to ensure ChatGPT provided an appropriate response.</p><p>After processing the sentence dropout questions through ChatGPT, the same 6 assessors evaluated the responses, <inline-formula><mml:math id="ieqn28"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>. The assessment process can again be expressed as a function <inline-formula><mml:math id="ieqn29"><mml:msup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msup><mml:mfenced separators="|"><mml:mrow><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi></mml:mrow></mml:msubsup><mml:mi mathvariant="normal"> </mml:mi><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfenced><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> for the <inline-formula><mml:math id="ieqn30"><mml:msup><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> assessor, where the assessor would use the correct answer to the question along with ChatGPT&#x2019;s answer to categorize the response. This generated the resulting dataset, <inline-formula><mml:math id="ieqn31"><mml:mi>W</mml:mi></mml:math></inline-formula>, defined as <inline-formula><mml:math id="ieqn32"><mml:mi>W</mml:mi><mml:mo>=</mml:mo><mml:mfenced open="{" close="}" separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mi> </mml:mi><mml:mo>|</mml:mo><mml:mi> </mml:mi><mml:msubsup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>`</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi> </mml:mi><mml:mo>&#x2200;</mml:mo><mml:mi> </mml:mi><mml:mi>k</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>K</mml:mi></mml:mrow></mml:mfenced></mml:math></inline-formula>, being the subset of USMLE question variations for the certainly correctly prompts.</p></sec><sec id="s2-5"><title>Ethical Considerations</title><p>Since the aim of this study is to analyze ChatGPT and not human subjects, a research ethics board review was not required. The evaluation of ChatGPT&#x2019;s responses was carried out on a volunteer basis, and all assessors were informed that contributing to the experiments would not result in any safety or privacy risks.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Are the Responses Clear?</title><p>To determine if the GPT-4.0 responses are clear, response categorizations across assessors were compared. As displayed in <xref ref-type="fig" rid="figure2">Figure 2</xref>, categorization disparities exist across the nonexpert and expert assessors with few alignments across assessors as depicted in <xref ref-type="table" rid="table1">Table 1</xref>. Namely, the number of C categorizations ranges from 39 to 51, while the number of I categorizations ranges from 12 to 39. This wide range of values for the categories immediately depicts the uncertainty across assessors due to each assessor having varying backgrounds and levels of medical expertise.</p><p>Analyzing the categorization for each question independently, many discrepancies exist among the assessors. As detailed in <xref ref-type="table" rid="table2">Table 2</xref>, on average 52% ([(52+46)/2]/94, SD of 3) of the responses in experiment 1 and 73% ([(119+104)/2]/152, SD of 7.5) of the responses in experiment 2 were categorized identically by the assessors. For example, USMLE question 54.1 was categorized by the nonexperts as PC, A, and C, and by the experts as I, PC, and PC, respectively. The inconsistency in categorizations depicts that LLM responses do not yet have a single apparent answer, but rather are still subject to interpretation depending on the individual.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Experiment 1: ChatGPT responses on USMLE step 1 open-ended prompts. Individual categorizations for the nonexpert and expert assessors, where each bar represents an individual&#x2019;s categorizations. The categorizations drastically vary across assessors, where some assessors categorized more responses as correct while other assessors categorized more responses as incorrect on the same dataset. USMLE: United States Medical Licensing Exam.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e66207_fig02.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Experiment 1: ChatGPT responses on USMLE<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> step 1 open-ended prompts. Overall response categorization between nonexpert and expert assessors. The top left section indicates the number of instances, regardless of correctness, where all nonexpert and expert assessors categorized a question identically. The bottom right section indicates the total number of questions categorized as correct by all nonexpert and expert assessors.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="4">Expert</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Identical</td><td align="left" valign="bottom">Mismatch</td><td align="left" valign="bottom">Correct</td><td align="left" valign="bottom">Other</td></tr></thead><tbody><tr><td align="left" valign="top">Nonexpert</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Identical</td><td align="left" valign="top">32</td><td align="left" valign="top">20</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mismatch</td><td align="left" valign="top">14</td><td align="left" valign="top">28</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Correct</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">29</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">6</td><td align="left" valign="top">55</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>USMLE: United States Medical Licensing Exam.</p></fn><fn id="table1fn2"><p><sup>b</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Number of identical categorizations among all individuals within the 2 groups for the 2 experiments. Experiment 1 had 94 questions in total, with 52 and 46 of the questions being categorized the same among the nonexpert and expert assessors, respectively. Similarly, experiment 2 had 152 responses in total spanning 29 unique questions, with 119 and 104 of the questions being categorized the same among the nonexpert and expert assessors, respectively.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="2">Experiment 1</td><td align="left" valign="bottom" colspan="2">Experiment 2</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Nonexpert</td><td align="left" valign="top">Expert</td><td align="left" valign="top">Nonexpert</td><td align="left" valign="top">Expert</td></tr></thead><tbody><tr><td align="left" valign="top">Correct</td><td align="left" valign="top">33</td><td align="left" valign="top">35</td><td align="left" valign="top">111</td><td align="left" valign="top">96</td></tr><tr><td align="left" valign="top">Partially correct</td><td align="left" valign="top">1</td><td align="left" valign="top">4</td><td align="left" valign="top">0</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">Incorrect</td><td align="left" valign="top">18</td><td align="left" valign="top">7</td><td align="left" valign="top">8</td><td align="left" valign="top">7</td></tr><tr><td align="left" valign="top">Ambiguous</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Overall agreement</td><td align="left" valign="top">52</td><td align="left" valign="top">46</td><td align="left" valign="top">119</td><td align="left" valign="top">104</td></tr></tbody></table></table-wrap><p>The nonexpert and expert assessors reached an identical conclusion for 52 and 46 of the questions for experiment 1, respectively, as detailed in <xref ref-type="table" rid="table2">Table 2</xref>. However, the groups collectively could only reach an identical conclusion for 32 of the questions as listed in <xref ref-type="table" rid="table1">Table 1</xref>. Since only 34% (32/94) of the responses were consistent across all assessors, the 60% threshold could not be met indicating that the responses are unclear. The decrease in the collective number of identically classified responses suggests that the GPT-4.0 responses are still too ambiguous for assessors to reach appropriate conclusions. The responses are not obvious enough such that anyone, regardless of their background and expertise, can reach the same conclusion.</p></sec><sec id="s3-2"><title>Are the Responses Genuinely Correct?</title><p>To determine the number of GPT-4.0 responses that are genuinely correct, the responses categorized as C by all nonexpert and expert assessors were analyzed and compared. As provided in <xref ref-type="table" rid="table1">Table 1</xref>, the experiment 1 categorizations where all assessors agreed are listed. In particular, 29 of the 94 responses were categorized as C, meaning that GPT-4.0 is certainly correct 31% (29/94) of the time.</p><p>The limited number of C responses indicates that GPT-4.0 is not often factual. As portrayed in <xref ref-type="table" rid="table2">Table 2</xref>, nonexpert and expert assessors classified 33 and 35 of the responses as C, respectively, even though collectively 29 responses were considered C. Since only 31% (29/94) of the responses were considered C across all assessors, the 60% threshold could not be met indicating that the responses are mostly incorrect. Hence, even though both groups approximately categorized the same number of responses as C, there are still many responses that are either A or I.</p></sec><sec id="s3-3"><title>Are the Responses Robust?</title><p>To determine if the GPT-4.0 responses are robust, an ablation study was conducted on the 29 correct responses. Reprompting GPT-4.0 with similar variations of the correct questions tested its ability to attain the correct answer even with information missing. This process aimed to simulate the self-diagnosis process since each individual would prompt LLMs with varying levels of information; some individuals would provide extensive details, while others may provide limited information. Thus, assessing GPT-4.0&#x2019;s robustness.</p><p><xref ref-type="table" rid="table2">Table 2</xref> provided the categorization details of the ablation study, consisting of 29 unique questions with 152 question variations. On average 68% ([(111+96)/2]/152, SD of 7.5) of the responses continue to be categorized as C even after removing information for each group. These results are far greater than GPT-4.0&#x2019;s accuracy on the initial 94 questions, which was correct only 31% (29/94) of the time. Moreover, <xref ref-type="fig" rid="figure3">Figure 3</xref> depicts the categorizations for each of the 6 assessors. Assessors categorized the experiment 2 dataset C much more frequently than the experiment 1 dataset. On average, 80% ([(127+120+105+116+133+129)/6]/152, SD of 9.34) of the assessors categorized the questions as C, depicting GPT-4.0&#x2019;s robustness on answers that are certainly correct.</p><p><xref ref-type="table" rid="table3">Table 3</xref> also displays the assessment similarities between the nonexpert and expert assessors for experiment 2. A total of 92 of the 152 questions were categorized C by all nonexpert and expert assessors, meaning that 61% (92/152) of the responses are certainly correct. Since 61% (92/152) of the responses for the sentence dropout experiment were categorized as C across all assessors, the 60% threshold is met indicating that the responses are robust. In other words, if GPT-4.0 correctly answers a question, it is likely to correctly answer a similar variation of the question again, even if some information is missing.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Experiment 2: ChatGPT robustness and ablation study analysis. Individual categorization for the nonexpert and expert assessors, where each bar represents an individual&#x2019;s categorizations. Out of the 152 questions, the assessors on average categorized 122 of the questions as correct, depicting its robustness.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e66207_fig03.png"/></fig><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Experiment 2: ChatGPT robustness and ablation study analysis. Overall response categorization between nonexpert and expert assessors. The top left section indicates the number of instances, regardless of correctness, where all nonexpert and expert assessors categorized a question identically. The bottom right section indicates the total number of questions categorized as correct by all nonexpert and expert assessors.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="4">Expert</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Identical</td><td align="left" valign="top">Mismatch</td><td align="left" valign="top">Correct</td><td align="left" valign="top">Other</td></tr></thead><tbody><tr><td align="left" valign="top">Nonexpert</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Identical</td><td align="left" valign="top">95</td><td align="left" valign="top">27</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mismatch</td><td align="left" valign="top">9</td><td align="left" valign="top">21</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Correct</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">92</td><td align="left" valign="top">22</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">4</td><td align="left" valign="top">34</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>The hypothesis that ChatGPT is currently unsuitable for self-diagnosis is proved. From each of the 3 addressed assessments with a passing threshold of 60%, GPT-4.0 marginally only passed one. The analysis indicates that GPT-4.0 is generally unclear and incorrect when providing medical information. However, when GPT-4.0 provides correct responses, it remains robust enough to continue answering these questions accurately even when some information is missing. <xref ref-type="table" rid="table4">Table 4</xref> summarizes the results of the evaluation procedure for GPT-4.0.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Results summarization of the evaluation procedure for GPT-4.0. With a minimum threshold of 60%, the GPT-4.0 responses are deemed generally unclear and incorrect, while exhibiting robustness when providing correct answers.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Definition</td><td align="left" valign="bottom">Success rate (%)</td><td align="left" valign="bottom">Pass or fail</td></tr></thead><tbody><tr><td align="left" valign="top">Are the responses clear?</td><td align="left" valign="top">Agreement reached between all assessors</td><td align="left" valign="top">34</td><td align="left" valign="top">Fail</td></tr><tr><td align="left" valign="top">Are the responses genuinely correct?</td><td align="left" valign="top">Responses categorized as correct by all assessors</td><td align="left" valign="top">31</td><td align="left" valign="top">Fail</td></tr><tr><td align="left" valign="top">Are the responses robust?</td><td align="left" valign="top">Responses continue to be categorized as correct by all assessors</td><td align="left" valign="top">61</td><td align="left" valign="top">Pass</td></tr></tbody></table></table-wrap></sec><sec id="s4-2"><title>Implications</title><p>Recognizing that the GPT-4.0 responses are often ambiguous, its medical advice should be accepted with caution since these responses can vary widely in interpretation depending on the individual. Additionally, since only a small number of responses were found to be genuinely correct, LLMs still require improvement before they can reliably be used in a medical setting. Without these improvements, LLMs risk misinforming individuals.</p><p>Even though ChatGPT cannot currently be used for self-diagnosis, substantial evidence indicates that LLMs are continuously improving, suggesting their potential future use in health care systems. As EvalPrompt was also conducted on an earlier GPT-3.5 version, as detailed in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>, the significant difference between the model performances proves that LLMs continue to improve as time progresses.</p><p>With ChatGPT being highly acclaimed for its success in passing medical examinations, researchers have proposed using ChatGPT in areas such as medical education and medical report creation [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. However, ChatGPT&#x2019;s ability to answer examination questions does not inherently equate to genuine medical comprehension and proficiency. Instead, using ChatGPT in these medical settings can undermine health care systems since ChatGPT&#x2019;s overconfidence can result in misinforming individuals.</p><p>Unlike practicing clinicians, ChatGPT lacks formal testing and accreditation for its abilities. It has not undergone accredited medical education or licensing, lacks approval for clinical practice, and has not demonstrated the necessary understanding or skill set to support its claims. While clinicians face severe consequences for errors, such as medical malpractice charges or license revocation, ChatGPT lacks such liability. Thus, relying on ChatGPT before it becomes accurate, consistent, and robust poses a significant risk of misleading health care practitioners and the general public.</p></sec><sec id="s4-3"><title>Limitations</title><p>This study has signified a substantial advancement in the quality of medical advice that LLMs can provide and their potential utility in the health care industry. Although the USMLE questions provide a solid foundation for simulating the self-diagnosis process via LLMs, the prompts contain extensive detail. In other words, an ordinary individual typically would not input paragraphs of information when self-diagnosing and may lack explicit technical health care knowledge. Therefore, future research can explore using a dataset of self-diagnosis questions that contains less detailed information to accurately assess the capability of LLMs. As a result, developing larger and more realistic self-diagnosing datasets can enhance the training of LLMs, in turn improving its performance. Additionally, LLMs are inclined to excel in tasks well-represented in the training data, potentially leading to lower performance in niche problems [<xref ref-type="bibr" rid="ref17">17</xref>]. This raises equity concerns, as questions concerning underrepresented groups may endure poor performance. Although not explicitly examined in this study, future research could generate datasets with diverse question designs across medical specializations to ensure all fields are represented in training.</p></sec><sec id="s4-4"><title>Conclusions</title><p>While LLMs make headlines for passing medical licensing examinations and are consequently being considered as candidates to train the next generation of health care professionals, it is evident that LLMs&#x2019; capabilities are (understandably) modest at this time. More importantly, this misplaced trust in these systems can lead to reliance on their use for self-diagnosis by the public. In constructing EvalPrompt for assessing the capabilities of LLMs in medical contexts, the extent of misinformation inversely correlates to the LLMs&#x2019; performance. LLMs that offer unclear and inaccurate responses are more likely to misinform individuals. Ultimately, this investigation presents challenges for machine learning researchers to build more transparent artificial intelligence&#x2013;powered models capable of reasoning and responding responsibly, while also highlighting the need for a dataset that incorporates more realistic prompt styles.</p></sec></sec></body><back><ack><p>Sirisha Rambhatla, Ph.D., would like to acknowledge the support of the Natural Sciences and Engineering Research Council of Canada (NSERC) Discovery Grant (RGPIN-2022-03512).</p></ack><notes><sec><title>Data Availability</title><p>The step 1 United States Medical Licensing Exam dataset along with the GPT-4.0 answers analyzed during this study are available in the EvalPrompt repository [<xref ref-type="bibr" rid="ref53">53</xref>]. The datasets for assessor categorizations analyzed during this study are not publicly available due to ChatGPT only being analyzed in aggregate, but are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="conflict"><p>VB is supported by an Academic Scholar Award from the University of Toronto Department of Psychiatry and has received research funding from the Canadian Institutes of Health Research, Brain &#x0026; Behavior Foundation, Ontario Ministry of Health Innovation Funds, Royal College of Physicians and Surgeons of Canada, Department of National Defence (Government of Canada), New Frontiers in Research Fund, Associated Medical Services Inc Healthcare, American Foundation for Suicide Prevention, Roche Canada, Novartis, and Eisai.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">A</term><def><p>ambiguous</p></def></def-item><def-item><term id="abb2">C</term><def><p>correct</p></def></def-item><def-item><term id="abb3">EvalPrompt</term><def><p>evaluation of large language model prompts</p></def></def-item><def-item><term id="abb4">I</term><def><p>incorrect</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">PC</term><def><p>partially correct</p></def></def-item><def-item><term id="abb7">USMLE</term><def><p>United States Medical Licensing Exam</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shahsavar</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Choudhury</surname><given-names>A</given-names> </name></person-group><article-title>User intentions to use ChatGPT for self-diagnosis and health-related purposes: cross-sectional survey study</article-title><source>JMIR Hum Factors</source><year>2023</year><month>05</month><day>17</day><volume>10</volume><fpage>e47564</fpage><pub-id pub-id-type="doi">10.2196/47564</pub-id><pub-id pub-id-type="medline">37195756</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Taecharungroj</surname><given-names>V</given-names> </name></person-group><article-title>&#x201C;What can ChatGPT do?&#x201D; analyzing early reactions to the innovative AI Chatbot on Twitter</article-title><source>BDCC</source><year>2023</year><month>03</month><volume>7</volume><issue>1</issue><fpage>35</fpage><pub-id pub-id-type="doi">10.3390/bdcc7010035</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Alhalbusi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ismail</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Abdelfattah</surname><given-names>F</given-names> </name></person-group><article-title>Chatting with ChatGPT: decoding the mind of Chatbot users and unveiling the intricate connections between user perception, trust and stereotype perception on self-esteem and psychological well-being</article-title><source>Curr Psychol</source><year>2024</year><month>03</month><volume>43</volume><issue>9</issue><fpage>7843</fpage><lpage>7858</lpage><pub-id pub-id-type="doi">10.1007/s12144-023-04989-0</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Pan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>W</given-names> </name><name name-style="western"><surname>Nakov</surname><given-names>P</given-names> </name><name name-style="western"><surname>Kan</surname><given-names>MY</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>WY</given-names> </name></person-group><article-title>On the risk of misinformation pollution with large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 26, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.13661</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Weidinger</surname><given-names>L</given-names> </name><name name-style="western"><surname>Uesato</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rauh</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Taxonomy of risks posed by language models</article-title><conf-name>FAccT &#x2019;22</conf-name><conf-date>Jun 21-24, 2022</conf-date><conf-loc>Seoul, Republic of Korea</conf-loc><pub-id pub-id-type="doi">10.1145/3531146.3533088</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="web"><article-title>Disruptions on the horizon</article-title><source>Policy Horizons Canada</source><year>2024</year><access-date>2025-02-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://horizons.service.canada.ca/en/2024/disruptions/index.shtml">https://horizons.service.canada.ca/en/2024/disruptions/index.shtml</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goyder</surname><given-names>C</given-names> </name><name name-style="western"><surname>McPherson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Glasziou</surname><given-names>P</given-names> </name></person-group><article-title>Self diagnosis</article-title><source>BMJ</source><year>2009</year><month>11</month><day>11</day><volume>339</volume><fpage>b4418</fpage><pub-id pub-id-type="doi">10.1136/bmj.b4418</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jacobs</surname><given-names>W</given-names> </name><name name-style="western"><surname>Amuta</surname><given-names>AO</given-names> </name><name name-style="western"><surname>Jeon</surname><given-names>KC</given-names> </name></person-group><article-title>Health information seeking in the digital age: an analysis of health information seeking behavior among US adults</article-title><source>Cogent Social Sciences</source><year>2017</year><month>01</month><day>1</day><volume>3</volume><issue>1</issue><fpage>1302785</fpage><pub-id pub-id-type="doi">10.1080/23311886.2017.1302785</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Swire-Thompson</surname><given-names>B</given-names> </name><name name-style="western"><surname>Lazer</surname><given-names>D</given-names> </name></person-group><article-title>Public health and online misinformation: challenges and recommendations</article-title><source>Annu Rev Public Health</source><year>2020</year><month>04</month><day>2</day><volume>41</volume><fpage>433</fpage><lpage>451</lpage><pub-id pub-id-type="doi">10.1146/annurev-publhealth-040119-094127</pub-id><pub-id pub-id-type="medline">31874069</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>White</surname><given-names>RW</given-names> </name><name name-style="western"><surname>Horvitz</surname><given-names>E</given-names> </name></person-group><article-title>Experiences with web search on medical concerns and self diagnosis</article-title><source>AMIA Annu Symp Proc</source><year>2009</year><month>11</month><day>14</day><volume>2009</volume><fpage>696</fpage><lpage>700</lpage><pub-id pub-id-type="medline">20351943</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>El Mikati</surname><given-names>IK</given-names> </name><name name-style="western"><surname>Hoteit</surname><given-names>R</given-names> </name><name name-style="western"><surname>Harb</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Defining misinformation and related terms in health-related literature: scoping review</article-title><source>J Med Internet Res</source><year>2023</year><month>08</month><day>9</day><volume>25</volume><fpage>e45731</fpage><pub-id pub-id-type="doi">10.2196/45731</pub-id><pub-id pub-id-type="medline">37556184</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Okoro</surname><given-names>YO</given-names> </name><name name-style="western"><surname>Ayo-Farai</surname><given-names>O</given-names> </name><name name-style="western"><surname>Maduka</surname><given-names>CP</given-names> </name><name name-style="western"><surname>Okongwu</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Sodamade</surname><given-names>OT</given-names> </name></person-group><article-title>A review of health misinformation on digital platforms: challenges and countermeasures</article-title><source>Int J Appl Res Soc Sci</source><year>2024</year><month>03</month><day>8</day><volume>6</volume><issue>1</issue><fpage>23</fpage><lpage>36</lpage><pub-id pub-id-type="doi">10.51594/ijarss.v6i1.689</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boniol</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kunjumen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nair</surname><given-names>TS</given-names> </name><name name-style="western"><surname>Siyam</surname><given-names>A</given-names> </name><name name-style="western"><surname>Campbell</surname><given-names>J</given-names> </name><name name-style="western"><surname>Diallo</surname><given-names>K</given-names> </name></person-group><article-title>The global health workforce stock and distribution in 2020 and 2030: a threat to equity and &#x201C;universal&#x201D; health coverage?</article-title><source>BMJ Glob Health</source><year>2022</year><month>06</month><volume>7</volume><issue>6</issue><fpage>e009316</fpage><pub-id pub-id-type="doi">10.1136/bmjgh-2022-009316</pub-id><pub-id pub-id-type="medline">35760437</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kuehn</surname><given-names>BM</given-names> </name></person-group><article-title>Clinician shortage exacerbates pandemic-fueled &#x201C;mental health crisis&#x201D;</article-title><source>JAMA</source><year>2022</year><month>06</month><day>14</day><volume>327</volume><issue>22</issue><fpage>2179</fpage><pub-id pub-id-type="doi">10.1001/jama.2022.8661</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Michel</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Ecarnot</surname><given-names>F</given-names> </name></person-group><article-title>The shortage of skilled workers in Europe: its impact on geriatric medicine</article-title><source>Eur Geriatr Med</source><year>2020</year><month>06</month><volume>11</volume><issue>3</issue><fpage>345</fpage><lpage>347</lpage><pub-id pub-id-type="doi">10.1007/s41999-020-00323-0</pub-id><pub-id pub-id-type="medline">32328964</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Turale</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nantsupawat</surname><given-names>A</given-names> </name></person-group><article-title>Clinician mental health, nursing shortages and the COVID-19 pandemic: crises within crises</article-title><source>Int Nurs Rev</source><year>2021</year><month>03</month><volume>68</volume><issue>1</issue><fpage>12</fpage><lpage>14</lpage><pub-id pub-id-type="doi">10.1111/inr.12674</pub-id><pub-id pub-id-type="medline">33891772</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garg</surname><given-names>RK</given-names> </name><name name-style="western"><surname>Urs</surname><given-names>VL</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Chaudhary</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Paliwal</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kar</surname><given-names>SK</given-names> </name></person-group><article-title>Exploring the role of ChatGPT in patient care (diagnosis and treatment) and medical research: a systematic review</article-title><source>Health Promot Perspect</source><year>2023</year><volume>13</volume><issue>3</issue><fpage>183</fpage><lpage>191</lpage><pub-id pub-id-type="doi">10.34172/hpp.2023.22</pub-id><pub-id pub-id-type="medline">37808939</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Horesh</surname><given-names>A</given-names> </name></person-group><article-title>Using ChatGPT to study medicine: learn the basics</article-title><source>FutureDoctorAI</source><year>2023</year><month>03</month><access-date>2025-02-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://futuredoctor.ai/chatgpt">https://futuredoctor.ai/chatgpt</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Iftikhar</surname><given-names>L</given-names> </name><name name-style="western"><surname>Iftikhar</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Hanif</surname><given-names>MI</given-names> </name></person-group><article-title>DocGPT: impact of ChatGPT-3 on health services as a virtual doctor</article-title><source>EC Paediatrics</source><year>2023</year><month>03</month><access-date>2025-02-20</access-date><volume>12</volume><issue>3</issue><fpage>45</fpage><lpage>55</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.researchgate.net/profile/Linta-Iftikhar/publication/370288279_DocGPT_Impact_of_ChatGPT-3_on_Health_Services_as_a_Virtual_Doctor/links/644b5bac5762c95ac35b52ce/DocGPT-Impact-of-ChatGPT-3-on-Health-Services-as-a-Virtual-Doctor.pdf">https://www.researchgate.net/profile/Linta-Iftikhar/publication/370288279_DocGPT_Impact_of_ChatGPT-3_on_Health_Services_as_a_Virtual_Doctor/links/644b5bac5762c95ac35b52ce/DocGPT-Impact-of-ChatGPT-3-on-Health-Services-as-a-Virtual-Doctor.pdf</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>H</given-names> </name></person-group><article-title>The rise of ChatGPT: Exploring its potential in medical education</article-title><source>Anat Sci Educ</source><year>2024</year><volume>17</volume><issue>5</issue><fpage>926</fpage><lpage>931</lpage><pub-id pub-id-type="doi">10.1002/ase.2270</pub-id><pub-id pub-id-type="medline">36916887</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Primack</surname><given-names>D</given-names> </name></person-group><article-title>Here come the robot doctors</article-title><source>Axois</source><year>2023</year><month>01</month><access-date>2025-02-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.axios.com/2023/01/18/chatgpt-ai-health-care-doctors">https://www.axios.com/2023/01/18/chatgpt-ai-health-care-doctors</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sedaghat</surname><given-names>S</given-names> </name></person-group><article-title>Early applications of ChatGPT in medical practice, education and research</article-title><source>Clin Med (Lond)</source><year>2023</year><month>05</month><volume>23</volume><issue>3</issue><fpage>278</fpage><lpage>279</lpage><pub-id pub-id-type="doi">10.7861/clinmed.2023-0078</pub-id><pub-id pub-id-type="medline">37085182</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>C</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the medical licensing exams? The implications of large language models for medical education and knowledge assessment</article-title><source>medRxiv</source><comment>Preprint posted online on  Dec 26, 2022</comment><pub-id pub-id-type="doi">10.1101/2022.12.23.22283901</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Scott</surname><given-names>K</given-names> </name></person-group><article-title>Microsoft teams up with openai to exclusively license GPT-3 language model</article-title><source>The Official Microsoft Blog</source><year>2020</year><month>09</month><access-date>2025-02-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://blogs.microsoft.com/blog/2020/09/22/microsoft-teams-up-with-openai-to-exclusively-license-gpt-3-language-model">https://blogs.microsoft.com/blog/2020/09/22/microsoft-teams-up-with-openai-to-exclusively-license-gpt-3-language-model</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Hickman</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Monahan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Schwarcz</surname><given-names>DB</given-names> </name></person-group><article-title>ChatGPT goes to law school</article-title><source>SSRN Journal</source><year>2022</year><volume>71</volume><issue>3</issue><fpage>387</fpage><lpage>400</lpage><pub-id pub-id-type="doi">10.2139/ssrn.4335905</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sallam</surname><given-names>M</given-names> </name><name name-style="western"><surname>Salim</surname><given-names>NA</given-names> </name><name name-style="western"><surname>Barakat</surname><given-names>M</given-names> </name><name name-style="western"><surname>Al-Tammemi</surname><given-names>AB</given-names> </name></person-group><article-title>ChatGPT applications in medical, dental, pharmacy, and public health education: a descriptive study highlighting the advantages and limitations</article-title><source>Narra J</source><year>2023</year><month>04</month><volume>3</volume><issue>1</issue><fpage>e103</fpage><pub-id pub-id-type="doi">10.52225/narra.v3i1.103</pub-id><pub-id pub-id-type="medline">38450035</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Dis</surname><given-names>EAM</given-names> </name><name name-style="western"><surname>Bollen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zuidema</surname><given-names>W</given-names> </name><name name-style="western"><surname>van Rooij</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bockting</surname><given-names>CL</given-names> </name></person-group><article-title>ChatGPT: five priorities for research</article-title><source>Nature New Biol</source><year>2023</year><month>02</month><volume>614</volume><issue>7947</issue><fpage>224</fpage><lpage>226</lpage><pub-id pub-id-type="doi">10.1038/d41586-023-00288-7</pub-id><pub-id pub-id-type="medline">36737653</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhai</surname><given-names>X</given-names> </name></person-group><article-title>ChatGPT user experience: implications for education</article-title><source>SSRN Journal</source><year>2022</year><pub-id pub-id-type="doi">10.2139/ssrn.4312418</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arora</surname><given-names>A</given-names> </name><name name-style="western"><surname>Arora</surname><given-names>A</given-names> </name></person-group><article-title>The promise of large language models in health care</article-title><source>The Lancet</source><year>2023</year><month>02</month><volume>401</volume><issue>10377</issue><fpage>641</fpage><pub-id pub-id-type="doi">10.1016/S0140-6736(23)00216-7</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Biswas</surname><given-names>SS</given-names> </name></person-group><article-title>Role of Chat GPT in public health</article-title><source>Ann Biomed Eng</source><year>2023</year><month>05</month><volume>51</volume><issue>5</issue><fpage>868</fpage><lpage>869</lpage><pub-id pub-id-type="doi">10.1007/s10439-023-03172-7</pub-id><pub-id pub-id-type="medline">36920578</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liebrenz</surname><given-names>M</given-names> </name><name name-style="western"><surname>Schleifer</surname><given-names>R</given-names> </name><name name-style="western"><surname>Buadze</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bhugra</surname><given-names>D</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>A</given-names> </name></person-group><article-title>Generating scholarly content with ChatGPT: ethical challenges for medical publishing</article-title><source>Lancet Digit Health</source><year>2023</year><month>03</month><volume>5</volume><issue>3</issue><fpage>e105</fpage><lpage>e106</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(23)00019-5</pub-id><pub-id pub-id-type="medline">36754725</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sallam</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title><source>Healthcare (Basel)</source><year>2023</year><month>03</month><day>19</day><volume>11</volume><issue>6</issue><fpage>887</fpage><pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id><pub-id pub-id-type="medline">36981544</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ufuk</surname><given-names>F</given-names> </name></person-group><article-title>The role and limitations of large language models such as ChatGPT in clinical settings and medical journalism</article-title><source>Radiology</source><year>2023</year><month>05</month><volume>307</volume><issue>3</issue><fpage>e230276</fpage><pub-id pub-id-type="doi">10.1148/radiol.230276</pub-id><pub-id pub-id-type="medline">36880943</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dada</surname><given-names>A</given-names> </name><name name-style="western"><surname>Puladi</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kleesiek</surname><given-names>J</given-names> </name><name name-style="western"><surname>Egger</surname><given-names>J</given-names> </name></person-group><article-title>ChatGPT in healthcare: a taxonomy and systematic review</article-title><source>Comput Methods Programs Biomed</source><year>2024</year><month>03</month><volume>245</volume><fpage>108013</fpage><pub-id pub-id-type="doi">10.1016/j.cmpb.2024.108013</pub-id><pub-id pub-id-type="medline">38262126</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Holmes</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Evaluating large language models on a highly-specialized topic, radiation oncology physics</article-title><source>Front Oncol</source><year>2023</year><volume>13</volume><fpage>1219326</fpage><pub-id pub-id-type="doi">10.3389/fonc.2023.1219326</pub-id><pub-id pub-id-type="medline">37529688</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Agrawal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hegselmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sontag</surname><given-names>D</given-names> </name></person-group><article-title>Large language models are few-shot clinical information extractors</article-title><conf-name>The 2022 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 7-11, 2022</conf-date><conf-loc>Abu Dhabi, United Arab Emirates</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2022.emnlp-main.130</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cascella</surname><given-names>M</given-names> </name><name name-style="western"><surname>Montomoli</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bellini</surname><given-names>V</given-names> </name><name name-style="western"><surname>Bignami</surname><given-names>E</given-names> </name></person-group><article-title>Evaluating the feasibility of ChatGPT in healthcare: an analysis of multiple clinical and research scenarios</article-title><source>J Med Syst</source><year>2023</year><month>03</month><day>4</day><volume>47</volume><issue>1</issue><fpage>33</fpage><pub-id pub-id-type="doi">10.1007/s10916-023-01925-4</pub-id><pub-id pub-id-type="medline">36869927</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harrer</surname><given-names>S</given-names> </name></person-group><article-title>Attention is not all you need: the complicated case of ethically using large language models in healthcare and medicine</article-title><source>EBioMedicine</source><year>2023</year><month>04</month><volume>90</volume><fpage>104512</fpage><pub-id pub-id-type="doi">10.1016/j.ebiom.2023.104512</pub-id><pub-id pub-id-type="medline">36924620</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jayakumar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Moore</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Furlough</surname><given-names>KA</given-names> </name><etal/></person-group><article-title>Comparison of an artificial intelligence-enabled patient decision aid vs educational material on decision quality, shared decision-making, patient experience, and functional outcomes in adults with knee osteoarthritis: a randomized clinical trial</article-title><source>JAMA Netw Open</source><year>2021</year><month>02</month><day>1</day><volume>4</volume><issue>2</issue><fpage>e2037107</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2020.37107</pub-id><pub-id pub-id-type="medline">33599773</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Patel</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Lam</surname><given-names>K</given-names> </name></person-group><article-title>ChatGPT: the future of discharge summaries?</article-title><source>Lancet Digit Health</source><year>2023</year><month>03</month><volume>5</volume><issue>3</issue><fpage>e107</fpage><lpage>e108</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(23)00021-3</pub-id><pub-id pub-id-type="medline">36754724</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tan</surname><given-names>XJ</given-names> </name><name name-style="western"><surname>Cheor</surname><given-names>WL</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>LL</given-names> </name><name name-style="western"><surname>Ab Rahman</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Bakrin</surname><given-names>IH</given-names> </name></person-group><article-title>Artificial intelligence (AI) in breast imaging: a scientometric umbrella review</article-title><source>Diagnostics (Basel)</source><year>2022</year><month>12</month><day>9</day><volume>12</volume><issue>12</issue><fpage>3111</fpage><pub-id pub-id-type="doi">10.3390/diagnostics12123111</pub-id><pub-id pub-id-type="medline">36553119</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xue</surname><given-names>VW</given-names> </name><name name-style="western"><surname>Lei</surname><given-names>P</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>WC</given-names> </name></person-group><article-title>The potential impact of ChatGPT in clinical and translational medicine</article-title><source>Clin Transl Med</source><year>2023</year><month>03</month><volume>13</volume><issue>3</issue><fpage>e1216</fpage><pub-id pub-id-type="doi">10.1002/ctm2.1216</pub-id><pub-id pub-id-type="medline">36856370</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>A</given-names> </name><name name-style="western"><surname>PourNejatian</surname><given-names>N</given-names> </name><etal/></person-group><article-title>A large language model for electronic health records</article-title><source>NPJ Digit Med</source><year>2022</year><month>12</month><day>26</day><volume>5</volume><issue>1</issue><fpage>194</fpage><pub-id pub-id-type="doi">10.1038/s41746-022-00742-2</pub-id><pub-id pub-id-type="medline">36572766</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Horvat</surname><given-names>N</given-names> </name><name name-style="western"><surname>Veeraraghavan</surname><given-names>H</given-names> </name><name name-style="western"><surname>Nahas</surname><given-names>CSR</given-names> </name><etal/></person-group><article-title>Combined artificial intelligence and radiologist model for predicting rectal cancer treatment response from magnetic resonance imaging: an external validation study</article-title><source>Abdom Radiol (NY)</source><year>2022</year><month>08</month><volume>47</volume><issue>8</issue><fpage>2770</fpage><lpage>2782</lpage><pub-id pub-id-type="doi">10.1007/s00261-022-03572-8</pub-id><pub-id pub-id-type="medline">35710951</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pun</surname><given-names>FW</given-names> </name><name name-style="western"><surname>Leung</surname><given-names>GHD</given-names> </name><name name-style="western"><surname>Leung</surname><given-names>HW</given-names> </name><etal/></person-group><article-title>Hallmarks of aging-based dual-purpose disease and age-associated targets predicted using PandaOmics AI-powered discovery engine</article-title><source>Aging (Albany NY)</source><year>2022</year><month>03</month><day>29</day><volume>14</volume><issue>6</issue><fpage>2475</fpage><lpage>2506</lpage><pub-id pub-id-type="doi">10.18632/aging.203960</pub-id><pub-id pub-id-type="medline">35347083</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kamineni</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lie</surname><given-names>W</given-names> </name><name name-style="western"><surname>Succi</surname><given-names>MD</given-names> </name></person-group><article-title>Evaluating ChatGPT as an adjunct for radiologic decision-making</article-title><source>medRxiv</source><comment>Preprint posted online on  Feb 7, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.02.02.23285399</pub-id><pub-id pub-id-type="medline">36798292</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ouyang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>D</given-names> </name></person-group><article-title>Interactive computer-aided diagnosis on medical image using large language models</article-title><source>Commun Eng</source><year>2024</year><month>09</month><day>17</day><volume>3</volume><issue>1</issue><fpage>133</fpage><pub-id pub-id-type="doi">10.1038/s44172-024-00271-8</pub-id><pub-id pub-id-type="medline">39284899</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>TrustLLM: trustworthiness in large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 10, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2401.05561</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="web"><article-title>Step 1 sample test questions</article-title><source>USMLE</source><year>2022</year><month>06</month><access-date>2025-02-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.usmle.org/exam-resources/step-1-materials/step-1-sample-test-questions">https://www.usmle.org/exam-resources/step-1-materials/step-1-sample-test-questions</ext-link></comment></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Ziaei</surname><given-names>R</given-names> </name><name name-style="western"><surname>Schmidgall</surname><given-names>S</given-names> </name></person-group><article-title>Language models are susceptible to incorrect patient self-diagnosis in medical applications</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 17, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.09362</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Zada</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tam</surname><given-names>N</given-names> </name><name name-style="western"><surname>Rambhatla</surname><given-names>S</given-names> </name></person-group><article-title>EvalPrompt: analyzing large language models for self-diagnosis</article-title><source>GitHub</source><year>2022</year><access-date>2025-02-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/criticalml-uw/EvalPrompt">https://github.com/criticalml-uw/EvalPrompt</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>The selected ChatGPT-4.0 model along with the corresponding parameters.</p><media xlink:href="formative_v9i1e66207_app1.docx" xlink:title="DOCX File, 24 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Procedures and guidelines were provided to the assessors to evaluate the LLM responses. LLM: large language model.</p><media xlink:href="formative_v9i1e66207_app2.docx" xlink:title="DOCX File, 143 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Categorization results for ChatGPT-3.5 in comparison to ChatGPT-4.0.</p><media xlink:href="formative_v9i1e66207_app3.docx" xlink:title="DOCX File, 413 KB"/></supplementary-material></app-group></back></article>