<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e68223</article-id><article-id pub-id-type="doi">10.2196/68223</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Fact-Checking Large Language Model Responses to a Health Care Prompt: Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Ryan</surname><given-names>Padhraig</given-names></name><degrees>BSc, PhD, MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Davoren</surname><given-names>Orla</given-names></name><degrees>BA</degrees></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Elwyn</surname><given-names>Glyn</given-names></name><degrees>BA, MB, BCh, MSc, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Pharmaceutical Society of Ireland</institution><addr-line>Dublin</addr-line><country>Ireland</country></aff><aff id="aff2"><institution>Dartmouth Institute for Health Policy and Clinical Practice, Dartmouth College</institution><addr-line>1 Medical Center Drive, Williamson Translational Research Building, Level 5</addr-line><addr-line>Lebanon</addr-line><addr-line>NH</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Ozek</surname><given-names>Burcu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Glyn Elwyn, BA, MB, BCh, MSc, PhD, Dartmouth Institute for Health Policy and Clinical Practice, Dartmouth College, 1 Medical Center Drive, Williamson Translational Research Building, Level 5, Lebanon, NH, 03756, United States, 1 603-646-5678; <email>glynelwyn@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>15</day><month>4</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e68223</elocation-id><history><date date-type="received"><day>31</day><month>10</month><year>2024</year></date><date date-type="rev-recd"><day>09</day><month>09</month><year>2025</year></date><date date-type="accepted"><day>10</day><month>09</month><year>2025</year></date></history><copyright-statement>&#x00A9; Padhraig Ryan, Orla Davoren, Glyn Elwyn. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 15.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e68223"/><abstract><sec><title>Background</title><p>Large language models use machine learning to produce natural language. These models have a range of potential applications in health care, such as patient education and diagnosis. However, evaluations of large language models in health care are still scarce.</p></sec><sec><title>Objective</title><p>This study aimed to (1) evaluate the accuracy and efficiency of automated fact-checking by 2 large language models and (2) illustrate a process through which a large language model might support a patient in redrafting a prompt to include key information needed for patient safety.</p></sec><sec sec-type="methods"><title>Methods</title><p>A parallel comparison of 2 large language models and 3 human experts was conducted. A clinical scenario was devised in which a woman aged 23 years questions the safety of retinoid treatment for acne by sending prompts to 2 large language models (GPT-4o and OpenBioLLM-70B). GPT-4o and OpenBioLLM-70B were asked to suggest improvements to the patient&#x2019;s initial prompt to elicit key information for clinical decision-making. After the patient sent the revised prompt to the large language models, the models were then asked to fact-check the final response. To test the generalizability of automated fact-checking, a set of 20 clinical statements on disparate topics, mostly related to drug indications, contraindications, and side effects, was developed. The large language models also fact-checked these 20 medical statements. The results were compared against the evaluations of 3 clinical experts. The outcome measures were as follows: (1) percentage of accuracy of automated fact-checking, (2) time to complete fact-checking, and (3) a binary outcome for prompt redrafting (advising the patient to revise her prompt by naming her acne medication to address safety concerns).</p></sec><sec sec-type="results"><title>Results</title><p>For the scenario of a patient with acne, GPT-4o and OpenBioLLM-70B both had 86% agreement with the clinical experts&#x2019; fact-checking. The large language models did not consistently convey the urgency of discontinuing isotretinoin treatment when pregnancy is suspected. In addition, the models did not adequately convey the importance of folic acid supplementation during pregnancy. For the set of 20 medical claims, GPT-4o fact-checking had 100% agreement with that of human experts, whereas OpenBioLLM-70B had 95% agreement. OpenBioLLM-70B diverged from human experts and GPT-4o on 1 question related to pediatric use of antihistamines. The expert fact-checks took a mean time of 18 (SD 3.74) minutes, GPT-4o took 42 seconds, and OpenBioLLM-70B took 33 minutes. The GPT-4o responses for the acne scenario had some inconsistencies but zero fabrication and no obvious omissions. In contrast, OpenBioLLM-70B omitted 1 key information item needed for patient safety.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>GPT-4o can interact with patients to improve the quality and comprehensiveness of the information contained in health-related prompts. GPT-4o and OpenBioLLM-70B can conduct efficient fact-checking that is close to the level of accuracy of human experts. Human experts need to perform additional checks for accuracy and safety.</p></sec></abstract><kwd-group><kwd>machine learning</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>deep learning</kwd><kwd>patient medication knowledge</kwd><kwd>patient participation</kwd><kwd>shared decision-making</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Generative artificial intelligence (GenAI) is a form of artificial intelligence that can generate text, images, or other data types. Tools such as ChatGPT and Gemini offer a new opportunity for patients and clinicians to access health information [<xref ref-type="bibr" rid="ref1">1</xref>].</p><p>There is evidence that patients are willing to use ChatGPT for self-diagnosis [<xref ref-type="bibr" rid="ref2">2</xref>]. Clinicians are willing to use ChatGPT for various tasks in practice [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. In some cases, GenAI has suggested diagnoses that have eluded clinicians [<xref ref-type="bibr" rid="ref5">5</xref>]. However, GenAI can provide misleading and factually incorrect information authoritatively without indicating that it is unreliable [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>Clinicians might be able to distinguish factually correct from incorrect GenAI responses in their areas of expertise but not in topics in which they only have general knowledge, whereas patients will always need support to fact-check GenAI responses. There is a need for improved tools to support clinicians and patients in this process.</p><p>The role of GenAI may extend beyond answering users&#x2019; health care prompts. GenAI may also have a role to play in the subsequent fact-checking of these responses. This additional layer of &#x201C;self-checking&#x201D; may bolster the accuracy of responses. There is scant evidence on the ability of GenAI to perform this function. There is also scant literature describing the manner in which a patient might interact with GenAI to accomplish this in a real-world context.</p><p>Ni et al [<xref ref-type="bibr" rid="ref8">8</xref>] evaluated the accuracy of large language models (LLMs), a prominent form of GenAI, in fact-checking public health statements. They found that the accuracy ranged from 66% to 90% depending on the type of prompt and the incorporation of relevant documents into the LLM system.</p><p>Zarharan et al [<xref ref-type="bibr" rid="ref9">9</xref>] compared the accuracy of a number of LLMs for fact-checking public health statements. When zero-shot prompting was used, the most accurate model was GPT-4. Zero-shot prompting means that the user does not input any relevant examples from which the LLM could learn what constitutes a satisfactory response. In contrast, when the user provided one or more examples from which the model could learn (known as single- or few-shot prompting), open-source models achieved a level of accuracy that was comparable to that of the proprietary GPT-4 model. The highest accuracy was achieved by Vicuna-13B and Mistral-7B after undergoing fine-tuning, a process that adjusts some of the LLM&#x2019;s weights for improved performance in a specific task. These models achieved 68.5% and 72.0% <italic>F</italic><sub>1</sub>-scores, respectively.</p><p>LLMs may also have a role to play in patient portals. A patient portal is a mechanism whereby patients can communicate with health care providers via the electronic health record. Chen et al [<xref ref-type="bibr" rid="ref10">10</xref>] used an LLM to draft responses to simulated patient messages (N=156) in a patient portal in a radiation oncology service at a tertiary care hospital. The LLMs&#x2019; responses were compared to responses generated by human experts. The human responses were shorter (34 vs 169 words). The LLM drafts posed a risk of severe harm in 7.1% of responses and a risk of death in 0.6% of responses. These risks were generally due to underestimating or failing to convey the acuity of a clinical scenario.</p><p>An advanced form of LLM is known as a reasoning model. Reasoning models break a prompt into smaller tasks and incrementally derive a conclusion. Vladika et al [<xref ref-type="bibr" rid="ref11">11</xref>] used a reasoning model to verify medical statements. For each statement, an LLM generated up to 5 new questions in response. These questions were then answered using the model&#x2019;s internal knowledge combined with a web search. Subsequently, another LLM integrated the responses to these questions and used this information to fact-check the original statement.</p><p>This study had 2 aims. The first was to evaluate the accuracy and efficiency of automated fact-checking by 2 LLMs. We compared the LLMs&#x2019; responses to those of human experts who searched for evidence sources. The second aim was to illustrate a process through which a patient might interact with an LLM to craft a prompt with appropriate information to support patient safety. We chose to use the following LLMs: GPT-4o (OpenAI), a commonly used framework, and OpenBioLLM-70B, an open-source model that performs strongly on biomedical benchmarks [<xref ref-type="bibr" rid="ref12">12</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Patient Involvement</title><p>To ensure realism and relevance for patients, we adopted a patient-centered research approach. Our patient coauthor experienced acne in her late 20s and was prescribed treatments. She was involved in developing the clinical scenario and the initial prompt that was the basis for the evaluation.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>No ethics approval was required for this work as it did not involve data obtained from human participants, in accordance with the policies of the Dartmouth Health Institutional Review Board.</p></sec><sec id="s2-3"><title>Scenario</title><p>On the basis of our patient author&#x2019;s experience, we proposed a query from a hypothetical patient, equivalent to a &#x201C;prompt&#x201D; to be given to LLMs. A hypothetical patient aged 23 years is taking isotretinoin to treat her acne. She is concerned she might be pregnant but has not had a pregnancy test. She asks an LLM for advice. Her first prompt (<xref ref-type="other" rid="box1">Textbox 1</xref>) does not specify isotretinoin. This is the first step in the process in <xref ref-type="fig" rid="figure1">Figure 1</xref>. This scenario contributes to both aims of this study. The LLMs generate responses that can serve as material for fact-checking (aim 1), whereas the scenario also illustrates a realistic manner in which a patient might interact with LLMs (aim 2).</p><boxed-text id="box1"><title> Initial large language model prompt from a hypothetical patient requesting guidance on acne treatment in the context of possible pregnancy.</title><p>&#x201C;I&#x2019;m a 23-year-old woman. I think I might be pregnant. I&#x2019;m pretty healthy, I just take acne capsules, otherwise I&#x2019;m not on any medicines. What should I do?&#x201D;</p></boxed-text><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Prompt improvement and fact-checking process that can be undertaken by a patient using a large language model (LLM).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e68223_fig01.png"/></fig></sec><sec id="s2-4"><title>Automated Prompt Improvement and Fact-Checking Process</title><p>When patients craft a prompt for an LLM, they may fail to convey critical aspects of their condition or medical history, leading to suboptimal or even harmful responses from an LLM. To address this, we developed a redrafting prompt (<xref ref-type="other" rid="box2">Textbox 2</xref>) asking the LLMs to suggest ways to improve the patient&#x2019;s initial prompt, such as by including details of medication or symptoms. This is step 2 of the process shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The LLM then suggests ways to refine the patient&#x2019;s initial prompt. The redrafting prompt can constrain the length of the LLM&#x2019;s response to avoid overwhelming the user, for example, &#x201C;Please list the three most important items of information&#x201D; to include. In our approach, we did not constrain the LLM. The LLM persona for the redrafting prompt (&#x201C;You&#x2019;re a medical expert&#x201D;) was informed by a research study by Pal et al [<xref ref-type="bibr" rid="ref13">13</xref>].</p><boxed-text id="box2"><title> Redrafting prompt: hypothetical patient requests guidance from a large language model to refine her original prompt regarding acne treatment in the context of possible pregnancy.</title><p>&#x201C;You&#x2019;re a medical expert. Consider the statement below from a patient. Please state what additional information you would want from the patient in order to make a good decision.&#x201D;</p></boxed-text><p>As part of this process, a patient author (OD) reviewed the LLM responses. The patient author flagged the potential harm of drug treatment as concerning; hence, our hypothetical patient added the name of her acne treatment to her revised prompt but did not make any further changes (step 3 in <xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><p>Two LLMs, GPT-4o and OpenBioLLM-70B, generated responses for each prompt. GPT-4o was accessed via the ChatGPT web portal. OpenBioLLM-70B was deployed on Google Colab using an NVIDIA A100 graphics processing unit. The automated prompt improvement and fact-checking process is summarized in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The prompts and the exact responses were dated and stored so that we could apply and compare the 2 fact-checking methods. The prompts were sent to GPT-4o and OpenBioLLM-70B on September 16, 2024, and January 6, 2025, respectively.</p></sec><sec id="s2-5"><title>LLM Fact-Checking</title><p>As part of this scenario, we imagined that the patient also wanted to fact-check the response to the revised prompt but was unsure about how to accomplish this task. We outlined a process through which the patient can prompt an LLM to conduct an automated fact-check of the previous response. This is compared against a manual method that we designed, which, although unlikely to be routinely used by patients or clinicians, provides a benchmark against which to compare GenAI fact-checking.</p><p>The LLM fact-check involves 2 separate prompts. The first prompt checks factuality, whereas the second prompt asks the LLM to identify inconsistencies and contradictions. These prompts are complementary. These 2 automated fact-checking prompts are shown in <xref ref-type="other" rid="box3">Textbox 3</xref>. The phrasing of the fact-checking prompt was informed by the research by Zhang et al [<xref ref-type="bibr" rid="ref14">14</xref>].</p><boxed-text id="box3"><title> Prompts for fact-check and consistency check: hypothetical patient asks a large language model to check the factuality and consistency of its previous responses.</title><p><bold>Fact-check</bold></p><p>&#x201C;You&#x2019;re a medical expert. Evaluate the truthfulness of the statement below. Consider your sources, context and date while assessing. To answer return &#x2018;Final Answer: {verdict}, {reason}.&#x2019; You must respond with a valid verdict: (&#x2018;false,&#x2019; &#x2018;mostly-false,&#x2019; &#x2018;half-true,&#x2019; or &#x2018;true&#x2019;) or &#x2018;uncertain,&#x2019; providing reasoning and citing sources by providing the domain of pertinent search results. Here is the statement to check:&#x201D;</p><p><bold>Consistency check</bold></p><p>&#x201C;You&#x2019;re a medical expert. Please identify any inconsistencies or contradictions in the following statement:&#x201D;</p></boxed-text></sec><sec id="s2-6"><title>&#x201C;Gold-Standard&#x201D; Comparator: Fact-Checking by Human Experts</title><p>The manual fact-checking process consisted of 6 steps. To remove potential bias, one judge (author PR) completed manual fact-checking before the LLM fact-checking. The other 2 judges did not have access to the LLMs&#x2019; fact-checking responses. All 3 judges are pharmacists, and all conducted fact-checking independently. The fact-checking process steps were as follows:</p><list list-type="bullet"><list-item><p>Step 1&#x2014;identify information claims in the response. A claim is a discrete phrase containing relevant information. Each claim was tabulated (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). A sentence might contain multiple claims. For example, the response to the initial prompt referred to &#x201C;blood tests, urine tests, and possibly an ultrasound.&#x201D; These 3 tests were classified as 3 separate claims.</p></list-item><list-item><p>Step 2&#x2014;determine the accuracy of each claim. This can be done in several ways; for example, a simple binary classification (true or false) can be used. We adapted the approach by Quelle and Bovet [<xref ref-type="bibr" rid="ref15">15</xref>] and used 5 categories: true, mostly true, mostly false, false, and uncertain.</p></list-item><list-item><p>Step 3&#x2014;identify contradictory or inconsistent claims.</p></list-item><list-item><p>Step 4&#x2014;identify claims that may pose clinical risk. This may comprise ambiguous, misleading, or omitted information relevant to patient safety.</p></list-item><list-item><p>Step 5&#x2014;assess the quality of the evidence that supports the response claims. This involved online searches using Google and PubMed to check the veracity of each claim. Online information sources were considered eligible if the source was judged to be credible and evidence-based. This assessment was based on the experts&#x2019; pharmacology training and interpretation of medical evidence using criteria such as sample size and study design [<xref ref-type="bibr" rid="ref16">16</xref>].</p></list-item><list-item><p>Step 6&#x2014;assess comprehensiveness by reviewing cited sources and determining whether critical pieces of information were omitted.</p></list-item></list></sec><sec id="s2-7"><title>Evaluating the Generalizability of LLM Fact-Checking</title><p>As part of aim 1, we evaluated the generalizability of LLMs&#x2019; ability to fact-check health care claims. The 2 LLMs assessed the factuality of 20 clinical claims. Independently, the 3 expert reviewers assessed the same claims.</p><p>The 20 claims are shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. To achieve a balanced dataset, 10 claims were considered to be true by the human judges, and 10 were false. A blend of true and false claims enabled the calculation of a true positive rate and a true negative rate. For example, a true claim was &#x201C;Mercaptopurine is indicated for maintenance of remission of Crohn&#x2019;s Disease.&#x201D; A false claim was &#x201C;Trifluoperazine is safe in patients with phaeochromocytoma.&#x201D; The claims addressed a variety of topics: clinical indications, contraindications, treatment duration, side effects of treatment, and drug interactions.</p><p>We used the following prompt for the LLMs:</p><disp-quote><p>You&#x2019;re a medical expert. Please assess the strength of evidence for the health claim below. Consider your sources, context and date while assessing. To answer return &#x201C;Final Answer: {verdict}, {reason}.&#x201D; You must respond with a valid verdict: (&#x201C;True,&#x201D; &#x201C;False&#x201D;). Here is the claim to check:</p></disp-quote><p>This prompt was administered to both LLMs for each of the 20 claims.</p></sec><sec id="s2-8"><title>GenAI Research</title><p>A methodological checklist for GenAI research has been developed by Sallam et al [<xref ref-type="bibr" rid="ref17">17</xref>]. This checklist can be found in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>The first task for the LLMs was to offer guidance to help redraft the user&#x2019;s initial prompt (second step of <xref ref-type="fig" rid="figure1">Figure 1</xref>). ChatGPT&#x2019;s guidance for redrafting (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) addressed 7 topics. Crucially, GPT-4o requested more specific details on the patient&#x2019;s acne medication in light of the potential harm during pregnancy. In general, GPT-4o adopted a patient-centered approach, requesting information on the user&#x2019;s support system and lifestyle factors. GPT-4o&#x2019;s response was extensive and could have led to a major revision of the prompt. However, the prime concern of the patient was the potential harm from acne medication, so she revised her prompt by naming the medication.</p><p>The revised prompt submitted to the LLMs was as follows:</p><disp-quote><p>I&#x2019;m a 23-year-old woman. I think I might be pregnant. I&#x2019;m pretty healthy, I just take isotretinoin for acne otherwise I&#x2019;m not on any medicines. What should I do?</p></disp-quote><p>In contrast, OpenBioLLM-70B&#x2019;s response to the redrafting prompt is shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. This response was shorter than the GPT-4o response (75 vs 279 words, respectively). OpenBioLLM-70B mentioned 6 topics, such as a missed period, home pregnancy test, and contraceptive use. However, OpenBioLLM-70B did not mention the risk of acne medication during pregnancy and did not ask the patient to provide more information on her medication. Hence, in this redrafting task, GPT-4o was superior to OpenBioLLM-70B in terms of patient safety.</p><p>After the prompt redrafting step, the revised prompt was submitted to the 2 LLMs. The LLMs both generated a response to the revised prompt. The LLMs were then prompted to fact-check the GPT-4o response as this contained more detail on the safe use of acne medication.</p><p><xref ref-type="table" rid="table1">Table 1</xref> shows the results of comparing human expert and automated fact-checking. The complete assessment is provided in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The experts followed the manual fact-checking process and identified 16 claims to verify. Of the 16 claims, the experts deemed 13 (81.3%) to be true, and 3 (18.8%) were classified as mostly true. This manual process (including the search for sources and specifying citations in the table of claims) took 18 minutes (full results are shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Results of fact-checking by GPT-4o, OpenBioLLM-70B, and human experts for isotretinoin treatment of acne in the context of possible pregnancy.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Human experts<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">GPT-4o</td><td align="left" valign="bottom">OpenBioLLM-70B</td></tr></thead><tbody><tr><td align="left" valign="top">Number of claims identified</td><td align="left" valign="top">16</td><td align="left" valign="top">13</td><td align="left" valign="top">13</td></tr><tr><td align="left" valign="top">Number of true claims</td><td align="left" valign="top">13</td><td align="left" valign="top">13</td><td align="left" valign="top">13</td></tr><tr><td align="left" valign="top">Number of mostly true claims</td><td align="left" valign="top">3</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Number of false claims</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Number of uncertain claims</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Time taken to review (mean and SD shown for the 3 expert reviewers)</td><td align="left" valign="top">18 (3.74) min</td><td align="left" valign="top">42 s</td><td align="left" valign="top">33 min, 21 s</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>The following information sources were used as references in the manual search by the human experts: the National Health Service, the Centers for Disease Control and Prevention, the Health Service Executive, the National Institutes of Health, the Mayo Clinic, <italic>PLOS One</italic>, the National Organization for Rare Disorders, the American Pregnancy Association, and the American College of Obstetricians and Gynecologists.</p></fn></table-wrap-foot></table-wrap><p>By comparison, GPT-4o identified 13 claims to verify and deemed all these claims to be true. This process took 42 seconds. The experts disagreed with the LLM classification of the claim about the use of folic acid because the response understated the importance of taking folic acid during pregnancy. Disagreement about the discontinuation of isotretinoin occurred for this response. The experts believed that isotretinoin treatment should be paused immediately without waiting for the results of a pregnancy test or a clinical visit given the risk involved. The response from the LLM was inconsistent; a section early in the response recommended stopping isotretinoin after a positive pregnancy test, whereas the conclusion recommended immediate cessation of isotretinoin. This inconsistency was not detected by the consistency check (<xref ref-type="other" rid="box3">Textbox 3</xref>).</p><p><xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> shows ChatGPT&#x2019;s response to the fact-check for consistency. The LLM suggested 4 improvements: the use of more sensitive pregnancy tests, safer treatments for acne, mental health support, and follow-up monitoring for maternal and fetal health. In contrast, the experts focused primarily on the urgency of discontinuing isotretinoin treatment and seeking advice from a clinician who could devise a comprehensive care plan in cooperation with the patient.</p><p>As part of aim 1, our study also tested the generalizability of LLMs&#x2019; fact-checking capability. The 20 medical statements under scrutiny are shown in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. GPT-4o had 100% agreement with human experts (Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), whereas OpenBioLLM-70B had 95% agreement (Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). For example, both LLMs found the following statement to be true:</p><disp-quote><p>Combined hormonal contraceptives should not be used in patients with acute porphyrias or gallstones.</p></disp-quote><p>Both LLMs correctly deemed the following statement to be false:</p><disp-quote><p>Sumatriptan is used to treat dizziness and drowsiness.</p></disp-quote><p>One question related to chlorphenamine, an antihistamine drug. In its response, OpenBioLLM-70B incorrectly stated the following:</p><disp-quote><p>False, The evidence does not support the claim that chlorphenamine can relieve the symptoms of allergy in children as young as 1 month.</p></disp-quote><p>In contrast, GPT-4o and the 3 human experts asserted that there was evidence to support the use of chlorphenamine in this age group. GPT-4o mentioned the following:</p><disp-quote><p>According to reputable sources, including the British National Formulary for Children (BNFc), chlorphenamine can be used in infants as young as 1 month old, albeit with adjusted dosing and under medical supervision. Its use in this age group is typically reserved for specific indications due to concerns about sedation and anticholinergic side effects. Still, the evidence supports its effectiveness and safety when properly dosed and monitored.</p></disp-quote><p>The complete LLM responses to fact-checking of these 20 medical statements can be found in Tables S3 and S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This scenario-based study demonstrates that an LLM can become a valuable source of evidence-based information by being prompted to undertake self-checking processes. The responses were checked by 3 experts and found to be fact-based; there was no fabrication or essential omissions. However, the LLMs did not consistently recommend immediately stopping isotretinoin or sufficiently guide the patient to take folic acid. A clinician could have corrected these errors, but a patient could have been misled.</p><p>By using methods in which LLMs are asked to revise the quality of prompts and undertake fact-checking for truthfulness and consistency, we showed that LLM responses could be improved to the point where the results showed an 86% agreement with those of experts undertaking verification. There are efficiency implications as the GPT-4o process was 24 times faster than the check conducted by the experts. Such capability and speed are superior by far to those of a nonexpert. The use of OpenBioLLM-70B was slower than that of the human experts, but the use of infrastructure on Google Colab was not optimized for quick model serving. OpenBioLLM-70B deployment could, in principle, be accelerated to match the latency of GPT-4o.</p></sec><sec id="s4-2"><title>Results in the Context of Similar Studies</title><p>Our study aligns with recent evidence that LLMs can play a helpful role in fact-checking in various domains [<xref ref-type="bibr" rid="ref18">18</xref>]. LLMs are more accurate on health-related questions when scientific consensus exists but are less able to respond when the evidence is controversial, ambiguous, or recently published [<xref ref-type="bibr" rid="ref19">19</xref>]. Kusunose et al [<xref ref-type="bibr" rid="ref20">20</xref>] found that GPT-3.5 achieved 80% accuracy for questions about hypertension guidelines, but the accuracy dropped to 36% when the evidence was weaker. LLMs can support shared decision-making by highlighting for clinicians the questions that patients are most likely to ask and providing written information in accessible ways to patients [<xref ref-type="bibr" rid="ref21">21</xref>]. Standardized datasets have been developed to measure the factuality of LLMs in domains such as health care [<xref ref-type="bibr" rid="ref22">22</xref>]. However, expert-crafted questions may differ significantly from patients&#x2019; questions in terms of clarity and inclusion of relevant information. Thus, it is important to involve patients in this process of LLM validation.</p></sec><sec id="s4-3"><title>Unanswered Questions and Future Research</title><p>Despite our results, we do not recommend using LLMs in routine clinical practice without clinician verification. However, as patients will inevitably use LLMs and find flawed information that will be misinterpreted [<xref ref-type="bibr" rid="ref23">23</xref>], clinicians must find methods to adapt to LLMs. Future research needs to further evaluate the capability of LLMs to self-improve and self-check comparing the responses to realistic benchmarks. As LLMs are based on stochastic algorithms, future research can investigate mechanisms to integrate LLMs with rule-based logic systems to mitigate inaccuracy and improve fact-checking. Bangerter et al [<xref ref-type="bibr" rid="ref24">24</xref>] found that a hybrid approach to fact-checking combining LLMs and fuzzy logic increased reliability, but this was not applied to health care claims.</p><p>Further research is also needed on the use of LLM agents. Agents can engage in back-and-forth with patients to ensure an appropriate level of detail for evidence-based decision-making. Agents can then craft an optimal prompt and deliver this to a subject matter expert LLM. Research is also needed into optimal methods of keeping LLMs up-to-date with new evidence. This could be a blend of fine-tuning some parameters of the LLM and giving the LLM access to new documents in a technique known as retrieval-augmented generation. Another fruitful area for research is likely to be multimodal artificial intelligence models, for example, integrating medical imaging, data tables (eg, trends of blood pressure and cholesterol measurements), and natural language input.</p><p>A multidisciplinary approach is needed to ensure the safe use of LLMs. Haltaufderheide and Ranisch [<xref ref-type="bibr" rid="ref25">25</xref>] argue that the degree of acceptable human oversight for LLMs in health care varies depending on the potential for harm. Han et al [<xref ref-type="bibr" rid="ref26">26</xref>] found that subtle modification of approximately 1% of an LLM&#x2019;s parameters can cause it to learn incorrect biomedical facts. There is a risk of malicious actors exploiting this vulnerability.</p></sec><sec id="s4-4"><title>Strengths and Weaknesses of This Method</title><p>A strength of this study was the use of an actual clinical question: a concerning situation for many who have acne and are sexually active. The clinical scenario was co-designed with our patient coauthor, who also guided the development of our methods and interpretations of the responses. All prompts and LLM responses were dated and archived to ensure transparency and reproducibility. We demonstrated how to increase the ability of LLMs to rapidly deliver high-quality information by prompting GenAI systems to self-improve and self-evaluate. This mimics a process whereby a clinician asks a patient to provide additional information. Given the inevitable use of LLMs as a source of health care information, these prompt revision and fact-checking methods are relevant to a wide audience. We used a panel of 3 judges to bolster the validity of the evaluation. We tested the generalizability of fact-checking by using 20 claims on disparate health care topics (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>Our study has a number of limitations. Only 1 detailed clinical scenario was developed related to acne management. Many other clinical scenarios should be evaluated to improve our understanding of the capabilities of LLMs. Our study used 2 LLMs for fact-checking, but new LLMs are being developed that may improve the accuracy of fact-checking. There may be scope to improve performance by using a specialized LLM for fact-checking and another LLM to respond directly to patients&#x2019; prompts. A rigorous comparison of prompt engineering techniques for fact-checking was not conducted in our study. Furthermore, although our patient author (OD) improved the realism of our scenario, there may have been some Hawthorne effect, whereby behavior changes when a person knows they are under observation. Patients may interact differently with LLMs in a natural setting. Finally, fact-checking, even by experts, is imperfect. Clinical guidelines and high-quality scientific literature reviews often have inconsistencies, and interpretations of evidence may differ [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. This was mitigated by the use of 3 human experts for fact-checking, but there is still potential for inaccuracy.</p></sec><sec id="s4-5"><title>Broader Implications</title><p>Scientific processes rely on self-correction, such as peer review, which, although fallible, over time serves to separate fact from fiction. Given that patients will turn to LLMs, regulators could set accuracy thresholds for GenAI [<xref ref-type="bibr" rid="ref29">29</xref>]. It may be feasible to build in prompt improvement and fact-checking when health care questions are submitted and withhold responses if questions are unclear [<xref ref-type="bibr" rid="ref30">30</xref>] or if there is insufficient scientific consensus.</p></sec></sec></body><back><ack><p>The authors wish to thank Thomas Broe, member of the Pharmaceutical Society of Ireland, and Andrew Murphy, member of the Pharmaceutical Society of Ireland, for fact-checking health care statements. The authors did not use generative artificial intelligence in the ideation or writing process for this manuscript despite analyzing the accuracy of 2 generative artificial intelligence models.</p></ack><notes><sec><title>Funding</title><p>No funding was received for this study.</p></sec><sec><title>Data Availability</title><p>All data generated or analyzed during this study are included in this published article (and its supplementary information files). Python code is available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>PR and GE conceptualized the study, conducted the analysis, and wrote the manuscript. OD was involved in developing the clinical scenario and prompts, editing the manuscript, and agreeing on the final version.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations:</title><def-list><def-item><term id="abb1">GenAI</term><def><p>generative artificial intelligence</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mirza</surname><given-names>FN</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>OY</given-names> </name><name name-style="western"><surname>Connolly</surname><given-names>ID</given-names> </name><etal/></person-group><article-title>Using ChatGPT to facilitate truly informed medical consent</article-title><source>NEJM AI</source><year>2024</year><month>01</month><day>25</day><volume>1</volume><issue>2</issue><pub-id pub-id-type="doi">10.1056/AIcs2300145</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shahsavar</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Choudhury</surname><given-names>A</given-names> </name></person-group><article-title>User intentions to use ChatGPT for self-diagnosis and health-related purposes: cross-sectional survey study</article-title><source>JMIR Hum Factors</source><year>2023</year><month>05</month><day>17</day><volume>10</volume><fpage>e47564</fpage><pub-id pub-id-type="doi">10.2196/47564</pub-id><pub-id pub-id-type="medline">37195756</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kisvarday</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yarahuan</surname><given-names>J</given-names> </name><etal/></person-group><article-title>ChatGPT use among pediatric health care providers: cross-sectional survey study</article-title><source>JMIR Form Res</source><year>2024</year><month>09</month><day>12</day><volume>8</volume><fpage>e56797</fpage><pub-id pub-id-type="doi">10.2196/56797</pub-id><pub-id pub-id-type="medline">39265163</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blease</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Locher</surname><given-names>C</given-names> </name><name name-style="western"><surname>Gaab</surname><given-names>J</given-names> </name><name name-style="western"><surname>H&#x00E4;gglund</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mandl</surname><given-names>KD</given-names> </name></person-group><article-title>Generative artificial intelligence in primary care: an online survey of UK general practitioners</article-title><source>BMJ Health Care Inform</source><year>2024</year><month>09</month><day>17</day><volume>31</volume><issue>1</issue><fpage>e101102</fpage><pub-id pub-id-type="doi">10.1136/bmjhci-2024-101102</pub-id><pub-id pub-id-type="medline">39288998</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>Toward the eradication of medical diagnostic errors</article-title><source>Science</source><year>2024</year><month>01</month><day>26</day><volume>383</volume><issue>6681</issue><fpage>eadn9602</fpage><pub-id pub-id-type="doi">10.1126/science.adn9602</pub-id><pub-id pub-id-type="medline">38271508</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cao</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Kwon</surname><given-names>DH</given-names> </name><name name-style="western"><surname>Ghaziani</surname><given-names>TT</given-names> </name><etal/></person-group><article-title>Accuracy of information provided by ChatGPT regarding liver cancer surveillance and diagnosis</article-title><source>AJR Am J Roentgenol</source><year>2023</year><month>10</month><volume>221</volume><issue>4</issue><fpage>556</fpage><lpage>559</lpage><pub-id pub-id-type="doi">10.2214/AJR.23.29493</pub-id><pub-id pub-id-type="medline">37222278</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Assessing the utility of ChatGPT throughout the entire clinical workflow: development and usability study</article-title><source>J Med Internet Res</source><year>2023</year><month>08</month><day>22</day><volume>25</volume><fpage>e48659</fpage><pub-id pub-id-type="doi">10.2196/48659</pub-id><pub-id pub-id-type="medline">37606976</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ni</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Qian</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jaulent</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Bousquet</surname><given-names>C</given-names> </name></person-group><article-title>Scientific evidence and specific context: leveraging large language models for health fact-checking</article-title><source>Online Inf Rev</source><year>2024</year><volume>48</volume><issue>7</issue><fpage>1488</fpage><lpage>1514</lpage><pub-id pub-id-type="doi">10.1108/OIR-02-2024-0111</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zarharan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wullschleger</surname><given-names>P</given-names> </name><name name-style="western"><surname>Pilehvar</surname><given-names>MT</given-names> </name><name name-style="western"><surname>Foster</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kia</surname><given-names>BB</given-names> </name></person-group><article-title>Tell me why: explainable public health fact-checking with large language models</article-title><source>arXiv</source><comment>Preprint posted online on  May 15, 2024</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2405.09454</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Guevara</surname><given-names>M</given-names> </name><name name-style="western"><surname>Moningi</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The effect of using a large language model to respond to patient messages</article-title><source>Lancet Digit Health</source><year>2024</year><month>06</month><volume>6</volume><issue>6</issue><fpage>e379</fpage><lpage>e381</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(24)00060-8</pub-id><pub-id pub-id-type="medline">38664108</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Vladika</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hacajova</surname><given-names>I</given-names> </name><name name-style="western"><surname>Matthes</surname><given-names>F</given-names> </name></person-group><article-title>Step-by-step fact verification system for medical claims with explainable reasoning</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 20, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2502.14765</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dorfner</surname><given-names>FJ</given-names> </name><name name-style="western"><surname>Dada</surname><given-names>A</given-names> </name><name name-style="western"><surname>Busch</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Biomedical large languages models seem not to be superior to generalist models on unseen medical data</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 25, 2024</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2408.13833</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Umapathi</surname><given-names>LK</given-names> </name><name name-style="western"><surname>Sankarasubbu</surname><given-names>M</given-names> </name></person-group><article-title>Med-HALT: medical domain hallucination test for large language models</article-title><access-date>2026-01-28</access-date><conf-name>Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)</conf-name><conf-date>Dec 6-7, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2023.conll-1.21.pdf">https://aclanthology.org/2023.conll-1.21.pdf</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Atomic calibration of LLMs in long-form generations</article-title><comment>Preprint posted online on  Oct 17, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.13246</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Quelle</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bovet</surname><given-names>A</given-names> </name></person-group><article-title>The perils and promises of fact-checking with large language models</article-title><source>Front Artif Intell</source><year>2024</year><month>02</month><day>7</day><volume>7</volume><fpage>1341697</fpage><pub-id pub-id-type="doi">10.3389/frai.2024.1341697</pub-id><pub-id pub-id-type="medline">38384276</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Atkins</surname><given-names>D</given-names> </name><name name-style="western"><surname>Best</surname><given-names>D</given-names> </name><name name-style="western"><surname>Briss</surname><given-names>PA</given-names> </name><etal/></person-group><article-title>Grading quality of evidence and strength of recommendations</article-title><source>BMJ</source><year>2004</year><month>06</month><day>19</day><volume>328</volume><issue>7454</issue><fpage>1490</fpage><pub-id pub-id-type="doi">10.1136/bmj.328.7454.1490</pub-id><pub-id pub-id-type="medline">15205295</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sallam</surname><given-names>M</given-names> </name><name name-style="western"><surname>Barakat</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sallam</surname><given-names>M</given-names> </name></person-group><article-title>A preliminary checklist (METRICS) to standardize the design and reporting of studies on generative artificial intelligence-based models in health care education and practice: development study involving a literature review</article-title><source>Interact J Med Res</source><year>2024</year><month>02</month><day>15</day><volume>13</volume><fpage>e54704</fpage><pub-id pub-id-type="doi">10.2196/54704</pub-id><pub-id pub-id-type="medline">38276872</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Augenstein</surname><given-names>I</given-names> </name><name name-style="western"><surname>Baldwin</surname><given-names>T</given-names> </name><name name-style="western"><surname>Cha</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Factuality challenges in the era of large language models and opportunities for fact-checking</article-title><source>Nat Mach Intell</source><year>2024</year><month>08</month><day>22</day><volume>6</volume><fpage>852</fpage><lpage>863</lpage><pub-id pub-id-type="doi">10.1038/s42256-024-00881-z</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Srivastava</surname><given-names>B</given-names> </name></person-group><article-title>Did chatbots miss their &#x201C;Apollo Moment&#x201D;? Potential, gaps, and lessons from using collaboration assistants during COVID-19</article-title><source>Patterns (N Y)</source><year>2021</year><month>08</month><day>13</day><volume>2</volume><issue>8</issue><fpage>100308</fpage><pub-id pub-id-type="doi">10.1016/j.patter.2021.100308</pub-id><pub-id pub-id-type="medline">34430927</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kusunose</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kashima</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sata</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation of the accuracy of ChatGPT in answering clinical questions on the Japanese Society of Hypertension Guidelines</article-title><source>Circ J</source><year>2023</year><month>06</month><day>23</day><volume>87</volume><issue>7</issue><fpage>1030</fpage><lpage>1033</lpage><pub-id pub-id-type="doi">10.1253/circj.CJ-23-0308</pub-id><pub-id pub-id-type="medline">37286486</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Elwyn</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ryan</surname><given-names>P</given-names> </name><name name-style="western"><surname>Blumkin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Weeks</surname><given-names>WB</given-names> </name></person-group><article-title>Meet generative AI&#x2026; your new shared decision-making assistant</article-title><source>BMJ Evid Based Med</source><year>2024</year><month>09</month><day>20</day><volume>29</volume><issue>5</issue><fpage>292</fpage><lpage>295</lpage><pub-id pub-id-type="doi">10.1136/bmjebm-2023-112651</pub-id><pub-id pub-id-type="medline">38866469</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hilton</surname><given-names>J</given-names> </name><name name-style="western"><surname>Evans</surname><given-names>O</given-names> </name></person-group><article-title>TruthfulQA: measuring how models mimic human falsehoods</article-title><access-date>2026-01-28</access-date><conf-name>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>May 22-27, 2022</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2022.acl-long.229.pdf">https://aclanthology.org/2022.acl-long.229.pdf</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Small</surname><given-names>WR</given-names> </name><name name-style="western"><surname>Wiesenfeld</surname><given-names>B</given-names> </name><name name-style="western"><surname>Brandfield-Harvey</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Large language model-based responses to patients' in-basket messages</article-title><source>JAMA Netw Open</source><year>2024</year><month>07</month><day>1</day><volume>7</volume><issue>7</issue><fpage>e2422399</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.22399</pub-id><pub-id pub-id-type="medline">39012633</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bangerter</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Fenza</surname><given-names>G</given-names> </name><name name-style="western"><surname>Furno</surname><given-names>D</given-names> </name><etal/></person-group><article-title>A hybrid framework integrating LLM and ANFIS for explainable fact-checking</article-title><source>IEEE Trans Fuzzy Syst</source><year>2024</year><volume>33</volume><issue>12</issue><fpage>4180</fpage><lpage>4189</lpage><pub-id pub-id-type="doi">10.1109/TFUZZ.2024.3431710</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haltaufderheide</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ranisch</surname><given-names>R</given-names> </name></person-group><article-title>The ethics of ChatGPT in medicine and healthcare: a systematic review on Large Language Models (LLMs)</article-title><source>NPJ Digit Med</source><year>2024</year><month>07</month><day>8</day><volume>7</volume><issue>1</issue><fpage>183</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01157-x</pub-id><pub-id pub-id-type="medline">38977771</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nebelung</surname><given-names>S</given-names> </name><name name-style="western"><surname>Khader</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Medical large language models are susceptible to targeted misinformation attacks</article-title><source>NPJ Digit Med</source><year>2024</year><month>10</month><day>23</day><volume>7</volume><issue>1</issue><fpage>288</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01282-7</pub-id><pub-id pub-id-type="medline">39443664</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>O&#x2019;Connell</surname><given-names>NE</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Wand</surname><given-names>BM</given-names> </name><name name-style="western"><surname>Ward</surname><given-names>SP</given-names> </name></person-group><article-title>Clinical guidelines for low back pain: a critical review of consensus and inconsistencies across three major guidelines</article-title><source>Best Pract Res Clin Rheumatol</source><year>2016</year><month>12</month><volume>30</volume><issue>6</issue><fpage>968</fpage><lpage>980</lpage><pub-id pub-id-type="doi">10.1016/j.berh.2017.05.001</pub-id><pub-id pub-id-type="medline">29103554</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cotie</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Vanzella</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Pakosh</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ghisi</surname><given-names>GL</given-names> </name></person-group><article-title>A systematic review of clinical practice guidelines and consensus statements for cardiac rehabilitation delivery: consensus, divergence, and important knowledge gaps</article-title><source>Can J Cardiol</source><year>2024</year><month>03</month><volume>40</volume><issue>3</issue><fpage>330</fpage><lpage>346</lpage><pub-id pub-id-type="doi">10.1016/j.cjca.2023.10.016</pub-id><pub-id pub-id-type="medline">38376955</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Mccoy</surname><given-names>AB</given-names> </name><etal/></person-group><article-title>Using large language model to guide patients to create efficient and comprehensive clinical care message</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>08</month><day>1</day><volume>31</volume><issue>8</issue><fpage>1665</fpage><lpage>1670</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae142</pub-id><pub-id pub-id-type="medline">38917441</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>T</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Bressem</surname><given-names>KK</given-names> </name><name name-style="western"><surname>Busch</surname><given-names>F</given-names> </name><name name-style="western"><surname>Nebelung</surname><given-names>S</given-names> </name><name name-style="western"><surname>Truhn</surname><given-names>D</given-names> </name></person-group><article-title>Comparative analysis of multimodal large language model performance on clinical vignette questions</article-title><source>JAMA</source><year>2024</year><month>04</month><day>16</day><volume>331</volume><issue>15</issue><fpage>1320</fpage><lpage>1321</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.27861</pub-id><pub-id pub-id-type="medline">38497956</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompts and large language model responses.</p><media xlink:href="formative_v10i1e68223_app1.doc" xlink:title="DOC File, 1489 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Methodological checklist.</p><media xlink:href="formative_v10i1e68223_app2.doc" xlink:title="DOC File, 35 KB"/></supplementary-material></app-group></back></article>