<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e81606</article-id><article-id pub-id-type="doi">10.2196/81606</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Use of Commercially Available Large Language Models to Generate Information Leaflets on Post&#x2013;Intensive Care Syndrome: Clinical Utility Assessment</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Hata</surname><given-names>Nanami</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Oami</surname><given-names>Takehiko</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kawakami</surname><given-names>Eiryo</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hanai</surname><given-names>Akiko</given-names></name><degrees>OTR, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Nakada</surname><given-names>Taka-aki</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Emergency and Critical Care Medicine, Chiba University Graduate School of Medicine</institution><addr-line>1-8-1 Inohana, Chuo</addr-line><addr-line>Chiba</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Department of Artificial Intelligence Medicine, Graduate School of Medicine, Chiba University</institution><addr-line>Chiba</addr-line><country>Japan</country></aff><aff id="aff3"><institution>Institute for Advanced Academic Research, Chiba University</institution><addr-line>Chiba</addr-line><country>Japan</country></aff><aff id="aff4"><institution>Predictive Medicine Special Project (PMSP), RIKEN Center for Integrative Medical Sciences (IMS), RIKEN</institution><addr-line>Kanagawa</addr-line><country>Japan</country></aff><aff id="aff5"><institution>Division of Applied Mathematical Science, RIKEN Center for Interdisciplinary Theoretical and Mathematical Sciences, RIKEN</institution><addr-line>Kanagawa</addr-line><country>Japan</country></aff><aff id="aff6"><institution>Faculty of Informatics, Graduate School of Informatics, Chiba University</institution><addr-line>Chiba</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Hasan</surname><given-names>Abul</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mitra</surname><given-names>Avijit</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Bievre</surname><given-names>Nicolas</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chen</surname><given-names>Pei-fu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Taka-aki Nakada, MD, PhD, Department of Emergency and Critical Care Medicine, Chiba University Graduate School of Medicine, 1-8-1 Inohana, Chuo, Chiba, 260-8677, Japan, +81-43-226-2372; <email>takanakada0@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>14</day><month>5</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e81606</elocation-id><history><date date-type="received"><day>31</day><month>07</month><year>2025</year></date><date date-type="rev-recd"><day>26</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>26</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Nanami Hata, Takehiko Oami, Eiryo Kawakami, Akiko Hanai, Taka-aki Nakada. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 14.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e81606"/><abstract><sec><title>Background</title><p>Patients and their families without medical knowledge may find professional health care information difficult to understand. The use of large language models (LLMs) to simplify and translate complex medical content holds promise for improving comprehension while reducing the burden on health care providers tasked with delivering explanations.</p></sec><sec><title>Objective</title><p>This study aims to evaluate the quality of information leaflets generated using commercially available LLMs.</p></sec><sec sec-type="methods"><title>Methods</title><p>Informational texts on post&#x2013;intensive care syndrome were generated using 6 different LLMs and 4 prompt designs with varying levels of instructional guidance. Clinical practice guideline documents were uploaded and provided to the models as reference context, reflecting a pragmatic clinical scenario without model tuning or advanced retrieval pipelines. In total, 72 texts were generated (6 models &#x00D7; 4 prompts &#x00D7; 3 outputs). After excluding texts shorter than 500 characters (n=16) and those without explicit mention of post&#x2013;intensive care syndrome (n=3), 53 texts remained. To enable balanced human evaluation across model-prompt combinations, the longest eligible response from each pair was selected (4 prompts &#x00D7; 4 models; n=16). Following independent expert review by 2 medical specialists, 7 texts were excluded, leaving 9 texts for final analysis. Ten individuals, including health care professionals and nonmedical personnel, assessed the texts using a 10-point Likert scale across multiple quality domains. An LLM-based parallel assessment was also conducted, and scores were compared across models and evaluator groups.</p></sec><sec sec-type="results"><title>Results</title><p>In the human evaluation of the selected 9 texts, the generated texts achieved an average score of 6.8 or higher across all evaluation criteria, with no potentially harmful content identified. The text generated by LLaMA 3 70B, using a step-by-step approach combined with text-augmented prompting based on clinical guidelines, received the highest overall score, whereas the lowest-rated text was produced using a simple prompt without text augmentation. Although no consistent trends were observed across LLMs or prompt engineering strategies, text-augmented prompting was generally associated with higher evaluation scores. Ratings differed between professional and nonprofessional evaluators. Given the feasibility-driven screening process and the resulting limited sample size, the findings should be interpreted as exploratory and descriptive rather than definitive estimates of overall model performance.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Among the selected texts included in the final human evaluation, informational materials generated using commercially available LLMs were generally rated as acceptable by human evaluators, and none contained harmful content. These findings suggest that LLMs may support the development of patient-facing informational materials under feasibility-constrained conditions, although further validation with larger and more diverse samples is warranted.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>post&#x2013;intensive care syndrome</kwd><kwd>clinical practice guidelines</kwd><kwd>artificial intelligence</kwd><kwd>generative AI</kwd><kwd>retrieval-augmented generation</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Medical documents such as clinical practice guidelines are intended to support decision-making by health care professionals and patients based on recommendations for standard examinations and treatments [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. However, because these guidelines are primarily written by medical specialists, their content is typically challenging for patients and their families to comprehend. Moreover, adapting these documents for diverse audiences is time-consuming and labor-intensive. A study evaluating the costs associated with creating patient education materials found that the average annual expenditure per center was approximately $65,401 for pamphlet development and $19,819 for annual review, highlighting the significant investment required for these endeavors [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>With recent advancements in generative artificial intelligence (AI), large language models (LLMs) have become capable of instantly summarizing documents and rephrasing content in specific contexts [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. Leveraging this technology to tailor professional medical content for patients and their families may enhance the readability of materials originally designed for health care providers [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Although the use of commercially available LLMs renders medical documents more patient-friendly, hallucinations can result in content inaccuracy that can be challenging to regulate [<xref ref-type="bibr" rid="ref9">9</xref>]. LLMs can be augmented with domain-specific knowledge through retrieval-augmented generation [<xref ref-type="bibr" rid="ref10">10</xref>], allowing the integration of up-to-date and contextually relevant information into generated outputs [<xref ref-type="bibr" rid="ref11">11</xref>]. The creation of disease- or condition-specific informational leaflets through LLMs enhances patient and family comprehension and reduces the burden on clinicians. However, the feasibility and validity of using commercially available LLMs to generate information leaflets for different audiences remain inadequately explored.</p><p>Post&#x2013;intensive care syndrome (PICS) encompasses physical, cognitive, and psychological impairments that occur during or after an intensive care unit stay and frequently persist after hospital discharge [<xref ref-type="bibr" rid="ref12">12</xref>]. PICS affects the long-term prognosis and quality of life of intensive care unit survivors and has a significant psychological impact on their families [<xref ref-type="bibr" rid="ref13">13</xref>]. A growing concern in emergency and critical care medicine is promoting the awareness of PICS among patients and their families, which is essential for its effective prevention and management [<xref ref-type="bibr" rid="ref14">14</xref>]. The focus on PICS underscores a critical unmet medical need. Patients, their families, and health care professionals are often required to respond to this condition unexpectedly without access to adequate informational resources. The development of condition-specific, readable, and medically accurate informational leaflets through LLMs and text-augmented prompting approach based on clinical guidelines has the potential to improve comprehension among nonexpert audiences and reduce the communication burden on clinicians during acute and emotionally charged periods [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>This study aimed to evaluate the clinical utility and safety of information leaflets on PICS for patients and their families, developed using an accessible commercially available LLM. For the qualitative assessment, medical experts, nonexpert individuals, and an LLM were used to review the generated content.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Settings</title><p>An evaluation study was conducted to evaluate the interpretability and readability of information leaflets on PICS generated using LLMs. Using prompt engineering and text-augmented prompting approach, we generated patient-friendly explanations of clinical practice guidelines and evaluated the quality of the generated materials.</p></sec><sec id="s2-2"><title>Prompt Design and Content Generation</title><p>Four prompts were designed to explore different content generation strategies and levels of guidance (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><list list-type="bullet"><list-item><p>Prompt 1: zero-shot without the uploaded context&#x2014;Simple instructions directed the model to explain PICS in a patient-friendly manner without the use of examples or an external context.</p></list-item><list-item><p>Prompt 2: zero-shot with the uploaded context&#x2014;Basic instructions to explain PICS, supplemented with clinical practice guideline content, were provided as an additional context via the text-augmented prompting approach.</p></list-item><list-item><p>Prompt 3: few-shot with the uploaded context&#x2014;The model was provided with specific examples of questions and answers to guide responses. The text-augmented prompting approach was used to ensure evidence-based content.</p></list-item><list-item><p>Prompt 4: Step-by-step with the uploaded context&#x2014;A detailed instruction set guided the model to retrieve relevant information and deliver a comprehensive explanation of PICS, including causes, symptoms, treatments, and practical advice for patients and families. Although prompt 4 instructed the model to &#x201C;search for the latest clinical guidelines,&#x201D; the models operated without internet access. This instruction was intended as a directive for contextual grounding&#x2014;specifically, to command the model to treat the uploaded document as the &#x201C;latest&#x201D; source retrieved from a search, thereby prioritizing the provided context over the model&#x2019;s pretrained internal knowledge.</p></list-item></list><p>These prompts were applied to 6 LLMs&#x2014;ChatGPT-4o, LLaMA 3 70B, MedItron:7b, Gemma, Mistral, and MedLLaMA2&#x2014;with 3 outputs generated per prompt-model combination, resulting in a total of 72 texts (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). To reflect the models most likely to be used by health care professionals, we focused on available models through Ollama and selected the 4 models with the highest number of pulls as of May 2024, as well as the 2 models with the highest number of pulls identified by searching for the keyword &#x201C;medical.&#x201D; For open-source models, we used Ollama with its default generation settings at the time of execution (July 2024) without parameter tuning. The settings included a temperature of 0.8, repeat penalty of 1.1, repeat_last_n of 64, seed of 0, and a context window size (num_ctx) of 2048 tokens. We did not perform token counting or explicit verification of context retention during generation. Therefore, depending on the combined length of the system instructions, prompts, uploaded guideline text, and generated output, partial truncation of the contextual input may have occurred. This configuration was maintained to simulate a standard user environment where context window adjustments are not typically performed.</p><p>For prompt conditions using guideline context (prompts 2&#x2010;4), the generation workflow was as follows:</p><list list-type="order"><list-item><p>The PICS section of the Japanese version of the Surviving Sepsis Campaign Guidelines was prepared as an uploaded reference document</p></list-item><list-item><p>The uploaded document was supplied together with the prompt instruction during generation</p></list-item><list-item><p>The LLM generated patient- and family-oriented explanations by paraphrasing the guideline content in plain language</p></list-item></list><p>No embedding-based retrieval, vector database indexing, or automated selection of guideline passages was performed. Parameter tuning and advanced retrieval pipelines were not intentionally implemented to reflect a pragmatic clinical scenario in which health care professionals use commercially available LLMs with minimal technical customization [<xref ref-type="bibr" rid="ref16">16</xref>].</p></sec><sec id="s2-3"><title>Selection of Generated Texts</title><p>For human evaluation, we compared the relative quality of practically usable outputs across model-prompt configurations. Because human evaluation requires substantial time and effort, the number of texts was restricted to a feasible sample size, and clearly incomplete outputs were excluded prior to evaluation.</p><p>The generated texts were selected through a structured 3-step process. First, 72 texts (6 models &#x00D7; 4 prompts &#x00D7; 3 outputs, <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>) were screened for minimum length (&#x2265;500 Japanese characters) and explicit mention of PICS, leaving 53 texts. Second, for each model-prompt combination, the longest output was selected as a proxy for informational coverage to reduce the likelihood of including incomplete or truncated responses and to ensure sufficient content for human evaluation, resulting in 21 texts. Third, to ensure balanced representation across prompts and models, four models (GPT-4, Llama3:7b, MedLLaMA, and Mistral) were retained. Because all MedLLaMA outputs for prompt 3 had been excluded during initial screening, the longest excluded output was reinstated to maintain balance. This process yielded 16 final texts (4 models &#x00D7; 4 prompts, <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). These texts were initially generated in English and subsequently formatted into Japanese A4-sized informational leaflets using a GPT-4o&#x2013;based system. The translation quality of GPT-4 has been evaluated in prior research against human translators and was found to be comparable to junior professional translators in terms of overall error rates, supporting the reliability of this approach [<xref ref-type="bibr" rid="ref17">17</xref>]. Subsequently, 2 board-certified emergency physicians (NH and TO) independently reviewed the 16 leaflets for accuracy, clarity, and completeness. Any disagreements between the reviewers were resolved through discussion to reach a consensus. Nine leaflets were selected for further evaluation (<xref ref-type="fig" rid="figure1">Figure 1</xref>, <xref ref-type="table" rid="table1">Table 1</xref>, and <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). A few were excluded because of limited informational content or language that was more suited for medical professionals; however, there were no critical errors that could potentially cause harm to patients or their families. Details of the screening process and the rationale for expert-based selection are provided in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p><p>Ten individuals were recruited as evaluators: 2 nonmedical individuals, 2 health care providers (1 nurse and 1 physical therapist), 4 specialists with expertise in PICS (3 emergency physicians and 1 physical therapist), and 2 nonspecialist physicians without PICS-specific expertise. These evaluators were requested to assess 9 selected texts in terms of clarity, readability, and other factors, using 10 evaluation criteria. The evaluation framework was developed with reference to prior large-scale LLM assessment studies in medicine, including the approach described by Singhal et al [<xref ref-type="bibr" rid="ref4">4</xref>].</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of the selection process for the 9 final texts. A total of 72 texts were generated using 4 prompt designs across the 6 LLMs, with 3 outputs each. After excluding texts less than 500 characters (n=16) and those lacking explicit mention of PICS (n=3), 53 texts remained. The longest text was selected from each unique prompt-model pair (n=16). Following an expert review by 2 medical specialists, 7 texts were excluded, resulting in 9 final texts for analysis. LLM: large language model; PICS: post&#x2013;intensive care syndrome.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e81606_fig01.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Overview of the 9 prompts (word count, characters, syllables, sentences).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Serial number</td><td align="left" valign="bottom">Prompt</td><td align="left" valign="bottom">Large language model</td><td align="left" valign="bottom">Number of outputs</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Prompt 1 (zero-shot without the uploaded context)</td><td align="left" valign="top">Chat GPT-4o</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Prompt 2 (zero-shot with the uploaded context)</td><td align="left" valign="top">Chat GPT-4o</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Prompt 4 (step-by-step with the uploaded context)</td><td align="left" valign="top">Chat GPT-4o</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Prompt 1 (zero-shot without the uploaded context)</td><td align="left" valign="top">LLaMA 3 70B</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">Prompt 3 (few-shot with the uploaded context)</td><td align="left" valign="top">LLaMA 3 70B</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Prompt 4 (step-by-step with the uploaded context)</td><td align="left" valign="top">LLaMA 3 70B</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Prompt 1 (zero-shot without the uploaded context)</td><td align="left" valign="top">MedLLaMA2</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Prompt 1 (zero-shot without the uploaded context)</td><td align="left" valign="top">Mistral</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">Prompt 4 (step-by-step with the uploaded context)</td><td align="left" valign="top">Mistral</td><td align="left" valign="top">3</td></tr></tbody></table></table-wrap><p>The survey was conducted using Google Forms and included the following components.</p><list list-type="order"><list-item><p>Evaluator demographics: age, sex, profession, and area of expertise</p></list-item><list-item><p>Evaluation items rated on a 10-point Likert scale:</p><list list-type="bullet"><list-item><p>Scientific or clinical validity: how scientifically and clinically appropriate is the text?</p></list-item><list-item><p>Degree of harmfulness: to what extent could the content be harmful?</p></list-item><list-item><p>Frequency of harmfulness: how likely is it that harm could result from this content?</p></list-item><list-item><p>Readability: how comprehensible is the text?</p></list-item><list-item><p>Accuracy: is the text medically accurate?</p></list-item><list-item><p>Logical consistency: is the text logically structured?</p></list-item><list-item><p>Difficulty to read: how difficult is the text to read?</p></list-item><list-item><p>Inappropriateness: does the text include inappropriate content?</p></list-item><list-item><p>Omissions: are any essential elements missing from the text?</p></list-item><list-item><p>Potential for bias: does the content include information that may not be applicable to medical populations?</p></list-item></list></list-item></list><p>Because higher raw scores in degree of harmfulness, frequency of harmfulness, inappropriateness, omissions, and potential for bias represented greater risk or lower quality, these items were reverse-scored prior to analysis (transformed as 11&#x2014;original score) so that higher scores consistently indicated better performance across all domains.</p></sec><sec id="s2-4"><title>LLM-Based Evaluation</title><p>In addition to human evaluation, LLM-based assessments were conducted using GPT-4o for structured comparison with human ratings. Two role-based prompts were used: &#x201C;You are a medical doctor with extensive experience in emergency care&#x201D; (physician-role) and &#x201C;You are a family member of a patient in the intensive care unit&#x201D; (patient-role).</p><p>The 9 selected texts were assessed using the following 6 aggregated dimensions:</p><list list-type="bullet"><list-item><p>Agreement with scientific consensus</p></list-item><list-item><p>Possibility and likelihood of harm</p></list-item><list-item><p>Evidence of comprehension</p></list-item><list-item><p>Reasoning and retrieval ability</p></list-item><list-item><p>Presence of inappropriate, incorrect, or missing content</p></list-item><list-item><p>Possibility of bias in the response</p></list-item></list><p>Among these, harm-related dimensions, presence of inappropriate or incorrect or missing content, and possibility of bias were reverse-scored (transformed as 11&#x2014;original score) so that higher values consistently indicated better performance.</p></sec><sec id="s2-5"><title>Statistical Analysis</title><p>Objective assessments of the LLM-generated texts were conducted using human evaluations across 10 predefined criteria and AI-based evaluations incorporating 6 components. Subgroup analyses were performed to compare the evaluation metrics according to the prompt type, LLM used, and the use of text-augmented prompting approach. For graphical presentation of the evaluation results, 95% CIs were estimated using cluster bootstrapping with resampling at the evaluator level (5000 iterations). Additionally, we examined the differences in the evaluation scores according to age, sex, profession, and area of expertise.</p><p>As an exploratory analysis, all 72 generated texts (6 models &#x00D7; 4 prompts &#x00D7; 3 outputs) were additionally evaluated using the same GPT-4o&#x2013;based assessment framework. This analysis aimed to examine evaluation patterns across multiple outputs within each model-prompt pair. To account for clustering of outputs within model-prompt pairs, a mixed-effects model was applied with model-prompt combinations specified as random intercepts. In addition, the mean scores of the 3 outputs within each model-prompt pair were calculated and analyzed using the same framework. Given the limited concordance observed between human and LLM-based evaluations, these analyses were considered exploratory and descriptive.</p><p>To assess the interevaluator variability between the human and LLM evaluators, we calculated the weighted &#x03BA; across the 3 groups (human evaluators, GPT-generated responses simulating a medical doctor, and GPT-generated responses simulating a family member). Interrater reliability among the 10 human evaluators was assessed using an intraclass correlation coefficient (ICC). A 2-way mixed-effects model with absolute agreement (ICC [A,10]) was applied to evaluate consistency across the 10 evaluation criteria.</p><p>Categorical variables are reported as absolute numbers and percentages, whereas continuous variables are expressed as mean (SD) or median (interquartile range), as appropriate. All statistical analyses were performed using GraphPad Prism 10 (GraphPad Software), pandas (v1.0.5), numpy (v1.21.4), seaborn (v0.11.2), and matplotlib (v3.5.1) tools in Python (v3.9.0).</p></sec><sec id="s2-6"><title>Ethical Considerations</title><p>This study involved human evaluators who assessed text materials generated by LLMs; however, the evaluators were not the research participants of the investigation. The study focused on the evaluation of AI-generated explanatory texts rather than on human participants themselves. No personal or identifiable information was collected from the evaluators. According to the <italic>Ethical Guidelines for Medical and Health Research Involving Human Subjects</italic> issued by the Ministry of Education, Culture, Sports, Science and Technology; the Ministry of Health, Labour and Welfare; and the Ministry of Economy, Trade and Industry of Japan (2017, amended 2021) [<xref ref-type="bibr" rid="ref18">18</xref>], this type of research is considered outside the scope of studies requiring institutional review board approval because it does not involve interventions, collection of personal data, or research targeting human participants.</p><p>Therefore, institutional review board approval and formal informed consent were not required.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Characteristics of Human Evaluators</title><p>Ten individuals evaluated the generated texts. Among them, 5 (50%) specialized in emergency and critical care medicine, 1 (10%) in internal medicine, 1 (10%) in psychiatry, and 3 (30%) in other fields. Two (20%) evaluators were involved in the development of the clinical practice guidelines for PICS (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Characteristics of human evaluators (N=10).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom">Human evaluators, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Sex</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">6 (60)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">4 (40)</td></tr><tr><td align="left" valign="top" colspan="2">Age (y)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>30&#x2010;34</td><td align="left" valign="top">4 (40)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>35&#x2010;39</td><td align="left" valign="top">3 (30)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>40&#x2010;44</td><td align="left" valign="top">2 (20)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>45&#x2010;49</td><td align="left" valign="top">1 (10)</td></tr><tr><td align="left" valign="top" colspan="2">Occupation</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Physician</td><td align="left" valign="top">5 (50)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nurse</td><td align="left" valign="top">1 (10)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Physical therapist</td><td align="left" valign="top">2 (20)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonmedical personnel</td><td align="left" valign="top">2 (20)</td></tr><tr><td align="left" valign="top" colspan="2">Specialty</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency and critical care medicine</td><td align="left" valign="top">5 (50)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Internal medicine</td><td align="left" valign="top">1 (10)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Psychiatry</td><td align="left" valign="top">1 (10)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Others</td><td align="left" valign="top">3 (30)</td></tr><tr><td align="left" valign="top" colspan="2">Involvement in the guideline development</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">2 (20)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">8 (80)</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>Human Evaluations</title><p>All selected texts received mean scores of 6.8 or higher across all evaluation items. Among the 9 outputs, the text generated by LLaMA 3 70B using a step-by-step approach combined with the uploaded context (prompt 4) had the highest overall mean score in the human evaluations. In contrast, the text produced using a simple zero-shot prompt and without the uploaded context (prompt 1) had the lowest overall mean score. These comparisons are descriptive and should be interpreted with caution given the limited number of texts per prompt condition. Items related to readability, difficulty in reading, and omission tended to receive comparatively lower ratings than the other evaluation domains (<xref ref-type="fig" rid="figure2">Figure 2</xref>).</p><p>The interrater reliability among the 10 human evaluators was high. The ICC indicated strong agreement (ICC[A,10]=0.903, 95% CI 0.869&#x2010;0.930; <italic>P</italic>&#x003C;.001), indicating strong consistency in the application of the 10 evaluation criteria across evaluators.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Human evaluation scores for the 9 selected texts across 10 evaluation criteria. Ten human evaluators assessed the 9 selected explanatory texts using 10 predefined evaluation criteria covering both informational quality and safety-related aspects. The forest plot shows the overall mean score for each criterion across the evaluated texts, with error bars representing 95% bootstrap CIs. Quality-related items are displayed in the upper panel, while safety-related items are shown in the lower panel to improve interpretability. Blue diamonds indicate the model configuration with the highest mean score among the evaluated texts, whereas red squares indicate the configuration with the lowest mean score. For negatively framed items (ie, degree of harmfulness, frequency of harmfulness, inappropriateness, omissions, and potential for bias), scores were reversed so that higher values consistently indicated better performance across all evaluation dimensions.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e81606_fig02.png"/></fig></sec><sec id="s3-3"><title>Assessments Using an LLM</title><p>GPT-4 evaluations indicated that the text generated by ChatGPT-4o using a step-by-step approach combined with the uploaded context (prompt 4) had comparatively higher scores for readability and scientific soundness among the evaluated outputs, which was broadly consistent with patterns observed in the human evaluations. By contrast, prompt 1 (zero-shot without the uploaded context) tended to receive lower ratings (<xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>). The personalized GPT evaluations simulating a medical doctor and a family member showed substantial agreement (weighted &#x03BA; coefficient: +0.818).</p></sec><sec id="s3-4"><title>Subgroup Analyses of LLMs and Prompt Engineering Strategies by Human Evaluators</title><p>For the LLM models, outputs generated using LLaMA 3 70B and Mistral tended to receive comparatively higher scores in domains such as scientific or clinical validity, accuracy, and readability. Among the prompt categories, prompt 2 (zero-shot with the uploaded context) appeared to show comparatively higher scores across multiple dimensions, whereas prompt 1 (zero-shot without the uploaded context) showed lower scores. Outputs generated using the uploaded context tended to receive higher scores for scientific validity and lower scores for omissions. Similarly, the use of prompt engineering strategies tended to be associated with higher scores for accuracy and readability and lower difficulty-in-reading ratings (<xref ref-type="fig" rid="figure3">Figure 3</xref>).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Subgroup analyses of human evaluation scores according to model and prompting strategies. Evaluations by 10 human evaluators for the 9 selected explanatory texts are presented according to (A) large language models, (B) prompt design, (C) the use of a text-augmented prompting approach based on clinical guidelines, and (D) prompt engineering strategies. The forest plots show the mean scores for each evaluation item across subgroups, with error bars representing 95% bootstrap CIs. Evaluation items related to informational quality are presented in the upper panels, whereas safety-related items are shown in the lower panels to facilitate visual distinction. For negatively framed items (ie, degree of harmfulness, frequency of harmfulness, inappropriateness, omissions, and potential for bias), scores were reversed so that higher values consistently indicated better performance across all evaluation dimensions.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e81606_fig03.png"/></fig></sec><sec id="s3-5"><title>Subgroup Analyses of Human Evaluation According to Evaluator Characteristics</title><p>Although some variability was observed across age groups, female evaluators tended to assign slightly higher scores than male evaluators. Physicians tended to provide marginally lower overall scores than nonmedical evaluators. No clear differences in evaluation scores were observed according to direct involvement in the development of the PICS guidelines (<xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>).</p></sec><sec id="s3-6"><title>Interrater Variability Between Human and LLM Evaluators</title><p>Variability between human and AI evaluations was observed, particularly for the highest- and lowest-ranked texts. Weighted &#x03BA; coefficients suggested low-to-negative agreement (human vs GPT simulating a medical doctor &#x2212;0.423; human vs GPT simulating a family member &#x2212;0.276; <xref ref-type="fig" rid="figure4">Figure 4</xref>). Nevertheless, texts that were ranked highly by both human and LLM evaluators tended to have been generated using a step-by-step approach combined with text-augmented prompting, although the specific LLM models differed.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Interevaluator variability between human and artificial intelligence evaluators. This figure illustrates the interrater variability in total evaluation scores between human and artificial intelligence evaluators (simulating a medical doctor and a family member). Each row represents a different model (1-9), and each column represents the rank assigned by a different type of evaluator: human evaluators, the GPT simulating a medical doctor, and the GPT simulating a family member. The ranking scores ranged from 1 (best) to 9 (worst). Darker colors indicate higher ranks (worse evaluations), whereas lighter colors indicate better performance. GPT: generative pretrained transformer.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e81606_fig04.png"/></fig></sec><sec id="s3-7"><title>Mixed-Effects Model Analysis of LLM-Based Evaluations Across 72 Generated Texts</title><p>The 9 texts selected through the screening process were among the higher-scoring outputs based on the LLM-based evaluation of all 72 generated texts (<xref ref-type="supplementary-material" rid="app8">Multimedia Appendices 8</xref><xref ref-type="supplementary-material" rid="app9"/>-<xref ref-type="supplementary-material" rid="app10">10</xref>). In mixed-effects models including all 72 texts (<xref ref-type="supplementary-material" rid="app11">Multimedia Appendices 11</xref> and <xref ref-type="supplementary-material" rid="app12">12</xref>), Gemma, MedLLaMA, and Meditron:7b showed significantly lower mean scores compared with LLaMA3 70B in both patient- and physician-simulated evaluations. ChatGPT-4o and Mistral did not show statistically significant differences. Prompt engineering strategies were not significantly associated with scores in these models. Sensitivity analyses using mean scores across the three repeated outputs demonstrated similar directional trends (<xref ref-type="supplementary-material" rid="app13">Multimedia Appendices 13</xref> and <xref ref-type="supplementary-material" rid="app14">14</xref>).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this study, information leaflets on PICS generated by commercially available LLMs for patients and their families were acceptable in terms of feasibility and validity. Among the 9 selected texts, those created using prompt engineering and text-augmented prompting approach tended to receive comparatively higher scores for readability, scientific accuracy, and overall quality, as assessed by human and AI evaluators. Subgroup analyses suggested that nonmedical evaluators tended to assign higher scores. Notably, the evaluation results from GPT-4 did not align with the human ratings. These findings should be interpreted as descriptive, given the limited number of texts in each subgroup.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>A key strength of this study is that assessments were performed by nonmedical individuals and multidisciplinary health care professionals, including physicians with expertise in relevant fields. Although the information leaflets were intended for patients and their families, validation by medical professionals is essential to ensure content accuracy and appropriateness. Another notable strength of this study was the use of commercially available LLMs with text-augmented prompting approach to generate these materials. Although these models typically lack domain-specific knowledge, constructing a specialized LLM requires substantial resources. Our findings suggest that commercially available LLMs can produce content of satisfactory quality, thereby potentially facilitating their integration into clinical settings.</p><p>This study highlights the critical influence of LLM selection on the quality of information leaflets related to PICS. Given the diversity of their training datasets, LLMs inherently produce variable output quality [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. A previous study evaluated the performance of 4 LLMs in generating therapeutic recommendations across 3 medical specialties and revealed significant differences in quality, accuracy, and potential harm. Notably, GPT-4 demonstrated a high level of alignment with physician assessments and proved to be effective in automating these evaluations [<xref ref-type="bibr" rid="ref21">21</xref>]. Another study reported that GPT-4 outperformed other LLMs and human responses in generating accurate, relevant, helpful, and safe answers to patient questions regarding laboratory test results [<xref ref-type="bibr" rid="ref22">22</xref>]. In our study, the LLaMA 3 70B and Mistral models tended to produce comparatively higher-quality outputs, which may be an important factor influencing performance.</p><p>In addition, we explored the impact of prompt engineering using strategies, including zero-shot, few-shot, and step-by-step approaches. Prompt engineering strategies can significantly affect the accuracy, integrity, and overall quality of the generated content [<xref ref-type="bibr" rid="ref23">23</xref>]. Our results demonstrated that texts generated using simplistic zero-shot prompts without the uploaded context tended to receive lower scores across all evaluation criteria. In contrast, the step-by-step approach, which was previously reported to improve the output quality [<xref ref-type="bibr" rid="ref24">24</xref>], tended to show comparatively higher scores. Moreover, the integration of retrieval-augmented generation has been reported to enhance factual accuracy and reduce content omissions [<xref ref-type="bibr" rid="ref10">10</xref>]. Integrating structured guideline reformatting and advanced prompt engineering strategies using the GPT-4 Turbo significantly improved the accuracy of clinical decision support in chronic hepatitis C management, with the guideline context and formatting demonstrating more effectiveness than few-shot learning [<xref ref-type="bibr" rid="ref25">25</xref>]. Taken together, these findings suggest the potential importance of using structured prompts and the integration of external knowledge retrieval in optimizing LLM performance in health care communications.</p><p>Subgroup analyses revealed significant differences in the evaluation scores based on evaluator characteristics. Female evaluators typically assigned higher scores than their male counterparts, which may reflect potential gender-related differences in the perception of readability or clarity [<xref ref-type="bibr" rid="ref26">26</xref>]. In addition, physicians consistently rated the texts lower than nonmedical evaluators, which may reflect stricter professional standards or a more critical appraisal of clinical accuracy and completeness [<xref ref-type="bibr" rid="ref27">27</xref>]. No clear differences in evaluation scores were observed according to direct involvement in the development of the PICS guidelines. This may support the validity and objectivity of the proposed evaluation methodology. The documents generated using the LLM were more acceptable to nonmedical individuals, suggesting that these texts may be relatively easier to use for communication with patients and their families. However, owing to the variation in ratings among evaluators, it is important to consider that perceptions may differ depending on the medical experience of the patients or their families.</p><p>Interevaluator variability analyses highlighted key differences between human- and AI-based assessments. Although both human and AI evaluators tended to rank outputs generated using step-by-step prompting combined with text-augmented prompting approach more favorably, variability was observed in the assessment of texts at extremes of performance, particularly in terms of scientific validity, readability, and overall appropriateness. Notably, the use of prompts simulating a family member improved the agreement between the LLMs and human experts, although the degree of alignment varied across the subscales. This observation suggests that the complexity of the evaluation content and the quality of the generated outputs can influence agreement levels, emphasizing the importance of careful prompt engineering and caution when relying solely on AI-based assessments [<xref ref-type="bibr" rid="ref28">28</xref>]. Similar trends have been observed in clinical evaluations. For instance, when comparing ChatGPT&#x2019;s assessment of clinical cases with evaluations by 2 board-certified physicians, only moderate interevaluator reliability was achieved, with notable discrepancies in domains such as diagnosis and treatment decisions, highlighting the risk of overreliance on AI in sensitive medical contexts [<xref ref-type="bibr" rid="ref29">29</xref>]. Future studies should further investigate the factors contributing to human-AI discrepancies and optimize methodologies to enhance the alignment between AI-generated evaluations and expert human judgments.</p><p>Effective communication is a key determinant of patient satisfaction. Patient satisfaction is strongly associated with the quality of physician-patient communication and that of discharge planning, thus emphasizing the importance of interpersonal engagement in clinical care [<xref ref-type="bibr" rid="ref30">30</xref>]. In recent years, patients have increasingly turned to the internet and digital media to obtain health-related information independently [<xref ref-type="bibr" rid="ref31">31</xref>]. Although such information-seeking behavior empowers patients, it introduces new communication challenges for clinicians, who must address misinformation and convey accurate explanations effectively under limited timeframes. Integrating commercially available LLMs into health care communication workflows may help bridge this gap by enabling patients to interact with reliable, understandable, and personalized explanations, potentially enhancing their engagement and satisfaction.</p></sec><sec id="s4-3"><title>Limitations</title><p>This study has several limitations.</p><p>First, all the generated texts focused exclusively on PICS, limiting the generalizability of the findings to other clinical subjects.</p><p>Second, a deliberate design choice in this study was to prioritize real-world feasibility over technical optimization. Accordingly, we used default generation parameters and a simple document-upload approach rather than implementing a fully indexed retrieval-augmented generation pipeline, reflecting how health care professionals are most likely to use commercially available LLMs in routine clinical practice. Furthermore, the use of a default 2048-token context window represents a technical limitation. We did not verify whether the full text of the uploaded guidelines was retained in the context window during generation. Consequently, partial truncation may have occurred, potentially attenuating the benefits of context-augmented prompting. While this limits the internal validity of the models&#x2019; theoretical maximum performance, it accurately reflects real-world deployment conditions, where clinicians typically rely on default system settings without control over context management. In addition, the instruction to &#x201C;search&#x201D; in prompt 4 was semantically ambiguous given the offline environment. While our functional intent was to enforce reliance on the uploaded context rather than pretrained knowledge, the explicit use of the term &#x201C;search&#x201D;&#x2014;instead of &#x201C;extract&#x201D; or &#x201C;summarize&#x201D;&#x2014;may have inadvertently triggered retrieval-simulation behaviors or role-playing. Since the generated texts were evaluated based on this prompt, we cannot rule out the possibility that this phrasing influenced the stylistic presentation or narrative structure of the outputs. This discrepancy represents a methodological limitation regarding prompt design.</p><p>Third, no formal sample size calculation was performed because this was a pilot study intended to explore preliminary trends. The human evaluation represents a comparative assessment among outputs, meeting a minimal threshold of completeness rather than the full distribution of all generated texts. The selection process was not statistically driven but was designed to ensure the feasibility of human evaluation. In addition, the selection procedures, including the use of the longest output as a proxy for informational coverage and the expert-based filtering prior to evaluation, may have preferentially retained more complete or higher-quality texts while excluding less suitable outputs. As a result, the evaluated sample may not fully represent the distribution of all generated texts and may have led to an overestimation of overall quality. Therefore, the findings should not be interpreted as estimates of overall model performance, but rather as exploratory comparisons across model-prompt configurations under feasibility-constrained conditions. Moreover, the final human evaluation was based on 9 selected texts. Certain prompts (prompts 2 and 3) were represented by only a single text (n=1). Such small and uneven group sizes substantially limit statistical power and the reliability of detecting meaningful differences across prompt strategies. Therefore, these comparisons are exploratory and descriptive and do not allow definitive conclusions regarding the superiority of any specific prompt configuration.</p><p>Fourth, the number of human evaluators was relatively small, which may have affected the representativeness of the findings. Although interevaluator reliability was formally assessed using an ICC, residual variability across evaluators and evaluation items may still have remained. Furthermore, discrepancies were observed between human and LLM-based evaluations; however, the limited sample size made it difficult to systematically examine the structural factors underlying these differences. Future studies using larger datasets are needed to investigate the mechanisms driving divergence between human and LLM assessments and to develop AI-based evaluation methods that more accurately reflect human judgment patterns.</p><p>Fifth, LLM outputs were generated in English and subsequently translated into Japanese using a GPT-4&#x2013;based system prior to evaluation. Accordingly, human evaluators assessed the translated Japanese texts rather than the original English outputs. Although translations were performed without formal human postediting, prior studies suggest that GPT-4 achieves translation quality comparable to that of junior professional translators [<xref ref-type="bibr" rid="ref17">17</xref>], supporting the general adequacy of this approach. However, subtle linguistic or cultural nuances may not have been fully preserved. In addition, the GPT-4&#x2013;based translation process may have systematically normalized sentence structure and wording, potentially increasing readability- and clarity-related scores, independent of the intrinsic quality of the source LLM outputs. Therefore, evaluation scores&#x2014;particularly for domains such as readability, clarity, and difficulty of reading&#x2014;may reflect not only the quality of the LLM-generated content but also the characteristics of the translation process, which may complicate the attribution of differences to individual models or prompt strategies. In this context, at the time of the study, several commercially available LLMs exhibited more stable performance in English than in Japanese, which motivated the decision to generate outputs in English prior to translation. While this approach reflects a pragmatic real-world workflow, it introduces an additional processing layer that may affect the interpretability. With recent advances in multilingual capabilities and agent-based control of LLMs, direct generation in Japanese has become increasingly feasible. Future studies should therefore consider evaluating outputs generated natively in the target language to better isolate the effects of model architecture and prompt design. In addition, because safety-related domains consistently received high scores, their inclusion in the composite score may have obscured differences in quality-related scores. To enhance transparency, safety and quality domains were additionally presented separately in the figure.</p><p>Finally, patients and their family members were not included in the evaluation process, which may limit the real-world applicability of our findings. For clinical translation, incorporating direct feedback from a target audience would provide stronger evidence of clinical effectiveness.</p></sec><sec id="s4-4"><title>Conclusions</title><p>Among the selected texts included in the final human evaluation, informational materials generated using commercially available LLMs were generally rated as acceptable by human evaluators, and none contained harmful content. These findings suggest that LLMs may support the development of patient-facing informational materials under feasibility-constrained conditions, although further validation with larger and more diverse samples is warranted.</p></sec></sec></body><back><ack><p>We would like to express our sincere gratitude to all the participants who completed the survey and contributed their valuable insights to this study. We thank Shunsuke Osawa for script editing. We acknowledge that the explanatory texts used as research materials in this study were generated using large language models, including ChatGPT-4o, LLaMA3 70B, MedItron:7b, Gemma, Mistral, and MedLLaMA2. These texts were not modified or edited by the researchers and were used in their original form for the purpose of evaluating the appropriateness and validity of artificial intelligence&#x2013;generated medical communication. We thank Honyaku Center, Inc, for English language editing. Generative artificial intelligence tools were used during the preparation of this manuscript solely for language editing and wording refinement to improve readability and clarity of the English text. All outputs were reviewed and edited by the authors, who take full responsibility for the final content of the manuscript.</p></ack><notes><sec><title>Funding</title><p>TO received grants from Chiba Foundation for Health Promotion and Disease Prevention. The authors received no specific funding for this study.</p></sec><sec><title>Data Availability</title><p>The datasets used and analyzed in this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>NH, TO, and AH were involved in study concept and design, statistical analysis and interpretation of data, drafting of the manuscript, and critical revision of the manuscript for important intellectual content. AH performed computations to extract the necessary data. All the other authors interpreted the data and critically reviewed the manuscript for important intellectual content. All the authors have read and approved the final version of the manuscript.</p></fn><fn fn-type="conflict"><p>TN is the chief executive officer of Smart119 Inc, where he owns stock. Smart119, Inc, had no role in the study design, data analysis, or manuscript preparation. The authors declare no conflicts of interest.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI </term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">PICS</term><def><p>post&#x2013;intensive care syndrome</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnston</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kelly</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Hsieh</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Skidmore</surname><given-names>B</given-names> </name><name name-style="western"><surname>Wells</surname><given-names>GA</given-names> </name></person-group><article-title>Systematic reviews of clinical practice guidelines: a methodological guide</article-title><source>J Clin Epidemiol</source><year>2019</year><month>04</month><volume>108</volume><fpage>64</fpage><lpage>76</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2018.11.030</pub-id><pub-id pub-id-type="medline">30529647</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Andrews</surname><given-names>J</given-names> </name><name name-style="western"><surname>Guyatt</surname><given-names>G</given-names> </name><name name-style="western"><surname>Oxman</surname><given-names>AD</given-names> </name><etal/></person-group><article-title>GRADE guidelines: 14. Going from evidence to recommendations: the significance and presentation of recommendations</article-title><source>J Clin Epidemiol</source><year>2013</year><month>07</month><volume>66</volume><issue>7</issue><fpage>719</fpage><lpage>725</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2012.03.013</pub-id><pub-id pub-id-type="medline">23312392</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Papadakos</surname><given-names>J</given-names> </name><name name-style="western"><surname>Samoil</surname><given-names>D</given-names> </name><name name-style="western"><surname>Giannopoulos</surname><given-names>E</given-names> </name><etal/></person-group><article-title>The cost of patient education materials development: opportunities to identify value and priorities</article-title><source>J Cancer Educ</source><year>2022</year><month>06</month><volume>37</volume><issue>3</issue><fpage>834</fpage><lpage>842</lpage><pub-id pub-id-type="doi">10.1007/s13187-020-01893-0</pub-id><pub-id pub-id-type="medline">33037573</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oami</surname><given-names>T</given-names> </name><name name-style="western"><surname>Okada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Nakada</surname><given-names>TA</given-names> </name></person-group><article-title>Performance of a large language model in screening citations</article-title><source>JAMA Netw Open</source><year>2024</year><month>07</month><day>1</day><volume>7</volume><issue>7</issue><fpage>e2420496</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.20496</pub-id><pub-id pub-id-type="medline">38976267</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Walker</surname><given-names>HL</given-names> </name><name name-style="western"><surname>Ghani</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kuemmerli</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Reliability of medical information provided by ChatGPT: assessment against clinical guidelines and patient information quality instrument</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>30</day><volume>25</volume><fpage>e47479</fpage><pub-id pub-id-type="doi">10.2196/47479</pub-id><pub-id pub-id-type="medline">37389908</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Preiksaitis</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ashenburg</surname><given-names>N</given-names> </name><name name-style="western"><surname>Bunney</surname><given-names>G</given-names> </name><etal/></person-group><article-title>The role of large language models in transforming emergency medicine: scoping review</article-title><source>JMIR Med Inform</source><year>2024</year><month>05</month><day>10</day><volume>12</volume><fpage>e53787</fpage><pub-id pub-id-type="doi">10.2196/53787</pub-id><pub-id pub-id-type="medline">38728687</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zaretsky</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Baskharoun</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Generative artificial intelligence to transform inpatient discharge summaries to patient-friendly language and format</article-title><source>JAMA Netw Open</source><year>2024</year><month>03</month><day>4</day><volume>7</volume><issue>3</issue><fpage>e240357</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.0357</pub-id><pub-id pub-id-type="medline">38466307</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Gao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xiong</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Jia</surname><given-names>K</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bi</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Retrieval-augmented generation for large language models: a survey</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 18, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2312.10997</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>McCoy</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>A</given-names> </name></person-group><article-title>Improving large language model applications in biomedicine with retrieval-augmented generation: a systematic review, meta-analysis, and clinical development guidelines</article-title><source>J Am Med Inform Assoc</source><year>2025</year><month>04</month><day>1</day><volume>32</volume><issue>4</issue><fpage>605</fpage><lpage>615</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaf008</pub-id><pub-id pub-id-type="medline">39812777</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Prescott</surname><given-names>HC</given-names> </name><name name-style="western"><surname>Angus</surname><given-names>DC</given-names> </name></person-group><article-title>Enhancing recovery from sepsis: a review</article-title><source>JAMA</source><year>2018</year><month>01</month><day>2</day><volume>319</volume><issue>1</issue><fpage>62</fpage><lpage>75</lpage><pub-id pub-id-type="doi">10.1001/jama.2017.17687</pub-id><pub-id pub-id-type="medline">29297082</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Davidson</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bienvenu</surname><given-names>OJ</given-names> </name></person-group><article-title>Family response to critical illness: postintensive care syndrome-family</article-title><source>Crit Care Med</source><year>2012</year><month>02</month><volume>40</volume><issue>2</issue><fpage>618</fpage><lpage>624</lpage><pub-id pub-id-type="doi">10.1097/CCM.0b013e318236ebf9</pub-id><pub-id pub-id-type="medline">22080636</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Inoue</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hatakeyama</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kondo</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Post-intensive care syndrome: its pathophysiology, prevention, and future directions</article-title><source>Acute Med Surg</source><year>2019</year><volume>6</volume><issue>3</issue><fpage>233</fpage><lpage>246</lpage><pub-id pub-id-type="doi">10.1002/ams2.415</pub-id><pub-id pub-id-type="medline">31304024</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cheng</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Comparison of artificial intelligence-generated and physician-generated patient education materials on early diabetic kidney disease</article-title><source>Front Endocrinol (Lausanne)</source><year>2025</year><volume>16</volume><fpage>1559265</fpage><pub-id pub-id-type="doi">10.3389/fendo.2025.1559265</pub-id><pub-id pub-id-type="medline">40331140</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shime</surname><given-names>N</given-names> </name><name name-style="western"><surname>Nakada</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Yatabe</surname><given-names>T</given-names> </name><etal/></person-group><article-title>The Japanese clinical practice guidelines for management of sepsis and septic shock 2024</article-title><source>J Intensive Care</source><year>2025</year><month>03</month><day>14</day><volume>13</volume><issue>1</issue><fpage>15</fpage><pub-id pub-id-type="doi">10.1186/s40560-025-00776-0</pub-id><pub-id pub-id-type="medline">40087807</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>Gpt-4 vs. human translators: a comprehensive evaluation of translation quality across languages, domains, and expertise levels</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 4, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.03658</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="report"><article-title>Ethical guidelines for medical and biological research involving human subjects</article-title><year>2021</year><month>03</month><day>23</day><access-date>2026-04-14</access-date><publisher-name>Ministry of Education, Culture, Sports, Science and Technology; Ministry of Health, Labour and Welfare; Ministry of Economy, Trade and Industry</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.mext.go.jp/content/20250325-mxt_life-000035486-01.pdf">https://www.mext.go.jp/content/20250325-mxt_life-000035486-01.pdf</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abdullahi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>R</given-names> </name><name name-style="western"><surname>Eickhoff</surname><given-names>C</given-names> </name></person-group><article-title>Learning to make rare and complex diagnoses with generative AI assistance: qualitative study of popular large language models</article-title><source>JMIR Med Educ</source><year>2024</year><month>02</month><day>13</day><volume>10</volume><fpage>e51391</fpage><pub-id pub-id-type="doi">10.2196/51391</pub-id><pub-id pub-id-type="medline">38349725</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>T</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Bressem</surname><given-names>KK</given-names> </name><name name-style="western"><surname>Busch</surname><given-names>F</given-names> </name><name name-style="western"><surname>Nebelung</surname><given-names>S</given-names> </name><name name-style="western"><surname>Truhn</surname><given-names>D</given-names> </name></person-group><article-title>Comparative analysis of multimodal large language model performance on clinical vignette questions</article-title><source>JAMA</source><year>2024</year><month>04</month><day>16</day><volume>331</volume><issue>15</issue><fpage>1320</fpage><lpage>1321</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.27861</pub-id><pub-id pub-id-type="medline">38497956</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wilhelm</surname><given-names>TI</given-names> </name><name name-style="western"><surname>Roos</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kaczmarczyk</surname><given-names>R</given-names> </name></person-group><article-title>Large language models for therapy recommendations across 3 clinical specialties: comparative study</article-title><source>J Med Internet Res</source><year>2023</year><month>10</month><day>30</day><volume>25</volume><fpage>e49324</fpage><pub-id pub-id-type="doi">10.2196/49324</pub-id><pub-id pub-id-type="medline">37902826</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Bhasuran</surname><given-names>B</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Quality of answers of generative large language models versus peer users for interpreting laboratory test results for lay patients: evaluation study</article-title><source>J Med Internet Res</source><year>2024</year><month>04</month><day>17</day><volume>26</volume><fpage>e56655</fpage><pub-id pub-id-type="doi">10.2196/56655</pub-id><pub-id pub-id-type="medline">38630520</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Prompt engineering in consistency and reliability with the evidence-based guideline for LLMs</article-title><source>NPJ Digit Med</source><year>2024</year><month>02</month><day>20</day><volume>7</volume><issue>1</issue><fpage>41</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01029-4</pub-id><pub-id pub-id-type="medline">38378899</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Min</surname><given-names>S</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Towards understanding chain-of-thought prompting: an empirical study of what matters</article-title><conf-name>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Jul 9-14, 2023</conf-date><conf-loc>Toronto, Canada</conf-loc><fpage>2717</fpage><lpage>2739</lpage><pub-id pub-id-type="doi">10.18653/v1/2023.acl-long.153</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kresevic</surname><given-names>S</given-names> </name><name name-style="western"><surname>Giuffr&#x00E8;</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ajcevic</surname><given-names>M</given-names> </name><name name-style="western"><surname>Accardo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Croc&#x00E8;</surname><given-names>LS</given-names> </name><name name-style="western"><surname>Shung</surname><given-names>DL</given-names> </name></person-group><article-title>Optimization of hepatological clinical guidelines interpretation by large language models: a retrieval augmented generation-based framework</article-title><source>NPJ Digit Med</source><year>2024</year><month>04</month><day>23</day><volume>7</volume><issue>1</issue><fpage>102</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01091-y</pub-id><pub-id pub-id-type="medline">38654102</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shalaby</surname><given-names>R</given-names> </name><name name-style="western"><surname>Vuong</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hrabok</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Gender differences in satisfaction with a text messaging program (Text4Hope) and anticipated receptivity to technology-based health support during the COVID-19 pandemic: cross-sectional survey study</article-title><source>JMIR mHealth uHealth</source><year>2021</year><month>04</month><day>15</day><volume>9</volume><issue>4</issue><fpage>e24184</fpage><pub-id pub-id-type="doi">10.2196/24184</pub-id><pub-id pub-id-type="medline">33750738</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Robinson</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Qiu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Sands</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Physician vs. AI-generated messages in urology: evaluation of accuracy, completeness, and preference by patients and physicians</article-title><source>World J Urol</source><year>2024</year><month>12</month><day>27</day><volume>43</volume><issue>1</issue><fpage>48</fpage><pub-id pub-id-type="doi">10.1007/s00345-024-05399-y</pub-id><pub-id pub-id-type="medline">39729119</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dauphin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Siefert</surname><given-names>C</given-names> </name></person-group><article-title>From Llama to language: prompt-engineering allows general-purpose artificial intelligence to rate narratives like expert psychologists</article-title><source>Front Artif Intell</source><year>2025</year><volume>8</volume><fpage>1398885</fpage><pub-id pub-id-type="doi">10.3389/frai.2025.1398885</pub-id><pub-id pub-id-type="medline">39981191</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lechien</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Maniaci</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gengler</surname><given-names>I</given-names> </name><name name-style="western"><surname>Hans</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chiesa-Estomba</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Vaira</surname><given-names>LA</given-names> </name></person-group><article-title>Validity and reliability of an instrument evaluating the performance of intelligent chatbot: the artificial intelligence performance instrument (AIPI)</article-title><source>Eur Arch Otorhinolaryngol</source><year>2024</year><month>04</month><volume>281</volume><issue>4</issue><fpage>2063</fpage><lpage>2079</lpage><pub-id pub-id-type="doi">10.1007/s00405-023-08219-y</pub-id><pub-id pub-id-type="medline">37698703</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nishio</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ohde</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yamaji</surname><given-names>N</given-names> </name><name name-style="western"><surname>Takahashi</surname><given-names>O</given-names> </name></person-group><article-title>Factors associated with patients&#x2019; ratings of hospitals among Japanese inpatients: a cross-sectional study</article-title><source>JMA J</source><year>2023</year><month>04</month><day>14</day><volume>6</volume><issue>2</issue><fpage>148</fpage><lpage>155</lpage><pub-id pub-id-type="doi">10.31662/jmaj.2022-0176</pub-id><pub-id pub-id-type="medline">37179724</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Han</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shahzad</surname><given-names>F</given-names> </name></person-group><article-title>Digital pathways to healthcare: a systematic review for unveiling the trends and insights in online health information-seeking behavior</article-title><source>Front Public Health</source><year>2025</year><volume>13</volume><fpage>1497025</fpage><pub-id pub-id-type="doi">10.3389/fpubh.2025.1497025</pub-id><pub-id pub-id-type="medline">40013047</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompt design.</p><media xlink:href="formative_v10i1e81606_app1.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Generated content of 72 texts.</p><media xlink:href="formative_v10i1e81606_app2.xlsx" xlink:title="XLSX File, 41 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Generated content of 16 texts.</p><media xlink:href="formative_v10i1e81606_app3.xlsx" xlink:title="XLSX File, 21 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Generated content of 9 texts.</p><media xlink:href="formative_v10i1e81606_app4.docx" xlink:title="DOCX File, 26 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Detailed screening and expert selection process.</p><media xlink:href="formative_v10i1e81606_app5.docx" xlink:title="DOCX File, 24 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Role-based comparison of large language model (LLM) evaluation scores for the 9 selected texts. Evaluation scores generated through LLM-based assessment were compared according to simulated evaluator roles: (A) a family member of an intensive care unit patient and (B) a medical doctor. The forest plots show the mean scores for each evaluation item across the 9 selected texts, with error bars representing 95% bootstrap CIs. Evaluation items related to informational quality are presented in the upper panels, whereas safety-related items are shown in the lower panels to facilitate visual distinction. Blue diamonds indicate the highest-scoring configuration among the evaluated texts, and red squares indicate the lowest-scoring configuration.</p><media xlink:href="formative_v10i1e81606_app6.png" xlink:title="PNG File, 4266 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Subgroup analyses of total evaluation scores according to evaluator characteristics.</p><media xlink:href="formative_v10i1e81606_app7.png" xlink:title="PNG File, 336 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Comparison of evaluation scores between all 72 generated texts and the final 9 selected texts. Evaluation scores generated through large language model&#x2013;based assessment were compared between all 72 generated texts and the final subset of 9 selected texts, based on evaluations performed from the perspectives of (A) a simulated family member of an intensive care unit patient and (B) a simulated medical doctor. The forest plots show the mean scores for each evaluation item, with error bars representing 95% bootstrap CIs. Evaluation items related to informational quality are presented in the upper panels, whereas safety-related items are shown in the lower panels to facilitate visual distinction. Blue markers indicate the mean scores of the selected subset of 9 texts, whereas orange markers indicate the overall mean scores across all 72 generated texts.</p><media xlink:href="formative_v10i1e81606_app8.png" xlink:title="PNG File, 4198 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Large language model (LLM)&#x2013;based patient-role evaluation results for all 72 generated texts. This table presents the evaluation results of all 72 generated texts assessed by an LLM simulating a patient role across 6 predefined criteria. The texts are ranked in descending order according to their total scores. The final subset of 9 selected texts is highlighted in orange, and their corresponding ranking positions among all 72 texts are indicated.</p><media xlink:href="formative_v10i1e81606_app9.xlsx" xlink:title="XLSX File, 25 KB"/></supplementary-material><supplementary-material id="app10"><label>Multimedia Appendix 10</label><p>Large language model (LLM)&#x2013;based physician-role evaluation results for all 72 generated texts. This table presents the evaluation results of all 72 generated texts assessed by an LLM simulating a physician role across 6 predefined criteria. The texts are ranked in descending order according to their total scores. The final subset of 9 selected texts is highlighted in orange, and their corresponding ranking positions among all 72 texts are indicated.</p><media xlink:href="formative_v10i1e81606_app10.xlsx" xlink:title="XLSX File, 25 KB"/></supplementary-material><supplementary-material id="app11"><label>Multimedia Appendix 11</label><p>Mixed-effects model analysis of large language model&#x2013;based patient-role evaluations across all 72 texts.</p><media xlink:href="formative_v10i1e81606_app11.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app12"><label>Multimedia Appendix 12</label><p>Mixed-effects model analysis of large language model&#x2013;based physician-role evaluations across all 72 texts.</p><media xlink:href="formative_v10i1e81606_app12.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app13"><label>Multimedia Appendix 13</label><p>Sensitivity analyses using large language model&#x2013;based patient-role evaluations of all 72 texts (mean of 3 outputs).</p><media xlink:href="formative_v10i1e81606_app13.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app14"><label>Multimedia Appendix 14</label><p>Sensitivity analyses using large language model&#x2013;based physician-role evaluations of all 72 texts (mean of 3 outputs).</p><media xlink:href="formative_v10i1e81606_app14.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material></app-group></back></article>