<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="letter" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JFR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id>
      <journal-title>JMIR Formative Research</journal-title>
      <issn pub-type="epub">2561-326X</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i1e56165</article-id>
      <article-id pub-id-type="pmid">38848553</article-id>
      <article-id pub-id-type="doi">10.2196/56165</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Research Letter</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Research Letter</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Clinical Accuracy, Relevance, Clarity, and Emotional Sensitivity of Large Language Models to Surgical Patient Questions: Cross-Sectional Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>de Azevedo Cardoso</surname>
            <given-names>Taiane</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chrimes</surname>
            <given-names>Dillon</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhu</surname>
            <given-names>Lingxuan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhu</surname>
            <given-names>Fang</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chekeni</surname>
            <given-names>Faraaz</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Dagli</surname>
            <given-names>Mert Marcel</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Neurosurgery</institution>
            <institution>University of Pennsylvania Perelman School of Medicine</institution>
            <addr-line>801 Spruce Street</addr-line>
            <addr-line>Philadelphia, PA, 19106</addr-line>
            <country>United States</country>
            <phone>1 2672306493</phone>
            <email>marcel.dagli@pennmedicine.upenn.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5201-6720</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Oettl</surname>
            <given-names>Felix Conrad</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9721-685X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Gujral</surname>
            <given-names>Jaskeerat</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-3626-0306</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Malhotra</surname>
            <given-names>Kashish</given-names>
          </name>
          <degrees>MBBS</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6985-5731</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Ghenbot</surname>
            <given-names>Yohannes</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4382-9372</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Yoon</surname>
            <given-names>Jang W</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2705-0406</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Ozturk</surname>
            <given-names>Ali K</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6474-9422</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Welch</surname>
            <given-names>William C</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1665-7985</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Neurosurgery</institution>
        <institution>University of Pennsylvania Perelman School of Medicine</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Orthopedic Surgery</institution>
        <institution>Hospital for Special Surgery</institution>
        <addr-line>New York, NY</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Orthopedic Surgery</institution>
        <institution>Schulthess Clinic</institution>
        <addr-line>Zurich</addr-line>
        <country>Switzerland</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Institute of Applied Health Research</institution>
        <institution>University of Birmingham</institution>
        <addr-line>Birmingham</addr-line>
        <country>United Kingdom</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Mert Marcel Dagli <email>marcel.dagli@pennmedicine.upenn.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>7</day>
        <month>6</month>
        <year>2024</year>
      </pub-date>
      <volume>8</volume>
      <elocation-id>e56165</elocation-id>
      <history>
        <date date-type="received">
          <day>8</day>
          <month>1</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>26</day>
          <month>1</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>14</day>
          <month>3</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>23</day>
          <month>4</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Mert Marcel Dagli, Felix Conrad Oettl, Jaskeerat Gujral, Kashish Malhotra, Yohannes Ghenbot, Jang W Yoon, Ali K Ozturk, William C Welch. Originally published in JMIR Formative Research (https://formative.jmir.org), 07.06.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on https://formative.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://formative.jmir.org/2024/1/e56165" xlink:type="simple"/>
      <abstract>
        <p>This cross-sectional study evaluates the clinical accuracy, relevance, clarity, and emotional sensitivity of responses to inquiries from patients undergoing surgery provided by large language models (LLMs), highlighting their potential as adjunct tools in patient communication and education. Our findings demonstrated high performance of LLMs across accuracy, relevance, clarity, and emotional sensitivity, with Anthropic’s Claude 2 outperforming OpenAI’s ChatGPT and Google’s Bard, suggesting LLMs’ potential to serve as complementary tools for enhanced information delivery and patient-surgeon interaction.</p>
      </abstract>
      <kwd-group>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
        <kwd>natural language processing</kwd>
        <kwd>NLP</kwd>
        <kwd>large language model</kwd>
        <kwd>LLM</kwd>
        <kwd>generative AI</kwd>
        <kwd>cross-sectional study</kwd>
        <kwd>health information</kwd>
        <kwd>patient education</kwd>
        <kwd>clinical accuracy</kwd>
        <kwd>emotional sensitivity</kwd>
        <kwd>surgical patient</kwd>
        <kwd>surgery</kwd>
        <kwd>surgical</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Recent advances in natural language processing (NLP) have produced large language model (LLM) applications, such as OpenAI’s ChatGPT, that have captivated a worldwide audience [<xref ref-type="bibr" rid="ref1">1</xref>]. They have permeated the health care sector, offering several benefits [<xref ref-type="bibr" rid="ref2">2</xref>]. While LLMs have immense potential in improving clinical practice and patient outcomes, their role has not been completely established [<xref ref-type="bibr" rid="ref3">3</xref>]. Often, patients that require surgery struggle with complex, anxiety-inducing questions [<xref ref-type="bibr" rid="ref4">4</xref>]. Thus, counseling during preoperative workup is crucial for obtaining informed consent, establishing trust, and ensuring presurgical optimization to improve patient outcomes. This process, being resource-intensive and involving numerous conversations, often delays communication, causing significant frustration for patients [<xref ref-type="bibr" rid="ref5">5</xref>]. Therefore, the importance of clear, adequate, and timely information delivery cannot be overemphasized. LLMs with chat features could improve preoperative communication; however, LLMs’ ability in answering patients’ surgical questions have not been extensively studied. Thus, this study aims to assess LLMs’ potential and proficiency in responding to questions from patients undergoing surgery.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>In formulating our questionnaire, we used the input of 3 neurosurgical attendings, focusing on common general patient inquiries regarding surgery. We presented 38 patient questions in web sessions to 3 publicly accessible LLMs: ChatGPT (GPT-4; OpenAI), Claude 2 (Anthropic), and Bard (Google) on August 16, 2023 (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Questions had 4 central themes: the nature and rationale of a surgery, preoperative concerns, procedural aspects, and postoperative considerations. Each reply from the LLMs was reviewed by 2 independent blinded reviewers (MMD and FCO, research fellows with medical doctorates who had not completed postgraduate clinical training). A 5-point Likert scale was used to assess accuracy, relevance, and clarity of responses [<xref ref-type="bibr" rid="ref6">6</xref>]. Emotional sensitivity was evaluated on a 7-point Likert scale to increase discriminatory power [<xref ref-type="bibr" rid="ref7">7</xref>]. Assessment of data normality used the Shapiro-Wilk test. Homogeneity of variances (homoscedasticity) across groups was evaluated via the Levene test. For nonparametric analysis, the Kruskal-Wallis test was used to discern differences among groups. Subsequent pairwise comparisons were facilitated by the post hoc Dunn test. In instances where parametric assumptions were upheld, a 1-way ANOVA was conducted, followed by post hoc analysis with the Tukey honestly significant difference (HSD) test. <italic>P</italic> values from the post hoc analysis were adjusted for multiplicity with Bonferroni correction. Additionally, weighted percentage agreement (WPA) was used to determine agreement between raters. All statistical analyses used Python (version 3.7; Python Foundation).</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>The study qualified for institutional review board exemption as it exclusively used questions sourced from surgeon input, with no direct patient involvement.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>Shapiro-Wilk testing indicated nonnormality (<italic>P</italic>&#60;.05; <xref ref-type="table" rid="table1">Table 1</xref>) for accuracy, relevance, and clarity scores. Levene testing revealed nonhomoscedasticity for relevance (<italic>F</italic><sub>2</sub>=5.009; <italic>P</italic>=.01). The Kruskal-Wallis test showed significant differences in the distribution of accuracy (<italic>H</italic>=27.464; <italic>P</italic>&#60;.001), relevance (<italic>H</italic>=29.074; <italic>P</italic>&#60;.001), and clarity (<italic>H</italic>=32.745; <italic>P</italic>&#60;.001). The post hoc Dunn test demonstrated that Claude 2’s responses were significantly more highly rated than ChatGPT’s or Bard’s for accuracy, relevance, and clarity (<italic>P</italic>&#60;.05). There were no significant differences between ChatGPT and Bard except in clarity (<italic>Z</italic>=1.972; <italic>P</italic>=.04). ANOVA showed significant differences in emotional sensitivity (<italic>F</italic><sub>2,111</sub>=10.799; <italic>P</italic>&#60;.001). The post hoc Tukey HSD test revealed significantly higher emotional sensitivity scores for Claude 2 compared to ChatGPT and Bard (<italic>P</italic>&#60;.05). WPA was highest for Claude 2, followed by ChatGPT and Bard (<xref ref-type="table" rid="table2">Tables 2</xref> and <xref ref-type="table" rid="table3">3</xref>).</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Results of normality test (Shapiro-Wilk), homoscedasticity test (Levene), nonparametric test (Kruskal-Wallis), post hoc pairwise comparison of nonparametric data (Dunn test with Bonferroni correction), parametric test (ANOVA), and post hoc pairwise comparison of parametric data (Tukey honestly significant differences [HSD] test with Bonferroni correction).</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="30"/>
          <col width="540"/>
          <col width="200"/>
          <col width="200"/>
          <thead>
            <tr valign="top">
              <td colspan="3">Test</td>
              <td>Value</td>
              <td><italic>P</italic> value</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="5">
                <bold>Shapiro-Wilk (<italic>W</italic> statistic)</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="4">
                <bold>Accuracy</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>ChatGPT</td>
              <td>0.862</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Claude 2</td>
              <td>0.711</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Bard</td>
              <td>0.87</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="4">
                <bold>Relevance</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>ChatGPT</td>
              <td>0.845</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Claude 2</td>
              <td>0.604</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Bard</td>
              <td>0.917</td>
              <td>.01</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="4">
                <bold>Clarity</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>ChatGPT</td>
              <td>0.886</td>
              <td>.01</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Claude 2</td>
              <td>0.747</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Bard</td>
              <td>0.933</td>
              <td>.02</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="4">
                <bold>Emotional sensitivity</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>ChatGPT</td>
              <td>0.965</td>
              <td>.27</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Claude 2</td>
              <td>0.953</td>
              <td>.11</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Bard</td>
              <td>0.959</td>
              <td>.18</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Levene (<italic>F</italic><sub>2</sub> statistic)</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="2">Accuracy</td>
              <td>2.144</td>
              <td>.12</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="2">Relevance</td>
              <td>5.009</td>
              <td>.01</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="2">Clarity</td>
              <td>1.918</td>
              <td>.15</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="2">Emotional sensitivity</td>
              <td>0.184</td>
              <td>.83</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Kruskal-Wallis (<italic>H</italic> statistic)</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="2">Accuracy</td>
              <td>27.363</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="2">Relevance</td>
              <td>29.074</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="2">Clarity</td>
              <td>32.745</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Dunn test with Bonferroni (<italic>Z</italic> statistic)</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="4">
                <bold>Accuracy</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>ChatGPT vs Claude 2</td>
              <td>–2.546</td>
              <td>.01</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>ChatGPT vs Bard</td>
              <td>1.56</td>
              <td>.15</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Claude 2 vs Bard</td>
              <td>4.106</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="4">
                <bold>Relevance</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>ChatGPT vs Claude 2</td>
              <td>–2.872</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>ChatGPT vs Bard</td>
              <td>1.235</td>
              <td>.34</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Claude 2 vs Bard</td>
              <td>4.107</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="4">
                <bold>Clarity</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>ChatGPT vs Claude 2</td>
              <td>–2.546</td>
              <td>.01</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>ChatGPT vs Bard</td>
              <td>1.972</td>
              <td>.04</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Claude 2 vs Bard</td>
              <td>4.518</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td colspan="3"><italic>F</italic> statistic (<italic>df</italic>) from ANOVA (for emotional sensitivity)</td>
              <td>10.799 (2,111)</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Tukey HSD test with Bonferroni (emotional sensitivity; <italic>Q</italic> statistic)</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="2">ChatGPT vs Claude 2</td>
              <td>–0.974</td>
              <td>&#60;.001</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="2">Bard vs ChatGPT</td>
              <td>0.21</td>
              <td>.60</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="2">Claude 2 vs Bard</td>
              <td>0.763</td>
              <td>.01</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>Adjusted percentage average ratings of large language model responses. Adjusted average percentage ratings were calculated as the mean of normalized scores using the following formula to scale responses uniformly from 0% to 100%: adjusted percentage rating = ((actual Likert score – 1) / (Likert scale maximum – 1)) × 100%.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="160"/>
          <col width="140"/>
          <col width="140"/>
          <col width="0"/>
          <col width="140"/>
          <col width="140"/>
          <col width="0"/>
          <col width="140"/>
          <col width="140"/>
          <thead>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">ChatGPT</td>
              <td colspan="3">Claude 2</td>
              <td colspan="2">Bard</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Likert score, mean (SD)</td>
              <td>Adjusted average Likert rating (%), mean (SD)</td>
              <td colspan="2">Likert score, mean (SD)</td>
              <td>Adjusted average Likert rating, mean (SD)</td>
              <td colspan="2">Likert score, mean (SD)</td>
              <td>Adjusted average Likert rating, mean (SD)</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Accuracy</td>
              <td>4.2 (0.55)</td>
              <td>79.93 (13.8)</td>
              <td colspan="2">4.61 (0.58)</td>
              <td>90.13 (14.58)</td>
              <td colspan="2">3.76 (0.85)</td>
              <td>69.08 (21.3)</td>
            </tr>
            <tr valign="top">
              <td>Relevance</td>
              <td>4.28 (0.64)</td>
              <td>81.91 (16.1)</td>
              <td colspan="2">4.76 (0.4)</td>
              <td>94.08 (9.96)</td>
              <td colspan="2">4.04 (0.67)</td>
              <td>75.99 (16.79)</td>
            </tr>
            <tr valign="top">
              <td>Clarity</td>
              <td>4.24 (0.61)</td>
              <td>80.92 (16.1)</td>
              <td colspan="2">4.68 (0.38)</td>
              <td>92.11 (9.38)</td>
              <td colspan="2">3.86 (0.64)</td>
              <td>71.38 (15.89)</td>
            </tr>
            <tr valign="top">
              <td>Emotional sensitivity</td>
              <td>4.49 (1)</td>
              <td>58.11 (16.61)</td>
              <td colspan="2">5.46 (0.92)</td>
              <td>74.34 (15.3)</td>
              <td colspan="2">4.7 (0.97)</td>
              <td>61.62 (16.16)</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <table-wrap position="float" id="table3">
        <label>Table 3</label>
        <caption>
          <p>Weighted percentage agreement (WPA) point estimates.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="190"/>
          <col width="270"/>
          <col width="270"/>
          <col width="270"/>
          <thead>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>ChatGPT, WPA (95% CI)</td>
              <td>Claude 2, WPA (95% CI)</td>
              <td>Bard, WPA (95% CI)</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Accuracy</td>
              <td>80.26 (67.61-92.92)</td>
              <td>86.84 (76.09-97.59)</td>
              <td> 71.05 (56.63-85.47)</td>
            </tr>
            <tr valign="top">
              <td>Relevance</td>
              <td>76.32 (62.8-89.83)</td>
              <td>97.37 (92.28-102.46)</td>
              <td>71.05 (56.63-85.47)</td>
            </tr>
            <tr valign="top">
              <td>Clarity</td>
              <td>72.37 (58.15-86.59)</td>
              <td>94.74 (87.64-101.84)</td>
              <td>60.53 (44.98-76.07)</td>
            </tr>
            <tr valign="top">
              <td>Emotional</td>
              <td>68.42 (53.64-83.2)</td>
              <td>77.63 (64.38-90.88)</td>
              <td>67.11 (52.17-82.04)</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Our investigation revealed potential for using LLMs in patient education. Claude 2 had significantly higher percentage average ratings of above 90% for accuracy (<italic>P</italic>=.004 and <italic>P</italic>&#60;.001), relevance (<italic>P</italic>&#60;.001), and clarity (<italic>P</italic>=.004 and <italic>P</italic>&#60;.001) compared to ChatGPT and Bard. It also scored significantly better on emotional sensitivity than ChatGPT and Bard (<italic>P</italic>&#60;.001 and <italic>P</italic>=.01), with 74.3%. In a study parallel to ours, Sezgin et al [<xref ref-type="bibr" rid="ref8">8</xref>] assessed the clinical accuracy of LLMs in the context of postpartum depression, demonstrating their efficacy in providing clinically accurate information, a finding that complements our study’s illustration of LLMs’ potential in patient education and engagement. By providing accurate and timely information, LLMs can potentially alleviate patient concerns.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>The study’s limitations include the absence of direct patient input when formulating the questionnaire, the lack of repeated zero-shot questioning, which may reveal variability, and no dedicated analysis of overtly inaccurate “hallucinations.” The principal challenge for LLM deployment in clinical settings lies in its regulatory approval and secure integration within health care systems [<xref ref-type="bibr" rid="ref9">9</xref>]. We are actively conceptualizing a randomized clinical trial controlling for these limitations to investigate LLM and surgeon responses as rated by patients and surgeons.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>While surgeons remain indispensable in patient education, LLMs can potentially serve as a complementary tool, enhancing information delivery and supporting patient-surgeon interactions.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Responses to surgical patient questions.</p>
        <media xlink:href="formative_v8i1e56165_app1.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 88 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">HSD</term>
          <def>
            <p>honestly significant difference</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">WPA</term>
          <def>
            <p>weighted percentage agreement</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>All data generated or analyzed during this study are included in this published article and its multimedia appendix.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>WCW is the guarantor of the study. MMD and WCW led conceptualization, data acquisition, analysis, drafting, and revision of the manuscript. JG and KM contributed to data acquisition, analysis, and drafting. Blinded scoring was performed by MMD and FCO. All authors contributed to analysis, interpretation, and drafting. JWY, AKO, and WCW contributed critical guidance at all stages of the study. The manuscript was reviewed and edited, and its final version was approved, by all authors.</p>
      </fn>
      <fn fn-type="conflict">
        <p>In the past 36 months, AKO has received consulting fees from Medacta and Johnson and Johnson (with no relation to this work), and has served as an E2M ad-hoc reviewer for the Journal of Neurosurgery Publishing Group (JNS PG). Additionally, within the same period, JWY has received a grant from Pacira and Johnson and Johnson; consulting fees from Medyssey, TrackX, Richard Wolf, and Johnson and Johnson; holds patents planned, issued, or pending with Kinesiometrics (co-founder) and MedCyclops (co-founder); and has served in a leadership role on the Scientific Program Committee of the AANS/CNS Joint Section on Disorders of the Spine and Peripheral Nerves (with no relation to this work). All other authors report no conflicts of interest.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kevin</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The brilliance and weirdness of ChatGPT</article-title>
          <source>New York Times</source>
          <access-date>2024-04-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nytimes.com/2022/12/05/technology/chatgpt-ai-twitter.html">https://www.nytimes.com/2022/12/05/technology/chatgpt-ai-twitter.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davenport</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kalakota</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The potential for artificial intelligence in healthcare</article-title>
          <source>Future Healthc J</source>
          <year>2019</year>
          <month>06</month>
          <volume>6</volume>
          <issue>2</issue>
          <fpage>94</fpage>
          <lpage>98</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/futurehealth"/>
          </comment>
          <pub-id pub-id-type="doi">10.7861/futurehosp.6-2-94</pub-id>
          <pub-id pub-id-type="medline">31363513</pub-id>
          <pub-id pub-id-type="pii">S2514-6645(24)01059-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC6616181</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mofatteh</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Neurosurgery and artificial intelligence</article-title>
          <source>AIMS Neurosci</source>
          <year>2021</year>
          <volume>8</volume>
          <issue>4</issue>
          <fpage>477</fpage>
          <lpage>495</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34877400"/>
          </comment>
          <pub-id pub-id-type="doi">10.3934/Neuroscience.2021025</pub-id>
          <pub-id pub-id-type="medline">34877400</pub-id>
          <pub-id pub-id-type="pii">neurosci-08-04-025</pub-id>
          <pub-id pub-id-type="pmcid">PMC8611194</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wongkietkachorn</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wongkietkachorn</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Rhunsiri</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Preoperative needs-based education to reduce anxiety, increase satisfaction, and decrease time spent in day surgery: a randomized controlled</article-title>
          <source>World J Surg</source>
          <year>2018</year>
          <month>03</month>
          <volume>42</volume>
          <issue>3</issue>
          <fpage>666</fpage>
          <lpage>674</lpage>
          <pub-id pub-id-type="doi">10.1007/s00268-017-4207-0</pub-id>
          <pub-id pub-id-type="medline">28875242</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00268-017-4207-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Weinman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dale</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Doctor-patient communication and patient satisfaction: a review</article-title>
          <source>Fam Pract</source>
          <year>1998</year>
          <month>10</month>
          <volume>15</volume>
          <issue>5</issue>
          <fpage>480</fpage>
          <lpage>92</lpage>
          <pub-id pub-id-type="doi">10.1093/fampra/15.5.480</pub-id>
          <pub-id pub-id-type="medline">9848436</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sullivan</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Artino</surname>
              <given-names>AR</given-names>
            </name>
          </person-group>
          <article-title>Analyzing and interpreting data from Likert-type scales</article-title>
          <source>J Grad Med Educ</source>
          <year>2013</year>
          <month>12</month>
          <volume>5</volume>
          <issue>4</issue>
          <fpage>541</fpage>
          <lpage>2</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/24454995"/>
          </comment>
          <pub-id pub-id-type="doi">10.4300/JGME-5-4-18</pub-id>
          <pub-id pub-id-type="medline">24454995</pub-id>
          <pub-id pub-id-type="pii">JGME-5-4-18</pub-id>
          <pub-id pub-id-type="pmcid">PMC3886444</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Preston</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Colman</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>Optimal number of response categories in rating scales: reliability, validity, discriminating power, and respondent preferences</article-title>
          <source>Acta Psychol (Amst)</source>
          <year>2000</year>
          <month>03</month>
          <volume>104</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>15</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://core.ac.uk/reader/41996672?utm_source=linkout"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/s0001-6918(99)00050-5</pub-id>
          <pub-id pub-id-type="medline">10769936</pub-id>
          <pub-id pub-id-type="pii">S0001-6918(99)00050-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sezgin</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Chekeni</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Keim</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Clinical accuracy of large language models and Google search responses to postpartum depression questions: cross-sectional study</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <month>09</month>
          <day>11</day>
          <volume>25</volume>
          <fpage>e49240</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e49240/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/49240</pub-id>
          <pub-id pub-id-type="medline">37695668</pub-id>
          <pub-id pub-id-type="pii">v25i1e49240</pub-id>
          <pub-id pub-id-type="pmcid">PMC10520763</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Malik</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Pathania</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rathaur</surname>
              <given-names>VK</given-names>
            </name>
          </person-group>
          <article-title>Overview of artificial intelligence in medicine</article-title>
          <source>J Family Med Prim Care</source>
          <year>2019</year>
          <month>07</month>
          <volume>8</volume>
          <issue>7</issue>
          <fpage>2328</fpage>
          <lpage>2331</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jfmpc.com/article.asp?issn=2249-4863;year=2019;volume=8;issue=7;spage=2328;epage=2331;aulast=Amisha%2C"/>
          </comment>
          <pub-id pub-id-type="doi">10.4103/jfmpc.jfmpc_440_19</pub-id>
          <pub-id pub-id-type="medline">31463251</pub-id>
          <pub-id pub-id-type="pii">JFMPC-8-2328</pub-id>
          <pub-id pub-id-type="pmcid">PMC6691444</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
