<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e78138</article-id><article-id pub-id-type="doi">10.2196/78138</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Comparing Human and AI Therapists in Behavioral Activation for Depression: Cross-Sectional Questionnaire Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Napiwotzki</surname><given-names>Inka</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Laue</surname><given-names>Julian</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Caldarone</surname><given-names>Flora</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Idahl</surname><given-names>Maximilian</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hadler</surname><given-names>Uwe</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Amrani</surname><given-names>Haithem</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hildt</surname><given-names>Elisabeth</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kahl</surname><given-names>Kai G</given-names></name><degrees>Prof Dr Med</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nejdl</surname><given-names>Wolfgang</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Clinic for Psychiatry, Social Psychiatry and Psychotherapy, Department of Medizinische Hochschule Hannover</institution><addr-line>Carl-Neuberg-Str 1</addr-line><addr-line>Hannover</addr-line><country>Germany</country></aff><aff id="aff2"><institution>L3S Research Center, Leibniz University Hannover</institution><addr-line>Hannover</addr-line><country>Germany</country></aff><aff id="aff3"><institution>Department of Humanities, Arts, and Social Sciences, Illinois Institute of Technology</institution><addr-line>Chicago</addr-line><addr-line>IL</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Hamdan</surname><given-names>Achmad</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Thomas</surname><given-names>Alex Thomas</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Foroutan</surname><given-names>Behzad</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Oloruntoba</surname><given-names>Oluwafemi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Inka Napiwotzki, MSc, Clinic for Psychiatry, Social Psychiatry and Psychotherapy, Department of Medizinische Hochschule Hannover, Carl-Neuberg-Str 1, Hannover, 30625, Germany, 49 511 532 2495; <email>napiwotzki.inka@mh-hannover.de</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>4</day><month>12</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e78138</elocation-id><history><date date-type="received"><day>28</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>25</day><month>09</month><year>2025</year></date><date date-type="accepted"><day>25</day><month>09</month><year>2025</year></date></history><copyright-statement>&#x00A9; Inka Napiwotzki, Julian Laue, Flora Caldarone, Maximilian Idahl, Uwe Hadler, Haithem Amrani, Elisabeth Hildt, Kai G Kahl, Wolfgang Nejdl. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 4.12.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e78138"/><abstract><sec><title>Background</title><p>Large language models (LLMs) have rapidly advanced across numerous fields, including mental health care. A shortage of trained therapists and mental health care providers has driven informal use of LLMs for therapeutic support. However, their clinical utility remains poorly defined.</p></sec><sec><title>Objective</title><p>This study aimed to systematically evaluate and compare the therapeutic knowledge and single-turn response capabilities of LLMs versus psychotherapists in training in the context of behavioral activation (BA) therapy for depression, and to assess how both groups&#x2019; performance changed when provided with structured therapeutic training materials.</p></sec><sec sec-type="methods"><title>Methods</title><p>Six LLMs and 8 human participants completed a questionnaire on depression and BA with 20 multiple-choice items and 10 therapy scenarios, each with 3 open-ended items, that postulated empathic response, use of validation strategies, and theory of mind capabilities. Human participants completed the questionnaire before and after a 5-hour workshop and 5-week period with learning materials. The LLMs received identical training content as context during the second test. All open-ended questions were rated on 5-point scales by 2 experts.</p></sec><sec sec-type="results"><title>Results</title><p>At baseline, the LLMs demonstrated higher knowledge scores than human participants (61.0 vs 52.0 out of 100 points) and were rated higher in empathy (<italic>U</italic>=2.0; <italic>P</italic>=.005; <italic>r</italic>=0.917), validation quality (<italic>U</italic>=2.5; <italic>P</italic>=.006; <italic>r</italic>=0.896), anticipation of cognition (<italic>U</italic>=0.0; <italic>P</italic>=.002; <italic>r</italic>=1.000), and anticipation of emotion (<italic>U</italic>=0.0; <italic>P</italic>=.002; <italic>r</italic>=1.000). Following BA training, the LLMs maintained their performance advantage across multiple-choice and open-ended items.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The results suggest that LLMs may generate high-quality therapeutic single-turn responses that integrate clinical knowledge with empathetic communication. The findings hint at LLMs&#x2019; potential as valuable tools in mental health care, although further clinical trials are needed to evaluate their performance in ongoing therapeutic relationships and clinical outcomes.</p></sec></abstract><kwd-group><kwd>large language model</kwd><kwd>mental health</kwd><kwd>behavioral activation</kwd><kwd>empathy</kwd><kwd>digital therapeutics</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Health care systems worldwide face a critical challenge in addressing a growing mental health crisis: while evidence-based treatments like cognitive behavioral therapy (CBT) exist, there is an acute shortage of trained professionals to deliver them. This gap affects 1 in 8 people globally who live with mental disorders, with numbers rising since the COVID-19 pandemic. Mental disorders have devastating consequences. Beyond reduced work productivity, affected individuals experience reduced social participation, physical health complications, and premature mortality [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Recent research has demonstrated the potential of large language models (LLMs) in mental health care applications [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. While LLMs offer more sophisticated and natural language understanding capabilities than earlier rule-based systems, their practical implementation in therapeutic contexts remains largely unexplored [<xref ref-type="bibr" rid="ref7">7</xref>]. However, informal therapeutic use of LLMs is already occurring. A recent study of the Replika chatbot (Luka, Inc) found users engaging in therapeutic conversations, with some reporting crisis prevention benefits [<xref ref-type="bibr" rid="ref8">8</xref>]. These findings align with informal user discussions across social media platforms, where individuals frequently describe using general-purpose LLM platforms like ChatGPT (OpenAI) for emotional support and mental health conversations, despite these models not being designed or validated for therapeutic use (Mirzae, T, unpublished data, October 2025). This spontaneous adoption, combined with LLMs&#x2019; known risks and susceptibility to errors, underscores the critical and urgent need for rigorous evaluation to ensure their safe and effective application in therapeutic dialogue [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>Researchers have explored various approaches to enhance LLMs&#x2019; therapeutic capabilities&#x2014;from fine-tuning models on therapy-specific datasets to applying few-shot learning with therapist-client examples and adapting self-critique techniques [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. However, these studies predominantly relied on automated evaluation methods, often using one LLM to evaluate another [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. This methodological limitation points to the need for comprehensive human expert assessment.</p><p>To address these limitations, we present a systematic evaluation comparing 6 LLMs with 8 psychotherapists in training. Our assessment consists of 2 components. Multiple-choice questions test knowledge on depression, therapy principles, and BA, an effective therapeutic method within CBT for treating depression [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. We focused on depression as it is one of the most common mental disorders [<xref ref-type="bibr" rid="ref1">1</xref>]. Through open-ended questions, we evaluated responses to client statements, assessing empathy, use of validation strategies, and the ability to anticipate a client&#x2019;s emotions and cognition. We evaluated how single-turn performance changes when LLMs are provided with therapeutic background information about BA principles and techniques, comparing this to the improvement observed in therapists after formal BA training. <xref ref-type="fig" rid="figure1">Figure 1</xref> provides a visual overview of our approach. This parallel assessment reveals whether additional context enhances LLM capabilities and quantifies learning effects in human therapists.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Experimental design for comparing psychotherapists in training with large language models (LLMs). Study methodology comparing psychotherapists in training (n=8, top) and LLMs (n=6, bottom) on therapeutic capabilities. Both groups completed identical assessments with multiple-choice knowledge questions and open-ended therapeutic scenarios. Human participants underwent a 5-week learning period including a BA workshop, while LLMs received equivalent information as added context during the posttest assessment. BA: behavioral activation; CBT: cognitive behavioral therapy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e78138_fig01.png"/></fig><p>In summary, our contributions are threefold: (1) a comprehensive evaluation questionnaire combining 20 multiple-choice and 10 open-ended questions that will be publicly available to foster further research in this domain; (2) an evaluation of LLMs in their default, publicly accessible form&#x2014;the way most users currently interact with these systems; and (3) the first direct comparison (to the best of our knowledge) between LLMs and human psychotherapists in training in both therapeutic knowledge and response capabilities.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Participants</title><p>Participants were recruited through the mailing list of the Clinic for Psychiatry, Social Psychiatry, and Psychotherapy, as well as the Educational Institute for Psychotherapy. In total, the mailing list consisted of 102 psychotherapists in training; 10 people answered, of whom 2 dropped out of participation because they could not participate on the day of the workshop. Thus, the human sample consisted of 8 psychotherapists in training from Hannover Medical School. Six participants were in their second year of training, 1 in the first year, and 1 in the third year. At the time of this study, 7 participants were actively conducting therapy sessions with outpatients. The sample included 7 female and 1 male participant, with age ranging from 25 to 32 (mean 27.88, SD 2.2) years. At the beginning of the questionnaire, participants were asked to rate their prior knowledge of BA on a scale of 1 to 5. They rated themselves a mean 3 (SD 1.87). The LLM sample consisted of 6 different LLMs. We investigated the most well-known LLMs at the time of conducting the study, namely GPT-4 (OpenAI), GPT-4o (OpenAI), Gemini Pro 1.5 (Google), Claude Opus (Anthropic), Llama-3 70B Instruct (Meta), and Command R+ (Cohere) [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref21">21</xref>]. All models were accessed via OpenRouter[<xref ref-type="bibr" rid="ref22">22</xref>] to enable a unified API interface.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study was reviewed and approved by the Research Ethics Committee at the Medical University Hanover (OE 9515; approved on December 18, 2023). All participants were adults (&#x2265;18 years). Written informed consent was obtained from all participants after the aims, procedures, risks, and benefits were explained. Participants could withdraw at any time without penalty. We safeguarded privacy and confidentiality by pseudonymization, secure servers, and restricted access. No identifying information is reported. Participants received credit points for attending the workshop and 250 EUR (US $271.30) compensation. The study adhered to the Declaration of Helsinki and applicable local and national regulations.</p></sec><sec id="s2-3"><title>Variables</title><p>In Germany, psychotherapy is conducted by professionals who are trained in scientifically established methods to diagnose, heal, or mitigate disorders with pathological significance [<xref ref-type="bibr" rid="ref23">23</xref>]. We assessed knowledge about depressive disorders, BA, and CBT with and without contextual information to evaluate baseline knowledge and learning capacity in both human therapists and LLMs.</p><p>The client-therapist working alliance represents a critical factor in symptom reduction. Essential therapist characteristics, particularly empathy and theory of mind capabilities&#x2014;the ability to anticipate and comprehend a patient&#x2019;s emotional and cognitive states&#x2014; substantially influence the development of a therapeutic alliance [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref27">27</xref>]. In exploring these capabilities, Rogers [<xref ref-type="bibr" rid="ref28">28</xref>] conceptualized psychotherapeutic empathy as a dual process encompassing both client understanding and the effective communication of this understanding, highlighting its cognitive and emotional components.</p><p>Validation strategies are fundamental communicative and alliance-building tools for affirming client experiences, communicating empathy, and fostering therapeutic engagement [<xref ref-type="bibr" rid="ref29">29</xref>-<xref ref-type="bibr" rid="ref33">33</xref>]. Our evaluation framework incorporates 6 validation strategies from dialectic behavioral therapy: attentiveness (actively listening and focusing on the client&#x2019;s words and emotions), intermodal communication (rephrasing within the same modality, eg, &#x201C;I was very sad&#x201D; &#x2192; &#x201C;So, you were affected emotionally&#x201D;), crossmodal communication (rephrasing across different modalities), biographical reference (connecting current responses to personal history), present-moment validation (affirming the appropriateness of current feelings or actions), and radical genuineness (sharing personal reflections, eg, &#x201C;I would have felt the same&#x201D;) [<xref ref-type="bibr" rid="ref34">34</xref>]. These strategies provided structured criteria for evaluating validation strategy choice in the open-ended questions, assessing empathic quality, validation effectiveness, and accuracy in estimating fictional patients&#x2019; cognitive and emotional states.</p></sec><sec id="s2-4"><title>Questionnaire</title><p>We designed a pen-and-paper test in German consisting of 20 multiple-choice questions to assess knowledge on depression, practical knowledge on psychotherapy, and specific knowledge on BA. The overall score was calculated by awarding 1 point for each correctly selected or omitted answer and &#x2212;1 point for incorrect selections and omissions. The maximum attainable score was 100 points for all questions, 50 points for knowledge on depression, 10 for practical knowledge, and 40 for specific knowledge on BA.</p><p>The second section entailed 10 case scenarios of psychotherapy sessions with 3 open-ended subquestions each. In the case scenarios, a fictional patient showed a reaction to a certain situation (eg, crying after a difficult therapeutic task could not be completed). First, participants were asked which emotions and cognitions they would anticipate in the patient and provide a rationale for their conclusions. In the second subquestion, the participants assumed the role of the therapist and had to outline how they would continue with the therapy. Thirdly, they were asked to formulate an answer to the patient. Responses were rated on 5 dimensions: theory of mind, empathic response, adherence to psychotherapy principles, adherence to BA techniques, and fidelity of the chosen validation strategies. The 5 rating dimensions comprised 87 subquestions in total. With 14 participants, each blinded expert evaluated 1218 individual items (87 subquestions &#x00D7; 14 participants).</p><p>Translated sample items from the open-ended case scenarios are provided in (Tables S1-S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The complete questionnaire, including all items in both German and English translation, is available in a public repository [<xref ref-type="bibr" rid="ref35">35</xref>].</p></sec><sec id="s2-5"><title>Procedure</title><p>Human participants completed the initial questionnaire on paper (90 min) before attending a 5-hour training session on BA. The training covered BA therapeutic rationale and methods, with participants receiving handouts containing session materials and fictional therapy transcripts. These transcripts illustrated potential client-therapist challenges and provided supplementary BA information. Participants had 5 weeks to study the materials before completing the questionnaire again. The testing period was April 26 to May 31, 2024.</p><p>For the LLM evaluation, we accessed all models through OpenRouter with temperature set to 0 to ensure reproducibility, while maintaining all other model parameters at their default values. Each interaction began with the system message &#x201C;Du bist ein Experte im Bereich Psychotherapie&#x201D; (&#x201C;You are an expert in psychotherapy&#x201D;), which remained identical for pretest and posttest assessments across all models. For multiple-choice questions, models were instructed to list only correct answer options. For the case scenarios, we used a chat-based format where each model&#x2019;s previous responses were preserved as distinct messages in the conversation history, rather than concatenating them into a single prompt. This meant that for each new question, the model had access to the full conversation history including its previous responses within the same case. All LLM responses were generated between April and May 2024. To ensure standardized formatting for the expert review, we manually transferred the model outputs to a Microsoft Word document, correcting any formatting inconsistencies while preserving the original content. This process created a uniform presentation format for the experts&#x2019; blind evaluation.</p><p>The same training materials provided to human participants, including session transcripts and additional BA information, were input as context for the LLMs&#x2019; posttest evaluation. Due to Llama 3.0&#x2019;s limited context window of 8k tokens, it was the only model for which only a condensed summary of the BA training materials was provided. All pretest and posttest responses from both humans and LLMs were randomly aggregated and blindly rated by 2 licensed psychotherapists with expertise in BA. The raters worked independently and were blind to each other&#x2019;s ratings.</p></sec><sec id="s2-6"><title>Analysis</title><p>Adherence, empathy, and goodness of fit of anticipated emotions and cognitions were rated on a scale of 1 (= &#x201C;not complied&#x201D;) to 5 (= &#x201C;completely complied&#x201D;). The mean was calculated for each group (human vs LLM) for the pretest and the posttest over all 10 questions. The theory of mind score is the mean of goodness of fit of anticipated emotions and cognitions. Six validation strategies were predefined from dialectic behavioral therapy, and the experts identified which strategies were applied in the answers. Quality of Validation was rated on a scale of 1 (= &#x201C;insufficient&#x201D;) to 5 (= &#x201C;optimal&#x201D;) and total number of applied strategies was counted for each group (human vs LLM). To ensure interrater reliability, the 2 experts jointly developed and agreed upon definitions for each rating level before beginning the evaluation. Statistical analyses used Wilcoxon signed-rank tests for within-group comparisons and Mann-Whitney <italic>U</italic> tests for between-group comparisons with a Bonferroni correction for multiple testing. The rank biserial correlation was used as an effect-size measure for both tests. The intraclass correlation coefficient was calculated to assess agreement between the 2 raters. This was intentional to preserve independent judgment.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p><xref ref-type="table" rid="table1">Table 1</xref> presents multiple-choice test scores for LLMs (n=6) and human psychotherapists in training (n=8) across 3 knowledge domains: depression, general therapy, and BA knowledge. At baseline, both groups demonstrated moderate domain knowledge, with LLMs showing numerically higher mean total scores (61.0, SD 7.46 vs 52.0, SD 15.43 points, <italic>U</italic>=16.5; <italic>P</italic>=.37; <italic>r</italic>=0.313).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Mean multiple-choice assessment scores of psychotherapists in training and large language models (LLMs).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category (score range)</td><td align="left" valign="bottom">Pretest (human), mean (SD)</td><td align="left" valign="bottom">Posttest<break/>(human), mean (SD)</td><td align="left" valign="bottom">Pretest<break/>(LLM), mean (SD)</td><td align="left" valign="bottom">Posttest<break/>(LLM), mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Depression (0&#x2010;50)</td><td align="left" valign="top">20 (9.43)</td><td align="left" valign="top">22 (9.64)</td><td align="left" valign="top">24.33 (1.8)<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">21.67 (3.35)</td></tr><tr><td align="left" valign="top">General knowledge (0&#x2010;10)</td><td align="left" valign="top">7 (2.24)</td><td align="left" valign="top">8 (2)</td><td align="left" valign="top">8.33 (1.37)</td><td align="left" valign="top">10 (0)<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">Behavioral activation (0&#x2010;40)</td><td align="left" valign="top">25 (6.24)</td><td align="left" valign="top">24.75 (7.93)</td><td align="left" valign="top">28.33 (6.47)</td><td align="left" valign="top">32.67 (5.62)<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">Total (0&#x2010;100)</td><td align="left" valign="top">52 (15.43)</td><td align="left" valign="top">54.75 (16.31)</td><td align="left" valign="top">61 (7.46)</td><td align="left" valign="top">64.33 (8.9)<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Values indicate highest scores in each category. There were no statistically significant differences between groups or between pretest and posttest scores at <italic>P</italic>&#x003C;.00625. </p></fn></table-wrap-foot></table-wrap><p>Following BA training, both groups demonstrated improvements under their respective testing conditions. Human participants, tested after a 5-week study period, showed a modest increase in mean total scores (52.0, SD 15.43 vs 54.75, SD 16.31 points; <italic>W</italic>=16.0; <italic>P</italic>=.84; <italic>r</italic>=0.111) with gains in depression knowledge (20.0 vs 22.0 points; <italic>W</italic>=9.0; <italic>P</italic>=.40; <italic>r</italic>=0.500). LLMs, evaluated with training materials as additional context, similarly showed enhanced performance with total scores improving (61.0, SD 7.46 vs 64.33, SD 8.9 points; <italic>W</italic>=5.0; <italic>P</italic>=.49; <italic>r</italic>=0.524), attained maximum scores in general therapy knowledge (8.3, SD 1.37 vs 10.0, SD 0.0 points; <italic>W</italic>=0; <italic>P</italic>=.06; <italic>r</italic>=1.000), and showed improvement in BA knowledge (28.33, SD 6.47 vs 32.67, SD 5.62 points; <italic>W</italic>=1.0; <italic>P</italic>=.08; <italic>r</italic>=0.905), though their depression knowledge showed a slight decrease (24.33, SD 1.8 vs 21.67, SD 3.35 points; <italic>W</italic>=4.0; <italic>P</italic>=.34; <italic>r</italic>=&#x2212;0.619). Notably, while LLMs improved markedly in BA knowledge, human performance in this domain remained stable (25.0, SD 6.24 vs 24.75, SD 7.93 points: <italic>W</italic>=15.0; <italic>P</italic>=.74; <italic>r</italic>=&#x2212;0.167). We observed no significant difference between pretest and posttest scores within either group. All statistical comparisons used a significance threshold of <italic>P</italic>&#x003C;.00625.</p><p><xref ref-type="fig" rid="figure2">Figure 2</xref> shows multiple-choice pretest and posttest scores for the 6 LLMs. Proprietary models demonstrated higher performance both before and after integrating additional context, with mean scores improving from 63.0 (SD 7.68) to 70.5 (SD 1.61) points, while open-source models declined from 57.0 (SD 5) to 52.0 (SD 2) points. The limited sample size of 4 proprietary and 2 open-source models restricts formal statistical analysis, as the Mann-Whitney <italic>U</italic> test would yield a minimum <italic>P</italic>=.13, and the Wilcoxon signed-rank tests within groups would lead to a minimum 2-sided <italic>P</italic> value of .25 (proprietary) and .50 (open source). Both exceed conventional significance thresholds. Nonetheless, our data reveal distinct performance patterns among the proprietary models (solid bars). GPT-4 and GPT-4o improved from 66.0 to 72.0 points, Gemini Pro 1.5 showed the largest gain (50.0 vs 68.0 points), and Claude Opus remained at 70.0 points, all converging to 68 to 72 points at posttest. In contrast, both open-source models (hatched bars) showed declining scores, with Llama-3 70B Instruct falling from 62.0 to 54.0 points and Command <italic>R</italic>+ from 52.0 to 50.0 points. This preliminary observation indicates a performance difference, with proprietary models scoring 18.5 points higher at posttest than open-source alternatives.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Multiple-choice assessment scores of large language models (LLMs) before and after adding behavioral activation context. Performance comparison of proprietary LLMs (solid bars) and open-source (hatched bars) on the multiple-choice knowledge assessment before (pretest) and after (posttest) receiving BA training materials as additional context.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e78138_fig02.png"/></fig><p>In addition to the multiple-choice assessment, an open-ended questionnaire was used to evaluate more nuanced therapeutic skills. <xref ref-type="table" rid="table2">Table 2</xref> shows the average scores for human and LLM responses. Across all pretest and posttest evaluations, LLMs achieved higher average scores than human participants on 5 psychological metrics (rated on a scale of 1 to 5). Pretest between-group comparisons showed significantly higher LLM performance for anticipation of cognition (<italic>U</italic>=0.0; <italic>P</italic>=.002; <italic>r</italic>=1.000) and anticipation of emotion (<italic>U</italic>=0.0; <italic>P</italic>=.002; <italic>r</italic>=1.000), with nonsignificant trends for adherence (<italic>U</italic>=4.0; <italic>P</italic>=.008; <italic>r</italic>=0.833), empathy (<italic>U</italic>=2.0; <italic>P</italic>=.005; <italic>r</italic>=0.917), and validation quality (<italic>U</italic>=2.5; <italic>P</italic>=.006; <italic>r</italic>=0.896). Both groups demonstrated improvements from pretest to posttest in adherence, empathy, and validation quality, with the largest gains in adherence (humans: 2.79, SD 0-14 vs 3.54, SD 0.43; LLMs: 3.31, SD 0.12 vs 3.96, SD 0.06). Within-group analyses revealed no significant improvements after correction, though adherence showed trends for both humans (<italic>W</italic>=1.0; <italic>P</italic>=.02; <italic>r</italic>=0.944) and LLMs (<italic>W</italic>=0.0; <italic>P</italic>=.03; <italic>r</italic>=1.000). At posttest, between-group differences were significant for adherence (<italic>U</italic>=2.0; <italic>P</italic>=.003; <italic>r</italic>=0.917) and empathy (<italic>U</italic>=1.5; <italic>P</italic>=.004; <italic>r</italic>=0.938), with nonsignificant trends for the other metrics after correction. LLMs exhibited decreases in anticipation of cognition (4.54, SD 0.30 vs 4.43, SD 0.42) and anticipation of emotion (4.69, SD 0.12 vs 4.40, SD 0.37).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Mean assessment scores from the open-ended question evaluation of psychotherapists in training and large language models (LLMs).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Pretest (human), mean (SD)</td><td align="left" valign="bottom">Posttest<break/>(human), mean (SD)</td><td align="left" valign="bottom">Pretest<break/>(LLM), mean (SD)</td><td align="left" valign="bottom">Posttest<break/>(LLM), mean (SD)</td><td align="left" valign="bottom">ICC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Adherence</td><td align="left" valign="top">2.79 (0.14)</td><td align="left" valign="top">3.54 (0.09)</td><td align="left" valign="top">3.31 (0.12)</td><td align="left" valign="top">3.96 (0.06)<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">0.55</td></tr><tr><td align="left" valign="top">Empathy</td><td align="left" valign="top">3.14 (0.43)</td><td align="left" valign="top">3.64 (0.29)</td><td align="left" valign="top">4.33 (0.09)</td><td align="left" valign="top">4.35 (0.26)<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">0.78</td></tr><tr><td align="left" valign="top">Validation quality</td><td align="left" valign="top">2.78 (0.22)</td><td align="left" valign="top">3.43 (0.29)</td><td align="left" valign="top">3.78 (0.14)</td><td align="left" valign="top">4.08 (0.13)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.6</td></tr><tr><td align="left" valign="top">Anticipation of cognition</td><td align="left" valign="top">3.53 (0.26)</td><td align="left" valign="top">4.01 (0.26)</td><td align="left" valign="top">4.54 (0.30)<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">4.43 (0.42)</td><td align="left" valign="top">0.67</td></tr><tr><td align="left" valign="top">Anticipation of emotion</td><td align="left" valign="top">3.36 (0.16)</td><td align="left" valign="top">3.80 (0.24)</td><td align="left" valign="top">4.69 (0.13)<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">4.40 (0.37)</td><td align="left" valign="top">0.76</td></tr><tr><td align="left" valign="top">Average across categories</td><td align="left" valign="top">3.18 (0.10)</td><td align="left" valign="top">3.72 (0.17)</td><td align="left" valign="top">4.21 (0.07)</td><td align="left" valign="top">4.28 (0.14)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.67</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>ICC: intraclass correlation coefficient.</p></fn><fn id="table2fn2"><p><sup>b</sup>Indicate significant differences between human and LLM scores at the same testing point (<italic>P</italic>&#x003C;.005). </p></fn><fn id="table2fn3"><p><sup>c</sup>Values represent the highest score in each category across all conditions.</p></fn></table-wrap-foot></table-wrap><p>Validation strategy use showed broadly comparable patterns between LLMs and humans, with both groups reducing overall use by approximately 5%. Statistical analyses used Wilcoxon signed-rank tests for within-group comparisons and Mann-Whitney <italic>U</italic> tests for between-group comparisons, with Bonferroni correction applied (significance threshold <italic>P</italic>&#x003C;.004). V1 (attentiveness) remained stable across both groups with no significant within-group changes. Both groups demonstrated similar patterns for V2 (intermodal), V3 (crossmodal), and V5 (present). V3 showed the largest increases in both groups, rising 133% for humans and 160% for LLMs, though within-group analyses revealed only trends toward significance (humans: <italic>W</italic>=3.0; <italic>P</italic>=.03; <italic>r</italic>=0.833; LLMs: <italic>W</italic>=3.0; <italic>P</italic>=.11; <italic>r</italic>=0.714). V5 decreased substantially in both groups, dropping 57% for humans and 67% for LLMs (humans: <italic>W</italic>=5.0; <italic>P</italic>=.07; <italic>r</italic>=-0.722; LLMs: <italic>W</italic>=1.5; <italic>P</italic>=.09; <italic>r</italic>=&#x2212;0.857). The clearest divergence appeared in V6 (radical genuineness), where humans increased use by 38% while LLMs decreased by 14% (humans: <italic>W</italic>=12.0; <italic>P</italic>=.39; <italic>r</italic>=0.333; LLMs: <italic>W</italic>=7.5; <italic>P</italic>=.52; <italic>r</italic>=&#x2212;0.286). Between-group comparisons at T1 (postintervention) showed a trend toward difference for V4 (biography; <italic>U</italic>=7.0; <italic>P</italic>=.01; <italic>r</italic>=0.708), although this did not survive Bonferroni correction. After correction, no comparisons reached statistical significance, indicating these patterns should be interpreted as trends rather than definitive effects (<xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Number of applied validation strategies of psychotherapists in training and large language models (LLMs).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Validation strategy</td><td align="left" valign="bottom">Pretest (human), n</td><td align="left" valign="bottom">Posttest<break/>(human), n</td><td align="left" valign="bottom">Change in human responses (%)</td><td align="left" valign="bottom">Pretest<break/>(LLM), n</td><td align="left" valign="bottom">Posttest<break/>(LLM), n</td><td align="left" valign="bottom">Change in LLM responses (%)</td></tr></thead><tbody><tr><td align="left" valign="top">V1 (attentiveness)</td><td align="left" valign="top">39</td><td align="left" valign="top">39</td><td align="left" valign="top">0</td><td align="left" valign="top">30</td><td align="left" valign="top">30</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">V2 (intermodal communication)</td><td align="left" valign="top">11</td><td align="left" valign="top">6</td><td align="left" valign="top">&#x2212;45</td><td align="left" valign="top">11</td><td align="left" valign="top">8</td><td align="left" valign="top">&#x2212;27</td></tr><tr><td align="left" valign="top">V3 (crossmodal communication)</td><td align="left" valign="top">6</td><td align="left" valign="top">14</td><td align="left" valign="top">+133</td><td align="left" valign="top">5</td><td align="left" valign="top">13</td><td align="left" valign="top">+160</td></tr><tr><td align="left" valign="top">V4 (reference to client biography)</td><td align="left" valign="top">10</td><td align="left" valign="top">8</td><td align="left" valign="top">&#x2212;20</td><td align="left" valign="top">10</td><td align="left" valign="top">11</td><td align="left" valign="top">+10</td></tr><tr><td align="left" valign="top">V5 (reference to the present)</td><td align="left" valign="top">14</td><td align="left" valign="top">6</td><td align="left" valign="top">&#x2212;57</td><td align="left" valign="top">12</td><td align="left" valign="top">4</td><td align="left" valign="top">&#x2212;67</td></tr><tr><td align="left" valign="top">V6 (radical genuineness)</td><td align="left" valign="top">8</td><td align="left" valign="top">11</td><td align="left" valign="top">+38</td><td align="left" valign="top">14</td><td align="left" valign="top">12</td><td align="left" valign="top">&#x2212;14</td></tr><tr><td align="left" valign="top">Sum over all strategies</td><td align="left" valign="top">88</td><td align="left" valign="top">84</td><td align="left" valign="top">&#x2212;5</td><td align="left" valign="top">82</td><td align="left" valign="top">78</td><td align="left" valign="top">&#x2212;5</td></tr></tbody></table></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study set out to evaluate whether LLMs could demonstrate therapeutic competencies comparable to psychotherapists in training, particularly in the domains of knowledge acquisition, therapeutic alliance building, and empathic communication. Consistent with recent studies showing that users perceive LLMs like Replika and ChatGPT as emotionally supportive despite not yet being designed for therapy [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref36">36</xref>], our findings indicated that LLMs received moderately higher ratings in certain therapeutic skills, including empathetic communication and anticipation of emotional and cognitive states. Of particular interest was the LLMs&#x2019; performance in domains traditionally associated with human capabilities: empathetic communication and emotional understanding. The observed differences between LLMs and human trainees in pretest expert-rated empathy (<italic>U</italic>=2.0; <italic>P</italic>=.005; <italic>r</italic>=0.917), anticipation of cognition (<italic>U</italic>=0.0; <italic>P</italic>=.002; <italic>r</italic>=1.000), and anticipation of emotion (<italic>U</italic>=0.0; <italic>P</italic>=.002; <italic>r</italic>=1.000) reveal an important distinction between producing seemingly empathetic responses and the processes behind them. Although LLMs do not experience emotions, they generated responses that trained evaluators rated as empathetic and emotionally aware, similar to human trainees. While these findings echo prior reports that LLMs can simulate empathic communication [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref12">12</xref>], they also raise questions of whether simulated empathy translates into effective therapeutic alliances with real patients, indicating the necessity for future research on how patients experience and respond to therapeutic interactions with LLM systems compared with human therapists.</p><p>The convergence of proprietary models&#x2019; performance on the multiple-choice assessment after exposure to training materials, reaching scores between 68 to 72 points, aligns with research findings suggesting that LLMs can rapidly integrate structured therapeutic information through few-shot prompting or fine-tuning [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>] and may indicate similar underlying approaches to processing and integrating therapeutic knowledge across different proprietary LLMs. In contrast, the declining performance of open-source models points to potential limitations in their ability to effectively integrate and apply additional context while maintaining consistent performance for the investigated models.</p><p>These differences between proprietary and open-source models have important implications for potential therapeutic applications. The stability and convergence of proprietary models suggest they may offer more reliable performance in therapeutic contexts. However, their proprietary nature raises questions about implementation costs in real-world mental health applications, as health care providers would need to rely on more costly external APIs. The performance gap between proprietary and open-source models might present a trade-off between reliability and cost that requires careful consideration in the development of LLM-assisted mental health interventions.</p><p>Our analysis of validation strategies revealed important insights into their use patterns in both LLMs and human trainees. Across both human and LLM responses, the fundamental strategy of attentiveness remained consistent, aligning with its essential role in therapeutic alliance formation as established in previous research [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref33">33</xref>]. Cross-modal and intermodal communication strategies were rarely used simultaneously, likely due to the constrained nature of single-response evaluations. Subsequent training led to an increase in cross-modal communication, suggesting successful integration when explicitly emphasized in training materials. However, the observed decrease in present-moment references may indicate a shift toward more structured therapeutic interventions, as participants potentially prioritized demonstrating newly acquired technical skills over moment-to-moment emotional attunement.</p></sec><sec id="s4-2"><title>Limitations</title><p>While our findings demonstrate strong performance from LLMs across multiple therapeutic competencies, these results should be interpreted within several important constraints. First, our evaluation was limited to single-turn therapeutic responses, affecting ecological validity, since real therapeutic relationships require maintaining consistent dialogue across sessions, building upon accumulated client information, and adapting therapeutic approaches as client needs evolve. Future research needs to examine their performance in multi-turn therapeutic dialogues, their ability to maintain consistent client understanding across sessions, and adaptation to changing therapeutic needs. Additionally, the sample of 8 psychotherapists in training is too small to draw reliable inferences regarding the broader population of psychotherapists in training. The testing material should be administered to larger samples of psychotherapists and psychotherapists in training. More expert ratings might also enhance variability and thus offer more scope for interpretation.</p><p>The questionnaire was conducted in a paper-and-pencil format, which might account for shorter answers in the human sample, thus affecting the rating. The rapid evolution of AI models also affects the longevity of our findings. Even during this study&#x2019;s preparation, more sophisticated models like Anthropic&#x2019;s Claude 4.0 Sonnet and OpenAI&#x2019;s GPT-5 were released, highlighting how quickly such comparative analyses can become outdated. The development of standardized evaluation frameworks that can account for the rapid evolution of LLM capabilities while maintaining rigorous assessment standards will be crucial for ongoing comparative analyses. Such frameworks should particularly focus on longitudinal therapeutic interactions and consistent application of therapeutic principles over time.</p><p>Another key methodological limitation affects our ability to make direct comparisons. The LLMs had access to all training materials during testing, while human participants had to rely on retained knowledge from their training period. This created an asymmetric testing environment similar to comparing open-book and closed-book exam performance. An open-book examination should be conducted and might also reflect a more realistic scenario for psychotherapists in training, since these are closer to a real-life therapeutic session where therapists have access to literature, data, and the possibility to look up information if they need to.</p></sec><sec id="s4-3"><title>Conclusions</title><p>This study provides evidence that LLMs can generate high-quality, single-turn therapeutic responses that effectively combine clinical knowledge with empathetic communication. Our findings reveal that LLMs often matched psychotherapists in training in knowledge assessments and practical therapeutic alliance-building skills. While LLMs cannot experience genuine empathy, their ability to produce responses rated as empathetic by licensed psychotherapists highlights the distinction between internal emotional experience and effective therapeutic communication.</p><p>Our findings suggest clinical applications for LLMs as supportive tools in mental health care, potentially addressing the critical shortage of trained professionals and demand in low-resource settings. They could supplement care or expand access to mental health care. However, such applications must be guided by rigorous safeguards, as the distinction between simulated and genuine human empathy has direct implications for therapeutic authenticity and patient outcomes.</p><p>Differences between proprietary and open-source LLMs emphasize issues of accessibility, equity, and sustainability. Though our results carefully suggest that proprietary models offer more reliable performance, their closed nature and cost structures risk exacerbating inequalities in access to high-quality digital mental health tools. Addressing these trade-offs will be critical in planning AI integration.</p><p>As mental health care faces growing demand, LLMs may serve as valuable supplements to human expertise, though their implementation requires careful consideration of ethical implications, therapeutic authenticity, and clinical outcomes.</p></sec></sec></body><back><notes><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available in the GitHub repository cited in the Questionnaire section.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BA</term><def><p>behavioral activation</p></def></def-item><def-item><term id="abb2">CBT</term><def><p>cognitive behavioral therapy</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Mental disorders</article-title><source>World Health Organization</source><access-date>2025-02-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/news-room/fact-sheets/detail/mental-disorders">https://www.who.int/news-room/fact-sheets/detail/mental-disorders</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Bloom</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Cafiero</surname><given-names>ET</given-names> </name><name name-style="western"><surname>Jan&#x00E9;-Llopis</surname><given-names>E</given-names> </name><etal/></person-group><article-title>The global economic burden of non-communicable diseases</article-title><source>World Economic Forum</source><year>2011</year><month>09</month><access-date>2025-11-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www3.weforum.org/docs/WEF_Harvard_HE_GlobalEconomicBurdenNonCommunicableDiseases_2011.pdf">https://www3.weforum.org/docs/WEF_Harvard_HE_GlobalEconomicBurdenNonCommunicableDiseases_2011.pdf</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cambridge</surname><given-names>OR</given-names> </name><name name-style="western"><surname>Knight</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Mills</surname><given-names>N</given-names> </name><name name-style="western"><surname>Baune</surname><given-names>BT</given-names> </name></person-group><article-title>The clinical relationship between cognitive impairment and psychosocial functioning in major depressive disorder: a systematic review</article-title><source>Psychiatry Res</source><year>2018</year><month>11</month><volume>269</volume><fpage>157</fpage><lpage>171</lpage><pub-id pub-id-type="doi">10.1016/j.psychres.2018.08.033</pub-id><pub-id pub-id-type="medline">30149273</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rajan</surname><given-names>S</given-names> </name><name name-style="western"><surname>McKee</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rangarajan</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Association of symptoms of depression with cardiovascular disease and mortality in low-, middle-, and high-income countries</article-title><source>JAMA Psychiatry</source><year>2020</year><month>10</month><day>1</day><volume>77</volume><issue>10</issue><fpage>1052</fpage><lpage>1063</lpage><pub-id pub-id-type="doi">10.1001/jamapsychiatry.2020.1351</pub-id><pub-id pub-id-type="medline">32520341</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sharma</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>IW</given-names> </name><name name-style="western"><surname>Miner</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Atkins</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Althoff</surname><given-names>T</given-names> </name></person-group><article-title>Human&#x2013;AI collaboration enables more empathic conversations in text-based peer-to-peer mental health support</article-title><source>Nat Mach Intell</source><year>2023</year><volume>5</volume><issue>1</issue><fpage>46</fpage><lpage>57</lpage><pub-id pub-id-type="doi">10.1038/s42256-022-00593-2</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Leonte</surname><given-names>KG</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>ML</given-names> </name><etal/></person-group><article-title>Large language models outperform mental and medical health care professionals in identifying obsessive-compulsive disorder</article-title><source>NPJ Digit Med</source><year>2024</year><month>07</month><day>19</day><volume>7</volume><issue>1</issue><fpage>193</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01181-x</pub-id><pub-id pub-id-type="medline">39030292</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abd-Alrazaq</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Alajlani</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ali</surname><given-names>N</given-names> </name><name name-style="western"><surname>Denecke</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bewick</surname><given-names>BM</given-names> </name><name name-style="western"><surname>Househ</surname><given-names>M</given-names> </name></person-group><article-title>Perceptions and opinions of patients about mental health chatbots: scoping review</article-title><source>J Med Internet Res</source><year>2021</year><month>01</month><day>13</day><volume>23</volume><issue>1</issue><fpage>e17828</fpage><pub-id pub-id-type="doi">10.2196/17828</pub-id><pub-id pub-id-type="medline">33439133</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maples</surname><given-names>B</given-names> </name><name name-style="western"><surname>Cerit</surname><given-names>M</given-names> </name><name name-style="western"><surname>Vishwanath</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pea</surname><given-names>R</given-names> </name></person-group><article-title>Loneliness and suicide mitigation for students using GPT3-enabled chatbots</article-title><source>Npj Ment Health Res</source><year>2024</year><month>01</month><day>22</day><volume>3</volume><issue>1</issue><fpage>4</fpage><pub-id pub-id-type="doi">10.1038/s44184-023-00047-6</pub-id><pub-id pub-id-type="medline">38609517</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gallegos</surname><given-names>IO</given-names> </name><name name-style="western"><surname>Rossi</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Barrow</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Bias and fairness in large language models: a survey</article-title><source>Comput Linguist</source><year>2024</year><month>09</month><day>1</day><volume>50</volume><issue>3</issue><fpage>1097</fpage><lpage>1179</lpage><pub-id pub-id-type="doi">10.1162/coli_a_00524</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kankanhalli</surname><given-names>M</given-names> </name></person-group><article-title>Hallucination is inevitable: an innate limitation of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 22, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2401.11817</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Na</surname><given-names>H</given-names> </name></person-group><article-title>CBT-LLM: a Chinese large language model for cognitive behavioral therapy-based mental health question answering</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 24, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.16008</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Han</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Koh</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Seo</surname><given-names>HT</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>DS</given-names> </name></person-group><article-title>Enhancing psychotherapy counseling: a data augmentation pipeline leveraging large language models for counseling conversations</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 13, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.08718</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Bouneffouf</surname><given-names>D</given-names> </name><name name-style="western"><surname>Cecchi</surname><given-names>G</given-names> </name><name name-style="western"><surname>Varshney</surname><given-names>K</given-names> </name></person-group><article-title>Towards healthy AI: large language models need therapists too</article-title><access-date>2025-11-10</access-date><conf-name>Proceedings of the 4th Workshop on Trustworthy Natural Language Processing (TrustNLP '24)</conf-name><conf-date>Jun 21-22, 2024</conf-date><conf-loc>Mexico City, Mexico</conf-loc><fpage>61</fpage><lpage>70</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.trustnlp-1">https://aclanthology.org/2024.trustnlp-1</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Li</surname><given-names>D</given-names> </name><name name-style="western"><surname>Cao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ren</surname><given-names>T</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name></person-group><article-title>Chatcounselor: a large language models for mental health support</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 27, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.15461</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tindall</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mikocka-Walus</surname><given-names>A</given-names> </name><name name-style="western"><surname>McMillan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>B</given-names> </name><name name-style="western"><surname>Hewitt</surname><given-names>C</given-names> </name><name name-style="western"><surname>Gascoyne</surname><given-names>S</given-names> </name></person-group><article-title>Is behavioural activation effective in the treatment of depression in young people? A systematic review and meta-analysis</article-title><source>Psychol Psychother</source><year>2017</year><month>12</month><volume>90</volume><issue>4</issue><fpage>770</fpage><lpage>796</lpage><pub-id pub-id-type="doi">10.1111/papt.12121</pub-id><pub-id pub-id-type="medline">28299896</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Uphoff</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ekers</surname><given-names>D</given-names> </name><name name-style="western"><surname>Robertson</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Behavioural activation therapy for depression in adults</article-title><source>Cochrane Database Syst Rev</source><year>2020</year><month>07</month><day>6</day><volume>7</volume><issue>7</issue><fpage>CD013305</fpage><pub-id pub-id-type="doi">10.1002/14651858.CD013305.pub2</pub-id><pub-id pub-id-type="medline">32628293</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>OpenAI</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 4, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Georgiev</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lei</surname><given-names>VI</given-names> </name><etal/></person-group><article-title>Gemini 1.5: unlocking multimodal understanding across millions of tokens of context</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 8, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.05530</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>Introducing the next generation of Claude</article-title><source>Anthropic PBC</source><access-date>2025-11-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.anthropic.com/news/claude-3-family">https://www.anthropic.com/news/claude-3-family</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Grattafiori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The Llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>Introducing command r+: a scalable LLM built for business</article-title><source>Cohere</source><access-date>2025-02-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cohere.com/blog/command-r-plus-microsoft-azure">https://cohere.com/blog/command-r-plus-microsoft-azure</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>The unified interface for LLMs</article-title><source>OpenRouter</source><access-date>2025-11-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openrouter.ai/">https://openrouter.ai/</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><article-title>PsychThG - gesetz &#x00FC;ber den beruf der psychotherapeutin und des psychotherapeuten</article-title><source>Bundesministerium der Justiz und f&#x00FC;r Verbraucherschutz</source><access-date>2025-02-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.gesetze-im-internet.de/psychthg_2020/BJNR160410019.html">https://www.gesetze-im-internet.de/psychthg_2020/BJNR160410019.html</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Del Re</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Fl&#x00FC;ckiger</surname><given-names>C</given-names> </name><name name-style="western"><surname>Horvath</surname><given-names>AO</given-names> </name><name name-style="western"><surname>Wampold</surname><given-names>BE</given-names> </name></person-group><article-title>Examining therapist effects in the alliance-outcome relationship: a multilevel meta-analysis</article-title><source>J Consult Clin Psychol</source><year>2021</year><month>05</month><volume>89</volume><issue>5</issue><fpage>371</fpage><lpage>378</lpage><pub-id pub-id-type="doi">10.1037/ccp0000637</pub-id><pub-id pub-id-type="medline">33829817</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wampold</surname><given-names>BE</given-names> </name><name name-style="western"><surname>Fl&#x00FC;ckiger</surname><given-names>C</given-names> </name></person-group><article-title>The alliance in mental health care: conceptualization, evidence and clinical applications</article-title><source>World Psychiatry</source><year>2023</year><month>02</month><volume>22</volume><issue>1</issue><fpage>25</fpage><lpage>41</lpage><pub-id pub-id-type="doi">10.1002/wps.21035</pub-id><pub-id pub-id-type="medline">36640398</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name></person-group><article-title>Therapist&#x2019;s empathy, attachment, and therapeutic alliance: neurobiological perspective</article-title><source>Int J Psychol Behav Anal</source><year>2018</year><volume>4</volume><issue>1</issue><fpage>1</fpage><lpage>5</lpage><pub-id pub-id-type="doi">10.15344/2455-3867/2018/140</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McClintock</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Anderson</surname><given-names>T</given-names> </name><name name-style="western"><surname>Patterson</surname><given-names>CL</given-names> </name><name name-style="western"><surname>Wing</surname><given-names>EH</given-names> </name></person-group><article-title>Early psychotherapeutic empathy, alliance, and client outcome: preliminary evidence of indirect effects</article-title><source>J Clin Psychol</source><year>2018</year><month>06</month><volume>74</volume><issue>6</issue><fpage>839</fpage><lpage>848</lpage><pub-id pub-id-type="doi">10.1002/jclp.22568</pub-id><pub-id pub-id-type="medline">29364509</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rogers</surname><given-names>CR</given-names> </name></person-group><article-title>Empathic: an unappreciated way of being</article-title><source>Couns Psychol</source><year>1975</year><month>06</month><volume>5</volume><issue>2</issue><fpage>2</fpage><lpage>10</lpage><pub-id pub-id-type="doi">10.1177/001100007500500202</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Klee</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Abeles</surname><given-names>N</given-names> </name><name name-style="western"><surname>Muller</surname><given-names>RT</given-names> </name></person-group><article-title>Therapeutic alliance: early indicators, course, and outcome</article-title><source>Psychol Psychother Theory Res Pract</source><year>1990</year><volume>27</volume><issue>2</issue><fpage>166</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.1037/0033-3204.27.2.166</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Horvath</surname><given-names>AO</given-names> </name></person-group><article-title>The therapeutic relationship: from transference to alliance</article-title><source>J Clin Psychol</source><year>2000</year><volume>56</volume><issue>2</issue><fpage>163</fpage><lpage>173</lpage><pub-id pub-id-type="doi">10.1002/(sici)1097-4679(200002)56:2&#x003C;163::aid-jclp3&#x003E;3.0.co;2-d</pub-id><pub-id pub-id-type="medline">10718600</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Horvath</surname><given-names>AO</given-names> </name></person-group><article-title>The therapeutic alliance: concepts, research and training</article-title><source>Aust Psychol</source><year>2001</year><month>07</month><day>1</day><volume>36</volume><issue>2</issue><fpage>170</fpage><lpage>176</lpage><pub-id pub-id-type="doi">10.1080/00050060108259650</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ackerman</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Hilsenroth</surname><given-names>MJ</given-names> </name></person-group><article-title>A review of therapist characteristics and techniques positively impacting the therapeutic alliance</article-title><source>Clin Psychol Rev</source><year>2003</year><month>02</month><volume>23</volume><issue>1</issue><fpage>1</fpage><lpage>33</lpage><pub-id pub-id-type="doi">10.1016/s0272-7358(02)00146-0</pub-id><pub-id pub-id-type="medline">12559992</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Summers</surname><given-names>RF</given-names> </name><name name-style="western"><surname>Barber</surname><given-names>JP</given-names> </name></person-group><article-title>Therapeutic alliance as a measurable psychotherapy skill</article-title><source>Acad Psychiatry</source><year>2003</year><volume>27</volume><issue>3</issue><fpage>160</fpage><lpage>165</lpage><pub-id pub-id-type="doi">10.1176/appi.ap.27.3.160</pub-id><pub-id pub-id-type="medline">12969839</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="book"><person-group person-group-type="editor"><name name-style="western"><surname>Sutor</surname><given-names>M</given-names> </name></person-group><source>Die Dialektisch Behaviorale Therapie (DBT): Neue DBT-Orientierte Diagnose&#x00FC;bergreifende Konzepte - Schwerpunkt Skills-Training</source><publisher-name>Springer</publisher-name><pub-id pub-id-type="doi">10.1007/978-3-662-64627-4</pub-id><pub-id pub-id-type="other">9783662646274</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="web"><article-title>GitHub</article-title><source>Single turn evaluation of LLMs and human therapists trainees in BA knowledge and response</source><access-date>2025-11-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/Julianlaue/Single-Turn-Evaluation-of-LLMs-and-human-therapists-trainees-in-BA-Knowledge-and-Response">https://github.com/Julianlaue/Single-Turn-Evaluation-of-LLMs-and-human-therapists-trainees-in-BA-Knowledge-and-Response</ext-link></comment></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>X</given-names> </name><name name-style="western"><surname>Ghosh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tilley</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Besada</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xiang</surname><given-names>Y</given-names> </name></person-group><article-title>&#x201C;Shaping ChatGPT into my digital therapist&#x201D;: a thematic analysis of social media discourse on using generative artificial intelligence for mental health</article-title><source>Digit Health</source><year>2025</year><volume>11</volume><fpage>20552076251351088</fpage><pub-id pub-id-type="doi">10.1177/20552076251351088</pub-id><pub-id pub-id-type="medline">40656852</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Responses of Claude Opus to open-ended questions of case scenarios 2, 6, 7, and 10.</p><media xlink:href="formative_v9i1e78138_app1.docx" xlink:title="DOCX File, 22 KB"/></supplementary-material></app-group></back></article>