<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e89077</article-id><article-id pub-id-type="doi">10.2196/89077</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Fine-Tuning Large Language Models for Motivational Interviewing in Health Behavior Change: Development and Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Hu</surname><given-names>Runze</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yang</surname><given-names>Yang</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yang</surname><given-names>Yihang</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kong</surname><given-names>Jingqi</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Luo</surname><given-names>Jiahui</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yang</surname><given-names>Wenyu</given-names></name><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Jing</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Jingyao</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zeng</surname><given-names>Huiqun</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Lei</given-names></name><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Liu</surname><given-names>Zheng</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Maternal and Child Health, School of Public Health, Peking University</institution><addr-line>No. 38 Xueyuan Road, Haidian District</addr-line><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff2"><institution>Peking University China Center for Health Development Studies, School of Public Health, Peking University</institution><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff3"><institution>Taomi AI4Health Lab</institution><addr-line>Beijing</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Steenstra</surname><given-names>Ivan</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Balan</surname><given-names>Ivan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Karnes</surname><given-names>Sasha</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Zheng Liu, Department of Maternal and Child Health, School of Public Health, Peking University, No. 38 Xueyuan Road, Haidian District, Beijing, 100191, China, 86 010-82801222; <email>liuzheng@bjmu.edu.cn</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>24</day><month>6</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e89077</elocation-id><history><date date-type="received"><day>06</day><month>12</month><year>2025</year></date><date date-type="rev-recd"><day>22</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>25</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Runze Hu, Yang Yang, Yihang Yang, Jingqi Kong, Jiahui Luo, Wenyu Yang, Jing Chen, Jingyao Liu, Huiqun Zeng, Lei Zhang, Zheng Liu. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 24.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e89077"/><abstract><sec><title>Background</title><p>Motivational interviewing (MI) is an effective counseling approach for promoting health behavior change, but its scalability is constrained by the need for highly trained human counselors. Large language models (LLMs) may provide a scalable way to support MI counseling, but evidence remains limited, especially for Chinese MI resources and evaluations based on standardized MI fidelity frameworks.</p></sec><sec><title>Objective</title><p>This study aimed to develop Chinese large language models for motivational interviewing (MI-LLMs) and evaluate whether MI-focused fine-tuning could improve their ability to generate counseling responses consistent with MI principles.</p></sec><sec sec-type="methods"><title>Methods</title><p>We first curated 5 publicly available Chinese psychological counseling datasets and assessed sampled conversations in terms of comprehensiveness, professionalism, authenticity, and safety. The 2 highest-scoring datasets, CPsyCounD and PsyDTCorpus, were selected for MI-style data construction. Using GPT-4 with a structured MI-informed prompt, we transformed 2040 multiturn counseling conversations into MI-style dialogs. Among these, 2000 dialogs were used for training and 40 for testing. Three Chinese-capable open-source LLMs (Baichuan2-7B-Chat, ChatGLM-4-9B-Chat, and Llama-3-8B-Chinese-Chat-v2) were fine-tuned with low-rank adaptation on the training dataset and were referred to as MI-LLMs. Automatic evaluation was conducted on the testing dataset using Bilingual Evaluation Understudy&#x2013;4 (BLEU-4) and Recall-Oriented Understudy for Gisting Evaluation (ROUGE) metrics. Manual evaluation was conducted using the Motivational Interviewing Treatment Integrity Coding Manual 4.2.1. Thirty simulated counseling dialogs generated by the MI-LLMs were compared with 30 real MI dialogs sampled from AnnoMI and translated into Chinese. Two trained graduate student raters coded global scores and behavior counts, from which summary scores were subsequently calculated.</p></sec><sec sec-type="results"><title>Results</title><p>In automatic evaluation, fine-tuning substantially improved BLEU-4 and ROUGE scores across all 3 models compared with the base models. In manual evaluation, the MI-LLMs achieved technical and relational global scores, as well as total MI-adherent ratios that approached those of real MI dialogs. The MI-LLM based on ChatGLM-4-9B-Chat showed the strongest overall global performance. However, MI-LLMs produced fewer complex reflections and had lower reflection-to-question ratios than real MI dialogs.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study provides preliminary evidence that MI focused fine-tuning can help Chinese LLMs acquire core counseling behaviors consistent with MI principles. It also offers a scalable approach for constructing MI style dialog resources in Chinese. Nevertheless, current MI-LLMs should be regarded as early-stage tools for supporting, rather than replacing human counselors. Future work should expand real MI training data and strengthen the complex reflective skills of MI-LLMs. Further studies are needed to evaluate their effectiveness, acceptability, and safety in health behavior change settings in the real world.</p></sec></abstract><kwd-group><kwd>motivational interviewing</kwd><kwd>large language models</kwd><kwd>health education</kwd><kwd>behavior change</kwd><kwd>fine-tuning</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Chronic diseases, such as cardiovascular disease, diabetes, and obesity, have become significant global health challenges [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Health behavior promotion is fundamental to the prevention and management of most of these chronic diseases. However, achieving long-term maintenance of behavior change through such interventions remains challenging [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. One line of research highlights the importance of enhancing individuals&#x2019; intrinsic motivation [<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>To this end, motivational interviewing (MI), an evidence-based counseling approach to facilitating behavior change [<xref ref-type="bibr" rid="ref10">10</xref>], has been proven effective in promoting health behavior change across a variety of behaviors [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>], such as smoking cessation [<xref ref-type="bibr" rid="ref13">13</xref>], weight loss [<xref ref-type="bibr" rid="ref14">14</xref>], substance use [<xref ref-type="bibr" rid="ref15">15</xref>], physical activity [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>], and chronic disease self-management [<xref ref-type="bibr" rid="ref18">18</xref>]. However, MI&#x2019;s effectiveness depends strongly on the professional therapist&#x2019;s experiences and skills, which require extensive training and continuous practice over several years [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. These demanding training requirements not only constrain the scalability of MI but also make it difficult for practitioners to maintain Motivational Interviewing Treatment Integrity (MITI) over time [<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>The rise of large language models (LLMs), such as GPT-4, has introduced a promising tool for addressing these challenges. LLMs have demonstrated advanced natural language processing capabilities and have been successfully applied in a variety of health-related tasks [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. However, existing research on the integration of MI with artificial intelligence or conversational agents faces several key limitations, including small, self-constructed datasets [<xref ref-type="bibr" rid="ref24">24</xref>], rule-based system architectures [<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref28">28</xref>], and outcome evaluation that is not adherent to standardized MI fidelity frameworks [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>].</p><p>In light of these limitations, this study aimed to (1) provide a scalable pathway for constructing a Chinese MI-style dialog dataset, (2) train fine-tuned LLMs to master the core skills of MI, and (3) assess the MI abilities of fine-tuned LLMs using both automatic and manual evaluation based on the MITI Coding Manual 4.2.1 [<xref ref-type="bibr" rid="ref30">30</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> shows the overall design of this study. We first collected available Chinese psychological counseling dialog datasets through online searches and evaluated randomly sampled conversations to select high-quality sources. Since these datasets did not contain structured MI content, we designed an MI-informed prompt to transform ordinary psychological counseling dialogs into MI-style multiturn dialogs. Using this dataset, we fine-tuned the LLM and developed the large language model for motivational interviewing (MI-LLM) model. Finally, through both automatic and manual evaluations, we comprehensively assessed whether the MI-LLMs initially had the potential to promote healthy behaviors by conducting MI.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overall design of this study. MI: motivational interviewing; MI-LLM: large language model for motivational interviewing.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e89077_fig01.png"/></fig></sec><sec id="s2-2"><title>Screening and Evaluating the Counseling Dialog Dataset</title><p>We searched Chinese psychological counseling dialog datasets from platforms including Kaggle, GitHub, Hugging Face, and OpenDataLab. Afterward, we randomly selected 50 conversations from each dataset and referenced CPsyCounD&#x2019;s automated evaluation methodology to evaluate the quality of these conversations [<xref ref-type="bibr" rid="ref31">31</xref>]. Specifically, this evaluation criterion included 4 aspects: comprehensiveness, professionalism, authenticity, and safety. Details of these 4 aspects of the evaluation criteria are shown in Table S1 of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. In the interest of data diversity, we decided to proceed with the top 2 datasets based on their aggregate scores.</p></sec><sec id="s2-3"><title>Transcription of the Counseling Dialogs</title><p>According to an authoritative book in the MI field [<xref ref-type="bibr" rid="ref10">10</xref>], we summarized the 4 basic tasks and interview techniques of MI to prepare a transcription prompt for GPT-4, a closed-source proprietary large language model developed by OpenAI, to transcribe Chinese psychological counseling conversations into MI-style dialogs. The transcription prompt template was strictly developed based on prompt engineering principles [<xref ref-type="bibr" rid="ref32">32</xref>], ensuring both structural rigor and functional appropriateness. Specifically, it was structured around the following key components: role, task objective, the 4 core tasks of MI, key techniques of MI, transformation steps, guiding principles, and an output format example. The specific prompt was shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. After transcription, all generated dialogs were manually screened to remove replies that were irrelevant, incoherent, unsafe, or clearly inconsistent with MI principles. These transcriptions based on GPT-4 were conducted in Beijing, China, between January 1, 2025, and February 28, 2025.</p></sec><sec id="s2-4"><title>Original Model Information</title><p>Three open-source LLMs with strong Chinese language capabilities&#x2014;Baichuan2-7B-Chat, ChatGLM-4-9B-Chat, and Llama-3-8B-Chinese-Chat-v2&#x2014;were downloaded from Hugging Face for fine-tuning and testing.</p><p>Baichuan2-7B-Chat [<xref ref-type="bibr" rid="ref33">33</xref>] is an open-source LLM launched by Baichuan Intelligence in December 2023, which was trained with a high-quality corpus of 2.6 trillion tokens. ChatGLM-4-9B-Chat [<xref ref-type="bibr" rid="ref34">34</xref>] is an open-source version of the pretrained model in the GLM-4 series, launched by Smart Spectrum AI in June 2024, and has demonstrated superior performance beyond Meta-Llama-3-8B in multifaceted dataset measurements. Meta-Llama-3-8B [<xref ref-type="bibr" rid="ref35">35</xref>] is an open-source LLM released by Meta in April 2024, using a high-quality corpus of 15 trillion tokens for pretraining with excellent performance. However, since only 5% of the training corpus of Meta-Llama-3-8B is non-English, it often answers Chinese prompts in English during inference. Instead of using the original model of Llama3, we use Llama-3-8B-Chinese-Chat-v2 [<xref ref-type="bibr" rid="ref36">36</xref>], a model based on Meta-Llama-3-8B, which provides a further 100,000 tokens of Chinese corpus for preference training, so that the model&#x2019;s Chinese language ability has been greatly developed.</p><p>To evaluate the models&#x2019; language comprehension and reasoning capabilities, we compiled the models&#x2019; scores on the following 3 key benchmarks. First, massive multitask language understanding (MMLU) is a large-scale multitask benchmark mainly used for evaluating English language models across 57 diverse tasks, measuring broad language understanding and reasoning abilities. Second, Chinese MMLU (CMMLU) is a multitask benchmark for evaluating comprehensive Chinese language understanding, including commonsense reasoning, mathematical reasoning, and sentiment analysis. Third, Chinese evaluation (C-Eval) is a benchmark designed to assess Chinese language understanding and generation capabilities. It covers multiple tasks such as reading comprehension, reasoning, and translation.</p><p>All benchmarks consist solely of multiple-choice questions. Model performance is reported as accuracy; therefore, higher scores indicate better model capability. For better comparison, we also collected the publicly reported scores of GPT-4 and Meta-Llama-3-8B on these benchmarks.</p><p>The specific scores for each model are shown in <xref ref-type="table" rid="table1">Table 1</xref>,<xref ref-type="table" rid="table1">1</xref>. The results showed that ChatGLM-4-9B-Chat performs strongly on C-Eval and CMMLU, demonstrating superior capability in Chinese-language applications. Although Llama-3-8B-Chinese-Chat-v2 and Baichuan2-7B-Chat achieved relatively lower scores, they still exhibited sufficient Chinese comprehension and reasoning abilities to serve as base models for fine-tuning in this study.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Models&#x2019; scores on 3 key benchmarks and the corresponding testing methods.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">MMLU<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">CMMLU<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="bottom">C-Eval<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="bottom">Evaluation setting</td><td align="left" valign="bottom">Source</td></tr></thead><tbody><tr><td align="left" valign="top">Baichuan2-7B-Chat<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">52.9</td><td align="left" valign="top">55.0</td><td align="left" valign="top">54.4</td><td align="left" valign="top">5-shot</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref33">33</xref>]</td></tr><tr><td align="left" valign="top">ChatGLM-4-9B-Chat<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">72.4</td><td align="left" valign="top">75.1</td><td align="left" valign="top">75.6</td><td align="left" valign="top">Not mentioned</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref34">34</xref>]</td></tr><tr><td align="left" valign="top">Llama-3-8B-Chinese-v2<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">63.7</td><td align="left" valign="top">52.4</td><td align="left" valign="top">49.8</td><td align="left" valign="top">5-shot</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref37">37</xref>]</td></tr><tr><td align="left" valign="top">GPT-4<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup></td><td align="left" valign="top">83.9</td><td align="left" valign="top">70.3</td><td align="left" valign="top">68.4</td><td align="left" valign="top">5-shot<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref33">33</xref>]</td></tr><tr><td align="left" valign="top">Meta-Llama-3-8B<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup></td><td align="left" valign="top">65</td><td align="left" valign="top">50.8</td><td align="left" valign="top">49.4</td><td align="left" valign="top">5-shot</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref37">37</xref>]</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>MMLU: massive multitask language understanding.</p></fn><fn id="table1fn2"><p><sup>b</sup>CMMLU: Chinese multitask language understanding.</p></fn><fn id="table1fn3"><p><sup>c</sup>C-Eval: Chinese evaluation.</p></fn><fn id="table1fn4"><p><sup>d</sup>Models fine-tuned in this study.</p></fn><fn id="table1fn5"><p><sup>e</sup>Models included for comparison.</p></fn><fn id="table1fn6"><p><sup>f</sup>5-shot means that when the model performs a task, it is provided with 5 examples as references or learning guides. These examples help the model recognize patterns or rules in the task, thereby improving its reasoning and predictions.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-5"><title>Methods of Fine-Tuning</title><p>Our study used the integrated tool LLaMA-Factory [<xref ref-type="bibr" rid="ref38">38</xref>], which was specifically designed for LLM&#x2014;a unified framework for efficient fine-tuning, evaluation, and deployment of LLMs. In addition, low-rank adaptation (LoRA) was adopted to fine-tune the LLMs. LoRA improves training efficiency by adding trainable LoRA matrices to the original model while keeping most pretrained parameters frozen, thereby reducing memory consumption compared with full-parameter fine-tuning. During training, we set the following key hyperparameters: a learning rate of 1.0&#x00D7;10<sup>&#x2013;4</sup>, which was gradually reduced using a cosine learning rate scheduler; a per-device training batch size of 1; and a gradient accumulation step size of 8 to achieve a larger effective batch size with limited memory. The number of training epochs was set to 3 to avoid overfitting; the warmup ratio was set to 0.1 to gradually increase the learning rate; and BF16 precision was used to accelerate training and reduce memory usage. During the training process, logs were recorded every 10 steps, the model was saved every 500 steps, and the change curve of the loss function was plotted.</p><p>Through fine-tuning the 3 models (Baichuan2-7B-Chat, ChatGLM-4-9B-Chat, and Llama-3-8B-Chinese-Chat-v2), we obtained corresponding domain-specific models. Based on the initials of MI and LLM, we named them MI-LLMs.</p></sec><sec id="s2-6"><title>Automatic and Manual Evaluation</title><sec id="s2-6-1"><title>Overview</title><p>After fine-tuning, we evaluated the MI-LLMs using automatic generation metrics and manual MITI&#x2013;based coding. The inference and evaluation runs for the base models and fine-tuned MI-LLMs were conducted on the High-Performance Computing Platform of Peking University in Beijing, China, between February 1, 2025, and March 31, 2025.</p></sec><sec id="s2-6-2"><title>Automatic Evaluation</title><p>For automatic evaluation, we constructed round-based test samples to compare each model-generated counselor response with the corresponding reference counselor response. This design allowed us to evaluate model performance at each client-counselor exchange while preserving the prior dialog history as context.</p><p>Our test dataset consisted of complete, transcribed MI-style psychological counseling dialogs, with each dialog corresponding to 1 full client-counselor session. We refer to these full dialogs as test dialogs. When constructing the round-based test samples, we split each transcribed MI-style dialog into rounds, taking each client volley as the model input and using all preceding rounds as dialog history to provide context. One round was defined as a single client-counselor exchange. For every round-level test sample, we attached a fixed instruction prompt: &#x201C;You are a psychological counselor with 20 years of experience. Your aim is to help clients solve psychological problems through professional Motivational Interviewing counseling.&#x201D;</p><p>Formally, for a test dialog with <italic>i</italic> rounds (ie, <italic>i</italic> client-counselor pairs), we denote it as {(<italic>q<sub>k</sub>,r<sub>k</sub></italic>) | k = 1,2,...,<italic>i</italic>}, where <italic>q<sub>k</sub></italic>  and <italic>r<sub>k</sub></italic> are the client&#x2019;s volley and the counselor&#x2019;s reply at round <italic>k</italic>, respectively. At round <italic>k</italic>, the dialog history is defined as:</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>h</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>q</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>r</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2223;</mml:mo><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>k</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo fence="false" stretchy="false">}</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>h<sub>k</sub></italic> represents all previous client queries and counselor responses. Conditioning on the fixed MI prompt <italic>P</italic><sub>MI</sub>, the model generates a response <italic>rˆ<sub>k</sub></italic> according to:</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mrow><mml:mover><mml:mi>r</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mi>k</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left left" rowspacing=".2em" columnspacing="1em" displaystyle="false"><mml:mtr><mml:mtd><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">L</mml:mi><mml:mi mathvariant="normal">L</mml:mi><mml:mi mathvariant="normal">M</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">M</mml:mi><mml:mi mathvariant="normal">I</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>q</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd><mml:mtd><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">L</mml:mi><mml:mi mathvariant="normal">L</mml:mi><mml:mi mathvariant="normal">M</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">M</mml:mi><mml:mi mathvariant="normal">I</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>q</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd><mml:mtd><mml:mn>1</mml:mn><mml:mo>&#x003C;</mml:mo><mml:mi>k</mml:mi><mml:mo>&#x2264;</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable><mml:mo fence="true" stretchy="true" symmetric="true"/></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>f</italic><sub>LLM</sub>(&#x00B7;) denotes the inference process of the language model. Thus, each test dialog in the test dataset gives rise to <italic>i</italic> round-level test samples in an Alpaca-style triplet form (instruction, input, and output, including the history and the most recent from the client).</p><p>For example, in a 3-round smoking-cessation conversation, round 1 uses the client&#x2019;s first volley (&#x201C;Hello, I feel that I have been under a lot of pressure recently, and I can&#x2019;t help smoking&#x201D;) as the input, together with the fixed MI prompt, and the corresponding counselor reply as the target output. In round 2, the input consists of the client&#x2019;s second volley (&#x201C;Yes, I really want to quit smoking... but I just can&#x2019;t quit it&#x201D;), the same fixed prompt, and the full dialog history from round 1 (the client&#x2019;s first volley and the counselor&#x2019;s response), with the counselor&#x2019;s second reply as the target. In round 3, the client&#x2019;s third volley, expressing worry about withdrawal reactions, is combined with the same fixed prompt and the cumulative history from rounds 1 and 2, and the counselor&#x2019;s third reply serves as the target output. A detailed example of this round-based division is shown in Table S2 of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>We first fed the round-based test samples constructed above into each model to obtain its generated responses, <italic>rˆ<sub>k</sub></italic> for every turn <italic>k</italic>. For automatic evaluation, we then compared each generated response <italic>r&#x0302;<sub>k</sub></italic>with its corresponding reference counselor response <italic>r&#x0302;<sub>k</sub></italic>. The outputs of the original base models and the MI-LLMs were treated as generated texts, while the counselor&#x2019;s volleys in the test dataset served as reference texts. Using LLaMA-Factory, we computed Bilingual Evaluation Understudy&#x2013;4 (BLEU-4), Recall-Oriented Understudy for Gisting Evaluation&#x2013;1 (ROUGE-1), ROUGE-2, and ROUGE-L between each generated response and its corresponding reference. These automatic metrics were used to quantify the improvement in the models&#x2019; ability to produce MI-consistent counseling responses that more closely match real counselors&#x2019; language and better handle multiturn psychological dialogs. Detailed definitions and implementation settings of the automatic metrics are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-6-3"><title>Manual Evaluation</title><p>In the manual evaluation, 2 researchers role-played standardized clients. Specifically, we predefined for each standardized client the target behavior to be changed, the specific manifestations of that behavior, the underlying reasons for the behavior (with the provision that only by addressing these reasons would the motivation for change be triggered), and the client&#x2019;s baseline readiness to change. The 2 researchers were each randomly assigned several standardized client profiles and then conducted the dialogs with the LLMs according to their assigned profiles. For each MI-LLM, we obtained 10 independent and complete multiround conversations, resulting in a total of 30 simulated counseling dialogs across the 3 MI-LLMs.</p><p>In addition, we used real MI dialogs from the AnnoMI dataset as a reference set [<xref ref-type="bibr" rid="ref39">39</xref>]. AnnoMI is a publicly available collection of expert-annotated MI dialogs, originally derived from audiorecorded counseling sessions and presented in the form of textual transcripts. We randomly sampled 30 dialogs from AnnoMI, which were originally in English, and translated them into Chinese to ensure language consistency with the conversations generated by MI-LLMs and to facilitate uniform manual evaluation.</p><p>Before manual evaluation, all dialogs were deidentified, formatted into the same transcript structure, and randomly ordered. The raters were not informed whether each dialog came from a real MI session or an MI-LLM&#x2013;generated interaction. In total, 60 written multiround transcripts, including 30 simulated counseling dialogs and 30 real MI dialogs, were manually evaluated. We also categorized and counted the topics of 2 sets of dialogs.</p><p>Manual coding was conducted according to the MITI Coding Manual 4.2.1 [<xref ref-type="bibr" rid="ref30">30</xref>]. The evaluation indicators included (1) global scores on four dimensions of cultivating change talk, softening sustain talk, partnership, and empathy; and (2) behavior counts, including giving information, persuading with permission, asking questions, simple reflections, complex reflections, affirming, seeking collaboration, emphasizing autonomy, persuading, and confronting. After completing the assessment of these indicators, we used the following equation to calculate summary scores for quantitative evaluation. The assignment of points for each indicator is shown in <xref ref-type="table" rid="table2">Table 2</xref>. All MITI coding was carried out by 2 graduate students: 1 in psychology and 1 in medicine. To ensure coding accuracy, both raters first independently downloaded and studied the MITI Coding Manual 4.2.1. They then completed 2 online training sessions led by a trainer certified by the Motivational Interviewing Network of Trainers. The first session reviewed the MITI manual with illustrative examples. In the second session, the raters coded 4 standardized sample dialogs (141 volleys in total). For global scores of 4 dialogs, the Pearson correlations between each rater and the reference standard scores were 0.64 and 0.65, respectively, while the interrater correlation was 0.84. For Cohen &#x03BA; of behavior coding across the 141 volleys, the &#x03BA; values between each rater and the standard codes exceeded 0.58, and the interrater &#x03BA; reached 0.69. Based on these results, the trainer confirmed that the raters were adequately prepared to proceed with formal coding.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Method for calculating summary scores in manual evaluation.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Indicator</td><td align="left" valign="bottom">Calculating method</td></tr></thead><tbody><tr><td align="left" valign="top">Complex reflection ratio</td><td align="left" valign="top">=Complex reflection/(Simple reflections+Complex reflection)</td></tr><tr><td align="left" valign="top">Reflection-to-question ratio (R:Q)</td><td align="left" valign="top">=Total reflections/(Total questions)=(Simple reflections+Complex reflection)/(Total questions)</td></tr><tr><td align="left" valign="top">Technical global</td><td align="left" valign="top">=(Cultivating change talk+Softening sustain talk)/2</td></tr><tr><td align="left" valign="top">Relational global</td><td align="left" valign="top">=(Partnership+Empathy)/2</td></tr><tr><td align="left" valign="top">Total MI-adherent ratio<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">=(Seeking collaboration+Affirming+Emphasizing autonomy)/(Seeking collaboration+Affirming+Emphasizing autonomy+Confronting+Persuading)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Motivational Interviewing Treatment Integrity Coding Manual 4.2.1 used 2 indicators to evaluate the conformity and nonconformity of motivational interviewing (MI).</p></fn></table-wrap-foot></table-wrap><p>For the real MI dialogs and the dialogs generated by each MI-LLM, we separately calculated the mean and SD of the global scores, behavior counts, and summary scores. These descriptive statistics allowed us to directly compare the counseling performance of human MI practitioners with that of the 3 MI-LLMs:</p><p>Total MI-adherent=Seeking collaboration+Affirming+Emphasizing autonomy</p><p>Total MI-nonadherent=Confronting+Persuading</p><p>However, these 2 indicators are related to the length of the dialog. The longer the dialog, the larger the values of these 2 indicators might be. Therefore, we adopted a ratio to normalize and compare the conformity of MI across different dialog lengths:</p><p>Total MI-adherent ratio=Total MI-adherent/(Total MI-adherent+Total MI nonadherent)</p></sec></sec><sec id="s2-7"><title>Reporting</title><p>We used the CHART (Chatbot Assessment Reporting Tool) checklist (<xref ref-type="supplementary-material" rid="app2">Checklist 1</xref>) to ensure that our reporting was comprehensive [<xref ref-type="bibr" rid="ref40">40</xref>].</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>Ethics board review was not sought for this study because it did not involve the recruitment of external human participants, clinical intervention, identifiable private information, or confidential patient records. The study used publicly available and deidentified dialogue datasets in accordance with their public availability and citation requirements. According to Article 32 of the Measures for Ethical Review of Life Science and Medical Research Involving Humans [<xref ref-type="bibr" rid="ref41">41</xref>], research using legally obtained publicly available data or anonymized information may be exempt from ethics review when it does not cause harm to humans and does not involve sensitive personal information or commercial interests. In the manual evaluation, researchers role-played standardized clients using predefined hypothetical profiles solely for model testing. Therefore, informed consent from external participants was not applicable.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Results of Screening Chinese Psychological Counseling Dialog Datasets</title><p>We obtained 5 Chinese psychological counseling dialog datasets: CPsyCounD [<xref ref-type="bibr" rid="ref31">31</xref>], Emotional First Aid Dataset [<xref ref-type="bibr" rid="ref42">42</xref>], Psy-Insight [<xref ref-type="bibr" rid="ref43">43</xref>], PsyDTCorpus [<xref ref-type="bibr" rid="ref44">44</xref>], and Smilechat [<xref ref-type="bibr" rid="ref45">45</xref>]. The quality of these datasets was evaluated from 4 perspectives: comprehensiveness, professionalism, authenticity, and safety. The results are shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. Among the 5 datasets, CPsyCounD and PsyDTCorpus achieved 2 highest overall scores. Therefore, we constructed the training and test datasets based on these 2 datasets.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The evaluation results of GPT-4 on the scores of 4 perspectives and the total score across 5 Chinese psychological counseling dialog datasets.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e89077_fig02.png"/></fig></sec><sec id="s3-2"><title>Construction of MI-Style Datasets</title><p>First, we extracted 1520 multiround dialogs from CPsyCounD and 520 dialogs from PsyDTCorpus, based on their optimal quality compared with the other 3 datasets. Second, we transcribed them into MI-style dialogs by using prompt engineering with GPT-4, obtaining 2040 transcribed multiturn dialogs in total. Third, from 2040 transcribed multiturn dialogs, we randomly selected 2000 of them as the training dataset and the remaining 40 as the test dataset.</p><p>Specifically, for CPsyCounD and PsyDTCorpus, the training dataset and the test dataset, we calculated the average number of dialog rounds, the average number of words in each client volley, and the average number of words in each counselor volley. The results are shown in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Statistical analysis of dialog features in the CPsyCounD, PsyDTCorpus, and the motivational interviewing&#x2013;style training dataset.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Datasets</td><td align="left" valign="bottom">Dialog rounds, mean (SD)</td><td align="left" valign="bottom">Words per counselor volley, mean (SD)</td><td align="left" valign="bottom">Words per client volley, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">CPsyCounD</td><td align="left" valign="top">7.65 (2.07)</td><td align="left" valign="top">29.95 (18.22)</td><td align="left" valign="top">47.76 (20.78)</td></tr><tr><td align="left" valign="top">PsyDTCorpus</td><td align="left" valign="top">18.08 (3.15)</td><td align="left" valign="top">28.67 (11.41)</td><td align="left" valign="top">53.74 (17.96)</td></tr><tr><td align="left" valign="top">Training dataset</td><td align="left" valign="top">7.97 (4.56)</td><td align="left" valign="top">29.08 (16.38)</td><td align="left" valign="top">57.33 (17.20)</td></tr><tr><td align="left" valign="top">Test dataset</td><td align="left" valign="top">7.53 (4.12)</td><td align="left" valign="top">29.21 (18.62)</td><td align="left" valign="top">56.42 (15.90)</td></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>Results of Automatic Evaluation</title><p>We divided the 40 test dialogs into 367 round-based samples to evaluate the fine-tuning effect of the model. As shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, all fine-tuned MI-LLMs performed better than their corresponding base models across BLEU-4 and ROUGE metrics. Specifically, the BLEU-4 score of the Baichuan2-7B-Chat&#x2013;based MI-LLM increased from 2.10 for the original Baichuan2-7B-Chat to 5.84; for the ChatGLM-4-9B-Chat&#x2013;based MI-LLM, it increased from 1.77 to 6.47; and for the Llama-3-8B-Chinese-Chat-v2&#x2013;based MI-LLM, it increased from 2.39 to 5.79. Regarding the ROUGE series metrics (ROUGE-1, ROUGE-2, and ROUGE-L), the performance of the MI-LLMs was likewise consistently better than that of the original models.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>The automatic evaluation results of large language models for motivational interviewing (MI-LLMs) and the original model on the test datasets.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e89077_fig03.png"/></fig></sec><sec id="s3-4"><title>Results of Manual Evaluation</title><p>Because the actual temporal duration was not directly comparable between transcript-based real MI dialogs and text-based LLM-generated dialogs, we compared dialog length using the average number of dialog rounds and the average volley length of clients and counselors. As shown in <xref ref-type="table" rid="table4">Table 4</xref>, the real MI dialogs contained more dialog rounds than the simulated counseling dialogs generated by the 3 MI-LLMs. Differences were also observed in volley length. Real MI dialogs showed shorter counselor volleys but longer client volleys on average compared to the simulated dialogs.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Statistical analysis of dialog features in real MI<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> dialogs and simulated counseling dialogs.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Source</td><td align="left" valign="bottom">Dialogs, n</td><td align="left" valign="bottom">Dialog rounds, mean (SD)</td><td align="left" valign="bottom">Words per counselor volley, mean (SD)</td><td align="left" valign="bottom">Words per client volley, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Real MI dialogs</td><td align="left" valign="top">30</td><td align="left" valign="top">57.93 (65.95)</td><td align="left" valign="top">24.26 (13.04)</td><td align="left" valign="top">21.38 (9.74)</td></tr><tr><td align="left" valign="top" colspan="5">Simulated counseling dialogs from 3 MI-LLMs<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Baichuan2-7B-Chat&#x2013;based</td><td align="left" valign="top">10</td><td align="left" valign="top">18.30 (7.27)</td><td align="left" valign="top">49.11 (6.57)</td><td align="left" valign="top">14.80 (5.67)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ChatGLM-4-9B-Chat&#x2013;based</td><td align="left" valign="top">10</td><td align="left" valign="top">17.40 (5.17)</td><td align="left" valign="top">52.28 (10.30)</td><td align="left" valign="top">11.45 (4.93)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama-3-8B-Chinese-Chat-v2&#x2013;based</td><td align="left" valign="top">10</td><td align="left" valign="top">18.90 (5.80)</td><td align="left" valign="top">43.79 (8.70)</td><td align="left" valign="top">10.71 (5.80)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>MI: motivational interviewing.</p></fn><fn id="table4fn2"><p><sup>b</sup>MI-LLM: large language model for motivational interviewing.</p></fn></table-wrap-foot></table-wrap><p>These findings suggest differences in turn-taking and volley structure between real MI and MI-LLM-generated dialogs. However, because the real MI dialogs were translated speech-derived transcripts, whereas the MI-LLM dialogs were originally generated as Chinese text&#x2013;based interactions, the differences in <xref ref-type="table" rid="table4">Table 4</xref> may also reflect language, translation, topic, dataset source, and communication-modality effects rather than human-LLM differences alone.</p><p>We categorized the topics of 30 real MI dialogs and 30 simulated counseling dialogs from 3 MI-LLMs. Both the real MI dialogs and the simulated counseling dialogs generated by the MI-LLMs primarily focused on behavior change. In particular, topics such as weight loss or diet management, increasing exercise, and reducing dependence on external objects or substances (eg, mobile phones, alcohol, or other addictive behaviors) were represented in both sets of dialogs. Details are shown in <xref ref-type="table" rid="table5">Table 5</xref>.</p><p>Based on the MITI framework, the manual evaluation results for simulated counseling dialogs generated by MI-LLMs and for real MI dialogs&#x2014;including the mean and SD for each indicator&#x2014;are summarized in <xref ref-type="table" rid="table6">Table 6</xref>.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Topics of real MI<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup> dialogs and simulated counseling dialogs from MI-LLMs<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup>.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Topics</td><td align="left" valign="bottom">Counts</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Real MI dialogs</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Increasing exercise</td><td align="left" valign="top">6</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Chronic disease self-management</td><td align="left" valign="top">6</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Weight loss or diet management</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Reducing recidivism</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Reducing drug use</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Reducing alcohol use</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Quitting smoking</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other</td><td align="left" valign="top">7</td></tr><tr><td align="left" valign="top" colspan="2">Simulated counseling dialogs from 3 MI-LLMs</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Weight loss or diet management</td><td align="left" valign="top">4&#x00D7;3</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Reducing mobile phone use</td><td align="left" valign="top">2&#x00D7;3</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Improving insomnia</td><td align="left" valign="top">2&#x00D7;3</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Increasing exercise</td><td align="left" valign="top">1&#x00D7;3</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Reducing excessive consumption</td><td align="left" valign="top">1&#x00D7;3</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>MI: motivational interviewing.</p></fn><fn id="table5fn2"><p><sup>b</sup>MI-LLM: large language model for motivational interviewing.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>The manual evaluation results of global scores, behavior counts, and summary scores.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Evaluation indicators</td><td align="left" valign="bottom" colspan="3">Simulated counseling dialogs from MI-LLMs<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup></td><td align="left" valign="bottom">Real MI<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> dialogs</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Baichuan2-7B-Chat&#x2013;based</td><td align="left" valign="bottom">ChatGLM-4-9B-Chat&#x2013;based</td><td align="left" valign="bottom">Llama-3-8B-Chinese-Chat-v2&#x2013;based</td><td align="left" valign="bottom"/></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Global scores, mean (SD)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cultivating change talk</td><td align="left" valign="top">4.00 (0.00)</td><td align="left" valign="top">4.00 (0.00)</td><td align="left" valign="top">4.00 (0.00)</td><td align="left" valign="top">3.97 (0.89)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Softening sustain talk</td><td align="left" valign="top">3.89 (0.33.)</td><td align="left" valign="top">4.00 (0.00)</td><td align="left" valign="top">3.90 (0.32)</td><td align="left" valign="top">3.87 (0.86)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Empathy</td><td align="left" valign="top">3.78 (0.44)</td><td align="left" valign="top">4.00 (0.00)</td><td align="left" valign="top">3.80 (0.67)</td><td align="left" valign="top">4.07 (1.11)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Partnership</td><td align="left" valign="top">3.44 (0.88)</td><td align="left" valign="top">4.00 (0.00)</td><td align="left" valign="top">4.00 (0.67)</td><td align="left" valign="top">3.83 (0.95)</td></tr><tr><td align="left" valign="top" colspan="5">Behavior counts, mean (SD)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Giving information</td><td align="left" valign="top">3.22 (1.56)</td><td align="left" valign="top">4.00 (2.75)</td><td align="left" valign="top">4.50 (1.78)</td><td align="left" valign="top">1.07 (1.48)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Persuading with permission</td><td align="left" valign="top">0.33 (0.50)</td><td align="left" valign="top">0.00 (0.00-0.00)</td><td align="left" valign="top">0.10 (0.32)</td><td align="left" valign="top">0.80 (1.56)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Asking questions</td><td align="left" valign="top">15.56 (7.24)</td><td align="left" valign="top">14.20 (5.15)</td><td align="left" valign="top">16.00 (5.10)</td><td align="left" valign="top">13.43 (11.68)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Simple reflections</td><td align="left" valign="top">13.50 (6.08)</td><td align="left" valign="top">13.25 (4.58)</td><td align="left" valign="top">13.10 (3.77)</td><td align="left" valign="top">14.17 (23.76)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Complex reflections</td><td align="left" valign="top">6.00 (2.52)</td><td align="left" valign="top">4.25 (2.03)</td><td align="left" valign="top">5.00 (2.33)</td><td align="left" valign="top">6.13 (8.54)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Total reflections</td><td align="left" valign="top">19.5 (8.50)</td><td align="left" valign="top">17.5 (6.09)</td><td align="left" valign="top">18.1 (5.74)</td><td align="left" valign="top">20.3 (28.15)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Affirming</td><td align="left" valign="top">4.44 (1.81)</td><td align="left" valign="top">3.75 (0.79)</td><td align="left" valign="top">3.35 (1.20)</td><td align="left" valign="top">3.30 (3.84)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Seeking collaboration</td><td align="left" valign="top">1.78 (0.57)</td><td align="left" valign="top">1.60 (0.52)</td><td align="left" valign="top">1.40(0.70)</td><td align="left" valign="top">1.53 (1.89)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emphasizing autonomy</td><td align="left" valign="top">1.06 (0.39)</td><td align="left" valign="top">1.25 (0.26)</td><td align="left" valign="top">0.45 (0.50)</td><td align="left" valign="top">0.60 (0.86)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Persuading</td><td align="left" valign="top">3.83 (2.78)</td><td align="left" valign="top">2.30 (1.58)</td><td align="left" valign="top">3.40 (1.26)</td><td align="left" valign="top">1.57 (1.65)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Confronting</td><td align="left" valign="top">0.11 (0.22)</td><td align="left" valign="top">0.00 (0.00)</td><td align="left" valign="top">0.00 (0.00)</td><td align="left" valign="top">0.10 (0.40)</td></tr><tr><td align="left" valign="top" colspan="5">Summary scores, mean (SD)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Complex reflections ratio</td><td align="left" valign="top">0.31 0.03)</td><td align="left" valign="top">0.24 (0.08)</td><td align="left" valign="top">0.26 (0.06)</td><td align="left" valign="top">0.37 (0.35)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Reflection-to-question ratio</td><td align="left" valign="top">1.27 (0.09)</td><td align="left" valign="top">1.25 (0.10)</td><td align="left" valign="top">1.11 (0.14)</td><td align="left" valign="top">1.44 (0.93)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Technical global</td><td align="left" valign="top">3.94 (0.17)</td><td align="left" valign="top">4.00 (0.00)</td><td align="left" valign="top">3.95 (0.16)</td><td align="left" valign="top">3.92 (0.80)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Relational global</td><td align="left" valign="top">3.61 (0.65)</td><td align="left" valign="top">4.00 (0.00)</td><td align="left" valign="top">3.90 (0.46)</td><td align="left" valign="top">3.95 (0.95)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Total MI-adherent ratio</td><td align="left" valign="top">0.75 (0.08)</td><td align="left" valign="top">0.78 (0.12)</td><td align="left" valign="top">0.72 (0.06)</td><td align="left" valign="top">0.73 (0.33)</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>MI-LLM: large language model for motivational interviewing.</p></fn><fn id="table6fn2"><p><sup>b</sup>MI: motivational interviewing.</p></fn></table-wrap-foot></table-wrap><p>In terms of global scores, the simulated dialogs all received consistently high global ratings on cultivating change talk, softening sustain talk, empathy, and partnership, with mean scores mostly around or above 3.5 and limited variability. Among them, the MI-LLM based on ChatGLM-4-9B-Chat achieved the strongest overall global performance. It reached a mean of 4.00 on all 4 global items and obtained Technical Global and Relational Global scores of 4.00, slightly surpassing the other 2 MI-LLMs and the real MI dialogs. Real MI dialogs achieved empathy scores that were marginally higher than those of the MI-LLMs, indicating that experienced human counselors still hold an advantage in conveying empathic understanding. For cultivating change talk and partnership, however, the average scores of human counselors were similar to those of MI-LLMs, suggesting that the models can already approximate human-level performance on key MI-consistent global dimensions.</p><p>In terms of behavior counts, asking questions and simple reflections were the 2 most frequent behaviors in both MI-LLM-generated and real MI dialogs. Complex reflections occurred most often in real MI dialogs, with Baichuan2-7B-Chat&#x2013;based and Llama-3-8B-Chinese-Chat-v2&#x2013;based MI-LLMs producing somewhat more complex reflections than the ChatGLM-4-9B-Chat&#x2013;based model, but still slightly fewer than human counselors on average.</p><p>Compared with real MI dialogs, MI-LLMs produced substantially more instances of giving information and slightly more affirming behaviors, indicating a strong tendency to support clients through explanations and positive feedback. Certain MI-adherent behaviors, such as seeking collaboration and emphasizing autonomy, appeared at least as frequently in MI-LLM simulations as in real dialogs, whereas persuading with permission was more common in real MI dialogs. MI nonadherent behaviors, including persuading and confronting, were infrequent in both groups, and confronting was almost absent.</p><p>The dispersion of behavior counts also differed between groups: dialogs generated by MI-LLMs showed relatively stable behavior frequencies across conversations, as reflected by smaller SD values for several behavior count indicators, whereas real MI dialogs exhibited greater variability between dialogs, reflecting the flexible adaptation of strategies in real clinical practice.</p><p>In terms of summary scores, real MI dialogs showed a clear advantage in complex reflective skills. The complex reflections ratio was highest in real MI dialogs, exceeding the ratios observed in all 3 MI-LLMs, which suggests that human counselors are more likely to use reflections that add inference, meaning, or emotional elaboration rather than merely repeating content. Similarly, the reflection-to-question (R:Q) ratio was higher in real MI dialogs than in any of the MI-LLM-generated dialogs, indicating that human practitioners rely relatively more on reflective listening than on asking questions.</p><p>In contrast, MI-LLMs performed very well on MI adherence. The total MI-adherent ratio was highest for the ChatGLM-4-9B-Chat&#x2013;based MI-LLM, followed by the Baichuan2-7B-Chat&#x2013;based model, both at or above the level observed in real MI dialogs, while the Llama-3-8B-Chinese-Chat-v2&#x2013;based model was slightly lower but still close. Taken together, these results suggest that MI-LLMs are highly capable of following MI-consistent guidelines and avoiding nonadherent behaviors, whereas human counselors retain an advantage in more nuanced skills, particularly the use of complex reflections and maintaining a higher balance of reflections over questions.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This exploratory study tested the feasibility of developing MI-LLMs through dataset screening, MI-style data construction, model fine-tuning, and multilevel evaluation. We first transformed high-quality Chinese psychological counseling dialogs into MI-style multiturn conversations using an MI-informed GPT-4 prompt, providing a scalable approach for constructing MI training data in a resource-limited language context. Based on this dataset, 3 open-source Chinese-capable LLMs were fine-tuned and evaluated. Both automatic and manual evaluations suggested that MI-LLMs acquired several core MI-consistent techniques and principles. They achieved technical and relational global scores and MI-adherent ratios close to those of real MI dialogs, although human practitioners still showed advantages in complex reflections and R:Q balance.</p></sec><sec id="s4-2"><title>Interpretation of Findings</title><p>Our findings first suggest that MI-style data construction is a feasible strategy for addressing the shortage of high-quality MI dialog corpora, especially in non-English contexts. Similar to recent efforts such as Korean MI and IC-AnnoMI [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref46">46</xref>], this study shows that LLM-assisted data augmentation can be used to expand MI-related dialog resources when expert-annotated MI corpora are limited.</p><p>Second, the results support the value of task-specific fine-tuning for improving the MI-consistent behaviors of general-purpose LLMs. Across all 3 base models, fine-tuning improved automatic generation metrics, and the manual MITI&#x2013;based evaluation showed that MI-LLMs achieved global scores and summary scores similar to those of real MI dialogs. These findings are consistent with recent evidence that fine-tuning can improve the quality of automated therapy delivered by LLMs [<xref ref-type="bibr" rid="ref47">47</xref>]. However, the improvements were more evident for basic MI-consistent behaviors than for advanced MI skills. Compared with human counselors, MI-LLMs still showed lower complex reflection ratios and lower R:Q ratios, suggesting that fine-tuning may help models learn MI-adherent language patterns more easily than deeper reflective listening and session-level counseling strategies.</p><p>Third, our findings add to emerging evidence that LLMs can generate responses broadly aligned with MI principles and may be adapted for scalable health behavior support [<xref ref-type="bibr" rid="ref48">48</xref>]. Prior studies have shown that LLM-generated MI responses can be contextually appropriate and that fully generative MI chatbots may be feasible in health behavior domains [<xref ref-type="bibr" rid="ref49">49</xref>]. In line with this literature, our results suggest that MI-LLMs may serve as a promising supplement for health behavior change support. Nevertheless, their limitations in complex reflections and R:Q balance indicate that current MI-LLMs should not yet be viewed as replacements for trained human counselors, but rather as early-stage tools requiring further optimization, safety evaluation, and real-world validation.</p></sec><sec id="s4-3"><title>Strengths and Limitations</title><p>This study has several strengths. First, it addresses an important language and resource gap by developing and evaluating Chinese MI-LLMs in a field still dominated by English-language datasets and systems. Second, the study proposes a practical and scalable data construction strategy by transforming existing Chinese psychological counseling dialogs into MI-style multiturn conversations. Third, 3 open-source LLMs with Chinese language capability were fine-tuned and evaluated, which improves the robustness of the findings compared with a single-model design. Fourth, the evaluation combined automatic generation metrics with manual MITI&#x2013;based coding, allowing us to assess not only text similarity but also clinically meaningful MI process indicators. Finally, the comparison with real MI dialogs from AnnoMI provided a more interpretable benchmark than model-to-model comparisons alone and helped identify specific skill gaps, especially in complex reflections and R:Q balance.</p><p>Several limitations should be noted. First, the training corpus was relatively small and was constructed through GPT-4-assisted transformation, which may limit the authenticity and generalizability of the generated dialogs. Second, the evaluation was based on researcher-simulated clients and written transcripts. Although dialogs were deidentified, standardized, randomly ordered, and rated without disclosing their source, complete blinding could not be guaranteed because raters might have inferred the source from observable dialog features. The comparison between translated English AnnoMI dialogs and originally Chinese MI-LLM dialogs may have been influenced by language, cultural, and communication-pattern differences. In addition, the use of written transcripts without audio or nonverbal cues, together with the small number of raters, means that the MITI-based results should be interpreted as preliminary process-based evidence rather than definitive evidence of clinical MI competence. Third, this study was not validated in real-world counseling or health behavior change settings, which limits the strength of evidence regarding the effectiveness, acceptability, and safety of MI-LLMs in actual use. Therefore, we could not assess how users might respond to MI delivered by nonhuman agents over time, including potential psychosocial risks. Although MI-LLMs may generate language that appears consistent with MI principles, text-based MI consistency should not be equated with human-delivered MI or a genuine counseling relationship, particularly because MI-LLMs lack access to tone of voice, facial expressions, body language, and other interpersonal cues that support face-to-face counseling. Future studies should evaluate potential unintended effects, including loneliness, emotional dependence, overreliance, reduced help-seeking from humans, and changes in real-world social support.</p></sec><sec id="s4-4"><title>Conclusions</title><p>In conclusion, this study provides preliminary evidence that MI-oriented fine-tuning can enable Chinese LLMs to produce counseling responses that approximate several core MI-consistent behaviors on MITI-based process measures. The findings suggest a scalable route for building MI dialog resources and developing AI-assisted tools for health behavior support. At the same time, current MI-LLMs remain limited in complex reflective listening, session-level strategy, real-world effectiveness, interpretability, and safety assurance. Future work should expand and diversify real MI training data, incorporate expert feedback or reinforcement learning from human preferences [<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref51">51</xref>], evaluate models in rigorous real-world trials, including randomized controlled trials comparing MI-LLMs with human-delivered MI or usual care, and develop safeguards that make MI-LLMs more transparent, controllable, and clinically responsible.</p></sec></sec></body><back><ack><p>This study was supported by the High-Performance Computing Platform of Peking University. The authors thank the staff involved in this study. In accordance with the GAIDeT taxonomy (2025), the authors declare the use of ChatGPT-4 under full human supervision for code generation, optimization, visualization, proofreading, editing, and translation during the research and manuscript preparation. Responsibility for the final manuscript lies entirely with the authors. Generative artificial intelligence tools are not listed as authors.</p></ack><notes><sec><title>Funding</title><p>This study was funded by the National Natural Science Foundation of China (82373694) and Beijing Natural Science Foundation (7262165).</p></sec><sec><title>Data Availability</title><p>Upon publication, all code and processed data necessary to reproduce the main findings will be made publicly available on GitHub [<xref ref-type="bibr" rid="ref52">52</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: ZL, RH, LZ</p><p>Data curation: RH, Yihang Yang, JK, JL, WY</p><p>Formal analysis: RH, JC, JL, HZ</p><p>Investigation: RH</p><p>Methodology: RH, Yang Yang, LZ</p><p>Supervision: ZL, LZ</p><p>Writing&#x2014;original draft: RH</p><p>Writing&#x2014;review and editing: RH, ZL</p><p>ZL and LZ contributed equally. These authors jointly supervised the work.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BLEU-4</term><def><p>Bilingual Evaluation Understudy&#x2013;4</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">LoRA</term><def><p>low-rank adaptation</p></def></def-item><def-item><term id="abb4">MI</term><def><p>motivational interviewing</p></def></def-item><def-item><term id="abb5">MI-LLM</term><def><p>large language model for motivational interviewing</p></def></def-item><def-item><term id="abb6">MITI</term><def><p>Motivational Interviewing Treatment Integrity</p></def></def-item><def-item><term id="abb7">MMLU</term><def><p>massive multitask language understanding</p></def></def-item><def-item><term id="abb8">R:Q</term><def><p>reflection-to-question</p></def></def-item><def-item><term id="abb9">ROUGE</term><def><p>Recall-Oriented Understudy for Gisting Evaluation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Budreviciute</surname><given-names>A</given-names> </name><name name-style="western"><surname>Damiati</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sabir</surname><given-names>DK</given-names> </name><etal/></person-group><article-title>Management and prevention strategies for non-communicable diseases (NCDs) and their risk factors</article-title><source>Front Public Health</source><year>2020</year><volume>8</volume><fpage>574111</fpage><pub-id pub-id-type="doi">10.3389/fpubh.2020.574111</pub-id><pub-id pub-id-type="medline">33324597</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>Noncommunicable diseases</article-title><source>World Health Organization</source><access-date>2025-03-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/news-room/fact-sheets/detail/noncommunicable-diseases">https://www.who.int/news-room/fact-sheets/detail/noncommunicable-diseases</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fjeldsoe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Neuhaus</surname><given-names>M</given-names> </name><name name-style="western"><surname>Winkler</surname><given-names>E</given-names> </name><name name-style="western"><surname>Eakin</surname><given-names>E</given-names> </name></person-group><article-title>Systematic review of maintenance of behavior change following physical activity and dietary interventions</article-title><source>Health Psychol</source><year>2011</year><month>01</month><volume>30</volume><issue>1</issue><fpage>99</fpage><lpage>109</lpage><pub-id pub-id-type="doi">10.1037/a0021974</pub-id><pub-id pub-id-type="medline">21299298</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kwasnicka</surname><given-names>D</given-names> </name><name name-style="western"><surname>Dombrowski</surname><given-names>SU</given-names> </name><name name-style="western"><surname>White</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sniehotta</surname><given-names>F</given-names> </name></person-group><article-title>Theoretical explanations for maintenance of behaviour change: a systematic review of behaviour theories</article-title><source>Health Psychol Rev</source><year>2016</year><month>09</month><volume>10</volume><issue>3</issue><fpage>277</fpage><lpage>296</lpage><pub-id pub-id-type="doi">10.1080/17437199.2016.1151372</pub-id><pub-id pub-id-type="medline">26854092</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adab</surname><given-names>P</given-names> </name><name name-style="western"><surname>Pallan</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Lancashire</surname><given-names>ER</given-names> </name><etal/></person-group><article-title>Effectiveness of a childhood obesity prevention programme delivered through schools, targeting 6 and 7 year olds: cluster randomised controlled trial (WAVES study)</article-title><source>BMJ</source><year>2018</year><month>02</month><day>7</day><volume>360</volume><fpage>k211</fpage><pub-id pub-id-type="doi">10.1136/bmj.k211</pub-id><pub-id pub-id-type="medline">29437667</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ahern</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Wheeler</surname><given-names>GM</given-names> </name><name name-style="western"><surname>Aveyard</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Extended and standard duration weight-loss programme referrals for adults in primary care (WRAP): a randomised controlled trial</article-title><source>Lancet</source><year>2017</year><month>06</month><day>3</day><volume>389</volume><issue>10085</issue><fpage>2214</fpage><lpage>2225</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(17)30647-5</pub-id><pub-id pub-id-type="medline">28478041</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jay</surname><given-names>M</given-names> </name></person-group><article-title>Weight loss maintenance remains challenging-even with personalized support</article-title><source>JAMA Netw Open</source><year>2025</year><month>09</month><day>2</day><volume>8</volume><issue>9</issue><fpage>e2532689</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2025.32689</pub-id><pub-id pub-id-type="medline">40982286</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Murray</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Brennan</surname><given-names>SF</given-names> </name><name name-style="western"><surname>French</surname><given-names>DP</given-names> </name><name name-style="western"><surname>Patterson</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Kee</surname><given-names>F</given-names> </name><name name-style="western"><surname>Hunter</surname><given-names>RF</given-names> </name></person-group><article-title>Effectiveness of physical activity interventions in achieving behaviour change maintenance in young and middle aged adults: a systematic review and meta-analysis</article-title><source>Soc Sci Med</source><year>2017</year><month>11</month><volume>192</volume><fpage>125</fpage><lpage>133</lpage><pub-id pub-id-type="doi">10.1016/j.socscimed.2017.09.021</pub-id><pub-id pub-id-type="medline">28965003</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sheeran</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Avishai</surname><given-names>A</given-names> </name><name name-style="western"><surname>Villegas</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Rothman</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Klein</surname><given-names>WMP</given-names> </name></person-group><article-title>Does increasing autonomous motivation or perceived competence lead to health behavior change? A meta-analysis</article-title><source>Health Psychol</source><year>2021</year><month>10</month><volume>40</volume><issue>10</issue><fpage>706</fpage><lpage>716</lpage><pub-id pub-id-type="doi">10.1037/hea0001111</pub-id><pub-id pub-id-type="medline">34881939</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>WR</given-names> </name><name name-style="western"><surname>Rollnick</surname><given-names>S</given-names> </name></person-group><source>Motivational Interviewing: Helping People Change and Grow</source><year>2023</year><edition>4</edition><publisher-name>The Guilford Press</publisher-name><pub-id pub-id-type="other">9781462552795</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Frost</surname><given-names>H</given-names> </name><name name-style="western"><surname>Campbell</surname><given-names>P</given-names> </name><name name-style="western"><surname>Maxwell</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Effectiveness of motivational interviewing on adult behaviour change in health and social care settings: a systematic review of reviews</article-title><source>PLoS ONE</source><year>2018</year><volume>13</volume><issue>10</issue><fpage>e0204890</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0204890</pub-id><pub-id pub-id-type="medline">30335780</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bischof</surname><given-names>G</given-names> </name><name name-style="western"><surname>Bischof</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rumpf</surname><given-names>HJ</given-names> </name></person-group><article-title>Motivational interviewing: an evidence-based approach for use in medical practice</article-title><source>Dtsch Arztebl Int</source><year>2021</year><month>02</month><day>19</day><volume>118</volume><issue>7</issue><fpage>109</fpage><lpage>115</lpage><pub-id pub-id-type="doi">10.3238/arztebl.m2021.0014</pub-id><pub-id pub-id-type="medline">33835006</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lindson-Hawley</surname><given-names>N</given-names> </name><name name-style="western"><surname>Thompson</surname><given-names>TP</given-names> </name><name name-style="western"><surname>Begh</surname><given-names>R</given-names> </name></person-group><article-title>Motivational interviewing for smoking cessation</article-title><source>Cochrane Database Syst Rev</source><year>2015</year><month>03</month><day>2</day><volume>2</volume><issue>3</issue><fpage>CD006936</fpage><pub-id pub-id-type="doi">10.1002/14651858.CD006936.pub3</pub-id><pub-id pub-id-type="medline">25726920</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barnes</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Ivezaj</surname><given-names>V</given-names> </name></person-group><article-title>A systematic review of motivational interviewing for weight loss among adults in primary care</article-title><source>Obes Rev</source><year>2015</year><month>04</month><volume>16</volume><issue>4</issue><fpage>304</fpage><lpage>318</lpage><pub-id pub-id-type="doi">10.1111/obr.12264</pub-id><pub-id pub-id-type="medline">25752449</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schwenker</surname><given-names>R</given-names> </name><name name-style="western"><surname>Dietrich</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Hirpa</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Motivational interviewing for substance use reduction</article-title><source>Cochrane Database Syst Rev</source><year>2023</year><month>12</month><day>12</day><volume>12</volume><issue>12</issue><fpage>CD008063</fpage><pub-id pub-id-type="doi">10.1002/14651858.CD008063.pub3</pub-id><pub-id pub-id-type="medline">38084817</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sinha</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kirk</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Effectiveness of behavioural interventions with motivational interviewing on physical activity outcomes in adults: systematic review and meta-analysis</article-title><source>BMJ</source><year>2024</year><month>07</month><day>10</day><volume>386</volume><fpage>e078713</fpage><pub-id pub-id-type="doi">10.1136/bmj-2023-078713</pub-id><pub-id pub-id-type="medline">38986547</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harkin</surname><given-names>K</given-names> </name><name name-style="western"><surname>Apostolopoulos</surname><given-names>V</given-names> </name><name name-style="western"><surname>Tangalakis</surname><given-names>K</given-names> </name><name name-style="western"><surname>Irvine</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tripodi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Feehan</surname><given-names>J</given-names> </name></person-group><article-title>The impact of motivational interviewing on behavioural change and health outcomes in cancer patients and survivors. A systematic review and meta-analysis</article-title><source>Maturitas</source><year>2023</year><month>04</month><volume>170</volume><fpage>9</fpage><lpage>21</lpage><pub-id pub-id-type="doi">10.1016/j.maturitas.2023.01.004</pub-id><pub-id pub-id-type="medline">36736204</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ekong</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kavookjian</surname><given-names>J</given-names> </name></person-group><article-title>Motivational interviewing and outcomes in adults with type 2 diabetes: a systematic review</article-title><source>Patient Educ Couns</source><year>2016</year><month>06</month><volume>99</volume><issue>6</issue><fpage>944</fpage><lpage>952</lpage><pub-id pub-id-type="doi">10.1016/j.pec.2015.11.022</pub-id><pub-id pub-id-type="medline">26699083</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>WR</given-names> </name><name name-style="western"><surname>Moyers</surname><given-names>TB</given-names> </name></person-group><article-title>Motivational interviewing and the clinical science of Carl Rogers</article-title><source>J Consult Clin Psychol</source><year>2017</year><month>08</month><volume>85</volume><issue>8</issue><fpage>757</fpage><lpage>766</lpage><pub-id pub-id-type="doi">10.1037/ccp0000179</pub-id><pub-id pub-id-type="medline">28726479</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mitcheson</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bhavsar</surname><given-names>K</given-names> </name><name name-style="western"><surname>McCambridge</surname><given-names>J</given-names> </name></person-group><article-title>Randomized trial of training and supervision in motivational interviewing with adolescent drug treatment practitioners</article-title><source>J Subst Abuse Treat</source><year>2009</year><month>07</month><volume>37</volume><issue>1</issue><fpage>73</fpage><lpage>78</lpage><pub-id pub-id-type="doi">10.1016/j.jsat.2008.11.001</pub-id><pub-id pub-id-type="medline">19150203</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bahri</surname><given-names>AA</given-names> </name></person-group><article-title>Motivational interviewing to promote healthy lifestyle behaviors: evidence, implementation, and digital applications</article-title><source>J Multidiscip Healthc</source><year>2025</year><volume>18</volume><fpage>6629</fpage><lpage>6642</lpage><pub-id pub-id-type="doi">10.2147/JMDH.S557957</pub-id><pub-id pub-id-type="medline">41104302</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lai</surname><given-names>A</given-names> </name><name name-style="western"><surname>Thygesen</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Farrington</surname><given-names>J</given-names> </name><name name-style="western"><surname>Keen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Li</surname><given-names>K</given-names> </name></person-group><article-title>Large language models for mental health applications: systematic review</article-title><source>JMIR Ment Health</source><year>2024</year><month>10</month><day>18</day><volume>11</volume><fpage>e57400</fpage><pub-id pub-id-type="doi">10.2196/57400</pub-id><pub-id pub-id-type="medline">39423368</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>RZ</given-names> </name><name name-style="western"><surname>Zhuang</surname><given-names>YX</given-names> </name><etal/></person-group><article-title>Natural language processing chatbot&#x2014;based interventions for improvement of diet, physical activity, and tobacco smoking behaviors: systematic review</article-title><source>JMIR Mhealth Uhealth</source><year>2025</year><month>06</month><day>11</day><volume>13</volume><fpage>e66403</fpage><pub-id pub-id-type="doi">10.2196/66403</pub-id><pub-id pub-id-type="medline">40503914</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>KMI: a dataset of korean motivational interviewing dialogues for psychotherapy</article-title><conf-name>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics</conf-name><conf-date>Apr 29 to May 4, 2025</conf-date><pub-id pub-id-type="doi">10.18653/v1/2025.naacl-long.541</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Almusharraf</surname><given-names>F</given-names> </name><name name-style="western"><surname>Rose</surname><given-names>J</given-names> </name><name name-style="western"><surname>Selby</surname><given-names>P</given-names> </name></person-group><article-title>Engaging unmotivated smokers to move toward quitting: design of motivational interviewing-based chatbot through iterative interactions</article-title><source>J Med Internet Res</source><year>2020</year><month>11</month><day>3</day><volume>22</volume><issue>11</issue><fpage>e20251</fpage><pub-id pub-id-type="doi">10.2196/20251</pub-id><pub-id pub-id-type="medline">33141095</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Leeuwis</surname><given-names>L</given-names> </name><name name-style="western"><surname>He</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Hi, I&#x2019;m Cecil (y) the smoking cessation chatbot: the effectiveness of motivational interviewing and confrontational counseling chatbots and the moderating role of the need for autonomy and self-efficacy</article-title><source>Chatbot Research and Design</source><year>2022</year><publisher-name>Springer</publisher-name><fpage>3</fpage><lpage>17</lpage><pub-id pub-id-type="doi">10.1007/978-3-031-25581-6_1</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>L</given-names> </name><name name-style="western"><surname>Basar</surname><given-names>E</given-names> </name><name name-style="western"><surname>Wiers</surname><given-names>RW</given-names> </name><name name-style="western"><surname>Antheunis</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Krahmer</surname><given-names>E</given-names> </name></person-group><article-title>Can chatbots help to motivate smoking cessation? A study on the effectiveness of motivational interviewing on engagement and therapeutic alliance</article-title><source>BMC Public Health</source><year>2022</year><month>04</month><day>12</day><volume>22</volume><issue>1</issue><fpage>726</fpage><pub-id pub-id-type="doi">10.1186/s12889-022-13115-x</pub-id><pub-id pub-id-type="medline">35413887</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Karve</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Calpey</surname><given-names>J</given-names> </name><name name-style="western"><surname>Machado</surname><given-names>C</given-names> </name><name name-style="western"><surname>Knecht</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mejia</surname><given-names>MC</given-names> </name></person-group><article-title>New doc on the block: scoping review of AI systems delivering motivational interviewing for health behavior change</article-title><source>J Med Internet Res</source><year>2025</year><month>09</month><day>16</day><volume>27</volume><fpage>e78417</fpage><pub-id pub-id-type="doi">10.2196/78417</pub-id><pub-id pub-id-type="medline">40957014</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>KC</given-names> </name><name name-style="western"><surname>Bressington</surname><given-names>D</given-names> </name><etal/></person-group><article-title>A theory and evidence-based artificial intelligence-driven motivational digital assistant to decrease vaccine hesitancy: intervention development and validation</article-title><source>Vaccines (Basel)</source><year>2024</year><month>06</month><day>25</day><volume>12</volume><issue>7</issue><fpage>708</fpage><pub-id pub-id-type="doi">10.3390/vaccines12070708</pub-id><pub-id pub-id-type="medline">39066346</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moyers</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Rowell</surname><given-names>LN</given-names> </name><name name-style="western"><surname>Manuel</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Ernst</surname><given-names>D</given-names> </name><name name-style="western"><surname>Houck</surname><given-names>JM</given-names> </name></person-group><article-title>The Motivational Interviewing Treatment Integrity code (MITI 4): rationale, preliminary reliability and validity</article-title><source>J Subst Abuse Treat</source><year>2016</year><month>06</month><volume>65</volume><fpage>36</fpage><lpage>42</lpage><pub-id pub-id-type="doi">10.1016/j.jsat.2016.01.001</pub-id><pub-id pub-id-type="medline">26874558</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Li</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>M</given-names> </name><etal/></person-group><article-title>CPsyCoun: a report-based multi-turn dialogue reconstruction and evaluation framework for Chinese psychological counseling</article-title><conf-name>Findings of the Association for Computational Linguistics ACL 2024</conf-name><conf-date>Aug 11-16, 2024</conf-date><pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.830</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>White</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Hays</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sandborn</surname><given-names>M</given-names> </name><name name-style="western"><surname>Olea</surname><given-names>C</given-names> </name></person-group><article-title>A prompt pattern catalog to enhance prompt engineering with ChatGPT</article-title><access-date>2026-05-30</access-date><conf-name>Proceedings of the 30th Conference on Pattern Languages of Programs (PLoP &#x2019;23)</conf-name><conf-date>Oct 22-25, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/10.5555/3721041.3721046">https://dl.acm.org/doi/10.5555/3721041.3721046</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>B</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Baichuan 2: open large-scale language models</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 19, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.10305</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zeng</surname><given-names>A</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yin</surname><given-names>D</given-names> </name><etal/></person-group><article-title>ChatGLM: a family of large language models from GLM-130B to GLM-4 all tools</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 18, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.12793</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Grattafiori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name></person-group><article-title>The Llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="web"><article-title>shenzhi-wang/Llama3-8B-Chinese-Chat</article-title><source>Hugging Face</source><access-date>2025-11-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat">https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Cui</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>X</given-names> </name></person-group><article-title>Efficient and effective text encoding for chinese LLaMA and Alpaca</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 17, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.08177</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zheng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>YeYanhan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>Z</given-names> </name></person-group><article-title>LlamaFactory: unified efficient fine-tuning of 100+ language models</article-title><conf-name>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Aug 11-16, 2024</conf-date><pub-id pub-id-type="doi">10.18653/v1/2024.acl-demos.38</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Balloccu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Anno-MI: a dataset of expert-annotated counselling dialogues</article-title><conf-name>ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</conf-name><conf-date>May 23-27, 2022</conf-date><pub-id pub-id-type="doi">10.1109/ICASSP43922.2022.9746035</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>CHART Collaborative</collab></person-group><article-title>Reporting guideline for chatbot health advice studies: the Chatbot Assessment Reporting Tool (CHART) statement</article-title><source>BMJ Med</source><year>2025</year><volume>4</volume><issue>1</issue><fpage>e001632</fpage><pub-id pub-id-type="doi">10.1136/bmjmed-2025-001632</pub-id><pub-id pub-id-type="medline">40761518</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="report"><person-group person-group-type="author"><collab>Ministry of Education of the People&#x2019;s Republic of China, Ministry of Science and Technology of the People&#x2019;s Republic of China, National Administration of Traditional Chinese Medicine</collab></person-group><article-title>Measures for ethical review of life science and medical research involving humans [Report in Chinese]</article-title><year>2023</year><access-date>2026-06-05</access-date><publisher-name>National Health Commission of the People&#x2019;s Republic of China</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.nhc.gov.cn/wjw/c100375/202302/902b4a1dc3af4aba862a6387e6e376dc.shtml">https://www.nhc.gov.cn/wjw/c100375/202302/902b4a1dc3af4aba862a6387e6e376dc.shtml</ext-link></comment></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="web"><article-title>chatopera/efaqa-corpus-zh</article-title><source>GitHub</source><access-date>2025-11-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/chatopera/efaqa-corpus-zh?tab=readme-ov-file">https://github.com/chatopera/efaqa-corpus-zh?tab=readme-ov-file</ext-link></comment></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="web"><article-title>ckqqqq/psy-insight: psy-insight dataset and project repository</article-title><source>GitHub</source><access-date>2025-12-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/ckqqqq/Psy-Insight">https://github.com/ckqqqq/Psy-Insight</ext-link></comment></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xing</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>X</given-names> </name></person-group><article-title>PsyDT: using LLMs to construct the digital twin of psychological counselor with personalized counseling style for psychological counseling</article-title><conf-name>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Jul 27 to Aug 1, 2025</conf-date><pub-id pub-id-type="doi">10.18653/v1/2025.acl-long.55</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Qiu</surname><given-names>H</given-names> </name><name name-style="western"><surname>He</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Li</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lan</surname><given-names>Z</given-names> </name></person-group><article-title>SMILE: single-turn to multi-turn inclusive language expansion via chatgpt for mental health support</article-title><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>Nov 12-16, 2024</conf-date><pub-id pub-id-type="doi">10.18653/v1/2024.findings-emnlp.34</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>V</given-names> </name><name name-style="western"><surname>Rajwat</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Medda</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ntoutsi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Recupero</surname><given-names>DR</given-names> </name></person-group><article-title>Unlocking LLMs: addressing scarce data and bias challenges in mental health and therapeutic counselling</article-title><access-date>2026-05-30</access-date><conf-name>Proceedings of the First International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security</conf-name><conf-date>Jul 29-30, 2024</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.nlpaics-1.26/">https://aclanthology.org/2024.nlpaics-1.26/</ext-link></comment></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yosef</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zisquit</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>B</given-names> </name><name name-style="western"><surname>Klomek</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Bar</surname><given-names>K</given-names> </name><name name-style="western"><surname>Friedman</surname><given-names>D</given-names> </name></person-group><article-title>The impact of fine-tuning LLMs on the quality of automated therapy assessed by digital patients</article-title><source>Npj Ment Health Res</source><year>2025</year><month>09</month><day>13</day><volume>4</volume><issue>1</issue><fpage>43</fpage><pub-id pub-id-type="doi">10.1038/s44184-025-00159-1</pub-id><pub-id pub-id-type="medline">40946097</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Teferra</surname><given-names>BG</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Johny</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Alignment of large language model responses with human therapists in motivational interviewing</article-title><source>JAMA Netw Open</source><year>2026</year><month>03</month><day>2</day><volume>9</volume><issue>3</issue><fpage>e262750</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2026.2750</pub-id><pub-id pub-id-type="medline">41870428</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Mahmood</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ali</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A fully generative motivational interviewing counsellor chatbot for moving smokers towards the decision to quit</article-title><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>Jul 27 to Aug 1, 2025</conf-date><pub-id pub-id-type="doi">10.18653/v1/2025.findings-acl.1283</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Christiano</surname><given-names>PF</given-names> </name><name name-style="western"><surname>Leike</surname><given-names>J</given-names> </name><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Martic</surname><given-names>M</given-names> </name><name name-style="western"><surname>Legg</surname><given-names>S</given-names> </name><name name-style="western"><surname>Amodei</surname><given-names>D</given-names> </name></person-group><article-title>Deep reinforcement learning from human preferences</article-title><access-date>2026-05-30</access-date><conf-name>Proceedings of the 31st International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 4-9, 2017</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/10.5555/3294996.3295184">https://dl.acm.org/doi/10.5555/3294996.3295184</ext-link></comment></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ouyang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Almeida</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wainwright</surname><given-names>CL</given-names> </name></person-group><article-title>Training language models to follow instructions with human feedback</article-title><access-date>2026-05-30</access-date><conf-name>Proceedings of the 36th International Conference on Neural Information Processing Systems (NeurIPS 2022)</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2022/hash/b1efde53be364a73914f58805a001731-Abstract-Conference.html">https://proceedings.neurips.cc/paper_files/paper/2022/hash/b1efde53be364a73914f58805a001731-Abstract-Conference.html</ext-link></comment></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="web"><article-title>CbyerDragon/MI-LLM</article-title><source>GitHub</source><access-date>2026-06-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/CbyerDragon/MI-LLM">https://github.com/CbyerDragon/MI-LLM</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Counseling datasets evaluation criteria, transcription prompts, and automatic evaluation details for large language models for motivational interviewing (MI-LLMs).</p><media xlink:href="formative_v10i1e89077_app1.docx" xlink:title="DOCX File, 36 KB"/></supplementary-material><supplementary-material id="app2"><label>Checklist 1</label><p>CHART checklist.</p><media xlink:href="formative_v10i1e89077_app2.docx" xlink:title="DOCX File, 22 KB"/></supplementary-material></app-group></back></article>