<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e75421</article-id><article-id pub-id-type="doi">10.2196/75421</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>A Behavioral Science-Informed Agentic Workflow for Personalized Nutrition Coaching: Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Yang</surname><given-names>Eric</given-names></name><degrees>MBI</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Garcia</surname><given-names>Tomas</given-names></name><degrees>MPA</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Williams</surname><given-names>Hannah G</given-names></name><degrees>d.MBA</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kumar</surname><given-names>Bhawesh</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ram&#x00E9;</surname><given-names>Martin</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rivera</surname><given-names>Eileen</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ma</surname><given-names>Yiran</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Amar</surname><given-names>Jonathan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Catalani</surname><given-names>Caricia</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jia</surname><given-names>Yugang</given-names></name><degrees>MPH, PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Verily Life Sciences</institution><addr-line>2999 Olympus Blvd, Ste 1000</addr-line><addr-line>Dallas</addr-line><addr-line>TX</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Bucher</surname><given-names>Amy</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Martins</surname><given-names>Ana</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Clynes</surname><given-names>Sasha</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Eric Yang, MBI, Verily Life Sciences, 2999 Olympus Blvd, Ste 1000, Dallas, TX, 75019, United States, 1 650-495-7100; <email>eryang@verily.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>24</day><month>9</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e75421</elocation-id><history><date date-type="received"><day>03</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>25</day><month>07</month><year>2025</year></date><date date-type="accepted"><day>14</day><month>08</month><year>2025</year></date></history><copyright-statement>&#x00A9; Eric Yang, Tomas Garcia, Hannah G Williams, Bhawesh Kumar, Martin Ram&#x00E9;, Eileen Rivera, Yiran Ma, Jonathan Amar, Caricia Catalani, Yugang Jia. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 24.9.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e75421"/><abstract><sec><title>Background</title><p>Effective management of cardiometabolic conditions requires sustained positive nutrition habits, often hindered by complex and individualized barriers. Direct human management is simply not scalable, and deterministic automated approaches to nutrition coaching may lack the personalization needed to address these diverse challenges.</p></sec><sec><title>Objective</title><p>We report the development and validation of a novel large language model (LLM)-powered agentic workflow designed to provide personalized nutrition coaching by directly identifying and mitigating patient-specific barriers.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used behavioral science principles to create a comprehensive workflow that can map nutrition-related barriers to corresponding evidence-based strategies. First, a specialized LLM agent to intentionally probe for and identify root causes of a patient&#x2019;s dietary struggles. Subsequently, a separate LLM agent to deliver tailored tactics that were designed to overcome those specific barriers. We conducted a user study with individuals with cardiometabolic conditions (N=16) to inform our workflow design and then validated our approach through an additional user study (n=6). We also conducted a large-scale simulation study, grounding on real patient vignettes and expert-validated metrics, where human experts evaluated the system&#x2019;s performance across multiple scenarios and domains.</p></sec><sec sec-type="results"><title>Results</title><p>In our user study, the system accurately identified barriers and provided personalized guidance. Five out of 6 participants agreed that the LLM agent helped them recognize obstacles preventing them from being healthier, and all participants strongly agreed that the advice felt personalized to their situation. In our simulation study, experts agreed that the LLM agent accurately identified primary barriers in more than 90% of cases (27 or 28/30). Additionally, experts determined that the workflow delivered personalized and actionable tactics empathetically, with average ratings of 4.17&#x2010;4.79 on a 5-point Likert scale.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our findings demonstrate the potential of this LLM-powered agentic workflow to improve nutrition coaching by providing personalized, scalable, and behaviorally informed interventions.</p></sec></abstract><kwd-group><kwd>behavioral science</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>nutritional coaching</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>The increasing prevalence of cardiometabolic diseases such as diabetes mellitus and hypertension poses a significant global health challenge [<xref ref-type="bibr" rid="ref1">1</xref>]. Effective management of these conditions hinges on sustained lifestyle modifications, with nutrition playing a central role [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. Digital health interventions, including mobile apps and online platforms, have emerged as readily accessible solutions. These platforms offer nutrition guidance, meal planning tools, and sometimes one-on-one interactions with human experts who aim to promote long-term adherence [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. However, traditional coaching models often face barriers like limited accessibility, high costs, and difficulties in scaling hyper-personalization.</p><p>The integration of large language models (LLMs) into digital health nutrition coaching presents an exciting opportunity. LLMs can revolutionize personalized support, optimize intervention effectiveness, and improve access to care. These models can engage in natural language conversations, enabling them to provide dynamic interactions beyond traditional static information delivery. Notably, their conversational ability allows LLMs to clarify user intent, answer nutrition-related questions with contextual nuance, provide tailored recommendations that align with individual preferences and needs, and adapt their approach based on user feedback [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. This personalized conversational approach, combined with the accessibility and scalability of digital platforms, holds immense promise for transforming nutrition coaching and improving the health of individuals around the world. In addition, recent research has showcased LLMs&#x2019; potential in gaining user acceptance and delivering accurate domain-specific knowledge, specifically in nutrition applications addressing cardiometabolic conditions [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref13">13</xref>].</p><p>Yet, simply delivering nutrition information through general foundation models is unlikely to achieve sustained behavior change. Leveraging the principles of behavioral science is critical for designing sustainable interventions that address the physical, psychological, and social factors influencing dietary choices. Frameworks such as the capability-opportunity-motivation-behavior (COM-B) model are instrumental in understanding the multidimensional factors that contribute to current behavioral patterns [<xref ref-type="bibr" rid="ref14">14</xref>]. Once the factors preventing behavioral change are better understood, additional frameworks such as the Behavioral Change Wheel, the Behavioral Change Taxonomy (BCT) of 93 hierarchically clustered techniques, and the Easy, Attractive, Social, and Timely (EAST) framework can provide the right approach and tactics on how to promote change [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. These frameworks offer insights into designing interventions that leverage psychological principles to enhance engagement, simplify behavior, make actions rewarding, and trigger positive associations. Combining these complementary frameworks, addressing both the root causes and offering solutions, is crucial to realize LLMs&#x2019; potential in delivering sustainable behavioral change.</p></sec><sec id="s1-2"><title>Related Work</title><p>The application of LLMs specifically to nutrition is a rapidly emerging field. Foundational work has demonstrated their utility for core nutritional analysis tasks, such as generating personalized meal plans based on caloric targets and decomposing compound ingredients for precise analysis [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Building on these capabilities, a second stream of research focuses on developing interactive, conversational nutrition assistants. Systems like ChatDiet, for example, use an orchestrator with personal and population data models to deliver highly personalized food recommendations [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. Other work explores multimodality, such as the Purrfessor chatbot which integrates visual meal analysis with conversational advice to enhance user engagement [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. However, while these systems show increasing sophistication in processing nutritional data, their design often overlooks the principles required for sustained behavior change.</p><p>Recent research has also explored the potential of LLMs for digital coaching in health behavior change [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref26">26</xref>]. Some literature has reported on interventions that often rely on high-level motivational strategies, utilizing LLMs to deliver advice primarily through broad motivational interviewing techniques or empathetic tones [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. Others have laid groundwork to infuse behavior science principles to the guidance provided by LLMs, incorporating frameworks like COM-B to identify barriers, but their reliance on single conversational turns for barrier classification may limit the depth of their assessments [<xref ref-type="bibr" rid="ref22">22</xref>]. A separate group of studies has explored LLMs&#x2019; ability to leverage external tools effectively to foster user engagement and self-reflection [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], or has investigated LLM capabilities to segment users in order to set up different action courses [<xref ref-type="bibr" rid="ref26">26</xref>].</p><p>These separate points of focus in the relevant literature highlight one-by-one the multiple components required for effective full coaching strategies, but several opportunities remain to enhance the integration of behavioral science principles with LLM capabilities, in order to gather individualized user insights and ultimately inform targeted, actionable behavior change tactics. There may be benefits in framing the coaching strategy as a succession of focus points, starting with quantifying barriers from a behavioral science perspective using iterative motivational probing processes, followed by offering specific tactics tailored to individual needs. Overall, there is significant potential for a nuanced integration of behavioral science expertise in the design and development of LLM-powered digital coaching interventions to support more effective and sustainable behavior change.</p><p>Our aim in this study was to introduce a novel approach in leveraging LLM-powered nutrition coaching for cardiometabolic condition management by advancing the integration of behavioral science principles and developing a comprehensive, scalable, and expert- and user-validated framework. Our research directly addressed current limitations in the field by creating a multi-agent conversational workflow powered by a deep understanding of nutrition-related barriers and corresponding strategies that directly mitigated the barriers. Through motivational probing, our approach directly identified the root causes of dietary behavior rather than addressing the surface-level symptoms. It fostered a personalized coaching experience, moving beyond high-level motivational techniques and offering targeted tactics supported by behavioral science research. Our approach also enabled a learning system, where barrier-strategy mappings could be tailored to specific individual context, habits, and tactic adoption. We summarize the steps in our study as follows.</p></sec><sec id="s1-3"><title>Study Overview</title><sec id="s1-3-1"><title>User Research and Literature Review for Comprehensive Barrier Identification and Strategy Mapping</title><p>We conducted a user research study to identify common nutrition-related barriers experienced by cardiometabolic patients. In addition, we performed a comprehensive literature review of academic research papers and internal reports. We then synthesized these barriers into an overarching set of main barriers encompassing all aspects of nutrition goal achievement. Furthermore, we mapped these barriers to a comprehensive set of strategies and behavioral science tactics, enabling a tailored approach to addressing barrier-specific challenges. This went beyond existing research by providing a more holistic and nuanced understanding of nutrition barriers and their corresponding solutions, paving the way for targeted interventions.</p></sec><sec id="s1-3-2"><title>Multi-Agent Workflow Design for Personalized Coaching</title><p>Building on our comprehensive barrier and strategy mapping and insights from user research, we designed a multi-agent workflow where a specialist LLM agent was tasked with probing and classifying barriers through conversation, while another specialist LLM agent carried out strategies and offered specific tactics. We opted for a multi-agent approach after preceding work (E Yang, MBI, unpublished data, September 2024) showed suboptimal results using a single agent across all the coaching tasks (barrier identification and strategy execution). The present results show that this multi-agent approach improved upon other existing single-agent strategies.</p></sec><sec id="s1-3-3"><title>Real-World Validation: Collecting Impressions From Cardiometabolic Populations</title><p>To ensure practical application and effectiveness, we validated our workflow with participants representative of the cardiometabolic populations we aim to serve. This direct validation provided evidence for the efficacy of our system in addressing real-world nutrition challenges, enhancing the credibility and impact of our research. The real-world validation set our work apart from purely simulation frameworks.</p></sec><sec id="s1-3-4"><title>Benchmarking and Expert Annotation</title><p>Along with behavioral science experts, we used a granular benchmark for evaluating the performance of our LLM agents in various stages, including barrier identification, tactic delivery, and overall conversational attributes. Expert annotations on these metrics provided a rigorous evaluation of our system&#x2019;s performance.</p></sec><sec id="s1-3-5"><title>Simulation-Based Evaluation at Scale</title><p>We curated real patient vignettes from our user study to generate large-scale simulated conversations of various realistic barrier situations. These data were then evaluated by LLM auto-evaluators, allowing for a scalable and accurate assessment of our system&#x2019;s performance across a diverse range of scenarios. This simulation-based evaluation approach provided a scalable method for evaluating the generalizability of our LLM-powered coaching system, setting a standard for evaluating similar artificial intelligence (AI)-driven interventions.</p></sec></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>Participants provided informed consent and were compensated at the rate of US $100 per hour. Verily Life Sciences research committee performed the ethical review and approved the study plan, declaring the research and resulting dataset exempt. For privacy and confidentiality protection, data were deidentified for analysis and subsequent publication.</p></sec><sec id="s2-2"><title>User Research Study</title><sec id="s2-2-1"><title>Research Objectives</title><p>Conducting a user research study was crucial for designing a coaching workflow with the specific needs of individuals with cardiometabolic conditions in mind. By directly involving the target population, we aimed to gain a deeper understanding of realistic nutrition challenges and preferences, ensuring that the system is designed to provide relevant, practical, and effective guidance.</p><p>This research study aimed to uncover key user insights necessary for building a patient-centered AI coaching workflow. Specifically, we sought to answer 2 main research questions: (1) What user motivations, characteristics, and challenges must our workflow understand and adapt to in order to be effective? (2) How can an AI coach&#x2019;s character and conversational patterns inspire trust and engagement while delivering behavioral science strategies?</p></sec><sec id="s2-2-2"><title>Participants</title><p>A total of 16 participants were recruited using third-party services, and following a purposeful sampling strategy, selecting for recent cardiometabolic diagnosis, and otherwise diversity in demographics, US geographic regions, health conditions, attitudes toward health care and technology, and familiarity and comfort with AI. Participants were recruited for sessions across a 12-week period.</p></sec><sec id="s2-2-3"><title>Procedure</title><p>Each participant took part in one-on-one, semi-structured qualitative interviews led by the research team. These interviews focused on participants&#x2019; medical histories, the barriers they encountered when working toward their health goals, and their responses to different conversational approaches, as well as their familiarity and general perceptions about AI. Participants shared their experiences and reflected on the specific challenges they faced in managing their health, while also discussing strategies they had found useful. The first step in our research plan, before addressing the second research objective, consisted of participants engaging in interactive exercises where they conversed with 2 types of LLM agents: a supportive agent and an assertive agent (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, section A). The supportive agent facilitated conversation by treating the user as the expert on their body and experience. This agent encouraged self-reflection and demonstrated high levels of compassion, curiosity, and affirmation. Its tone was plain-spoken, easygoing, and patient, kindly taking direction from the user. In contrast, the assertive agent adopted a directive approach, positioning itself as the resident expert with high energy, authoritative knowledge, and a strategic mindset. This agent aimed to empower the user by being assertive and eager to motivate change. Both agents are powered by Gemini-1.5 Pro, a commercially available model known for its conversational and reasoning capabilities [<xref ref-type="bibr" rid="ref27">27</xref>]. Their character and tone were guided by specific phrasing and conversational styles in single prompts. From this exercise, we selected the agent type to be carried forward to assemble our agentic workflow. The sessions were conducted on a web-based chat interface, under the observation of user-experience researchers; AI agents were described to participants as &#x201C;genies&#x201D; that could work autonomously or in collaboration with clinicians.</p><p>To extract output of these study user sessions, analysts used a modified-grounded approach to qualitative analysis, using prior behavior change theories and user experience expertise in the data analysis process. The process began with qualitative coding where analysts labeled sections of data with short words and phrases, capturing what was being communicated. Then, similar codes were grouped together into broader categories that represent recurring patterns. These themes included promoters and detractors of trust and engagement, attitudes toward AI, needs, and expectations. Then, analysts reviewed and refined each theme to ensure fit with matching data and accuracy. Assessing each theme and groups of themes enabled the development of key insights on preferred conversational style. Overall, this approach allowed for a focused exploration of specific aspects of AI desirability and behavior change motivation, while grounding observations on the data itself.</p></sec></sec><sec id="s2-3"><title>Behavioral Science Agentic Workflow Design</title><sec id="s2-3-1"><title>Curation of Barriers and Behavioral Science Strategies</title><p>Recognizing that the barriers described by the participants in the user study may not be all-encompassing, we additionally researched the challenges faced by people with cardiometabolic conditions in the literature and marketplace. We analyzed numerous research papers and reports, covering areas like nutrition, medication adherence, exercise, and goal setting. This helped us identify over 100 total barriers individuals experience. Next, we used affinity mapping to organize these barriers, grouping them based on common themes and patterns. By prioritizing the most frequent and impactful challenges, we narrowed our focus to 28 key barriers (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> section B).</p><p>To uncover effective strategies and tactics for addressing these barriers, we conducted a comprehensive review of existing behavioral science frameworks and developed a solutions repository. First, we examined established frameworks, seeking strategies and tactics with proven efficacy in overcoming similar barriers within the health domain. Our review encompassed well-known models like the COM-B model, the BCT Taxonomy, the EAST framework, and other behavioral change models [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref16">16</xref>] (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> section B). Following this extensive mapping exercise, we curated a final selection of a repository containing over 50 strategies and 100 tactics that are mapped to the 28 barriers. This repository comprehensively linked each identified barrier to a range of potential solutions, offering guidance on optimal implementation to maximize impact. There are popular frameworks and pre-defined sequences for intervention design, which strictly connect the COM-B model to the Behavior Change Wheel to guide the selection of &#x201C;intervention functions&#x201D; and &#x201C;policy domains.&#x201D; Although we recognize the value of such approaches, we deliberately chose not to explicitly link our workflow to any singular pathways at this stage. We found that the incorporation of other tools, like the 93 Behavioral Change Techniques Taxonomy v1 (BCTv1) and the EAST framework, could be not only suitable but even more appropriate for our population of interest and for our agent capabilities. Ultimately, this decision allowed for greater flexibility and innovation to optimize and personalize the way our agent responded to our population of interest. We could explore the full potential of our future capabilities, ensuring no constraints by any predetermined intervention pathways. A few examples of barriers, strategies, and tactics mapping are shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Examples of mapping of barriers, strategies, and tactics.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Barriers</td><td align="left" valign="bottom">Strategies<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">Tactics<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Decision fatigue: The mental exhaustion from making too many choices can lead to poor, suboptimal future decisions. For example, &#x201C;As a working parent I have to deal with so many things that I don&#x2019;t want to think about cooking when I get home.&#x201D; [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">&#x2003;Heuristics: Mental heuristics are cognitive shortcuts our brains use to simplify complex situations and make quick decisions [<xref ref-type="bibr" rid="ref29">29</xref>].</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Rules of thumb: Offer the user a set of actionable principles to help them automate tasks. For example, &#x201C;Always, fill one third of your plate with lean protein.&#x201D;</p></list-item><list-item><p>Default: Encourage the user to pick an option and use it as a default to save time and mental effort. For example, &#x201C;Let&#x2019;s set Tuesday as a kale salad day.&#x201D; [<xref ref-type="bibr" rid="ref29">29</xref>]</p></list-item></list></td></tr><tr><td align="left" valign="top">Present bias: It is the user tendency to overvalue immediate rewards over future, larger rewards. For example, &#x201C;It&#x2019;s the weekend! Let&#x2019;s start with dieting next week.&#x201D; [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">Future self. Future self is a vivid and emotional connection with a future version of ourselves that affects our intention to engage in a future behavior [<xref ref-type="bibr" rid="ref31">31</xref>].</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Mental rehearsal of successful performance: Encourage the user to practice visualizing themselves successfully performing the desired behavior in realistic scenarios. For example, suggest they vividly imagine eating greens and feeling light to continue with the day [<xref ref-type="bibr" rid="ref15">15</xref>].</p></list-item></list></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Strategy: a high-level behavioral science concept aimed to mitigate a barrier.</p></fn><fn id="table1fn2"><p><sup>b</sup>Tactic: concrete, tangible, and actionable step to execute on a given strategy.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3-2"><title>Multi-Agent Workflow Description</title><p>This section details the architecture of the multi-agent LLM coaching workflow, aiming to provide tailored nutrition guidance to cardiometabolic patients by identifying and addressing their unique barriers to achieving their nutrition goals. As shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>, the system operated through 2 core agents, seamlessly integrated to offer a unified user experience: the barrier identification agent and the strategy execution agent. Both agents were powered by Gemini-1.5 Pro and were prompted to converse in the supportive manner preferred by participants in the user study. We developed the system in Python 3.9 using the Vertex AI API for Gemini. The specific instructions for each agent were defined in system prompts passed during the model&#x2019;s instantiation, and we used a chat session object to maintain the conversational context.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Summary of study steps. (A) The multi-agent artificial intelligence (AI) coach workflow was infused with behavioral science principles to address patients&#x2019; barriers toward their nutrition goals. The workflow consisted of 2 core agents, the barrier identification agent and the strategy execution agent. (B) To assess the performance of our workflow at scale, we developed patient simulators that portray validated nutrition vignettes drawn from real cardiometabolic patient profiles. (C) Real cardiometabolic patients and patient simulators interacted with our multi-agent AI coach workflow. The conversation experiences were evaluated via survey, expert annotation of dialogs, and were additionally compared to a baseline large language model (LLM) via auto-evaluation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e75421_fig01.png"/></fig><p>The barrier identification agent initiated conversations by inquiring about the user&#x2019;s current nutrition goals and progress. It then employed motivational interviewing techniques to explore the specifics of the user&#x2019;s nutrition habits, with a focus on identifying the most prominent barrier that may be hindering progress toward their stated goal. The agent was equipped with a predefined taxonomy of 28 barrier concepts, provided within the prompt alongside detailed descriptions and examples. Through iterative dialog, the agent analyzed the user&#x2019;s responses to classify the identified struggles into one of the barrier concepts. While a user could exhibit multiple barrier concepts, the agent was instructed to prioritize and focus on the most prominent one displayed by the user. By tasking the agent with the identification of a primary barrier (as opposed to relying on users themselves), we circumvented burdening users with triaging technical barrier concepts with which they may be unfamiliar, or sorting through voluminous associated tactic lists that may be overwhelming. Once the agent determined that sufficient information had been gathered for barrier classification, it generated an internal summary of the conversation and the identified barrier concept, which was then relayed to the strategy execution agent. The prompt used by the barrier identification agent can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, section C.1.</p><p>The strategy execution agent received the conversation summary and the identified barrier concept from the barrier identification agent. The agent relied on a structure where each barrier was mapped to a strategy (a high-level behavioral science concept aimed to mitigate a barrier) and, ultimately, to a list of tactics (concrete, tangible, and actionable steps to execute on that strategy) to be carried out by the strategy execution agent. Therefore, each barrier concept eventually mapped to a set of potential tactics, each with examples and an associated execution sequence outlining mandatory and optional tactics, as well as their prioritization. This way, once the agent received the input about the barrier, it retrieved the corresponding tactics and execution sequences from a predefined table. The agent then engaged in further conversation with the user, drawing upon the preceding conversation summary and the prescribed tactics. For the optional tactics, the agent dynamically adapted its offerings based on user responses and perceived acceptability and effectiveness of the previously deployed tactics. The conversation concluded when the agent determined that the user was sufficiently equipped with the necessary tools to overcome their identified barrier and progress toward their nutrition goals. The prompt used by the strategy execution agent can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, section C.2.</p><p>Agent orchestration was defined as when the back-end functionality relied on 2 core distinct agents, the user interacted with the system as a single, continuous AI-powered nutrition coach. This seamless transition between barrier identification and strategy execution was facilitated by the orchestration of information exchange between agents. Crucially, conversation summaries and key outputs, including identified barriers related to nutrition, were explicitly transferred between the agents, ensuring the preservation of all necessary context while excluding irrelevant banter. This modular design allowed each agent to specialize in its respective task, enabling more focused model improvements for both barrier identification and strategy execution. An example of the agent orchestration can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, section D.2.</p></sec><sec id="s2-3-3"><title>Evaluation of User Impressions of Workflow</title><sec id="s2-3-3-1"><title>Research Objectives</title><p>Having implemented the AI coaching workflow grounded on the insights from the initial user research, we conducted further sessions to evaluate users&#x2019; impressions of the workflow through direct interactions. This phase aimed to test the real-world applicability and accuracy of the system&#x2019;s barrier identification and strategy execution capabilities, providing a critical assessment of how well the workflow translates theoretical principles into practical outcomes. Engaging with users also helped us uncover any usability or communication insights that may not have been apparent in agentic workflow and prompt development. Key questions guiding this evaluation were as follows:</p></sec><sec id="s2-3-3-2"><title>Effectiveness of Behavioral Science-Informed Workflow</title><p>How did users who are managing health concerns respond to our workflow infused with behavior science frameworks? How effectively did our workflow help users identify obstacles that prevent them from being healthier? Were the strategies and tactics offered by our workflow easy for users to put into action? How did our workflow impact users&#x2019; motivation and confidence to make positive changes in their health?</p></sec><sec id="s2-3-3-3"><title>Building Trust and Engagement Through AI Interactions</title><p>Did our AI coach&#x2019;s character and personality inspire trust and engagement? To what degree did users feel supported by our coaching workflow?</p></sec><sec id="s2-3-3-4"><title>Participants</title><p>We selected a subset of 6 participants out of the 16 in the user research study described in the prior section, for direct conversational interaction with our LLM coaching workflow in a single session. This scaled-down sample allowed us to conduct the study in a timely fashion. The subset was selected on the basis of diversity with respect to AI comfort levels and stated motivation levels. The participants were primed to think of challenges they might face achieving a nutrition goal they had set, and to channel that while interacting with our coaching workflow. After the experience, the participants were asked to reflect on their interactions through qualitative interviews and to complete a customized survey for this study. Analysts used a modified-grounded approach to qualitative analysis.</p></sec></sec></sec><sec id="s2-4"><title>Simulation Study</title><sec id="s2-4-1"><title>Patient Vignettes</title><p>To assess the performance of our multi-agent LLM coaching workflow at a larger scale, we conducted a simulation study with the high-level workflow illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>. Leveraging patient profiles curated from the user research study that contained patient lifestyle context and medical history, we identified the prominent nutrition barriers for each profile, aligning them with our predefined taxonomy of 28 barrier concepts. This involved a manual process of translating patients&#x2019; self-reported nutrition struggles into corresponding barrier concepts. Subsequently, for each identified barrier within each profile, we crafted a detailed patient nutrition vignette paragraph with a separate base Gemini model. These vignettes aimed to vividly depict the specific manifestation of that barrier within the context of the individual&#x2019;s profile. Providing prewritten nutrition vignettes allowed us to:</p></sec><sec id="s2-4-2"><title>Focus the Patient Simulator</title><p>Our approach enabled the downstream patient simulator to concentrate on portraying the specific barrier and engaging in a meaningful conversation, without the need to generate complex nutrition narratives dynamically.</p></sec><sec id="s2-4-3"><title>Enhance Control and Consistency</title><p>We enabled greater control over the simulated conversations, minimizing the risk of inconsistencies or incoherent narratives that might arise from on-the-fly story generation.</p></sec><sec id="s2-4-4"><title>Isolate Barrier Identification Impact</title><sec id="s2-4-4-1"><title>Overview</title><p>Our approach allowed us to isolate and assess the system&#x2019;s ability to accurately identify and address specific barriers, independent of other confounding factors within a patient&#x2019;s profile.</p><p>In total, we simulated a wide range of scenarios across diverse patient profiles, nutrition goals, and barriers, resulting in 187 total simulated vignettes. We provide a sample vignette generated from a barrier concept for a patient profile in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, section D.1.</p><p>To ensure the quality and relevance of the generated vignettes with respect to the simulated behavioral barrier, we leveraged OpenAI&#x2019;s GPT-4o (gpt-4o-2024-08-06) model in an LLM-as-judges framework, also known as auto-evaluation [<xref ref-type="bibr" rid="ref32">32</xref>]. The GPT-4o model was intentionally chosen as the vignettes were generated by a Gemini model. The auto-evaluator received two inputs: the target barrier to be simulated and the generated patient vignette. It then evaluated the vignette across four dimensions: evidence, realism, completeness, and leakage.</p></sec><sec id="s2-4-4-2"><title>Evidence (High/Medium/Low)</title><p>The extent to which the vignette provided clear indications that the patient&#x2019;s behavior or thoughts were influenced by the target barrier.</p></sec><sec id="s2-4-4-3"><title>Realism (High/Medium/Low)</title><p>The plausibility and believability of the depiction of the target barrier, reflecting how it might manifest in a real person&#x2019;s life.</p></sec><sec id="s2-4-4-4"><title>Completeness (High/Medium/Low)</title><p>The sufficiency of details provided in the vignette to fully understand the impact of the target barrier on the patient&#x2019;s ability to achieve their nutrition goals.</p></sec><sec id="s2-4-4-5"><title>Leakage (Yes/No)</title><p>Whether the vignette directly mentioned the technical term of the target barrier. It was important that vignettes do not explicitly contain the technical terms, which may skew the barrier identification task down the line.</p></sec></sec></sec><sec id="s2-5"><title>Conversation Simulation</title><p>Having established a robust set of patient vignettes, we proceeded to simulate dialogs between a Gemini-powered patient simulator and our LLM coaching workflow for each of the 153 higher quality vignettes. The patient simulator was provided with two key inputs: the generated vignette from the corresponding patient profile and barrier, and the communication style observed for that patient during the user research study. Notably, the patient simulator&#x2019;s nutrition barriers were only portrayed via the vignette without explicit mentioning of any technical behavioral science barrier terms. The patient simulator, guided by these inputs, engaged in a conversation with the AI coach. The dialog continued until the AI coach determined, based on its internal logic and the patient&#x2019;s responses, that the patient had reached a higher level of preparedness to address their nutrition-related challenges. This endpoint represented a complete coaching interaction within the simulation framework. An example of simulated conversations can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, section D.2.</p></sec><sec id="s2-6"><title>Expert Assessment</title><sec id="s2-6-1"><title>Overview</title><p>Evaluating the quality and effectiveness of the simulated coaching conversations was a crucial step in assessing the overall performance of our multi-agent LLM workflow. We leaned on our behavioral science expertise to develop an evaluation rubric encompassing 5 key dimensions: barrier identification accuracy, tactic comprehensiveness, tactic personalization, tactic actionability, and conversation empathy. These dimensions were chosen to reflect the core competencies required for effective digital coaching in the context of motivating dietary behavioral change.</p></sec><sec id="s2-6-2"><title>Barrier Identification Accuracy</title><p>Given the list of 28 nutrition barrier concepts, the AI coach identified the correct patient barrier. Accurate identification of the patient&#x2019;s primary barrier is paramount for delivering tailored and effective interventions. Rating options: &#x201C;Yes&#x201D; or &#x201C;No.&#x201D;</p></sec><sec id="s2-6-3"><title>Tactic Comprehensiveness</title><p>The AI coach delivered all the instructed primary tactics to the patient via clear descriptions or relevant examples in an understandable way. Ensuring the delivery of all intended coaching tactics is crucial for maximizing the potential impact of the intervention. Rating options: &#x201C;Yes&#x201D; or &#x201C;No.&#x201D;</p></sec><sec id="s2-6-4"><title>Tactic Personalization</title><p>For the tactics that were delivered, the AI coach made the tactics personalized to the patient&#x2019;s unique context. Personalizing coaching tactics to the individual&#x2019;s unique context and circumstances enhances engagement and promotes behavior change. Rating options: 5-point Likert scale.</p></sec><sec id="s2-6-5"><title>Tactic Actionability</title><p>The AI coach discussed actionable steps to help patients overcome their barriers toward their nutrition goals. Providing clear, actionable steps empowers patients to translate intentions into concrete behaviors. Rating options: 5-point Likert scale.</p></sec><sec id="s2-6-6"><title>Conversation Empathy</title><p>The AI coach provided encouragement and motivation to the patient empathetically. Expressing empathy and providing emotional support fosters a positive and trusting coach-patient relationship. Rating options: 5-point Likert scale.</p><p>The corresponding conversations that were simulated for the 50 randomly selected generated vignettes were assessed by human experts with extensive academic training and professional experience in behavioral science. Specifically, two behavioral science experts independently labeled 30 simulated conversations each, with 10 overlapping conversations used to assess inter-rater reliability.</p></sec></sec><sec id="s2-7"><title>Comparative Study</title><p>To evaluate the advantages of our multi-agent LLM workflow infused with behavioral science principles, we conducted a comparative study against a single base Gemini model that lacked explicit behavioral science knowledge infusion and did not employ a multi-agent approach. The base model was instructed to assist patients in overcoming their nutrition challenges by leveraging its inherent capabilities to apply motivational interviewing and behavioral science tactics without structured guidance. We simulated coaching conversations with the same patient simulator used previously.</p><p>Then, a GPT-4o auto-evaluator was employed to determine preference between the 2 conversation sets based on behavioral science criteria. To ensure a fair comparison, we measured the conversation lengths across both conversation sets, ensuring that they contained a comparable number of characters to control for LLM preference for longer contexts. The order in which the conversations were presented to the evaluator was alternated to additionally control for any position bias.</p><p>To generate an additional reference data point, the same two behavioral science experts from the Expert Assessment exercise independently and blindly evaluated a randomly shuffled subset of 30 pairs (10 overlapping) of these simulated conversations, namely, conversations from our multi-agent LLM workflow and from the basic Gemini model.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>User Research Study</title><p>We enrolled N=16 participants in this portion of our study (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Characteristics of the study participants (N=16).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom">Value, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Sex</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">8 (50)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">8 (50)</td></tr><tr><td align="left" valign="top">Age (y)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x003C;45</td><td align="left" valign="top">5 (31.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>45&#x2010;65</td><td align="left" valign="top">9 (56.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x003E;65</td><td align="left" valign="top">2 (12.5)</td></tr><tr><td align="left" valign="top">Race/ethnicity</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>American Indian/Alaskan Native</td><td align="left" valign="top">1 (6.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Black/African American</td><td align="left" valign="top">6 (37.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hispanic/Latino</td><td align="left" valign="top">2 (12.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>White</td><td align="left" valign="top">7 (43.8)</td></tr><tr><td align="left" valign="top">Recent<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> cardiometabolic diagnoses</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hypertension</td><td align="left" valign="top">9 (56.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Prediabetes</td><td align="left" valign="top">4 (25)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Type 2 diabetes</td><td align="left" valign="top">10 (62.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hyperlipidemia</td><td align="left" valign="top">2 (12.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Obesity</td><td align="left" valign="top">15 (93.8)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Diagnosis within the last 6 months, n=13; diagnosis more than 6 months ago, n=3.</p></fn></table-wrap-foot></table-wrap><p>The user research study revealed crucial insights into the barriers and motivations that should be integrated into the design of the AI coaching workflow. Participants shared detailed accounts of their physical, psychological, and social challenges in achieving their health goals. Common themes that emerged included balancing competing priorities, dealing with physical limitations, and struggling with low self-efficacy. For example, participants frequently expressed feeling overwhelmed by the demands of daily life, which made it difficult to sustain motivation to work on their health over time.</p><p>Additionally, participants highlighted the importance of feeling understood and empowered by a coach. A preference emerged for the supportive agent, as most participants felt more comfortable and engaged with its empathetic approach. They noted that the supportive agent fostered trust and provided a space for self-reflection. Although a few participants appreciated the assertive agent&#x2019;s high-energy style, the majority found it to be less aligned with their desire for a collaborative and self-empowering experience. This feedback was instrumental in refining the AI coaching workflow, which ultimately prioritized the compassionate, user-driven style of the supportive agent to promote long-term engagement and behavior change.</p><p>Informed by these findings, we established two guiding principles for the agentic workflow design. First, the workflow must move beyond generic advice to probe and understand the root causes of a user&#x2019;s barriers, ensuring that subsequent strategies are practical, incremental, and tailored to their personal context. Second, every interaction must be delivered through the supportive, conversational style that participants confirmed was essential for fostering trust and engagement.</p></sec><sec id="s3-2"><title>User Impressions of Workflow</title><sec id="s3-2-1"><title>Effectiveness of Behavioral Science-Informed Workflow</title><p>The AI coaching workflow, grounded in behavioral science principles, was found to be effective in addressing barriers users have toward healthier habits. First, the AI coach was successful in helping users identify specific barriers to their health. Five out of 6 participants agreed that the assistant helped them recognize obstacles that prevented them from being healthier. Participants also indicated that their interactions with the AI coach were informative. When asked whether they learned something new about their health habits, 4 out of 6 participants strongly agreed, and the remaining somewhat agreed with the sentiment. As a participant reflected, the agent&#x2019;s approach of leading with additional questions and building from user experiences enabled &#x201C;more refined, bite-sized&#x201D; understanding, enhancing quality of the nutrition motivations and barriers insights collected. Furthermore, participants responded positively to the strategies and tactics provided by the AI coach. All participants strongly agreed that the AI coach&#x2019;s advice felt personalized to their situation, reflecting its ability to adapt to users&#x2019; unique needs. Notably, all participants agreed that the AI coach&#x2019;s advice was easy to put into action, highlighting its practicality. Participants appreciated the focus on manageable, small changes, with one stating, &#x201C;Making small changes makes a big difference,&#x201D; highlighting the perceived effectiveness of the behavior change strategies recommended by the AI coach. The AI coach&#x2019;s ability to offer novel suggestions tailored to individual preferences was highlighted by participants, such as when one participant appreciated the recommendation to swap quinoa porridge for oatmeal, and another found value in being advised on alternatives to high-sugar protein drinks. The impact of these strategies on user motivation was significant, with all participants agreeing that the AI coach increased their confidence to make positive changes in their health. Each participant also reported feeling more motivated after their conversations due to excitement for having a plan to implement manageable, novel, small changes.</p></sec><sec id="s3-2-2"><title>Building Engagement Through AI Interactions</title><p>Overall, the AI coaching workflow fostered relationship building and engagement through their conversational style and empathetic personality. All participants strongly agreed that they felt supported by the AI coach and were interested in having future conversations about their health, indicating a high level of engagement. One participant noted, &#x201C;I could sit there and talk to that thing all day.&#x201D; Furthermore, participants frequently described the AI coach as having a &#x201C;friendly&#x201D; and &#x201C;human-like&#x201D; demeanor, with one participant mentioning, &#x201C;It did feel like you&#x2019;d spoken with somebody, like you had an actual conversation with somebody.&#x201D; As one participant noted, the AI coach &#x201C;seemed to be friendly, kind of human-like,&#x201D; creating a sense of connection and understanding that facilitated deeper discussions about their nutrition habits. The AI coach&#x2019;s ability to create a supportive environment was evident, with all participants agreeing that they felt comfortable discussing their health issues with them. This sense of comfort was further supported by the finding that the great majority of participants strongly agreed that their conversations were engaging. The AI coach&#x2019;s empathetic and person-centered communication style not only facilitated open dialog but also encouraged continued user interaction, which is crucial for sustained engagement and positive health outcomes. The full survey results can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, section E.</p></sec></sec><sec id="s3-3"><title>Simulation Study</title><sec id="s3-3-1"><title>Assessment of the Patient Vignette Auto-Evaluator</title><p>To calibrate the auto-evaluator&#x2019;s performance, we conducted an adversarial analysis. We evaluated the auto-evaluator on a patient vignette paired with a randomly chosen behavioral barrier different from the one simulated in the vignette. This mismatch was designed to assess the auto-evaluator&#x2019;s sensitivity to inconsistencies between the intended barrier and the generated narrative. We hypothesized that scores for these adversarial cases would be significantly lower, reflecting the lack of alignment between the vignette and the incorrect barrier. The results of this analysis are presented in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Auto-evaluator performance on matched (N=187) and mismatched vignettes (N=187).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="3">Matched vignette, n (%)</td><td align="left" valign="bottom" colspan="3">Mismatched vignette, n (%)</td></tr><tr><td align="left" valign="top">Dimension</td><td align="left" valign="top">High</td><td align="left" valign="top">Medium</td><td align="left" valign="top">Low</td><td align="left" valign="top">High</td><td align="left" valign="top">Medium</td><td align="left" valign="top">Low</td></tr></thead><tbody><tr><td align="left" valign="top">Evidence</td><td align="left" valign="top">184 (98.4)</td><td align="left" valign="top">3 (1.6)</td><td align="left" valign="top">0</td><td align="left" valign="top">36 (19.3)</td><td align="left" valign="top">32 (17.1)</td><td align="left" valign="top">119 (63.6)</td></tr><tr><td align="left" valign="top">Realism</td><td align="left" valign="top">187 (100)</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">84 (44.9)</td><td align="left" valign="top">94 (50.3)</td><td align="left" valign="top">9 (4.8)</td></tr><tr><td align="left" valign="top">Completeness</td><td align="left" valign="top">156 (83.4)</td><td align="left" valign="top">31 (16.6)</td><td align="left" valign="top">0</td><td align="left" valign="top">19 (10.2)</td><td align="left" valign="top">46 (24.6)</td><td align="left" valign="top">122 (65.2)</td></tr></tbody></table></table-wrap><p>Of the 187 simulated vignettes, 153 of them received high marks across evidence, realism, and completeness dimensions by the auto-evaluator. These higher quality vignettes were selected for downstream conversation simulation. As hypothesized, the auto-evaluator assigned significantly lower scores to the mismatched vignettes across all dimensions. This indicates the auto-evaluator&#x2019;s ability to discern between accurately and inaccurately represented barriers, supporting its validity for assessing vignette quality. In addition, there was no barrier term concept leakage across all cases. Our vignette generation and validation processes provided a robust foundation for our simulation study, ensuring realistic and diverse scenarios for evaluating the effectiveness of our multi-agent LLM coaching workflow.</p></sec><sec id="s3-3-2"><title>Expert Assessment</title><p>The evaluation of the simulated coaching conversations by human experts provided important insights into the performance of our multi-agent LLM workflow. Full results are shown in <xref ref-type="table" rid="table4">Table 4</xref>. For Barrier Identification Accuracy, the experts agreed that the AI coach accurately identified the primary patient barrier in greater than 90% of cases (in 27/30 cases for 1 reviewer and 28/30 for the other), and the cases with low accuracy score were due to another barrier being deemed more prominent over the one selected by the agent, not to an inability of the agent to identify a barrier. For Tactic Comprehensiveness, the AI coach successfully delivered all instructed primary tactics in 70% of conversations (21/30) labeled by expert 1% and 90% of conversations (27/30) labeled by expert 2. The inter-rater reliability for both dimensions was high, with agreement percentages of 80%, indicating strong consistency between the experts&#x2019; assessments. In addition, the agentic workflow received high scores across the remaining dimensions. For Tactic Personalization, the AI coach demonstrated a strong ability to tailor its coaching tactics to the individual&#x2019;s unique context, with average ratings of 4.38 and 4.79 on a 5-point Likert scale. Tactic Actionability was also rated highly, with average scores of 4.17 and 4.59, reflecting the clarity and feasibility of the steps recommended by the AI coach. Finally, Conversation Empathy received high marks, with average scores of 4.58 and 4.76, indicating that the AI coach was perceived as empathetic and supportive, effectively fostering a positive coaching relationship. While Expert 2 gave slightly higher ratings on average, the absolute difference in ratings for overlapping cases was minimal, reinforcing the reliability of the evaluations. These results highlight the AI&#x2019;s coach capacity to deliver personalized, actionable, and empathetic coaching aligned with core behavioral science principles.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Expert evaluation on simulated conversations.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Expert 1 average rating (n=30)</td><td align="left" valign="bottom">Expert 2 average rating<break/>(n=30)</td><td align="left" valign="bottom">Interrater reliability (%) and average absolute difference in rating</td></tr></thead><tbody><tr><td align="left" valign="top">Dimension (Yes=0; No=1)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Barrier identification accuracy</td><td align="left" valign="top">0.93</td><td align="left" valign="top">0.90</td><td align="left" valign="top">80<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tactic comprehensiveness</td><td align="left" valign="top">0.70</td><td align="left" valign="top">0.90</td><td align="left" valign="top">80<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">Dimension (5-point Likert), mean (SD)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tactic personalization</td><td align="left" valign="top">4.38 (0.94)</td><td align="left" valign="top">4.79 (0.49)</td><td align="left" valign="top">0.78<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tactic actionability</td><td align="left" valign="top">4.17 (1.10)</td><td align="left" valign="top">4.59 (0.63)</td><td align="left" valign="top">0.89<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Conversation empathy</td><td align="left" valign="top">4.58 (0.73)</td><td align="left" valign="top">4.76 (0.44)</td><td align="left" valign="top">0.56<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Values correspond to interrater reliability.</p></fn><fn id="table4fn2"><p><sup>b</sup>Values correspond to average absolute difference in rating.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3-3"><title>Comparative Study</title><p>As seen in <xref ref-type="table" rid="table5">Table 5</xref>, the results of the comparative study demonstrated a preference for the behavioral science-informed workflow. The GPT-4o auto-evaluator preferred the conversations generated by the multi-agent workflow in 102 out of 153 (66.7%) cases, compared to 51 out of 153 (33.3%) cases for the single-agent base model. The blinded human expert evaluation of a subset of these conversations produced consistent results with the auto-evaluator, with both experts showing preference for the multi-agent workflow (in 66%-73% of cases) as compared with the single-agent (26%-33% of cases). To ensure a fair comparison, we report the conversation lengths, with average character counts of 3825 for the multi-agent workflow and 3904 for the base model. In addition to alternating the order presented to the auto-evaluator, the mitigation of length bias was important for ensuring that the auto-evaluator&#x2019;s preferences were based on the quality of behavioral science content rather than superficial factors.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Comparative evaluation on simulated conversations with behavioral science agentic workflow and base Gemini.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2"/><td align="left" valign="bottom" colspan="2">No. of conversations preferred, n (%)</td><td align="left" valign="bottom" rowspan="2">Conversations avg. Char. length<break/>(N=153), mean (SD)</td></tr><tr><td align="left" valign="bottom">Autoevaluation (N=153)</td><td align="left" valign="bottom">Human expert review (N=30)</td></tr></thead><tbody><tr><td align="left" valign="top">Behavioral science agentic workflow</td><td align="left" valign="top">102 (66.7)</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Expert 1: 22 (73.3)</p></list-item><list-item><p>Expert 2: 20 (66.7)</p></list-item></list></td><td align="left" valign="top">3825 (1678)</td></tr><tr><td align="left" valign="top">Base Gemini</td><td align="left" valign="top">51 (33.3)</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Expert 1: 8 (26.7)</p></list-item><list-item><p>Expert 2: 10 (33.3)</p></list-item></list></td><td align="left" valign="top">3904 (2056)</td></tr></tbody></table></table-wrap><p>Anecdotally, our review of the conversations revealed that the base model frequently provided more generic advice, often suggesting alternative goals or broad solutions rather than leveraging nuanced behavioral science tactics tailored to overcome the patient&#x2019;s original nutrition goals. In contrast, the multi-agent workflow generated more specific and contextually relevant strategies, highlighting the benefits of structured behavioral science integration in digital coaching conversations.</p></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Overall Learnings</title><p>Our findings offer important implications for the integration of behavioral science principles into multi-agent AI coaching workflows, particularly for managing cardiometabolic conditions. The results demonstrate that our novel workflow, which utilized comprehensive barrier identification and strategy mapping, has the potential to significantly enhance the effectiveness of digital coaching interventions. The real-world user study confirmed that the workflow provides relevant, personalized support that resonates with users, fostering trust and engagement. Additionally, the introduction of structured patient simulators allowed us to systematically evaluate AI&#x2019;s performance across diverse scenarios, providing a scalable method to refine and validate the system&#x2019;s approach. By moving beyond surface-level motivational techniques, our approach directly targeted the root causes of nutrition-related behaviors, offering personalized and tailored coaching experiences. The strong preference for our workflow, as evidenced by both human expert evaluations and auto-evaluation, underscores the potential of structured, behaviorally informed AI coaching systems to deliver more nuanced and relevant guidance. This study sets a precedent for the development of agentic workflows that can effectively adapt to individual patient contexts, fostering sustained engagement and positive health outcomes through a deep understanding of patient-specific barriers and strategies.</p></sec><sec id="s4-2"><title>System Design Implications Beyond This Research Work</title><p>The implications of our findings extend beyond the cardiometabolic nutrition domain, suggesting that personalized AI coaching systems have the potential to play a transformative role in the future of digital health interventions. By aligning coaching systems with individual patient needs, these AI models can improve patient engagement, which is often a major barrier in digital health tools. Our results reinforce the growing importance of integrating behavioral science into AI systems to ensure that the interventions proactively help patients navigate their health goals in a structured, effective way. This also opens the door for AI to contribute more meaningfully to other areas of health management, such as physical activity management, mental health support, and preventive care.</p></sec><sec id="s4-3"><title>Limitations and Future Research Opportunities</title><p>Our study has several limitations that highlight opportunities for future research. First, the sample size for both the user research study and human expert evaluation was relatively small, which may limit the generalizability of the findings. Future research should replicate these studies with larger, more diverse populations over time to confirm the effectiveness of the multi-agent LLM workflow in broader contexts. Future research could also refine barrier identification by introducing direct questioning by the agent to the user about whether they feel equipped to overcome their barriers, rather than relying on conversational inferences. In addition, although the use of auto-evaluation enables scalable assessment, there are known biases when using LLMs as judges despite our best efforts to mitigate them [<xref ref-type="bibr" rid="ref33">33</xref>-<xref ref-type="bibr" rid="ref36">36</xref>]. Incorporating a diverse set of human evaluators could provide richer insights into the system&#x2019;s real-world applicability and user experience. Moreover, considering feedback mechanisms and information retrieval approaches will create additional opportunities to improve the workflow. For instance, user feedback loops can enable reinforcement learning techniques to improve model performance, and retrieval-augmented generation approaches can ensure that the agent&#x2019;s knowledge base is up-to-date and based on latest evidence [<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref39">39</xref>]. This optimization research could focus on the long-term impact of AI-driven coaching, measuring (and testing strategies for improvement of) long-term adherence and durability of healthy nutrition habits.</p><p>Another layer of future research during real-world deployment could focus on the need for ongoing oversight to ensure safety, efficacy, and ethical compliance, with the goal of determining whether (or to which extent) it would be possible for the system to function with minimal supervision [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. In this regard, studies that monitor specific use cases and potential risks, particularly the occurrence of hallucinations, and investigate mitigation strategies are warranted. Finally, real-world operational challenges related to scalability, integration into health care systems, and ensuring equitable access must be carefully evaluated to maximize the positive impact of AI-driven coaching on diverse populations [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref42">42</xref>].</p></sec><sec id="s4-4"><title>Conclusions</title><p>In conclusion, this study presents a novel multi-agent LLM workflow that leverages behavioral science principles to enhance digital coaching for nutrition management among individuals with cardiometabolic conditions. Our approach, validated through expert assessments, real-world user studies, and large-scale simulation-based evaluations, provides strong evidence for the effectiveness of AI-driven, personalized coaching systems that go beyond generic advice to offer tailored, actionable, and empathetic guidance. Although further research is needed to validate these findings across larger and more diverse populations, our results pave the way for the development of scalable, behaviorally informed AI systems that can support meaningful and sustained health behavior change, addressing both current limitations and future possibilities in digital health interventions.</p></sec></sec></body><back><ack><p>This study was sponsored by Verily Life Sciences. Verily Life Sciences was responsible for data collection. Authors were fully responsible for the data analysis and interpretation presented herein and the writing of this article. EY, TG, HW, BK, MR, ER, and JA had access to the raw data. The authors had access to the full dataset for the study and reviewed and approved the final manuscript for submission.</p><p>All authors report employment and equity ownership in Verily Life Sciences.</p></ack><notes><sec><title>Data Availability</title><p>The data supporting this study are not accessible for sharing.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: CC, EY, HGW, TG, YJ</p><p>Investigation: CC, EY, HW, TG, YM</p><p>Data curation: CC, EY, HW, TG, YM</p><p>Formal analysis: BK, ER, EY, HGW, JA, MR, TG</p><p>Supervision: YJ</p><p>Writing - original draft: EY</p><p>Writing - review and editing: BK, CC, ER, EY, HGW, JA, MR, TG, YM, YJ</p></fn><fn fn-type="conflict"><p>All authors report employment and equity ownership in Verily Life Sciences.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">BCT</term><def><p>Behavioral Change Taxonomy</p></def></def-item><def-item><term id="abb3">COM-B</term><def><p>capability-opportunity-motivation-behavior</p></def></def-item><def-item><term id="abb4">EAST</term><def><p>Easy, Attractive, Social, and Timely</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language models</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Becker</surname><given-names>T</given-names> </name><name name-style="western"><surname>Majmundar</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Harris</surname><given-names>KM</given-names> </name></person-group><source>High and Rising Mortality Rates among Working Age Adults</source><publisher-name>National Academies of Sciences</publisher-name><pub-id pub-id-type="doi">10.17226/25976</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mozaffarian</surname><given-names>D</given-names> </name><name name-style="western"><surname>Aspry</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Garfield</surname><given-names>K</given-names> </name><etal/></person-group><article-title>&#x201C;Food is medicine&#x201D; strategies for nutrition security and cardiometabolic health equity: JACC state-of-the-art review</article-title><source>J Am Coll Cardiol</source><year>2024</year><month>02</month><day>27</day><volume>83</volume><issue>8</issue><fpage>843</fpage><lpage>864</lpage><pub-id pub-id-type="doi">10.1016/j.jacc.2023.12.023</pub-id><pub-id pub-id-type="medline">38383100</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bermingham</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Linenberg</surname><given-names>I</given-names> </name><name name-style="western"><surname>Polidori</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Effects of a personalized nutrition program on cardiometabolic health: a randomized controlled trial</article-title><source>Nat Med</source><year>2024</year><month>07</month><volume>30</volume><issue>7</issue><fpage>1888</fpage><lpage>1897</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-02951-6</pub-id><pub-id pub-id-type="medline">38714898</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Majithia</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Kusiak</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Armento Lee</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Glycemic outcomes in adults with type 2 diabetes participating in a continuous glucose monitor-driven virtual diabetes clinic: prospective trial</article-title><source>J Med Internet Res</source><year>2020</year><month>08</month><day>28</day><volume>22</volume><issue>8</issue><fpage>e21778</fpage><pub-id pub-id-type="doi">10.2196/21778</pub-id><pub-id pub-id-type="medline">32856597</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Azelton</surname><given-names>KR</given-names> </name><name name-style="western"><surname>Crowley</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Vence</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Digital health coaching for type 2 diabetes: randomized controlled trial of healthy at home</article-title><source>Front Digit Health</source><year>2021</year><volume>3</volume><issue>764735</issue><fpage>34901926</fpage><pub-id pub-id-type="doi">10.3389/fdgth.2021.764735</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lim</surname><given-names>WX</given-names> </name><name name-style="western"><surname>Fook-Chong</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Gan</surname><given-names>WH</given-names> </name></person-group><article-title>The outcomes of app-based health coaching to improve dietary behavior among nurses in a tertiary hospital: pilot intervention study</article-title><source>JMIR Nurs</source><year>2022</year><month>07</month><day>15</day><volume>5</volume><issue>1</issue><fpage>e36811</fpage><pub-id pub-id-type="doi">10.2196/36811</pub-id><pub-id pub-id-type="medline">35838811</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Palepu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Schaekermann</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Towards conversational diagnostic AI</article-title><source>ArXiv</source><comment>Preprint posted online on  Jan 11, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2401.05654</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A study of generative large language model for medical research and healthcare</article-title><source>NPJ Digit Med</source><year>2023</year><volume>6</volume><issue>1</issue><fpage>37973919</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00958-w</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Berry</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Chwyl</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hsieh</surname><given-names>G</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Forman</surname><given-names>EM</given-names> </name></person-group><article-title>Comparing large language model AI and human-generated coaching messages for behavioral weight loss</article-title><source>J Technol Behav Sci</source><year>2025</year><pub-id pub-id-type="doi">10.1007/s41347-025-00491-5</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ong</surname><given-names>QC</given-names> </name><name name-style="western"><surname>Ang</surname><given-names>CS</given-names> </name><name name-style="western"><surname>Chee</surname><given-names>DZY</given-names> </name><etal/></person-group><article-title>Advancing health coaching: a comparative study of large language model and health coaches</article-title><source>Artif Intell Med</source><year>2024</year><month>11</month><volume>157</volume><fpage>103004</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2024.103004</pub-id><pub-id pub-id-type="medline">39454500</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Szymanski</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wimer</surname><given-names>BL</given-names> </name><name name-style="western"><surname>Anuyah</surname><given-names>O</given-names> </name><name name-style="western"><surname>Eicher-Miller</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Metoyer</surname><given-names>RA</given-names> </name></person-group><article-title>Integrating expertise in LLMs: crafting a customized nutrition assistant with refined template instructions</article-title><year>2024</year><month>05</month><day>11</day><conf-name>CHI &#x2019;24</conf-name><conf-loc>Honolulu HI USA</conf-loc><fpage>1</fpage><lpage>22</lpage><pub-id pub-id-type="doi">10.1145/3613904.3641924</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lan</surname><given-names>W</given-names> </name><etal/></person-group><article-title>An AI dietitian for type 2 diabetes mellitus management based on large language and image recognition models: preclinical concept validation study</article-title><source>J Med Internet Res</source><year>2023</year><month>11</month><day>9</day><volume>25</volume><fpage>e51300</fpage><pub-id pub-id-type="doi">10.2196/51300</pub-id><pub-id pub-id-type="medline">37943581</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Michie</surname><given-names>S</given-names> </name><name name-style="western"><surname>van Stralen</surname><given-names>MM</given-names> </name><name name-style="western"><surname>West</surname><given-names>R</given-names> </name></person-group><article-title>The behaviour change wheel: a new method for characterising and designing behaviour change interventions</article-title><source>Implement Sci</source><year>2011</year><month>04</month><day>23</day><volume>6</volume><fpage>42</fpage><pub-id pub-id-type="doi">10.1186/1748-5908-6-42</pub-id><pub-id pub-id-type="medline">21513547</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Michie</surname><given-names>S</given-names> </name><name name-style="western"><surname>Richardson</surname><given-names>M</given-names> </name><name name-style="western"><surname>Johnston</surname><given-names>M</given-names> </name><etal/></person-group><article-title>The behavior change technique taxonomy (v1) of 93 hierarchically clustered techniques: building an international consensus for the reporting of behavior change interventions</article-title><source>Ann Behav Med</source><year>2013</year><month>08</month><volume>46</volume><issue>1</issue><fpage>81</fpage><lpage>95</lpage><pub-id pub-id-type="doi">10.1007/s12160-013-9486-6</pub-id><pub-id pub-id-type="medline">23512568</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>BIT</article-title><source>East: Four simple ways to apply behavioural insights</source><year>2014</year><access-date>2024-08-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.bi.team/%20publications/east-four-simple-ways-to-apply-behavioural-insights">https://www.bi.team/ publications/east-four-simple-ways-to-apply-behavioural-insights</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kopitar</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bedra&#x010D;</surname><given-names>L</given-names> </name><name name-style="western"><surname>Strath</surname><given-names>LJ</given-names> </name><name name-style="western"><surname>Bian</surname><given-names>J</given-names> </name><name name-style="western"><surname>Stiglic</surname><given-names>G</given-names> </name></person-group><article-title>Improving personalized meal planning with large language models: identifying and decomposing compound ingredients</article-title><source>Nutrients</source><year>2025</year><month>04</month><day>29</day><volume>17</volume><issue>9</issue><fpage>1492</fpage><pub-id pub-id-type="doi">10.3390/nu17091492</pub-id><pub-id pub-id-type="medline">40362801</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Khamesian</surname><given-names>S</given-names> </name><name name-style="western"><surname>Arefeen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Carpenter</surname><given-names>SM</given-names> </name><name name-style="western"><surname>NutriGen</surname><given-names>GH</given-names> </name></person-group><article-title>Personalized meal plan generator leveraging large language models to enhance dietary and nutritional adherence</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 28, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2502.20601</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Khatibi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Nagesh</surname><given-names>N</given-names> </name><etal/></person-group><article-title>ChatDiet: empowering personalized nutrition-oriented food recommender chatbots through an LLM-augmented framework</article-title><source>Smart Health (2014)</source><year>2024</year><month>06</month><volume>32</volume><fpage>100465</fpage><pub-id pub-id-type="doi">10.1016/j.smhl.2024.100465</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tian</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>D</given-names> </name></person-group><article-title>Purrfessor: a fine-tuned multimodal llava diet health chatbot</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 22, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2411.14925</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>J&#x00F6;rke</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sapkota</surname><given-names>S</given-names> </name><name name-style="western"><surname>Warkenthien</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Supporting physical activity behavior change with LLM-based conversational agents</article-title><source>ArXiv</source><comment>Preprint posted online on  May 9, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.06061</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hegde</surname><given-names>N</given-names> </name><name name-style="western"><surname>Vardhan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Nathani</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Infusing behavior science into large language models for activity coaching</article-title><source>PLOS Digit Health</source><year>2024</year><month>04</month><volume>3</volume><issue>4</issue><fpage>e0000431</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000431</pub-id><pub-id pub-id-type="medline">38564502</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Meywirth</surname><given-names>S</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Mandviwalla</surname><given-names>M</given-names> </name><name name-style="western"><surname>S&#x00F6;llner</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tuunanen</surname><given-names>T</given-names> </name></person-group><article-title>Designing a large language model-based coaching intervention for lifestyle behavior change</article-title><source>Lecture Notes in Computer Science</source><year>2024</year><volume>14621</volume><publisher-name>Springer</publisher-name><pub-id pub-id-type="doi">10.1007/978-3-031-61175-9_6</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yoo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bernuy</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Large language model agents for improving engagement with behavior change interventions: application to digital mindfulness</article-title><source>ArXiv</source><comment>Preprint posted online on  Jul 3, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.13067</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Dao</surname><given-names>D</given-names> </name><name name-style="western"><surname>Teo</surname><given-names>JYC</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Nguyen</surname><given-names>HD</given-names> </name></person-group><article-title>LLM-powered multimodal AI conversations for diabetes prevention</article-title><year>2024</year><month>06</month><day>10</day><conf-name>ICMR &#x2019;24</conf-name><conf-loc>Phuket Thailand</conf-loc><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.1145/3643479.3662049</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bak</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chin</surname><given-names>J</given-names> </name></person-group><article-title>The potential and limitations of large language models in identification of the states of motivations for facilitating health behavior change</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>2047</fpage><lpage>2053</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae057</pub-id><pub-id pub-id-type="medline">38527272</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Team</surname><given-names>G</given-names> </name><name name-style="western"><surname>Anil</surname><given-names>R</given-names> </name><name name-style="western"><surname>Borgeaud</surname><given-names>S</given-names> </name><name name-style="western"><surname>Alayrac</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Soricut</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Gemini: a family of highly capable multimodal models</article-title><source>ArXiv</source><comment>Preprint posted online on  Dec 19, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2312.11805</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pignatiello</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Hickman</surname><given-names>RL</given-names>  <suffix>Jr</suffix></name></person-group><article-title>Decision fatigue: a conceptual analysis</article-title><source>J Health Psychol</source><year>2020</year><month>01</month><volume>25</volume><issue>1</issue><fpage>123</fpage><lpage>135</lpage><pub-id pub-id-type="doi">10.1177/1359105318763510</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moorhouse</surname><given-names>A</given-names> </name></person-group><article-title>Decision fatigue: less is more when making choices with patients</article-title><source>Br J Gen Pract</source><year>2020</year><month>08</month><volume>70</volume><issue>697</issue><fpage>399</fpage><pub-id pub-id-type="doi">10.3399/bjgp20X711989</pub-id><pub-id pub-id-type="medline">32732206</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chakraborty</surname><given-names>A</given-names> </name></person-group><article-title>Present bias</article-title><source>ECTA</source><year>2021</year><volume>89</volume><issue>4</issue><fpage>1921</fpage><lpage>1961</lpage><pub-id pub-id-type="doi">10.3982/ECTA16467</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hershfield</surname><given-names>HE</given-names> </name><name name-style="western"><surname>Goldstein</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Sharpe</surname><given-names>WF</given-names> </name><etal/></person-group><article-title>Increasing saving behavior through age-progressed renderings of the future self</article-title><source>J Mark Res</source><year>2011</year><month>11</month><volume>48</volume><fpage>S23</fpage><lpage>S37</lpage><pub-id pub-id-type="doi">10.1509/jmkr.48.SPL.S23</pub-id><pub-id pub-id-type="medline">24634544</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>OpenAI</article-title><source>Hello gpt-4o</source><year>2024</year><access-date>2024-08-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/hello-gpt-4o">https://openai.com/index/hello-gpt-4o</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Large language models are not fair evaluators</article-title><source>ArXiv</source><comment>Preprint posted online on  May 29, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.17926</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>H</given-names> </name><name name-style="western"><surname>He</surname><given-names>S</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Han</surname><given-names>M</given-names> </name></person-group><article-title>Systematic evaluation of LLM-as-a-judge in LLM alignment tasks: explainable metrics and diverse prompt templates</article-title><source>ArXiv</source><comment>Preprint posted online on  Aug 23, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2408.13006</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Thakur</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Choudhary</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ramayapally</surname><given-names>VS</given-names> </name><name name-style="western"><surname>Vaidyanathan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hupkes</surname><given-names>D</given-names> </name></person-group><article-title>Judging the judges: evaluating alignment and vulnerabilities in llms-as-judges</article-title><source>ArXiv</source><comment>Preprint posted online on  Jun 18, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.12624</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zack</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lehman</surname><given-names>E</given-names> </name><name name-style="western"><surname>Suzgun</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Assessing the potential of GPT-4 to perpetuate racial and gender biases in health care: a model evaluation study</article-title><source>Lancet Digit Health</source><year>2024</year><month>01</month><volume>6</volume><issue>1</issue><fpage>e12</fpage><lpage>e22</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(23)00225-X</pub-id><pub-id pub-id-type="medline">38123252</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>The revolution of generative artificial intelligence in psychology: the interweaving of behavior, consciousness, and ethics</article-title><source>Acta Psychol (Amst)</source><year>2024</year><month>11</month><volume>251</volume><fpage>104593</fpage><pub-id pub-id-type="doi">10.1016/j.actpsy.2024.104593</pub-id><pub-id pub-id-type="medline">39522296</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>E</given-names> </name><name name-style="western"><surname>Amar</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>B</given-names> </name><name name-style="western"><surname>Jia</surname><given-names>Y</given-names> </name></person-group><article-title>The geometry of queries: query-based innovations in retrieval-augmented generation</article-title><source>ArXiv</source><comment>Preprint posted online on  Jul 25, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.18044</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Anisuzzaman</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Malins</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Friedman</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Attia</surname><given-names>ZI</given-names> </name></person-group><article-title>Fine-tuning large language models for specialized use cases</article-title><source>Mayo Clin Proc Digit Health</source><year>2025</year><month>03</month><volume>3</volume><issue>1</issue><fpage>100184</fpage><pub-id pub-id-type="doi">10.1016/j.mcpdig.2024.11.005</pub-id><pub-id pub-id-type="medline">40206998</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pfohl</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Cole-Lewis</surname><given-names>H</given-names> </name><name name-style="western"><surname>Sayres</surname><given-names>R</given-names> </name><etal/></person-group><article-title>A toolbox for surfacing health equity harms and biases in large language models</article-title><source>Nat Med</source><year>2024</year><month>12</month><volume>30</volume><issue>12</issue><fpage>3590</fpage><lpage>3600</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03258-2</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayoub</surname><given-names>NF</given-names> </name><name name-style="western"><surname>Balakrishnan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ayoub</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Barrett</surname><given-names>TF</given-names> </name><name name-style="western"><surname>David</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Gray</surname><given-names>ST</given-names> </name></person-group><article-title>Inherent bias in large language models: a random sampling analysis</article-title><source>Mayo Clin Proc Digit Health</source><year>2024</year><month>06</month><volume>2</volume><issue>2</issue><fpage>186</fpage><lpage>191</lpage><pub-id pub-id-type="doi">10.1016/j.mcpdig.2024.03.003</pub-id><pub-id pub-id-type="medline">40207170</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Freyer</surname><given-names>O</given-names> </name><name name-style="western"><surname>Wiest</surname><given-names>IC</given-names> </name><name name-style="western"><surname>Kather</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Gilbert</surname><given-names>S</given-names> </name></person-group><article-title>A future role for health applications of large language models depends on regulators enforcing safety standards</article-title><source>Lancet Digit Health</source><year>2024</year><month>09</month><volume>6</volume><issue>9</issue><fpage>e662</fpage><lpage>e672</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(24)00124-9</pub-id><pub-id pub-id-type="medline">39179311</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1 </label><p>Supplementary methods and results.</p><media xlink:href="formative_v9i1e75421_app1.docx" xlink:title="DOCX File, 30 KB"/></supplementary-material></app-group></back></article>