<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v8i1e63262</article-id><article-id pub-id-type="doi">10.2196/63262</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Moving Toward Meaningful Evaluations of Monitoring in e-Mental Health Based on the Case of a Web-Based Grief Service for Older Mourners: Mixed Methods Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Brandl</surname><given-names>Lena</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jansen-Kosterink</surname><given-names>Stephanie</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Brodbeck</surname><given-names>Jeannette</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jacinto</surname><given-names>Sofia</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mooser</surname><given-names>Bettina</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Heylen</surname><given-names>Dirk</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Human Media Interaction group, University of Twente</institution>, <addr-line>Drienerlolaan 5</addr-line><addr-line>Enschede</addr-line>, <country>Netherlands</country></aff><aff id="aff2"><institution>Roessingh Research and Development</institution>, <addr-line>Enschede</addr-line>, <country>Netherlands</country></aff><aff id="aff3"><institution>Biomedical Signals and Systems, University of Twente</institution>, <addr-line>Enschede</addr-line>, <country>Netherlands</country></aff><aff id="aff4"><institution>Institute for Psychology, University of Bern</institution>, <addr-line>Bern</addr-line>, <country>Switzerland</country></aff><aff id="aff5"><institution>School of Social Work, University of Applied Sciences and Arts Northwestern Switzerland</institution>, <addr-line>Olten</addr-line>, <country>Switzerland</country></aff><aff id="aff6"><institution>Centro de Investiga&#x00E7;&#x00E3;o e Interven&#x00E7;&#x00E3;o Social, Instituto Universit&#x00E1;rio de Lisboa</institution>, <addr-line>Lisboa</addr-line>, <country>Portugal</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Wu</surname><given-names>Chaoyi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Sakal</surname><given-names>Collin</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Lena Brandl, MSc, Human Media Interaction group, University of Twente, Drienerlolaan 5, Enschede, 7522NB, Netherlands, 31 534893740; <email>l.brandl@utwente.nl</email></corresp></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>28</day><month>11</month><year>2024</year></pub-date><volume>8</volume><elocation-id>e63262</elocation-id><history><date date-type="received"><day>15</day><month>06</month><year>2024</year></date><date date-type="rev-recd"><day>06</day><month>10</month><year>2024</year></date><date date-type="accepted"><day>10</day><month>10</month><year>2024</year></date></history><copyright-statement>&#x00A9; Lena Brandl, Stephanie Jansen-Kosterink, Jeannette Brodbeck, Sofia Jacinto, Bettina Mooser, Dirk Heylen. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 28.11.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2024/1/e63262"/><abstract><sec><title>Background</title><p>Artificial intelligence (AI) tools hold much promise for mental health care by increasing the scalability and accessibility of care. However, current development and evaluation practices of AI tools limit their meaningfulness for health care contexts and therefore also the practical usefulness of such tools for professionals and clients alike.</p></sec><sec><title>Objective</title><p>The aim of this study is to demonstrate the evaluation of an AI monitoring tool that detects the need for more intensive care in a web-based grief intervention for older mourners who have lost their spouse, with the goal of moving toward meaningful evaluation of AI tools in e-mental health.</p></sec><sec sec-type="methods"><title>Method</title><p>We leveraged the insights from three evaluation approaches: (1) the <italic>F</italic><sub>1</sub>-score evaluated the tool&#x2019;s capacity to classify user monitoring parameters as either in need of more intensive support or recommendable to continue using the web-based grief intervention as is; (2) we used linear regression to assess the predictive value of users&#x2019; monitoring parameters for clinical changes in grief, depression, and loneliness over the course of a 10-week intervention; and (3) we collected qualitative experience data from e-coaches (N=4) who incorporated the monitoring in their weekly email guidance during the 10-week intervention.</p></sec><sec sec-type="results"><title>Results</title><p>Based on n=174 binary recommendation decisions, the <italic>F</italic><sub>1</sub>-score of the monitoring tool was 0.91. Due to minimal change in depression and loneliness scores after the 10-week intervention, only 1 linear regression was conducted. The difference score in grief before and after the intervention was included as a dependent variable. Participants&#x2019; (N=21) mean score on the self-report monitoring and the estimated slope of individually fitted growth curves and its standard error (ie, participants&#x2019; response pattern to the monitoring questions) were used as predictors. Only the mean monitoring score exhibited predictive value for the observed change in grief (<italic>R</italic><sup>2</sup>=1.19, SE 0.33; <italic>t</italic><sub>16</sub>=3.58, <italic>P</italic>=.002). The e-coaches appreciated the monitoring tool as an opportunity to confirm their initial impression about intervention participants, personalize their email guidance, and detect when participants&#x2019; mental health deteriorated during the intervention.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The monitoring tool evaluated in this paper identified a need for more intensive support reasonably well in a nonclinical sample of older mourners, had some predictive value for the change in grief symptoms during a 10-week intervention, and was appreciated as an additional source of mental health information by e-coaches who supported mourners during the intervention. Each evaluation approach in this paper came with its own set of limitations, including (1) skewed class distributions in prediction tasks based on real-life health data and (2) choosing meaningful statistical analyses based on clinical trial designs that are not targeted at evaluating AI tools. However, combining multiple evaluation methods facilitates drawing meaningful conclusions about the clinical value of AI monitoring tools for their intended mental health context.</p></sec></abstract><kwd-group><kwd>e-mental health</kwd><kwd>digital mental health service</kwd><kwd>mental health</kwd><kwd>digital health</kwd><kwd>internet intervention</kwd><kwd>monitoring mental health</kwd><kwd>monitor</kwd><kwd>e-coach</kwd><kwd>coaching</kwd><kwd>grieve</kwd><kwd>mourn</kwd><kwd>old</kwd><kwd>affective states</kwd><kwd>artificial intelligence</kwd><kwd>predictive</kwd><kwd>repeatedly measured predictors in regression</kwd><kwd>fuzzy cognitive map</kwd><kwd>algorithm</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Artificial intelligence (AI) tools hold much promise for mental health care by increasing the scalability and accessibility of care [<xref ref-type="bibr" rid="ref1">1</xref>]. They have the potential to identify warning signs of serious mental health problems earlier than current mental health care systems allow and deliver timely (digital) mental care, potentially preventing the full onset of mental health disorders or limiting the severity with which they impair people&#x2019;s lives [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. For example, Sakal et al [<xref ref-type="bibr" rid="ref3">3</xref>] described the development and evaluation of an AI-based screening tool for geriatric depression in Chinese older adults. Taking into account cultural response biases to traditional depression screening tools, the tool focused on less emotionally sensitive demographic and quality of life predictors such as health status compared to 3 years ago, hearing status, income, and average hours of sleep per night in the previous month. The tool was found to perform well during validation and the authors explained the importance of the nonsensitive nature of the questions used by the screening tool for early detection of geriatric depression in the Chinese aging population. The tool represents a means for Chinese public health officials to fight the growing mental health treatment gap in the country. Likewise, Zhang et al [<xref ref-type="bibr" rid="ref4">4</xref>] leveraged AI to extensively analyze behavior-related and physiological risk factors for suicide in middle- and older-aged individuals who participated in the UK Biobank population-based cohort that was recruited between 2006 and 2010. The use of AI and advanced statistical tools enabled the authors to systematically identify and rank 246 behavior-related and 200 physiological factors and identify 58 robust predictors for suicide risk. The authors explained that the gained insights unravel new potential avenues for targeted suicide prevention.</p><p>Despite such promising examples of how AI tools can contribute to increasing the scalability, accessibility, and effectiveness of mental health care, AI tools are currently still considered to be in a proof-of-concept stage rather than currently having a clinical impact on mental health care [<xref ref-type="bibr" rid="ref5">5</xref>]. Tornero-Costa et al [<xref ref-type="bibr" rid="ref6">6</xref>] described a mismatch between clinical trial designs that are common in mental health care and desired data qualities for AI development, which are often difficult to reconcile in terms of time, money, and human resources. AI tools and clinical trials have fundamentally diverged data sampling considerations, specifically concerning exclusion criteria that are common in clinical trials to limit the influence of confounders on tested clinical outcomes, or due to safety considerations. However, given large enough sample sizes, confounders improve the generalizability of AI models, which makes them a necessary element in any representative dataset. In mental health care, AI tools are currently often developed in retrospection as secondary outcomes of clinical trials and are based on clinical data collected for purposes other than model development [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>In addition, current AI model engineering approaches for mental health are criticized for their focus on perfecting model performance without providing practical clinical value [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Whiting and Fazel [<xref ref-type="bibr" rid="ref7">7</xref>] explained in their recent clinical meta review on the accuracy of prediction models for detecting suicide risk that only a few models are developed with independent clinical validation or piloting in mind. Model developers tend to neglect the clinical meaning of the association between predictors and model outcomes and are not transparent regarding the decision-making process leading to the selection of model parameters [<xref ref-type="bibr" rid="ref6">6</xref>]. Furthermore, current practices favor model evaluation metrics such as predictive accuracy without explaining how they are linked to a clinical decision. In the specific context of suicide risk detection, the authors advocate that prediction models should be compared to unstructured clinical assessments of suicide risk to investigate the incremental benefit of these tools in supporting clinician decision-making. Ultimately, suicide prediction is challenging for both data-driven prediction models and clinical practitioners, as is any mental health prediction task. To build AI tools in mental health care with a clinical impact, we need to start developing and evaluating models whose outcomes can be clearly linked to clinical decision-making and their roles in clinical practice should be well-defined.</p><p>In this paper, we evaluate a mental health monitoring tool in an e-mental health service for older mourners by combining the insights from 3 evaluation approaches. We encountered some challenges that are common in AI evaluation studies and showcase how these affect the clinical meaningfulness of our obtained results. We exemplify the need for AI tools in mental health care to go beyond classical AI evaluation metrics and statistical approaches in clinical research to have an impact. The next section briefly introduces the monitoring tool and the e-mental health service in which it is embedded before describing our evaluation approach in more detail. The e-mental health service for which the monitoring tool was developed supports older mourners in processing the loss of their spouse. We conclude with a discussion of the encountered evaluation challenges and some suggestions on how to move the development of impactful AI tools in mental health care forward.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Background: The Monitoring Tool</title><p>The monitoring tool that we evaluated in this study is implemented in a web-based grief service for older mourners who have lost their spouse. The grief service consists of 10 content modules (eg, unraveling myths and truths about grief) and exercises and activity suggestions that help the mourner process the loss and foster positive mental and physical well-being (eg, writing a farewell letter to the deceased spouse, reconnecting with one&#x2019;s hobbies) [<xref ref-type="bibr" rid="ref9">9</xref>]. The monitoring tool complements the service with a biweekly mental health self-check and by analyzing whether it is advisable for the user to seek offline (professional) support. It has 2 components: a mental health user profile and a decision-making component. The mental health user profile consists of 2 self-report questionnaires, an initial risk assessment (IRA) and a continuous risk assessment (CRA). The IRA represents an initial assessment of the user&#x2019;s affective state and grief symptoms when they start using the web-based grief service and controls for risk factors such as whether the loss has been violent (eg, their partner committed suicide). The CRA assesses the extent to which the mourner experiences psychological suffering. The decision-making component consists of a set of rules that determines whether the user exceeds a suicidal threshold; it also includes a fuzzy cognitive map (FCM) decision algorithm. It arrives at the decision to display either a recommendation to seek offline support or an encouragement to continue using the grief service as is. Filling in the CRA is optional; the grief program can be used without it. The development of the monitoring tool&#x2014;including its parameter selection, the construction of the 2 monitoring questionnaires (IRA and CRA), and an initial error analysis based on fictitious scenarios&#x2014;is described in detail in Brandl et al [<xref ref-type="bibr" rid="ref10">10</xref>].</p></sec><sec id="s2-2"><title>Evaluation Context: Randomized Controlled Trial and e-Coach Focus Group</title><p>The current evaluation of the mental health monitoring tool is based on an ongoing randomized controlled trial (RCT) that started in March 2022 in Switzerland [<xref ref-type="bibr" rid="ref9">9</xref>]. The primary aim of the RCT is to investigate the clinical efficacy of the previously described web-based grief service, while the secondary aim is to examine which delivery format of the web-based grief service (standardized vs self-tailored) is associated with better clinical outcomes. At the time of writing this paper, the RCT is ongoing until the needed sample size to test the 2 delivery formats of the service is achieved. Our evaluation approach uses the data of older mourners (&#x2265;60 years old) who participated in the RCT. Participants were recruited from the general population and had experienced the loss of their partner at least 1 month before the RCT. A more extensive list of inclusion and exclusion criteria can be found in the dedicated study protocol of the RCT [<xref ref-type="bibr" rid="ref9">9</xref>]. During the RCT, 4 e-coaches provided guidance in the form of a weekly email with short, personalized feedback and support. The e-coaches were encouraged to include participants&#x2019; self-reported mood and therapeutic progress and the outcome of the monitoring tool in the guidance that they provided.</p></sec><sec id="s2-3"><title>Evaluation Approach</title><sec id="s2-3-1"><title>Overview</title><p>To evaluate the monitoring tool, we did the following: (1) assessed the classification performance of the monitoring decision algorithm using the <italic>F</italic><sub>1</sub>-metric; (2) investigated the predictive value of participants&#x2019; monitoring responses for their clinical change in grief, depression, and loneliness after the 10-week RCT; and (3) collected qualitative user experience data to explore the tool&#x2019;s suitability for clinical practice from trained e-coaches who used the monitoring for their work during the RCT. For the first step, the classification performance was assessed using ground truth classification labels provided by the e-coaches for the tool&#x2019;s binary outcome (recommendation to seek support vs encouragement to continue using the grief service as is). The e-coaches determined the ground truth labels based on their professional assessment given the participant&#x2019;s progress in the e-mental health service, their biweekly monitoring responses, weekly email exchanges with the participant, and a clinical interview at the beginning of the RCT. The monitoring&#x2019;s suggested classification was visible to the e-coaches alongside participants&#x2019; raw monitoring responses to facilitate the e-coach&#x2019;s understanding of the classification. If the e-coach&#x2019;s assessment diverged from the outcome of the monitoring tool, they provided a brief textual explanation about their rationale. The ground truth labels for the monitoring predictions were provided by the e-coaches upon request at the time of conducting this analysis. For the second step, we explored the predictive value of the CRA by relating CRA scores to the difference in clinical measurements before and after the RCT. For the third step, we focused on the e-coaches&#x2019; experiences with the monitoring tool during the RCT. A web-based focus group was conducted in which the 4 e-coaches discussed how they used the monitoring tool in their role as e-coaches, how they experienced having the tool at their disposal, and how they think such a tool could be most useful for mourners who use the web-based grief service and for e-coaches such as themselves.</p></sec><sec id="s2-3-2"><title>Measures</title><p>The CRA is a multidimensional scale that measures hopelessness, grief symptoms, social isolation, and psychological crisis with 2 items each on a 4-point Likert scale. The items assess the frequency of emotional suffering in the past 2 weeks ranging from 0 (Not at all) to 3 (Every day). The CRA also measures therapeutic progress on a 4-point Likert scale ranging from 0 (Strongly disagree) to 3 (Strongly agree). The CRA serves as input for the FCM algorithm as part of the decision-making process in the monitoring model. Its development is described in more detail in Brandl et al [<xref ref-type="bibr" rid="ref10">10</xref>]. A copy of the scale is included in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. For this study, the CRA scores of RCT participants were retrieved from data logs of the grief program. The three clinical measures (grief, loneliness, depression) were assessed at three measurement moments during the RCT via a web-based surveying tool: (1) prior to starting the web-based grief program (<italic>t</italic><sub>0</sub>), (2) after completing the 10-week intervention (<italic>t</italic><sub>1</sub>), and (3) twenty weeks after starting the intervention program (<italic>t</italic><sub>2</sub>). For the current evaluation of the monitoring tool, we only take the first 2 measurement moments into account. The clinical measures include an assessment of the mourner&#x2019;s (1) grief symptoms using the Texas Revised Inventory of Grief [<xref ref-type="bibr" rid="ref11">11</xref>], (2) depressive symptoms using the Patient Health Questionnaire-9 [<xref ref-type="bibr" rid="ref12">12</xref>], and (3) loneliness via the de Jong Gierveld Loneliness Scale [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p></sec><sec id="s2-3-3"><title>Data Inclusion</title><p>For the evaluation of the classification performance, our first approach, we included any monitoring decision for which the e-coaches provided a ground truth label. For assessing the predictive value of the CRA for clinical change during the RCT, however, we only included participants who had (1) completed the 10-week intervention and (2) filled in the clinical measurements (depression, grief, loneliness) at baseline and 10 weeks after starting the intervention. We did not expect the delivery format of the grief program (self-tailored vs standardized) or the fact that participants in the waitlist control condition received access to the intervention only after 12 weeks to impact the decisions of the monitoring algorithm. Likewise, we did not expect the delivery format or the waitlist control condition to affect the relation between how participants filled in the CRA and the clinical outcomes. Therefore, we included participants from all arms of the RCT in this analysis. Specifically, we included CRA scores from RCT weeks 2-10. In week 2, the CRA was administered for the first time.</p></sec></sec><sec id="s2-4"><title>Ethical Considerations</title><p>The RCT based on which we evaluated the current AI monitoring tool of a web-based grief service received medical ethical approval from the Medical Ethical Committee of Northwestern and Central Switzerland (Business Administration System for Ethics Committees number 2021&#x2010;02221) and is registered at ClinicalTrials.gov (NCT0528004). All older mourners who participated in the RCT signed an informed consent form that was approved by the Medical Ethical Committee of Northwestern and Central Switzerland, allowing the secondary analysis of their monitoring data for the purpose of evaluating the web-based grief service with no further consent required. In addition, the e-coaches provided written informed consent prior to participation in the focus group. All analyses involving the data of RCT subjects were conducted on an anonymized dataset where each participant was represented by an arbitrary code that is not related to their identity. The e-coach focus group was recorded and automatically transcribed using Microsoft Teams. After checking the correctness of the automatic transcription, the recording was deleted, and the transcription was deidentified. All subsequent analyses were conducted using the deidentified focus group transcription. Participants did not receive (financial) compensation for participating in this research. We refer the reader to the dedicated RCT study protocol for more detailed information about its ethical review process [<xref ref-type="bibr" rid="ref9">9</xref>].</p></sec><sec id="s2-5"><title>Data Analyses</title><sec id="s2-5-1"><title>Analysis I: Classification Evaluation</title><p>Our data were sampled from a nonclinical population. Therefore, we expected few (true) help-seeking recommendations in the sample. This has implications for choosing an appropriate classification evaluation metric [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Regarding terminology, in the binary classification problem at hand (recommending to seek offline support versus recommending to continue using the service as is), we chose the less frequent class, recommending help-seeking, as the positive outcome class and recommending to continue using the service as the negative class, as recommended for imbalanced classification problems [<xref ref-type="bibr" rid="ref16">16</xref>]. The <italic>F</italic>-measure is used when there is no clear preference for either minimizing false positives (someone receives an unjustified recommendation to seek support) or false negatives (someone who needs support does not receive a recommendation to seek support) because both are regarded as equally important for determining the classifier&#x2019;s performance. The <italic>F</italic><sub>1</sub>-score is the harmonic mean between the true positive rate (recall) of a classifier and its precision:</p><disp-formula id="E4"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mspace width="thinmathspace"/><mml:mrow><mml:mo fence="true" stretchy="true" symmetric="true" maxsize="1.2em" minsize="1.2em">/</mml:mo></mml:mrow><mml:mspace width="thinmathspace"/><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">f</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">g</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mrow><mml:mo fence="true" stretchy="true" symmetric="true" maxsize="1.2em" minsize="1.2em">/</mml:mo></mml:mrow><mml:mspace width="thinmathspace"/><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">f</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mspace width="thinmathspace"/><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x2217;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mo>&#x2217;</mml:mo><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mspace width="thinmathspace"/><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mspace width="thinmathspace"/><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:math></disp-formula><p>The <italic>F</italic><sub>1</sub>-score is bounded to the interval [0<italic>,</italic>1], where 1 represents maximum precision and recall and 0 represents zero precision and recall. All calculations were performed using Python 3.11 [<xref ref-type="bibr" rid="ref17">17</xref>].</p></sec><sec id="s2-5-2"><title>Analysis II: Predicting Clinical Change Using Monitoring Measurements</title><p>The reliability of the CRA questionnaire was assessed using Spearman-Brown <italic>&#x03C1;<sub>SP</sub></italic> coefficients for its five 2-item subscales: psychological crisis, hopelessness, grief symptoms, social isolation, and therapeutic progress [<xref ref-type="bibr" rid="ref18">18</xref>]. As pointed out by Tavakol and Dennick [<xref ref-type="bibr" rid="ref19">19</xref>], if a scale measures several constructs, it is recommended that reliability is assessed separately for each construct. Since each CRA construct is measured using 2 items, Spearman-Brown coefficients were deemed the most appropriate method for assessing reliability [<xref ref-type="bibr" rid="ref18">18</xref>]. To investigate the relation between the CRA and the clinical outcomes of the RCT, we conducted 3 linear regression analyses, with the difference in clinical outcomes before and after the grief intervention as the dependent variable and the parameters of an individually fitted linear growth curve (the estimate of the linear coefficient and its standard error, ie, the slope of the linear curve) and mean CRA scores as independent variables, as suggested by Welten et al [<xref ref-type="bibr" rid="ref20">20</xref>] for repeatedly measured predictors. Analyses were performed using Python 3.11 and R (version 4.3.2; R Foundation for Statistical Computing) [<xref ref-type="bibr" rid="ref21">21</xref>].</p></sec><sec id="s2-5-3"><title>Analysis III: e-Coaches&#x2019; Experience With the Monitoring Tool</title><p>An inductive coding scheme was developed and applied in ATLAS.ti [<xref ref-type="bibr" rid="ref22">22</xref>] to the transcript of the e-coaches&#x2019; focus group about their experiences with the monitoring tool during the RCT. The coding scheme was developed and applied by one researcher and verified by a second researcher. Any discrepancies were discussed until agreement was reached. An exemplary code is <italic>monitoringExp</italic>, which summarizes experiences and thoughts that the e-coaches had about having the monitoring at their disposal.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Analysis I: Classification Evaluation</title><sec id="s3-1-1"><title>Participants</title><p>The data of 44 RCT participants were included in the assessment of the classification performance of the monitoring module. On average, these 44 participants filled in 4.02 (SD 2.2) of the 5 biweekly CRA questionnaires during the 10-week intervention, amounting to 174 monitoring decisions that were labeled by hand by the e-coaches.</p></sec><sec id="s3-1-2"><title>Confusion Matrix and <italic>F</italic><sub>1</sub>-score</title><p><xref ref-type="table" rid="table1">Table 1</xref> shows the confusion matrix for the monitoring algorithm&#x2019;s decision-making. Most labeled monitoring decisions (n=168) were true negatives, reflecting that detecting the need for professional intervention in a web-based grief service is an extremely imbalanced classification problem. Taking a closer look at the only false negative classification, the e-coach explains that they disagreed with not recommending additional support because the participant indicated an exacerbation of psychosomatic symptoms (eg, heart pounding) in their email exchange with the e-coach as well as a lack of future perspective. The FCM does not include psychosomatic symptoms in its decision-making, but it does include a measure of &#x201C;lack of future perspective&#x201D; (hopelessness). Four of the 5 true positives occurred in the initial monitoring assessment where additional risk factors are assessed, such as a recent inpatient treatment for a psychological condition. Only 2 of the 3 participants exhibited such risk factors; those were also named by the e-coach as reasons why they agreed with the recommendation to seek additional support. The remaining true positives represented moments of elevated emotional suffering as reflected by the participants&#x2019; monitoring responses in the CRA. The monitoring algorithm&#x2019;s <italic>F</italic><sub>1</sub>-score was 0.91.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Confusion matrix of the monitoring decision algorithm that either recommends help-seeking or to continue using the mental health service with no change.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Positive class (help-seeking recommendation)</td><td align="left" valign="bottom">Negative class (no help-seeking recommendation)</td></tr></thead><tbody><tr><td align="left" valign="top">True prediction</td><td align="left" valign="top">5</td><td align="left" valign="top">168</td></tr><tr><td align="left" valign="top">False prediction</td><td align="left" valign="top">0</td><td align="left" valign="top">1</td></tr></tbody></table></table-wrap></sec></sec><sec id="s3-2"><title>Analysis II: Predictive Value CRA for Clinical Change</title><sec id="s3-2-1"><title>Participants</title><p>Since we only included participants who had both completed the 10-week intervention and filled in the clinical measurements (depression, grief, loneliness) at baseline and 10 weeks after starting the intervention, 21 participants were included in the analysis that assesses the predictive value of the CRA. Participants&#x2019; mean age was 60.1 (SD 11.4) years. Of the 21 participants, 18 were female and 3 were male. Using Spearman-Brown coefficients <italic>&#x03C1;<sub>SP</sub></italic> to assess the reliability of the 5 CRA subscales resulted in <italic>&#x03C1;<sub>SP</sub></italic>=0.74 for the hopelessness subscale, <italic>&#x03C1;<sub>SP</sub></italic>=0.70 for the grief symptoms and therapeutic progress subscales, <italic>&#x03C1;<sub>SP</sub></italic>=0.66 for the psychological crisis construct, and <italic>&#x03C1;<sub>SP</sub></italic>=0.14 for the social isolation subscale. <italic>&#x03C1;<sub>SP</sub></italic> scores between 0.5 and 0.7 are considered fair and scores between 0.7 and 0.9 are considered good [<xref ref-type="bibr" rid="ref23">23</xref>]. Further investigation into the low Spearman-Brown coefficient <italic>&#x03C1;<sub>SP</sub></italic> for the social isolation construct revealed that the 2 items in the subscale correlated poorly (Pearson <italic>r</italic>=0.08).</p></sec><sec id="s3-2-2"><title>Linear Regression Analysis</title><p><xref ref-type="table" rid="table2">Table 2</xref> shows the descriptives of the dependent and independent variables in the regression analysis. Depression and loneliness measurements before the RCT (<italic>t</italic><sub>0</sub>) and after the RCT (<italic>t</italic><sub>1</sub>) differed little, making it difficult to reliably fit a model using either as the dependent variable. We therefore decided to only conduct 1 regression analysis, with the difference in grief scores before and after the RCT as the dependent variable. Not everyone filled in the CRA regularly, resulting in n=84 CRA measurements that were included in the analysis. <xref ref-type="fig" rid="figure1">Figure 1</xref> shows a subset of the fitted individual growth curves; the entire set is included in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. <xref ref-type="table" rid="table3">Table 3</xref> summarizes the results of the linear regression analysis with the difference in grief before and after the RCT as the dependent variable and individual CRA growth curves and CRA means as predictors. Overall, the regression model fit the observed data well (<italic>R</italic><sup>2</sup>=0.45; <italic>F</italic><sub>3,16</sub>=4.42, <italic>P</italic>=.019). Neither the slope of the individually fitted CRA curves (<italic>R</italic><sup>2</sup>=&#x2212;1.18, SE 2.21; <italic>t</italic><sub>16</sub>=&#x2212;0.54, <italic>P</italic>=.60) nor their standard error (<italic>R</italic><sup>2</sup>=&#x2212;4.45, SE 3.03; <italic>t</italic><sub>16</sub>=&#x2212;1.47, <italic>P</italic>=.16) predicted how grief symptoms changed during the RCT. CRA mean scores did have predictive value for how mourners&#x2019; grief scores changed during the intervention (<italic>R</italic><sup>2</sup>=1.19, SE 0.33; <italic>t</italic><sub>16</sub>=3.58, <italic>P</italic>=.002). We checked statistical assumptions visually, including normality and homoscedasticity of residuals, and found none to be violated.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Descriptives of the independent and dependent variables in the regression analysis.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Variable</td><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom">Median (min, max)</td><td align="left" valign="bottom">Scale range [reference]</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Continuous risk assessment total score (n=84)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">8.68 (4.16)</td><td align="left" valign="top">9.0 (1.0, 21.0)</td><td align="left" valign="top">0&#x2010;24 [<xref ref-type="bibr" rid="ref10">10</xref>]</td></tr><tr><td align="left" valign="top" colspan="5"><italic><bold>t</bold></italic><sub><bold>1</bold></sub> <bold>&#x2013;</bold> <italic><bold>t</bold></italic><sub><bold>0</bold></sub> <bold>difference (N=21)</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Grief</td><td align="left" valign="top">&#x2212;2.9 (5.37)</td><td align="left" valign="top">&#x2212;2.0 (&#x2212;12.0, 6.0)</td><td align="left" valign="top">5-80 [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Depression</td><td align="left" valign="top">&#x2212;0.76 (2.83)</td><td align="left" valign="top">0.0 (&#x2212;7.0, 5.0)</td><td align="left" valign="top">0-27 [<xref ref-type="bibr" rid="ref12">12</xref>]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Loneliness</td><td align="left" valign="top">&#x2212;0.29 (1.27)</td><td align="left" valign="top">0.0 (&#x2212;3.0, 3.0)</td><td align="left" valign="top">0-6 [<xref ref-type="bibr" rid="ref25">25</xref>]</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Of 105 possible continuous risk assessment measurements, 21 (20%) were missing.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Subset of fitted linear individual growth curves that served as predictor variables in the regression analysis. CRA: continuous risk assessment.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v8i1e63262_fig01.png"/></fig><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Summary of the linear regression analysis with individually fitted growth curve parameters and continuous risk assessment mean scores as independent and difference in grief before and after the 10-week web-based grief intervention as dependent variables.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"><italic>R</italic><sup>2</sup> (95% CI)</td><td align="left" valign="bottom"><italic>t</italic> test (<italic>df</italic>)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Intercept</td><td align="char" char="." valign="top">&#x2212;11.37 (&#x2212;16.82 to &#x2212;5.92)</td><td align="char" char="." valign="top">&#x2212;4.42 (16)</td><td align="char" char="." valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Slope of growth curve</td><td align="char" char="." valign="top">&#x2212;1.18 (&#x2212;5.87 to 3.5)</td><td align="char" char="." valign="top">&#x2212;0.54 (16)</td><td align="char" char="." valign="top">.60</td></tr><tr><td align="left" valign="top">Standard error of the growth curve slope</td><td align="char" char="." valign="top">&#x2212;4.45 (&#x2212;10.88 to 1.98)</td><td align="char" char="." valign="top">&#x2212;1.47 (16)</td><td align="char" char="." valign="top">.16</td></tr><tr><td align="left" valign="top">Continuous risk assessment mean</td><td align="char" char="." valign="top">1.19 (0.48 to 1.89)</td><td align="char" char="." valign="top">3.58 (16)</td><td align="char" char="." valign="top">.002</td></tr></tbody></table></table-wrap></sec></sec><sec id="s3-3"><title>Analysis III: e-Coaches&#x2019; Experience With the Monitoring Tool</title><p>All e-coaches that provided guidance during the RCT (N=4) participated in the online focus group to discuss their experience with the monitoring tool. The e-coach team consisted of 1 trained psychological therapist and 3 final-year clinical psychology students who partook in the e-coaching as part of their training. Their mean age was 26.9 (SD 2.69) years. All e-coaches were female. The final-year students were trained to provide email guidance and were closely supervised by a trained psychotherapist. Before the start of the RCT, the e-coaches discussed how they would use the monitoring tool to check on participants&#x2019; health regularly. To determine a deterioration in a participant&#x2019;s mental health, the e-coaches took into account the mourner&#x2019;s CRA responses, recommendations suggested by the monitoring algorithm, their impression of the mourner from the clinical interview (eg, knowledge about the death anniversary date of the deceased), and the mourner&#x2019;s weekly email communication, if available. The e-coaches weighted recent CRA responses most and whether there was a pattern in the response behavior. One e-coach explained that they incorporated the monitoring responses into their weekly guidance emails for unresponsive participants to personalize their contact with them. All e-coaches confirmed that they used the monitoring to confirm their existing impressions of participants:</p><disp-quote><p>I think we regarded it as a kind of a safety option, to check how people are feeling and how it aligns with our impression of the person and the remaining contact we have with them. And for us to reflect, did we overlook anything or forget to ask anything?</p><attrib>e-coach 1</attrib></disp-quote><p>The e-coaches unanimously experienced having another source of information as helpful, especially for participants who otherwise communicated little with them during the RCT. The e-coaches experienced being able to monitor participants&#x2019; progress with the grief service as supportive and reassuring:</p><disp-quote><p>Whether they [the mourner] made progress or deteriorated, a kind of support for recognizing if anything were to happen. Maybe if they [the mourner] did not tell us, to have another chance at detecting it.</p><attrib>e-coach 3</attrib></disp-quote><p>Another e-coach agreed that they were curious about how participants responded in the monitoring, especially when mourners worked on intervention content that the e-coaches knew to be challenging for some mourners, such as writing a farewell letter to the deceased spouse. For future versions of the monitoring tool, the e-coaches suggested providing feedback directly to the mourner, such as regular written summaries and recommendations to seek additional support in times of crisis. This could support the mourner&#x2019;s reflection about their affective states and encourage them to seek offline support proactively instead of waiting until an e-coach advises them to seek support. In addition, the e-coaches expressed that they would prefer to receive warning messages (eg, via SMS text message) whenever the condition of a mourner deteriorated drastically to facilitate immediate intervention.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study is situated in the rapidly emerging field of AI tools for mental health care and evaluated a monitoring module in a web-based grief intervention for older mourners with the aim of guiding them to offline support if their mental health deteriorated. We leveraged the insights from 3 evaluation approaches and encountered 3 main challenges when trying to come up with satisfactory and clear conclusions about &#x201C;how well&#x201D; a monitoring module such as the one evaluated in this study performs.</p><p>First, many clinical classification problems are (extremely) imbalanced, meaning that the class for which correct classification is crucial (eg, recommending help-seeking, detecting a tumor) is underrepresented in real-life datasets [<xref ref-type="bibr" rid="ref15">15</xref>]. Although there are evaluation metrics, including the <italic>F</italic><sub>1</sub>-metric [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref26">26</xref>] that we used in this study, that can mitigate class imbalance to some extent [<xref ref-type="bibr" rid="ref27">27</xref>], the clinical meaningfulness of obtained results should still be appraised critically. Complementing the evaluation with qualitative accounts from clinical practice is in line with Whiting and Fazel&#x2019;s [<xref ref-type="bibr" rid="ref7">7</xref>] suggestion to consider the incremental benefit of AI tools in clinical practice and stress that any thorough evaluation of a monitoring tool should go beyond quantifiable accuracies and statistics. A monitoring tool that does not match the needs and preferences of its users and the clinical context in which it is used will ultimately not be used, regardless of its classification performance [<xref ref-type="bibr" rid="ref28">28</xref>]. The qualitative evaluation of the tool revealed that the e-coaches envisioned the tool not only as a regular mental health check but also as an emergency detection tool for short-term psychological crisis. A more appropriate approach to evaluating the latter would be to investigate CRA measures around an episode of psychological crisis. However, the low prevalence of psychological crisis in our data makes any evaluation targeted at detecting emergencies impossible. The tool should be evaluated in a clinical sample in which short-term psychological crisis is expected to arise more frequently to investigate its suitability as an emergency detection tool.</p><p>Finding appropriate statistical approaches to evaluate AI tools for clinical practice using real-life mental health data represents a second challenge. Despite the mixed results obtained in this study, we argue that statistical approaches that allow for the explicit modeling of individual differences should receive more attention in future evaluations of AI tools in mental health care. Individual growth curve predictors are recommended when distinct developmental patterns are expected across outcome groups&#x2014;in our case, we expected distinct patterns for each individual participant [<xref ref-type="bibr" rid="ref20">20</xref>]. Grief and its experienced intensity are inherently individual [<xref ref-type="bibr" rid="ref29">29</xref>], suggesting from a clinical point of view that individually optimized growth curves are a suitable means of analysis. In this study, individually fitted CRA growth curves captured participants&#x2019; response patterns variably well. Participants&#x2019; response patterns may require more complex functions (eg, quadratic, cubic) than linearly fitted curves. Another reason why the estimated growth curves fit participants&#x2019; response patterns variably well is missing values [<xref ref-type="bibr" rid="ref20">20</xref>]. The reliability of the underlying measurement tool likewise impacts the fit of estimated growth curves. The social isolation subscale needs revision, as its 2 items were poorly correlated. The 2 items capture 2 different dimensions of being socially isolated: the feeling of being a burden to others and active social withdrawal behavior. It is difficult to reliably assess 2 dimensions of a construct using only 2 items. The construction of 2-item scales is generally discouraged in terms of reliability [<xref ref-type="bibr" rid="ref18">18</xref>]; however, to limit the burden of filling in mental health checks regularly as part of a digital mental health service, short self-assessment tools are needed. In this context, incorporating less obtrusive assessment methods in digital mental health services, including sensing technologies [<xref ref-type="bibr" rid="ref30">30</xref>] and natural language processing [<xref ref-type="bibr" rid="ref31">31</xref>], to complement self-report monitoring of clients&#x2019; mental states should be considered in the future.</p><p>To move toward well-developed monitoring systems in e-mental health, we recommend clear and early decision-making about (1) the responsibilities of the monitoring tool in the e-mental health application (and which responsibilities the tool does not have) and (2) what it takes to evaluate the tool in a satisfactory way so that it can live up to these responsibilities and contribute in a meaningful way to clinical practice. Currently, AI tools are often developed as secondary goals to the development of a new e-mental health application [<xref ref-type="bibr" rid="ref6">6</xref>], which represents the third identified challenge since it limits time and effort invested into their development and evaluation. Extracting clinically meaningful results using common methods for evaluating AI tools is complex. Hence, such tools cannot afford ambiguities regarding their capabilities and responsibilities that further complicate the evaluation process.</p></sec><sec id="s4-2"><title>Limitations</title><p>This study has limitations. First, the e-coaches that provided the ground truth labels for assessing the classification performance of the monitoring tool had access to the tool&#x2019;s suggested decisions at the time of labeling participants&#x2019; monitoring response patterns as either &#x201C;advisable to seek support&#x201D; or &#x201C;fine to continue using the grief service as is.&#x201D; Having access to the tool&#x2019;s suggestions may have biased the e-coaches&#x2019; ground truth labels in favor of the monitoring tool. However, the ground truth labels were provided by the coaches upon request at the time of conducting the analyses in this study, after RCT participants that were included in this study had completed the 10-week grief intervention. Therefore, in practice, the e-coaches revisited participants&#x2019; monitoring responses retrospectively, as well as their own initial decision-making during participants&#x2019; participation in the RCT. The retrospective nature of the labeling task likely limited the potentially introduced bias because the e-coaches had the knowledge of the participants&#x2019; trial outcome at their disposal, solidifying the truth of the provided labels. With regard to providing clinically meaningful insights, a second limitation of this study is the small sample size in the regression analysis and the small number of psychological crisis events in the classification evaluation analysis that we mentioned earlier. Any (clinical) conclusions based on the obtained results should be drawn with caution.</p></sec><sec id="s4-3"><title>Conclusion</title><p>In recent years, the demand for high-quality and accessible mental health care has been increasing. Digital mental health self-help services have the potential to support today&#x2019;s health care systems in meeting care demands and their potential is further increased by leveraging the benefits of emerging AI tools, including monitoring tools that track users&#x2019; affective states and guide them toward offline support if their mental health warrants professional intervention. Such AI tools come with challenges that must be addressed systematically before they have an impact in clinical practice. These challenges include finding meaningful evaluation approaches in the face of (1) (extremely) imbalanced real-life clinical datasets, (2) ambiguous demands and expectations regarding the capabilities and responsibilities of such tools in e-mental health, and (3) priority misalignments between evaluation approaches for AI tools and the overarching goals of clinical trials in which their evaluation is usually embedded. We hope to contribute to an enhanced awareness about these challenges and to the development of evaluation approaches for AI tools in e-mental health that facilitate their introduction into clinical practice.</p></sec></sec></body><back><ack><p>The research leading to these results was carried out under the AAL Programme under project number AAL-2019-6-168-CP, with funding from the European Union and the national funding agencies of the Netherlands, Portugal, and Switzerland: The Netherlands Organisation for Health Research and Development (ZonMW), Funda&#x00E7;&#x00E3;o para a Ci&#x00EA;ncia e Tecnologia (FCT), and Innosuisse&#x2013;Swiss Innovation Agency, respectively.</p></ack><notes><sec><title>Data Availability</title><p>The datasets collected during the randomized controlled trial that are analyzed in this study are currently not publicly available and access requests should be directed to the principal investigator of the trial. The trial is registered at ClinicalTrials.gov (NCT0528004), where the contact details of the principal investigator can be found. The e-coach focus group data are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CRA</term><def><p>continuous risk assessment</p></def></def-item><def-item><term id="abb3">FCM</term><def><p>fuzzy cognitive map</p></def></def-item><def-item><term id="abb4">IRA</term><def><p>initial risk assessment</p></def></def-item><def-item><term id="abb5">RCT</term><def><p>randomized controlled trial</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Teachman</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Silverman</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Werntz</surname><given-names>A</given-names> </name></person-group><article-title>Digital mental health services: moving from promise to results</article-title><source>Cogn Behav Pract</source><year>2022</year><volume>29</volume><issue>1</issue><fpage>97</fpage><lpage>104</lpage><pub-id pub-id-type="doi">10.1016%2Fj.cbpra.2021.06.014</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mu&#x00F1;oz</surname><given-names>RF</given-names> </name></person-group><article-title>Harnessing psychology and technology to contribute to making health care a universal human right</article-title><source>Cogn Behav Pract</source><year>2022</year><month>02</month><volume>29</volume><issue>1</issue><fpage>4</fpage><lpage>14</lpage><pub-id pub-id-type="doi">10.1016/j.cbpra.2019.07.003</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sakal</surname><given-names>C</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xiang</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name></person-group><article-title>Development and validation of the Chinese Geriatric Depression Risk calculator (CGD-risk): a screening tool to identify elderly Chinese with depression</article-title><source>J Affect Disord</source><year>2022</year><month>12</month><day>15</day><volume>319</volume><fpage>428</fpage><lpage>436</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2022.09.034</pub-id><pub-id pub-id-type="medline">36184985</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>B</given-names> </name><name name-style="western"><surname>You</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rolls</surname><given-names>ET</given-names> </name><etal/></person-group><article-title>Identifying behaviour-related and physiological risk factors for suicide attempts in the UK Biobank</article-title><source>Nat Hum Behav</source><year>2024</year><month>09</month><volume>8</volume><issue>9</issue><fpage>1784</fpage><lpage>1797</lpage><pub-id pub-id-type="doi">10.1038/s41562-024-01903-x</pub-id><pub-id pub-id-type="medline">38956227</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thieme</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hanratty</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lyons</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Designing human-centered AI for mental health: developing clinically relevant applications for online CBT treatment</article-title><source>ACM Trans Comput-Hum Interact</source><year>2023</year><month>04</month><day>30</day><volume>30</volume><issue>2</issue><fpage>1</fpage><lpage>50</lpage><pub-id pub-id-type="doi">10.1145/3564752</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tornero-Costa</surname><given-names>R</given-names> </name><name name-style="western"><surname>Martinez-Millana</surname><given-names>A</given-names> </name><name name-style="western"><surname>Azzopardi-Muscat</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lazeri</surname><given-names>L</given-names> </name><name name-style="western"><surname>Traver</surname><given-names>V</given-names> </name><name name-style="western"><surname>Novillo-Ortiz</surname><given-names>D</given-names> </name></person-group><article-title>Methodological and quality flaws in the use of artificial intelligence in mental health research: systematic review</article-title><source>JMIR Ment Health</source><year>2023</year><month>02</month><day>2</day><volume>10</volume><fpage>e42045</fpage><pub-id pub-id-type="doi">10.2196/42045</pub-id><pub-id pub-id-type="medline">36729567</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Whiting</surname><given-names>D</given-names> </name><name name-style="western"><surname>Fazel</surname><given-names>S</given-names> </name></person-group><article-title>How accurate are suicide risk prediction models? Asking the right questions for clinical practice</article-title><source>Evid Based Ment Health</source><year>2019</year><month>08</month><volume>22</volume><issue>3</issue><fpage>125</fpage><lpage>128</lpage><pub-id pub-id-type="doi">10.1136/ebmental-2019-300102</pub-id><pub-id pub-id-type="medline">31248976</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cabitza</surname><given-names>F</given-names> </name><name name-style="western"><surname>Campagner</surname><given-names>A</given-names> </name></person-group><article-title>The need to separate the wheat from the chaff in medical informatics: introducing a comprehensive checklist for the (self)-assessment of medical AI studies</article-title><source>Int J Med Inform</source><year>2021</year><month>09</month><volume>153</volume><fpage>104510</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2021.104510</pub-id><pub-id pub-id-type="medline">34108105</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brodbeck</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jacinto</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gouveia</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A web-based self-help intervention for coping with the loss of a partner: protocol for randomized controlled trials in 3 countries</article-title><source>JMIR Res Protoc</source><year>2022</year><month>11</month><day>30</day><volume>11</volume><issue>11</issue><fpage>e37827</fpage><pub-id pub-id-type="doi">10.2196/37827</pub-id><pub-id pub-id-type="medline">36449341</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brandl</surname><given-names>L</given-names> </name><name name-style="western"><surname>van Velsen</surname><given-names>L</given-names> </name><name name-style="western"><surname>Brodbeck</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jacinto</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hofs</surname><given-names>D</given-names> </name><name name-style="western"><surname>Heylen</surname><given-names>D</given-names> </name></person-group><article-title>Developing an eMental health monitoring module for older mourners using fuzzy cognitive maps</article-title><source>D Health</source><year>2023</year><volume>9</volume><fpage>20552076231183549</fpage><pub-id pub-id-type="doi">10.1177/20552076231183549</pub-id><pub-id pub-id-type="medline">37361430</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Futterman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Holland</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Brown</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Thompson</surname><given-names>LW</given-names> </name><name name-style="western"><surname>Gallagher-Thompson</surname><given-names>D</given-names> </name></person-group><article-title>Factorial validity of the Texas Revised Inventory of Grief-Present scale among bereaved older adults</article-title><source>Psychol Assess</source><year>2010</year><month>09</month><volume>22</volume><issue>3</issue><fpage>675</fpage><lpage>687</lpage><pub-id pub-id-type="doi">10.1037/a0019914</pub-id><pub-id pub-id-type="medline">20822280</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Martin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rief</surname><given-names>W</given-names> </name><name name-style="western"><surname>Klaiberg</surname><given-names>A</given-names> </name><name name-style="western"><surname>Braehler</surname><given-names>E</given-names> </name></person-group><article-title>Validity of the Brief Patient Health Questionnaire Mood Scale (PHQ-9) in the general population</article-title><source>Gen Hosp Psychiatry</source><year>2006</year><volume>28</volume><issue>1</issue><fpage>71</fpage><lpage>77</lpage><pub-id pub-id-type="doi">10.1016/j.genhosppsych.2005.07.003</pub-id><pub-id pub-id-type="medline">16377369</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Jong-Gierveld</surname><given-names>J</given-names> </name></person-group><article-title>Developing and testing a model of loneliness</article-title><source>J Pers Soc Psychol</source><year>1987</year><volume>53</volume><issue>1</issue><fpage>119</fpage><lpage>128</lpage><pub-id pub-id-type="doi">10.1037/0022-3514.53.1.119</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>De Jong Gierveld</surname><given-names>J</given-names> </name><name name-style="western"><surname>Van Tilburg</surname><given-names>T</given-names> </name></person-group><article-title>The De Jong Gierveld short scales for emotional and social loneliness: tested on data from 7 countries in the UN generations and gender surveys</article-title><source>Eur J Ageing</source><year>2010</year><month>06</month><volume>7</volume><issue>2</issue><fpage>121</fpage><lpage>130</lpage><pub-id pub-id-type="doi">10.1007/s10433-010-0144-6</pub-id><pub-id pub-id-type="medline">20730083</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>AKC</given-names> </name><name name-style="western"><surname>Kamel</surname><given-names>MS</given-names> </name></person-group><article-title>Classification of imbalanced data: a review</article-title><source>Int J Patt Recogn Artif Intell</source><year>2009</year><month>06</month><volume>23</volume><issue>4</issue><fpage>687</fpage><lpage>719</lpage><pub-id pub-id-type="doi">10.1142/S0218001409007326</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Branco</surname><given-names>P</given-names> </name><name name-style="western"><surname>Torgo</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ribeiro</surname><given-names>R</given-names> </name></person-group><article-title>A survey of predictive modelling under imbalanced distributions</article-title><source>ACM Comput Surv</source><year>2016</year><volume>49</volume><issue>2</issue><fpage>31</fpage><lpage>48</lpage><pub-id pub-id-type="doi">10.1145/2907070</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Rossum</surname><given-names>G</given-names> </name><name name-style="western"><surname>Drake</surname><given-names>FL</given-names> </name></person-group><source>Python 3 Reference Manual</source><year>2009</year><publisher-name>CreateSpace</publisher-name><pub-id pub-id-type="other">9781441412690</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eisinga</surname><given-names>R</given-names> </name><name name-style="western"><surname>Grotenhuis</surname><given-names>M te</given-names> </name><name name-style="western"><surname>Pelzer</surname><given-names>B</given-names> </name></person-group><article-title>The reliability of a two-item scale: Pearson, Cronbach, or Spearman-Brown?</article-title><source>Int J Public Health</source><year>2013</year><month>08</month><volume>58</volume><issue>4</issue><fpage>637</fpage><lpage>642</lpage><pub-id pub-id-type="doi">10.1007/s00038-012-0416-3</pub-id><pub-id pub-id-type="medline">23089674</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tavakol</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dennick</surname><given-names>R</given-names> </name></person-group><article-title>Making sense of Cronbach&#x2019;s alpha</article-title><source>Int J Med Educ</source><year>2011</year><volume>2</volume><fpage>53</fpage><lpage>55</lpage><pub-id pub-id-type="doi">10.5116%2Fijme.4dfb.8dfd</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Welten</surname><given-names>M</given-names> </name><name name-style="western"><surname>de Kroon</surname><given-names>MLA</given-names> </name><name name-style="western"><surname>Renders</surname><given-names>CM</given-names> </name><etal/></person-group><article-title>Repeatedly measured predictors: a comparison of methods for prediction modeling</article-title><source>Diagn Progn Res</source><year>2018</year><volume>2</volume><fpage>5</fpage><pub-id pub-id-type="doi">10.1186/s41512-018-0024-7</pub-id><pub-id pub-id-type="medline">31093555</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><source>R: A language and environment for statistical computing</source><access-date>2024-11-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.R-project.org/">https://www.R-project.org/</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><source>Atlas.ti</source><access-date>2024-06-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://atlasti.com/">https://atlasti.com/</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nutley</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Bur&#x00E9;n</surname><given-names>J</given-names> </name><name name-style="western"><surname>Thorell</surname><given-names>LB</given-names> </name></person-group><article-title>COVID-19 restrictions resulted in both positive and negative effects on digital media use, mental health, and lifestyle habits</article-title><source>Int J Environ Res Public Health</source><year>2023</year><month>08</month><day>16</day><volume>20</volume><issue>16</issue><fpage>6583</fpage><pub-id pub-id-type="doi">10.3390/ijerph20166583</pub-id><pub-id pub-id-type="medline">37623169</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Znoj</surname><given-names>H</given-names> </name></person-group><article-title>Texas-Revised Inventory of Grief: validation of the German version TRIG-D</article-title><source>Psychosom Konsilpsychiat</source><year>2008</year><volume>2</volume><fpage>236</fpage><lpage>239</lpage><pub-id pub-id-type="doi">10.1007/s11800-008-0131-3</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gierveld</surname><given-names>JDJ</given-names> </name><name name-style="western"><surname>Tilburg</surname><given-names>TV</given-names> </name></person-group><article-title>A 6-item scale for overall, emotional, and social loneliness: confirmatory tests on survey data</article-title><source>Res Aging</source><year>2006</year><volume>28</volume><issue>5</issue><fpage>582</fpage><lpage>598</lpage><pub-id pub-id-type="doi">10.1177/0164027506289723</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Apostolopoulos</surname><given-names>ID</given-names> </name><name name-style="western"><surname>Groumpos</surname><given-names>PP</given-names> </name></person-group><article-title>Fuzzy cognitive maps: their role in explainable artificial intelligence</article-title><source>Appl Sci (Basel)</source><year>2023</year><volume>13</volume><issue>6</issue><fpage>3412</fpage><pub-id pub-id-type="doi">10.3390/app13063412</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Raeder</surname><given-names>T</given-names> </name><name name-style="western"><surname>Forman</surname><given-names>G</given-names> </name><name name-style="western"><surname>Chawla</surname><given-names>NV</given-names> </name></person-group><article-title>Learning from imbalanced data: evaluation matters</article-title><source>Data Mining: Foundations and Intelligent Paradigms</source><year>2012</year><fpage>315</fpage><lpage>331</lpage><pub-id pub-id-type="doi">10.1007/978-3-642-23166-7_12</pub-id><pub-id pub-id-type="other">9783642231650</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sendak</surname><given-names>M</given-names> </name><name name-style="western"><surname>Elish</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>M</given-names> </name></person-group><article-title>"The human body is a black box": supporting clinical decision-making with deep learning</article-title><year>2020</year><month>01</month><day>27</day><conf-name>2020 Conference on Fairness, Accountability, and Transparency</conf-name><conf-loc>Barcelona, Spain</conf-loc><pub-id pub-id-type="doi">10.1145/3351095.3372827</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Mancini</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Bonanno</surname><given-names>GA</given-names> </name></person-group><article-title>Loss and grief: the role of individual differences</article-title><source>Resilience and Mental Health: Challenges Across the Lifespan</source><year>2011</year><fpage>189</fpage><lpage>199</lpage><pub-id pub-id-type="doi">10.1017/CBO9780511994791.015</pub-id><pub-id pub-id-type="other">9780521898393</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abdullah</surname><given-names>S</given-names> </name><name name-style="western"><surname>Choudhury</surname><given-names>T</given-names> </name></person-group><article-title>Sensing technologies for monitoring serious mental illnesses</article-title><source>IEEE MultiMedia</source><year>2018</year><volume>25</volume><issue>1</issue><fpage>61</fpage><lpage>75</lpage><pub-id pub-id-type="doi">10.1109/MMUL.2018.011921236</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Malgaroli</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hull</surname><given-names>TD</given-names> </name><name name-style="western"><surname>Zech</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Althoff</surname><given-names>T</given-names> </name></person-group><article-title>Natural language processing for mental health interventions: a systematic review and research framework</article-title><source>Transl Psychiatry</source><year>2023</year><month>10</month><day>6</day><volume>13</volume><issue>1</issue><fpage>309</fpage><pub-id pub-id-type="doi">10.1038/s41398-023-02592-2</pub-id><pub-id pub-id-type="medline">37798296</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Continuous risk assessment questionnaire.</p><media xlink:href="formative_v8i1e63262_app1.docx" xlink:title="DOCX File, 23 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Regression analysis: individual continuous risk assessment growth curves.</p><media xlink:href="formative_v8i1e63262_app2.docx" xlink:title="DOCX File, 256 KB"/></supplementary-material></app-group></back></article>