<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e67835</article-id><article-id pub-id-type="doi">10.2196/67835</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Machine Learning Approach to Identifying Empathy Using the Vocals of Mental Health Helpline Counselors: Algorithm Development and Validation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Sanjeewa</surname><given-names>Ruvini</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Iyer</surname><given-names>Ravi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Apputhurai</surname><given-names>Pragalathan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wickramasinghe</surname><given-names>Nilmini</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Meyer</surname><given-names>Denny</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>School of Health Sciences, Swinburne University of Technology, Hawthorn</institution><addr-line>PO Box 218, John Street</addr-line><addr-line>Melbourne</addr-line><country>Australia</country></aff><aff id="aff2"><institution>School of Computing, Engineering &#x0026; Mathematical Sciences, La Trobe University</institution><addr-line>Melbourne</addr-line><country>Australia</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Gore</surname><given-names>Ross</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chowdhury</surname><given-names>Shaika</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Ruvini Sanjeewa, BSc, School of Health Sciences, Swinburne University of Technology, Hawthorn, PO Box 218, John Street, Melbourne, 3122, Australia, 61 422587030; <email>rsanjeewa@swin.edu.au</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>16</day><month>4</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e67835</elocation-id><history><date date-type="received"><day>22</day><month>10</month><year>2024</year></date><date date-type="rev-recd"><day>16</day><month>02</month><year>2025</year></date><date date-type="accepted"><day>18</day><month>02</month><year>2025</year></date></history><copyright-statement>&#x00A9; Ruvini Sanjeewa, Ravi Iyer, Pragalathan Apputhurai, Nilmini Wickramasinghe, Denny Meyer. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 16.4.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e67835"/><abstract><sec><title>Background</title><p>This research study aimed to detect the vocal features immersed in empathic counselor speech using samples of calls to a mental health helpline service.</p></sec><sec><title>Objective</title><p>This study aimed to produce an algorithm for the identification of empathy from these features, which could act as a training guide for counselors and conversational agents who need to transmit empathy in their vocals.</p></sec><sec sec-type="methods"><title>Methods</title><p>Two annotators with a psychology background and English heritage provided empathy ratings for 57 calls involving female counselors, as well as multiple short call segments within each of these calls. These ratings were found to be well-correlated between the 2 raters in a sample of 6 common calls. Using vocal feature extraction from call segments and statistical variable selection methods, such as L1 penalized LASSO (Least Absolute Shrinkage and Selection Operator) and forward selection, a total of 14 significant vocal features were associated with empathic speech. Generalized additive mixed models (GAMM), binary logistics regression with splines, and random forest models were used to obtain an algorithm that differentiated between high- and low-empathy call segments.</p></sec><sec sec-type="results"><title>Results</title><p>The binary logistics regression model reported higher predictive accuracies of empathy (area under the curve [AUC]=0.617, 95% CI 0.613&#x2010;0.622) compared to the GAMM (AUC=0.605, 95% CI 0.601&#x2010;0.609) and the random forest model (AUC=0.600, 95% CI 0.595&#x2010;0.604). This difference was statistically significant, as evidenced by the nonoverlapping 95% CIs obtained for AUC. The DeLong test further validated these results, showing a significant difference in the binary logistic model compared to the random forest (D=6.443, <italic>df</italic>=186283, <italic>P</italic>&#x003C;.001) and GAMM (Z=5.846, <italic>P</italic>&#x003C;.001). These findings confirm that the binary logistic regression model outperforms the other 2 models concerning predictive accuracy for empathy classification.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study suggests that the identification of empathy from vocal features alone is challenging, and further research involving multimodal models (eg, models incorporating facial expression, words used, and vocal features) are encouraged for detecting empathy in the future. This study has several limitations, including a relatively small sample of calls and only 2 empathy raters. Future research should focus on accommodating multiple raters with varied backgrounds to explore these effects on perceptions of empathy. Additionally, considering counselor vocals from larger, more heterogeneous populations, including mixed-gender samples, will allow an exploration of the factors influencing the level of empathy projected in counselor voices more generally.</p></sec></abstract><kwd-group><kwd>vocal features</kwd><kwd>voice characteristics</kwd><kwd>empathy</kwd><kwd>mental health care</kwd><kwd>crisis helpline service</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Empathy is defined as experiencing the emotions (emotional empathy) and cognitions (cognitive empathy) of others and responding to them appropriately [<xref ref-type="bibr" rid="ref1">1</xref>]. Empathy is especially important for patient care, where the lived experience of the patient must be understood by responding health care professionals, while also conveying this understanding in conjunction with a desire to help the patient [<xref ref-type="bibr" rid="ref2">2</xref>]. The effectiveness of physician empathy has been shown to improve patient satisfaction and commitment to recovery while reducing anxiety and distress levels, leading to better clinical results [<xref ref-type="bibr" rid="ref3">3</xref>]. Furthermore, empathic behavior by mental health (MH) care providers reduces their own risk of burnout [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Telephone helpline services offer an effective means of supporting those who need immediate MH care [<xref ref-type="bibr" rid="ref5">5</xref>]. The demand for such services has increased dramatically since the outbreak of the COVID-19 pandemic [<xref ref-type="bibr" rid="ref6">6</xref>], increasing the expectations of counseling staff to provide support for people with complex MH concerns [<xref ref-type="bibr" rid="ref7">7</xref>]. As a basic counseling skill, empathy is key to successful engagement with patients in the context of complex psychosocial needs.</p><p>Besides emotional and cognitive empathy in understanding the status of a patient, contextual awareness is equally important for therapeutic engagement [<xref ref-type="bibr" rid="ref8">8</xref>]. This means that empathic responses need to be contextually appropriate by considering environmental cues, culture, demographic factors, and the specific circumstances of the patient to understand the broader context of their MH status [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. This allows counselors to tailor responses, based on context, to engage in effective communication with distressed patients, thereby delivering better outcomes [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>Verbal cues and tone of voice are crucial when communicating empathy [<xref ref-type="bibr" rid="ref11">11</xref>]. For example, reduced speech rate and lower pitch are perceived as more empathic by patients when receiving bad news from health care providers in an oncology setting [<xref ref-type="bibr" rid="ref12">12</xref>] and while actively listening to telephone callers, nurses have been found to express empathy through their choice of words, voice and intonation, projection of compassion and warmth, as well as &#x201C;tuning in&#x201D; to the caller&#x2019;s story and identifying with the caller&#x2019;s emotions [<xref ref-type="bibr" rid="ref13">13</xref>].</p><p>Unfortunately, the global demand for MH support is not being met by the existing workforce [<xref ref-type="bibr" rid="ref14">14</xref>]. This service gap is leading to growing interest in alternative digital technological solutions. Technological innovations, such as the design of conversational agents, have demonstrated potential in facilitating effective and immediate patient care. However, to optimize upon end user acceptance, conversational agents need to display empathy [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. However, we have yet to identify the precise vocal features most associated with an empathic human response, and it is also not known if it is possible to categorize empathy levels using such vocal features. Thus, the aim of this paper is to (1) identify the vocal features significantly associated with empathy in a large collection of telephone helpline counseling call recordings; and to (2) evaluate the accuracy of a machine learning algorithm to correctly designate short segments of each recording to categories of low and high empathy.</p><p>This study has been reported in accordance with the TRIPOD+AI (Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis+Artificial Intelligence) checklist as shown in <xref ref-type="supplementary-material" rid="app7">Checklist 1</xref>.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Collection</title><p>Recordings of telephone helpline calls (n=57) were obtained from On The Line, Australia, a suicide helpline counseling service. Participants were counselors for a Suicide Call-Back Service (a national helpline service coordinated by On The Line, Australia). Calls were randomly sampled from July 1, 2019, to June 30, 2021, stratified by organizationally determined suicide risk level (high or low). The level of suicide risk of each caller had been previously assessed by counselors using the Columbia Suicide Severity Rating Scale (C-SSRS) to differentiate between calls featuring high suicide risk (with C-SSRS ratings of 6&#x2010;7) and calls with low risk of suicide (with C-SSRS ratings of 1&#x2010;2; please refer to Iyer et al [<xref ref-type="bibr" rid="ref17">17</xref>] for further details [<xref ref-type="bibr" rid="ref18">18</xref>]). Only the counselors&#x2019; voice recordings were used in this study. No information was provided that could be used to identify the callers or counselors for any of these calls.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study was approved by the Swinburne Human Research Ethics Committee (Ref:20226835&#x2010;11907). The application was given a waiver of consent from the ethics committee for the use of nonidentifiable secondary research data. However, the research team also signed a confidentiality agreement that restricted the discussion of the contents of the recordings only within the research team. The secondary data and annotation results were saved on OneDrive for Business (Microsoft Corp) and were only accessible to the research team. No compensation was paid to participants in this study because they could not be identified by the research team.</p></sec><sec id="s2-3"><title>Annotation of Call Segments and Overall Call Empathy for Counselors</title><p>Annotations of counselor empathy were conducted by 2 independent researchers (Inge Gnatt and Sarah Dunning) using RStudio (version 2024.04.2, build 764; R Foundation) [<xref ref-type="bibr" rid="ref19">19</xref>]. The call annotators Inge Gnatt and Sarah Dunning were recruited via research team networks. Both had extensive experience working as counselors for a MH helpline service. Inge Gnatt had experience working as an annotator on a similar project. Segments of the counselor voices were selected from each call using Audacity (version 3.5.1, CMake Release Build; Muse Group &#x0026; contributors) [<xref ref-type="bibr" rid="ref20">20</xref>], ensuring that overlaps between the caller and responding counselor voices were minimized. Empathy displayed by the counselor within each call segment was rated using the Carkhuff and Truax Empathy (CTE) scale [<xref ref-type="bibr" rid="ref21">21</xref>]. A weekly project team meeting, attended by a clinical psychologist (Maja Nedeljkovic), was used to reconcile any disparities in ratings. A Qualtrics web-based questionnaire was used to also collect data on the overall level of empathy displayed by the responding counselor during each call and to evaluate caller distress at the commencement and conclusion of each call. <xref ref-type="fig" rid="figure1">Figure 1</xref> shows the flow of the voice analysis process during this study.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the voice analysis process for classifying empathy in counselor voices. AELS: Active-Empathic Listening Scale; AUC: area under the curve; CTE: Carkhuff and Truax Empathy; LASSO: Least Absolute Shrinkage and Selection Operator; LOOCV: Leave One Out Cross Validation; GAMM: generalized additive mixed model; N/A: not available; OTLA: On the Line Australia; PCM-f321e: pulse code modulation float; PEIS: Perceived Emotional Intelligence Scale; RS7: Rating Scale 7 (7-item).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e67835_fig01.png"/></fig></sec><sec id="s2-4"><title>Measures of Counselor Empathy and Caller Distress</title><p>The CTE scale was used to rate the audio segments selected from each call on a 5-point Likert-style scale (1=&#x201C;low empathy&#x201D; to 5=&#x201C;high empathy&#x201D;) [<xref ref-type="bibr" rid="ref21">21</xref>]. Additionally, 3 measures were used to assess overall counselor empathy for each call. These 3 scales were modified to suit counselor-caller conversations through an iterative process in which members of the research team provided independent feedback to achieve the final questionnaire. Examples were developed by the annotators for each item included in these scales, ensuring clarity and consistency of ratings. The details for these scales are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The measurement scales included:</p><list list-type="order"><list-item><p>The Perceived Emotional Intelligence (PEI) Scale [<xref ref-type="bibr" rid="ref22">22</xref>] identified variations in PEI in the counselor&#x2019;s vocals. The PEI is a 20-item scale with each item scored with 1=&#x201C;never or almost never true&#x201D; and 7=&#x201C;almost or almost always true.&#x201D;</p></list-item><list-item><p>The Active-Empathic Listening (AEL) Scale [<xref ref-type="bibr" rid="ref23">23</xref>] was modified appropriately to produce 10 items measuring empathic listening using a 7-point Likert-style scale with 1=&#x201C;never or almost never true&#x201D; and 7=&#x201C;almost or almost always true.&#x201D;</p></list-item><list-item><p>Rating Scale 7 (RS7): A single item 7-point Likert Scale [<xref ref-type="bibr" rid="ref24">24</xref>] was used to rate overall empathy with 1=&#x201C;low empathy&#x201D; and 7=&#x201C;high empathy.&#x201D;</p></list-item></list><p>Finally, at the start and end of each call the annotators assessed the level of caller distress using the distress thermometer [<xref ref-type="bibr" rid="ref25">25</xref>], a visual analogue 11-point scale (0=&#x201C;no distress&#x201D; to 10=&#x201C;extreme distress&#x201D;).</p></sec><sec id="s2-5"><title>Data Validation Through Interrater Consistency Check</title><p>Six (10%) calls were chosen at random to measure interrater reliability. The empathy ratings of the more experienced rater (Inge Gnatt) were used as the reference, against which the SD ratings were compared. Spearman correlation [<xref ref-type="bibr" rid="ref26">26</xref>] was calculated for each of the 3 scales used to rate overall counselor empathy for each call. The Mann-Whitney <italic>U</italic> test was then used to check for the significance of differences between the ratings provided by the annotators.</p></sec><sec id="s2-6"><title>Relationships Between Perceptions of Empathy and Call Context</title><p>The associations between perceived counselor empathy and call context were explored using a combination of empathy ratings, caller distress at the beginning and end of the call, and caller suicide risk. Caller distress and suicide risk were correlated with perceived counselor empathy to evaluate the relationship between level of empathy and caller disposition.</p></sec><sec id="s2-7"><title>Preprocessing Stage: Audio File Format Conversion and Vocal Features Extraction</title><p>The input call recordings were obtained as 8 kHz sample rate, 8-bit depth .wav files. The encoding type of the files were transformed to PCM float format with 32-bit depth to ensure compatibility with RStudio for analysis. Vocal features (n=55) were extracted per 30-millisecond speech frames (50% overlap; Blackman windows) within each annotated segment using RStudio (version 2.7.0, <italic>Soundgen</italic> package [<xref ref-type="bibr" rid="ref27">27</xref>]).</p></sec><sec id="s2-8"><title>Removing Moderate Ratings and Binary Coding Empathy Level</title><sec id="s2-8-1"><title>Overview</title><p>The vocal segments that scored a rating of 3 out of 5 on the CTE scale were removed (n=142, 18%) from further analysis because of their neutral empathic character. A binary response variable was then created for each of the 643 remaining segments (190,345 speech frames of 30 ms) with an empathy rating of 4&#x2010;5 coded as high empathy (n=146 segments) and a rating of 1&#x2010;2 coded as low empathy (n=497 segments).</p></sec><sec id="s2-8-2"><title>Removing Missing Values in Speech Frames and Vocal Features</title><p>Vocal features and 30-millisecond speech frames with more than 50% missing values were removed to maintain the quality of data and improve the overall accuracy of the results. The resulting data retained 50.9% (n=28) of the original vocal features and 53.6% (n=102,021) of the original speech frames.</p></sec><sec id="s2-8-3"><title>Removing Silent Speech Frames and Normalization</title><p>The silent speech frames were also removed from each of the 643 segments leaving 95,034 speech frames in the final analysis sample. Finally, the minimum and maximum normalizing technique [<xref ref-type="bibr" rid="ref28">28</xref>] was applied to reduce the influence of background noise.</p></sec></sec><sec id="s2-9"><title>Analysis Stage</title><sec id="s2-9-1"><title>Selection of Vocal Features</title><p>Variable selection was performed to identify vocal features that were strongly associated with empathy. L1 penalized LASSO (Least Absolute Shrinkage and Selection Operator) regression was used to select the most relevant variables by shrinking the coefficients of the least relevant variables to 0 [<xref ref-type="bibr" rid="ref29">29</xref>]. Tenfold cross-validation was used to optimize the tuning parameter, lambda. Further refinement per the selected vocal features was then conducted using a forward stepwise regression model.</p></sec><sec id="s2-9-2"><title>Models for Identifying Empathy Level With Selected Vocal Features</title><p>Three methods were used to classify low and high empathy segments based on this final set of selected vocal features. A generalized additive mixed model (GAMM) included vocal features as fixed effects, with each call treated as a random effect. Spline functions for the selected vocal features were used to account for nonlinearity [<xref ref-type="bibr" rid="ref30">30</xref>]. The GAM function [<xref ref-type="bibr" rid="ref31">31</xref>] of package <italic>mgcv</italic> [<xref ref-type="bibr" rid="ref32">32</xref>] in RStudio was used for the analysis.</p><p>Random forest classification also allowed for nonlinear relationships using step functions while more efficiently processing large datasets [<xref ref-type="bibr" rid="ref33">33</xref>]. This has been a prominent classification model used in studies involving vocal analysis [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. The binary logistic regression model was the third model considered, again accounting for nonlinearity using splines, and was used in our study as the baseline model [<xref ref-type="bibr" rid="ref36">36</xref>].</p></sec><sec id="s2-9-3"><title>Model Evaluation</title><p>Probabilistic predictions for high versus low empathy levels were obtained for each segment using Leave One Caller Out Cross Validation. Based on these probabilities, receiver operating characteristic curves were created, and areas under the curves (AUCs) [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>] were used to compare the reliability of these models. The Youden index [<xref ref-type="bibr" rid="ref39">39</xref>] was used to decide the optimal cut point for classifying segments based on their estimated high versus low empathy probabilities.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>The following results were obtained from the 57 calls for female counselors. This sample of calls included 12 (12/57, 21%) calls at high risk of suicide and 45 (45/57,79%) calls at low risk of suicide.</p></sec><sec id="s3-2"><title>Annotation of Call Segments</title><p>Using the CTE scale, 146 (18.6%) segments showed low empathy, 142 (18.1%) showed medium empathy, and 497 (63.3%) segments demonstrated high empathy.</p></sec><sec id="s3-3"><title>Overall Call Empathy Ratings</title><p><xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> provides descriptive statistics for the overall empathy ratings for the 57 calls using the 3 scales. Excellent reliability is observed from both the raters, Inge Gnatt and Sarah Dunning, for the PEI and AEL scales, with Cronbach &#x03B1; values above 0.9. The descriptive statistics and the Spearman correlation statistics between the annotators are shown in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. A strong agreement between raters was observed with the PEI measure and the RS7 empathy measure approaching statistical significance.</p></sec><sec id="s3-4"><title>Relationships Between Perceptions of Empathy and Call Context</title><p>The relationship of counselor empathy ratings with caller&#x2019;s distress at the start and end of the call and suicide risk are shown in <xref ref-type="table" rid="table1">Table 1</xref>. While the correlations between the initial distress and the 3 empathy measures were not significant, a moderate, statistically significant negative correlation between the final distress of the caller and the empathy of the counselor was observed for both the PEI and RS7 (<italic>P</italic>&#x003C;.01 and <italic>P</italic>&#x003C;.001, respectively). The suicide risk of callers as measured using the C-SSRS, had a statistically significant but weak positive correlation with counselor empathy across the PEI and AEL measures. Strong statistically significant correlations among the 3 empathy measures were found, validating the empathy measurement process.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Spearman correlation coefficients for caller distress and suicide risk with counselor empathy ratings<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="3">Caller context</td><td align="left" valign="bottom" colspan="3">Ratings for counselor empathy</td></tr></thead><tbody><tr><td align="left" valign="top">Empathy rating</td><td align="left" valign="top">Initial distress</td><td align="left" valign="top">Final distress</td><td align="left" valign="top">Suicide risk</td><td align="left" valign="top">PEIS<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">AELS<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">RS7<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">PEIS</td><td align="left" valign="top">&#x2013;0.011</td><td align="left" valign="top">&#x2013;0.414**</td><td align="left" valign="top">0.310*</td><td align="left" valign="top">1</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">AELS</td><td align="left" valign="top">0.102</td><td align="left" valign="top">&#x2013;0.233</td><td align="left" valign="top">0.308*</td><td align="left" valign="top">.822***</td><td align="left" valign="top">1</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">RS7</td><td align="left" valign="top">&#x2013;0.024</td><td align="left" valign="top">&#x2013;0.443***</td><td align="left" valign="top">0.055</td><td align="left" valign="top">.847***</td><td align="left" valign="top">.851***</td><td align="left" valign="top">1</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>***<italic>P</italic>&#x003C;.001 (2-tailed), **<italic>P</italic>&#x003C;.01 (2-tailed), and *<italic>P</italic>&#x003C;.05 (2-tailed).</p></fn><fn id="table1fn2"><p><sup>b</sup>PEIS: Perceived Emotional Intelligence Scale.</p></fn><fn id="table1fn3"><p><sup>c</sup>AELS: Active-Empathic Listening Scale.</p></fn><fn id="table1fn4"><p><sup>d</sup>RS7: Rating Scale 7 (7-item).</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-5"><title>Analysis Stage</title><p>The selection of vocal features using LASSO to predict high versus low empathy used a cross-validation of the training dataset to reveal an optimum Log (Lambda) parameter=&#x2013;8.542. The relationship between this parameter and the binomial deviance is shown in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. Using this lambda value, 23 vocal features were retained. These 23 vocal features were then passed on to the forward selection binary logistic regression model, which identified 16 significant vocal features as shown in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Results of the forward binary logistic regression model for vocal feature selection for identifying high versus low empathy in counselor voices.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Vocal features</td><td align="left" valign="bottom">Coefficient</td><td align="left" valign="bottom"><italic>Z</italic> value</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">&#x2003;Depth of amplitude</td><td align="left" valign="top">&#x2013;1.468</td><td align="left" valign="top">&#x2013;15.994</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Frequency of amplitude (Hz)</td><td align="left" valign="top">0.068</td><td align="left" valign="top">2.524</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top">&#x2003;Frequency of amplitude (Hz) via MS<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">&#x2013;0.462</td><td align="left" valign="top">&#x2013;12.350</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Purity of amplitude via MS</td><td align="left" valign="top">&#x2013;0.834</td><td align="left" valign="top">&#x2013;9.596</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Amplitude (dB)</td><td align="left" valign="top">&#x2013;3.391</td><td align="left" valign="top">&#x2013;51.688</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Dominant frequency (Hz)</td><td align="left" valign="top">0.985</td><td align="left" valign="top">3.136</td><td align="left" valign="top">.002</td></tr><tr><td align="left" valign="top">&#x2003;Entropy</td><td align="left" valign="top">3.47</td><td align="left" valign="top">20.843</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Shannon entropy</td><td align="left" valign="top">&#x2013;6.016</td><td align="left" valign="top">&#x2013;25.27</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Epoch</td><td align="left" valign="top">0.289</td><td align="left" valign="top">5.04</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;First formant frequency (Hz)</td><td align="left" valign="top">1.244</td><td align="left" valign="top">5.08</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;First formant width (Hz)</td><td align="left" valign="top">0.194</td><td align="left" valign="top">4.181</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Second formant frequency (Hz)</td><td align="left" valign="top">&#x2013;0.191</td><td align="left" valign="top">&#x2013;2.808</td><td align="left" valign="top">.005</td></tr><tr><td align="left" valign="top">&#x2003;Second formant width (Hz)</td><td align="left" valign="top">&#x2013;0.063</td><td align="left" valign="top">&#x2013;1.802</td><td align="left" valign="top">.07</td></tr><tr><td align="left" valign="top">&#x2003;Third formant frequency (Hz)</td><td align="left" valign="top">&#x2013;0.099</td><td align="left" valign="top">&#x2013;1.521</td><td align="left" valign="top">.13</td></tr><tr><td align="left" valign="top">&#x2003;Third formant width (Hz)</td><td align="left" valign="top">0.004</td><td align="left" valign="top">0.118</td><td align="left" valign="top">.91</td></tr><tr><td align="left" valign="top">&#x2003;Spectral flux</td><td align="left" valign="top">0.11</td><td align="left" valign="top">1.435</td><td align="left" valign="top">.15</td></tr><tr><td align="left" valign="top">&#x2003;HNR<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> (dB)</td><td align="left" valign="top">&#x2013;0.37</td><td align="left" valign="top">2.869</td><td align="left" valign="top">.004</td></tr><tr><td align="left" valign="top">&#x2003;Spectral novelty</td><td align="left" valign="top">0.035</td><td align="left" valign="top">0.715</td><td align="left" valign="top">.47</td></tr><tr><td align="left" valign="top">&#x2003;Peak frequency (Hz)</td><td align="left" valign="top">&#x2013;0.293</td><td align="left" valign="top">&#x2013;1.827</td><td align="left" valign="top">.06</td></tr><tr><td align="left" valign="top">&#x2003;25th percentile frequency (Hz)</td><td align="left" valign="top">&#x2013;1.311</td><td align="left" valign="top">&#x2013;4.720</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;50th percentile frequency (Hz)</td><td align="left" valign="top">&#x2013;0.262</td><td align="left" valign="top">&#x2013;1.586</td><td align="left" valign="top">.11</td></tr><tr><td align="left" valign="top">&#x2003;Spectral centroid (Hz)</td><td align="left" valign="top">5.782</td><td align="left" valign="top">15.717</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">&#x2003;Spectral slope (Hz)</td><td align="left" valign="top">&#x2013;3.944</td><td align="left" valign="top">&#x2013;17.881</td><td align="left" valign="top">&#x003C;.001</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>MS: modulation spectrum.</p></fn><fn id="table2fn2"><p><sup>b</sup>HNR: harmonics-to-noise ratio.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-6"><title>Feature Extraction and Classification</title><p>The results of the GAMM are shown in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>. Of the 16 selected vocal features, the GAMM used only 14. Based on the effective <italic>df</italic> values, 2 of the vocal features (Shannon entropy and the 25th percentile frequency) show a linear relationship in the GAMM. <xref ref-type="fig" rid="figure2">Figure 2</xref> illustrates the nonlinear nature of all other relationships. An AUC value of 0.605 was obtained for the GAMM [<xref ref-type="bibr" rid="ref40">40</xref>].</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The smoothed relationship between the selected vocal features and the standardized predicted level of empathy displayed in counselor speech with 95% CIs. MS: modulation spectrum.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e67835_fig02.png"/></fig><p>The binary logistic classification model showed higher AUC values and nonoverlapping CIs at 95%. The DeLong test confirmed that the binary logistic model outperformed both the random forest (D=6.443, <italic>df</italic>=186283, <italic>P</italic>&#x003C;.001) and GAMM (Z=5.846, <italic>P</italic>&#x003C;.001) models [<xref ref-type="bibr" rid="ref41">41</xref>]. However, this model achieved a lower classification accuracy than the other 2 methods, as shown in <xref ref-type="table" rid="table3">Table 3</xref>, suggesting that the probability cut point used for classification purposes was not ideal.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Performance comparison of the 3 classification models for identifying high versus low counselor empathy.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Classification model</td><td align="left" valign="bottom">Accuracy (%)</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> value</td><td align="left" valign="bottom">95% CIs</td></tr></thead><tbody><tr><td align="left" valign="top">GAMM<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">75</td><td align="left" valign="top">0.605</td><td align="left" valign="top">0.601&#x2010;0.609</td></tr><tr><td align="left" valign="top">Random forest</td><td align="left" valign="top">74</td><td align="left" valign="top">0.6</td><td align="left" valign="top">0.595&#x2010;0.604</td></tr><tr><td align="left" valign="top">Binary logistic regression</td><td align="left" valign="top">69</td><td align="left" valign="top">0.617</td><td align="left" valign="top">0.613&#x2010;0.622</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn><fn id="table3fn2"><p><sup>b</sup>GAMM: generalized additive mixed model.</p></fn></table-wrap-foot></table-wrap><p>As illustrated in <xref ref-type="fig" rid="figure2">Figure 2</xref>, higher empathy was associated with higher values for the first formant frequency, dominant frequency, Shannon entropy, spectral slope, and harmonic-to-noise ratio. In contrast, higher empathy was associated with lower values for the 25th percentile frequency and spectral centroid. Finally, lower empathy was associated with intermediate values for depth of amplitude, amplitude, dominant frequency, epoch, and first formant frequency of speech.</p><p>The GAMM was able to differentiate between low and high 30-millisecond segments of speech to a classification accuracy of 75%. This was superior to both random forest and binary logistic regression models (74% and 69%, respectively).</p><p>Epoch (39%), amplitude (22%), and depth of amplitude (7%) were the top 3 vocal features contributing to empathic speech in the GAMM. The vocal features that contribute the most toward the identification of empathic speech vary across the 3 methods used, as shown in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p><p>For purposes of validation, the voice algorithm was also evaluated on a synthesized dataset of female voices created from a text-to-speech application. This approach yielded significant differences between high and low empathic voices in the validation dataset for the GAMM, random forest, and binary logistic regression classification models.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study was undertaken to identify a range of vocal features that can predict the level of empathy exhibited in the recordings of female counselors and to accurately classify short segments of each recording according to low or high empathy ratings. We were successful in identifying 14 unique vocal features that significantly distinguished between low and high empathy ratings in the GAMM. Furthermore, we were able to successfully classify short segments of speech using these vocal features to an accuracy level of 75% using this model. Although this study considered only 57 calls, each call featured multiple segments annotated for the level of empathy, and each of these segments was further divided into 30-millisecond speech frames that were the observational points considered during the modeling process. Reducing the number of vocal features in the model led to lower AUC values, suggesting that the original model featuring 14 vocal features was not overfitted. Furthermore, the 95% AUC CIs for the binary logistic regression did not overlap those for the other 2 methods, confirming that the binary logistic regression model produced superior results.</p><p>Empathy is an important component of human interactions and social connections that promotes general well-being [<xref ref-type="bibr" rid="ref42">42</xref>]. It is an essential component of MH care support, required to enhance therapeutic alliance and rapport building. Empathy embodies the ability to understand and compassionately reflect the range of feelings and experiences communicated by others. Empathic communication relies upon verbal (words and vocal features) and nonverbal means such as body language and facial expression [<xref ref-type="bibr" rid="ref43">43</xref>]. However, it is only verbal expressions of empathy through vocal features that are the subject of this study.</p><p>The detection of empathy is traditionally based upon subjective human perception captured using standardized empathy scales or questionnaires [<xref ref-type="bibr" rid="ref44">44</xref>]. Research on detecting empathic speech through voice feature extraction has been the focus of at least 2 recent studies, both showing similar accuracies to what we have found. For example, the first study by Chen et al [<xref ref-type="bibr" rid="ref45">45</xref>] analyzed the acoustic prosodic features of speech recorded in YouTube (Google LLC) videos used for empathy training purposes. The use of these features resulted in classification accuracies of 59% (<italic>F</italic><sub>1</sub>-score) when differentiating between empathy and neutral categories. A second study by Alam et al [<xref ref-type="bibr" rid="ref46">46</xref>] explored acoustic and lexical features of empathic speech using annotations sourced from Italian call center conversations. In this study, classification using support vector machines yielded accuracies of 68.1%.</p><p>However, our study is unique in identifying a range of vocal features that identify the level of empathy using an ecologically valid dataset of counselor-caller conversations obtained from On the Line Australia, an Australian helpline service. Machine learning techniques are involved in both vocal feature selection and empathy classification. The 2 annotators chosen for empathy labeling purposes had a psychological background to strengthen the validity of this process. This led to the development of an algorithm that detects the human vocal features that are associated with empathic speech. This algorithm has the potential to enhance the training of counselors in the use of empathic speech and offers valuable insights into effective human communication in the MH care domain.</p></sec><sec id="s4-2"><title>Call Context and Counselor Empathy in a Crisis Helpline Setting</title><p>The analysis revealed that there is a strong negative correlation between the final distress level of the caller and the counselor&#x2019;s level of empathy, suggesting that empathic communication with a caller can lower their level of distress by the end of the call. Higher levels of empathy allow the counselors to build rapport and trust with their patients, allowing for effective emotional support during a crisis. These findings align with the existing literature about the benefits of empathic interactions. The suicide risk of the caller was positively related to counselor empathy. This is an indication that counselors effectively recognize crisis situations, exhibiting higher levels of empathy when speaking to callers with high risks of suicide. It also confirms that the level of empathy displayed by counselors is adapted to the situation of the caller. These relationships of counselor empathy with caller distress and suicide risk confirm the importance of counselor empathy in the context of crisis helpline services.</p></sec><sec id="s4-3"><title>Characteristics of Empathic Vocals</title><p>This study has identified several vocal features associated with empathy. The depth of amplitude in speech reflects varying levels of loudness, emphasizing the expressiveness and dynamic nature of the human vocals apparent in empathic speech. A stable, more consistent emotional delivery during speech (purity of amplitude) also helps to convey empathy. Quieter vocals (amplitude and lower tonal frequency or dominant frequency) are also associated with higher empathy.</p><p>Higher first formant frequencies are associated with &#x201C;a&#x201D; vowel sound, which corresponds with high ratings of empathy. Additionally, a higher harmonic-to-noise ratio, indicating greater clarity and more pleasant-sounding vocals, is also associated with greater empathy. The spectral slope has a strong positive relationship to empathy, while the spectral centroid shows the opposite relationship. This indicates that a lower spectral centroid, with more low-frequency components, makes a speaker sound more empathic.</p><p>Based on these findings, it is evident that empathy in vocals is provided by a combination of multiple human vocal features, and variations in each of the features exert a different impact on empathy. In particular, the way that a specific threshold of loudness in the vocals decides the delivery of perceived empathy in the context of effective counseling provides compelling evidence. This further suggests that the right balance of each of these vocal features is needed, where stability, energy, and clarity play a pivotal role.</p><p>However, the study of empathy in vocals is a complex topic and has challenges. Especially the subjective nature of empathy perception is an area that requires further study. This study relied on empathy ratings provided by 2 psychology-trained female raters of English heritage. Different results may have been obtained if raters with different cultural, social, and educational backgrounds had been included [<xref ref-type="bibr" rid="ref47">47</xref>].</p></sec><sec id="s4-4"><title>Vocal Feature Extraction and Empathy Classification</title><p>Three methods were used to study the association between empathy and relevant vocal features. A GAMM, a random forest, and a logistic regression approach with splines were fitted and compared using AUC values and using the Leave One Caller Out Cross Validation method for evaluating these classifiers. The accuracy of the empathy level classifications achieved was similar (75%, 74%, and 69%, respectively) when Youden index was used to choose the probability cut point, as were their AUC values (0.605, 0.600, and 0.617, respectively). The GAMM and binary logistic regression with splines required significantly more computational time compared to the random forest, which used 100 trees. Despite the significant improvement in AUC value with the logistic regression with splines in comparison to the random forest model, the AUC results do not seem to be dissimilar. However, the AUC 95% CIs for the binary logistic regression do not overlap with those for the other 2 methods, confirming that the logistic regression model provides a better fit to the data. This was further validated by the DeLong test results showing significant differences in AUC values of the binary logistic model compared to the other 2 methods. The partial overlap between the AUC CIs of the random forest and binary GAMMs indicates a significant difference in model fit. Therefore, the binary logistic classification model outperforms the other 2 methods in its ability to distinguish high-empathic speech from low-empathic speech. However, the slightly lower classification accuracy (69%) for this model suggests that the Youden method used to determine the probability cut point used for classification purposes may not be optimal.</p><p>These relatively low AUC values, also seen in other related research projects, can perhaps be partly attributed to the difficulties encountered in providing accurate ratings of empathy. The algorithms developed for detecting empathy from vocal features were reliant on the quality of the input data provided for the empathy ratings. A larger sample of raters and a larger sample of calls might have produced more reliable data, and this is recommended for future research in this area.</p><p>However, the multimodal approaches commonly used for empathy recognition through the use of words, vocals, visual signs, and psychological signals reflect the multifaceted nature of empathy [<xref ref-type="bibr" rid="ref48">48</xref>]. The importance of facial expressions for recognizing empathy is particularly emphasized in this literature, where factors such as observation time and the type of emotion expressed significantly influence the accuracy of identification [<xref ref-type="bibr" rid="ref49">49</xref>]. These multimodal approaches suggest that a higher accuracy in detecting empathy can be achieved when all these factors are collectively considered, rather than vocal approaches on their own.</p></sec><sec id="s4-5"><title>Limitations</title><p>The inherent differences of empathy perception among the annotators of this study were a concern in this study. An additional analysis was conducted to explore this further, incorporating a third annotator without psychological expertise and from a different cultural background (<xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>). The findings from this analysis produced an even lower model performance, confirming that perceptions of empathy do vary between individuals and cultures. This analysis included both female and nonfemale counselor vocals, which may also have contributed to this poorer performance. While this finding underscores the complexity of recognizing empathy, it also highlights how cultural differences, personal experience, and psychological knowledge of individuals contribute to subjective perceptions. Therefore, future research would benefit from accommodating these differences in empathy perception within the model by including multiple annotators with varied backgrounds.</p><p>Another limitation of this study was due to the very small number of available male counselor recordings (n=13). The main analysis could therefore only be conducted on female vocals, limiting the generalization of our findings. Ideally it would have been possible to develop separate models for empathy in male and female counselors and to test whether there were significant differences between these models. Larger samples of counselor vocals would also have been preferable, providing more diversity in the data. Unfortunately, only a very small sample of male counselor calls was obtained. A recent study that focused on the vocal characteristics of distressed adults using machine learning techniques identified a significant difference between male and female vocal behaviors. This suggested that it would not be appropriate to use a single model to describe the voices of both male and female counselors [<xref ref-type="bibr" rid="ref50">50</xref>]. For this reason, only female counselor voices were considered in this study. Recent research on machine learning applications has explored synthesized data as an avenue to address class imbalance, and this was considered as a possible remedy to address the lower representation of male voices in this study [<xref ref-type="bibr" rid="ref51">51</xref>]. However, using synthetic speech, such as through text-to-speech applications, faces the challenge of reflecting the diversity inherent in natural human voices [<xref ref-type="bibr" rid="ref52">52</xref>]. So, this option was not considered for boosting the number of male counselor calls, so as to make an analysis of these data possible.</p><p>However, this option was used when the algorithm developed in this study was validated on an external synthesized dataset consisting of female recordings generated using a text-to-speech application. These data also showed successful outcomes for the algorithm in differentiating between high and low empathic voices in female voices. These results highlight synthetic voice augmentation as a promising future research direction in machine learning applications. Alternatively, a balanced representation of gender, as well as cultural background, empathy levels, and individual characteristics, are necessary considerations for the counselor recordings used in future research of this nature.</p><p>A further limitation is the source of the calls used for this study. The context of a suicide helpline service is very specific, and it may be that more algorithmic success would have been possible in a less stressed environment. Additionally, for most of the calls, the level of empathy was assessed by a single rater. As mentioned above, it would have been preferable if a greater duplication of ratings could have been used to provide the dependent variable for the models that have been used to identify the level of empathy in counselor vocals. However, this pilot study, using machine learning techniques to identify vocal features related to empathy in female counselor voices, has shown promise. Therefore, further explorations of this approach with increases in the call sample size and a more balanced gender representation, and with more annotators to allow more duplication of ratings and more variation in annotator background while also including calls from more than 1 helpline service, will benefit future research on this topic.</p></sec><sec id="s4-6"><title>Implications of This Study</title><p>The importance of empathy in reducing the distress of callers confirms the need for the incorporation of empathic communication skills in training programs for counselors. Additionally, a statistically significant positive correlation was found between suicide risk and counselor empathy (<italic>P</italic>&#x003C;.05). This suggests that counselors tend to be more empathic toward high-risk callers. This perhaps highlights the need for counselors to adhere to a more caller-centered approach, ensuring that empathy is consistently exhibited for both low-risk and high-risk callers. Resources should perhaps be allocated equally for all callers rather than having a crisis intervention strategy that is tailored to prioritize callers needing emergency support.</p><p>To the best of the authors&#x2019; knowledge, this is the first study that has identified the unique features of human vocals that are associated with the communication of empathy in a MH care setting. The results of this study have implications for the training of counselors and psychologists working for MH-related telephone helpline services. Additionally, these findings can serve as a training resource for MH professionals more broadly, enhancing the quality of care provided. The engineering of empathic chatbots, especially within a triage capacity, is another significant area of research that would benefit from the findings of this study [<xref ref-type="bibr" rid="ref53">53</xref>].</p><p>However, collecting the vocal data of individuals for research purposes raises important ethical concerns. This research needs to prioritize user consent and caller privacy. It is recommended that people with lived experience in telephone counseling and MH be asked to assist with the co-design and coproduction of such research to ensure that any resulting training programs or monitoring systems are acceptable to users and meet consumer needs.</p></sec></sec></body><back><ack><p>This research was funded by Swinburne University of Technology. The data call recordings were provided by On The Line Australia who was the main research partner of this study. We acknowledge Dr Jakqui Barnfield for her support from On The Line, Australia. We also thank Inge Gnatt, BPsych (Hons), and Sarah Dunning, BPsych, for their contributions as research annotators and Dr Maja Nedeljkovic for her supervision of the annotation process.</p></ack><notes><sec><title>Data Availability</title><p>All data analyzed in this study are included in summary form under Multimedia Appendices. Our ethics approval does not allow the reporting of individual call data.</p></sec></notes><fn-group><fn fn-type="con"><p>RS developed the first draft of this study's manuscript and conducted a voice analysis, including the creation of the algorithm. RI contributed to developing the algorithm for detecting voice features that relate to empathy. RS contributed to the call annotation process, where variations in counselor empathy within the calls were identified. DM and RI contributed their supervision during the entire annotation process, confirming the well-being of the researchers involved. DM, RI, PA, and NW were involved in this study's manuscript revisions.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AEL</term><def><p>Active-Empathic Listening</p></def></def-item><def-item><term id="abb2">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb3">C-SSRS</term><def><p>Columbia Suicide Severity Rating Scale</p></def></def-item><def-item><term id="abb4">CTE</term><def><p>Carkhuff and Truax Empathy</p></def></def-item><def-item><term id="abb5">GAMM</term><def><p>generalized additive mixed model</p></def></def-item><def-item><term id="abb6">LASSO</term><def><p>Least Absolute Shrinkage and Selection Operator</p></def></def-item><def-item><term id="abb7">MH</term><def><p>mental health</p></def></def-item><def-item><term id="abb8">PEI</term><def><p>Perceived Emotional Intelligence</p></def></def-item><def-item><term id="abb9">RS7</term><def><p>Rating Scale 7</p></def></def-item><def-item><term id="abb10">TRIPOD+AI</term><def><p>Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis+Artificial Intelligence</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Smith</surname><given-names>A</given-names> </name></person-group><article-title>Cognitive empathy and emotional empathy in human behavior and evolution</article-title><source>Psychol Rec</source><year>2006</year><month>01</month><volume>56</volume><issue>1</issue><fpage>3</fpage><lpage>21</lpage><pub-id pub-id-type="doi">10.1007/BF03395534</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Hojat</surname><given-names>M</given-names> </name></person-group><source>Empathy in Health Professions Education and Patient Care</source><year>2016</year><publisher-name>Springer Cham</publisher-name><pub-id pub-id-type="other">9783319801896</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Derksen</surname><given-names>F</given-names> </name><name name-style="western"><surname>Bensing</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lagro-Janssen</surname><given-names>A</given-names> </name></person-group><article-title>Effectiveness of empathy in general practice: a systematic review</article-title><source>Br J Gen Pract</source><year>2013</year><month>01</month><volume>63</volume><issue>606</issue><fpage>e76</fpage><lpage>84</lpage><pub-id pub-id-type="doi">10.3399/bjgp13X660814</pub-id><pub-id pub-id-type="medline">23336477</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sturzu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lala</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bisch</surname><given-names>M</given-names> </name><name name-style="western"><surname>Guitter</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dobre</surname><given-names>D</given-names> </name><name name-style="western"><surname>Schwan</surname><given-names>R</given-names> </name></person-group><article-title>Empathy and burnout - a cross-sectional study among mental healthcare providers in France</article-title><source>J Med Life</source><year>2019</year><volume>12</volume><issue>1</issue><fpage>21</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.25122/jml-2018-0050</pub-id><pub-id pub-id-type="medline">31123521</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peppou</surname><given-names>LE</given-names> </name><name name-style="western"><surname>Economou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Skali</surname><given-names>T</given-names> </name><name name-style="western"><surname>Papageorgiou</surname><given-names>C</given-names> </name></person-group><article-title>From economic crisis to the COVID-19 pandemic crisis: evidence from a mental health helpline in Greece</article-title><source>Eur Arch Psychiatry Clin Neurosci</source><year>2021</year><month>03</month><volume>271</volume><issue>2</issue><fpage>407</fpage><lpage>409</lpage><pub-id pub-id-type="doi">10.1007/s00406-020-01165-4</pub-id><pub-id pub-id-type="medline">32666279</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Betancourt</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Rosenberg</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Zevallos</surname><given-names>A</given-names> </name><name name-style="western"><surname>Brown</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Mileski</surname><given-names>M</given-names> </name></person-group><article-title>The impact of COVID-19 on telemedicine utilization across multiple service lines in the United States</article-title><source>Healthcare (Basel)</source><year>2020</year><month>10</month><day>1</day><volume>8</volume><issue>4</issue><fpage>380</fpage><pub-id pub-id-type="doi">10.3390/healthcare8040380</pub-id><pub-id pub-id-type="medline">33019667</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pavlova</surname><given-names>A</given-names> </name><name name-style="western"><surname>Scarth</surname><given-names>B</given-names> </name><name name-style="western"><surname>Witt</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hetrick</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fortune</surname><given-names>S</given-names> </name></person-group><article-title>COVID-19 related innovation in Aotearoa/New Zealand mental health helplines and telehealth providers - mapping solutions and discussing sustainability from the perspective of service providers</article-title><source>Front Psychiatry</source><year>2022</year><volume>13</volume><fpage>973261</fpage><pub-id pub-id-type="doi">10.3389/fpsyt.2022.973261</pub-id><pub-id pub-id-type="medline">36111308</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gibbons</surname><given-names>SB</given-names> </name></person-group><article-title>Understanding empathy as a complex construct: a review of the literature</article-title><source>Clin Soc Work J</source><year>2011</year><month>09</month><volume>39</volume><issue>3</issue><fpage>243</fpage><lpage>252</lpage><pub-id pub-id-type="doi">10.1007/s10615-010-0305-2</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Angus</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kagan</surname><given-names>F</given-names> </name></person-group><article-title>Empathic relational bonds and personal agency in psychotherapy: implications for psychotherapy supervision, practice, and research</article-title><source>Psychotherapy (Chic)</source><year>2007</year><month>12</month><volume>44</volume><issue>4</issue><fpage>371</fpage><lpage>377</lpage><pub-id pub-id-type="doi">10.1037/0033-3204.44.4.371</pub-id><pub-id pub-id-type="medline">22122315</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pugh</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Vetere</surname><given-names>A</given-names> </name></person-group><article-title>Lost in translation: an interpretative phenomenological analysis of mental health professionals&#x2019; experiences of empathy in clinical work with an interpreter</article-title><source>Psychol Psychother</source><year>2009</year><month>09</month><volume>82</volume><issue>Pt 3</issue><fpage>305</fpage><lpage>321</lpage><pub-id pub-id-type="doi">10.1348/147608308X397059</pub-id><pub-id pub-id-type="medline">19208292</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Le</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>CC</given-names> </name><etal/></person-group><article-title>Defining clinical empathy: a grounded theory approach from the perspective of healthcare workers and patients in a multicultural setting</article-title><source>BMJ Open</source><year>2021</year><month>09</month><day>14</day><volume>11</volume><issue>9</issue><fpage>e045224</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2020-045224</pub-id><pub-id pub-id-type="medline">34521657</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McHenry</surname><given-names>M</given-names> </name><name name-style="western"><surname>Parker</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Baile</surname><given-names>WF</given-names> </name><name name-style="western"><surname>Lenzi</surname><given-names>R</given-names> </name></person-group><article-title>Voice analysis during bad news discussion in oncology: reduced pitch, decreased speaking rate, and nonverbal communication of empathy</article-title><source>Support Care Cancer</source><year>2012</year><month>05</month><volume>20</volume><issue>5</issue><fpage>1073</fpage><lpage>1078</lpage><pub-id pub-id-type="doi">10.1007/s00520-011-1187-8</pub-id><pub-id pub-id-type="medline">21573770</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gustafsson</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Wahlberg</surname><given-names>AC</given-names> </name></person-group><article-title>The telephone nursing dialogue process: an integrative review</article-title><source>BMC Nurs</source><year>2023</year><volume>22</volume><issue>1</issue><fpage>345</fpage><pub-id pub-id-type="doi">10.1186/s12912-023-01509-0</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><article-title>Mental health atlas 2020: review of the Eastern Mediterranean region</article-title><source>World Health Organization</source><year>2022</year><access-date>2025-03-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://applications.emro.who.int/docs/9789292740443-eng.pdf?ua=1">https://applications.emro.who.int/docs/9789292740443-eng.pdf?ua=1</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Callejas</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Griol</surname><given-names>D</given-names> </name></person-group><article-title>Conversational agents for mental health and wellbeing. dialog systems: a perspective from language, logic and computation</article-title><source>Dialog Systems</source><year>2021</year><publisher-name>Springer, Cham</publisher-name><fpage>219</fpage><lpage>244</lpage><pub-id pub-id-type="doi">10.1007/978-3-030-61438-6_11</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pelau</surname><given-names>C</given-names> </name><name name-style="western"><surname>Dabija</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Ene</surname><given-names>I</given-names> </name></person-group><article-title>What makes an AI device human-like? The role of interaction quality, empathy and perceived psychological anthropomorphic characteristics in the acceptance of artificial intelligence in the service industry</article-title><source>Comput Human Behav</source><year>2021</year><month>09</month><volume>122</volume><fpage>106855</fpage><pub-id pub-id-type="doi">10.1016/j.chb.2021.106855</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Iyer</surname><given-names>R</given-names> </name><name name-style="western"><surname>Nedeljkovic</surname><given-names>M</given-names> </name><name name-style="western"><surname>Meyer</surname><given-names>D</given-names> </name></person-group><article-title>Using voice biomarkers to classify suicide risk in adult telehealth callers: retrospective observational study</article-title><source>JMIR Ment Health</source><year>2022</year><month>08</month><day>15</day><volume>9</volume><issue>8</issue><fpage>e39807</fpage><pub-id pub-id-type="doi">10.2196/39807</pub-id><pub-id pub-id-type="medline">35969444</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Posner</surname><given-names>K</given-names> </name><name name-style="western"><surname>Brown</surname><given-names>GK</given-names> </name><name name-style="western"><surname>Stanley</surname><given-names>B</given-names> </name><etal/></person-group><article-title>The Columbia-Suicide Severity Rating Scale: initial validity and internal consistency findings from three multisite studies with adolescents and adults</article-title><source>Am J Psychiatry</source><year>2011</year><month>12</month><volume>168</volume><issue>12</issue><fpage>1266</fpage><lpage>1277</lpage><pub-id pub-id-type="doi">10.1176/appi.ajp.2011.10111704</pub-id><pub-id pub-id-type="medline">22193671</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>RStudio</article-title><source>RStudio IDE (Version 2024.04.2+764)</source><year>2024</year><access-date>2025-04-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://posit.co/products/open-source/rstudio/">https://posit.co/products/open-source/rstudio/</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><article-title>Audacity 3.5.1</article-title><source>Audacity Support</source><year>2024</year><access-date>2025-03-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://support.audacityteam.org/additional-resources/changelog/older-versions/audacity-3.5/audacity-3.5.1">https://support.audacityteam.org/additional-resources/changelog/older-versions/audacity-3.5/audacity-3.5.1</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Heck</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>CS</given-names> </name></person-group><article-title>Differential expression of empathy in a counseling analogue</article-title><source>J Couns Psychol</source><year>1973</year><volume>20</volume><issue>2</issue><fpage>101</fpage><lpage>104</lpage><pub-id pub-id-type="doi">10.1037/h0034171</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mayer</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Salovey</surname><given-names>P</given-names> </name><name name-style="western"><surname>Caruso</surname><given-names>DR</given-names> </name><name name-style="western"><surname>Sitarenios</surname><given-names>G</given-names> </name></person-group><article-title>Measuring emotional intelligence with the MSCEIT V2.0</article-title><source>Emotion</source><year>2003</year><month>03</month><volume>3</volume><issue>1</issue><fpage>97</fpage><lpage>105</lpage><pub-id pub-id-type="doi">10.1037/1528-3542.3.1.97</pub-id><pub-id pub-id-type="medline">12899321</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bodie</surname><given-names>GD</given-names> </name></person-group><article-title>The Active-Empathic Listening Scale (AELS): conceptualization and evidence of validity within the interpersonal domain</article-title><source>Commun Q</source><year>2011</year><month>07</month><volume>59</volume><issue>3</issue><fpage>277</fpage><lpage>295</lpage><pub-id pub-id-type="doi">10.1080/01463373.2011.583495</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Joshi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kale</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chandel</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pal</surname><given-names>DK</given-names> </name></person-group><article-title>Likert Scale: explored and explained</article-title><source>BJAST</source><year>2015</year><volume>7</volume><issue>4</issue><fpage>396</fpage><lpage>403</lpage><pub-id pub-id-type="doi">10.9734/BJAST/2015/14975</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ownby</surname><given-names>KK</given-names> </name></person-group><article-title>Use of the distress thermometer in clinical practice</article-title><source>J Adv Pract Oncol</source><year>2019</year><month>03</month><volume>10</volume><issue>2</issue><fpage>175</fpage><lpage>179</lpage><pub-id pub-id-type="medline">31538028</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bikker</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Fitzpatrick</surname><given-names>B</given-names> </name><name name-style="western"><surname>Murphy</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mercer</surname><given-names>SW</given-names> </name></person-group><article-title>Measuring empathic, person-centred communication in primary CARE nurses: validity and reliability of the Consultation and Relational Empathy (CARE) measure</article-title><source>BMC Fam Pract</source><year>2015</year><month>10</month><day>23</day><volume>16</volume><fpage>149</fpage><pub-id pub-id-type="doi">10.1186/s12875-015-0374-y</pub-id><pub-id pub-id-type="medline">26493072</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Anikin</surname><given-names>A</given-names> </name></person-group><article-title>Soundgen: an open-source tool for synthesizing nonverbal vocalizations</article-title><source>Behav Res Methods</source><year>2019</year><month>04</month><volume>51</volume><issue>2</issue><fpage>778</fpage><lpage>792</lpage><pub-id pub-id-type="doi">10.3758/s13428-018-1095-7</pub-id><pub-id pub-id-type="medline">30054898</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Casale</surname><given-names>S</given-names> </name><name name-style="western"><surname>Russo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Scebba</surname><given-names>G</given-names> </name><name name-style="western"><surname>Serrano</surname><given-names>S</given-names> </name></person-group><article-title>Speech emotion classification using machine learning algorithms</article-title><conf-name>Proceedings of the 2th IEEE International Conference on Semantic Computing (ICSC 2008)</conf-name><conf-date>Aug 4-7, 2008</conf-date><conf-loc>Santa Monica, CA</conf-loc><pub-id pub-id-type="doi">10.1109/ICSC.2008.43</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rasmussen</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Bro</surname><given-names>R</given-names> </name></person-group><article-title>A tutorial on the LASSO approach to sparse modeling</article-title><source>Chemometr Intell Lab Syst</source><year>2012</year><month>10</month><volume>119</volume><fpage>21</fpage><lpage>31</lpage><pub-id pub-id-type="doi">10.1016/j.chemolab.2012.10.003</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>S&#x00F3;skuthy</surname><given-names>M</given-names> </name></person-group><article-title>Generalised additive mixed models for dynamic analysis in linguistics: a practical introduction</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 15, 2017</comment><pub-id pub-id-type="doi">10.48550/arXiv.1703.05339</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wood</surname><given-names>SN</given-names> </name><name name-style="western"><surname>Pya</surname><given-names>N</given-names> </name><name name-style="western"><surname>S&#x00E4;fken</surname><given-names>B</given-names> </name></person-group><article-title>Smoothing parameter and model selection for general smooth models</article-title><source>J Am Stat Assoc</source><year>2016</year><month>10</month><day>1</day><volume>111</volume><issue>516</issue><fpage>1548</fpage><lpage>1563</lpage><pub-id pub-id-type="doi">10.1080/01621459.2016.1180986</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wood</surname><given-names>SN</given-names> </name></person-group><article-title>Fast stable restricted maximum likelihood and marginal likelihood estimation of semiparametric generalized linear models</article-title><source>J R Stat Soc Series B Stat Methodol</source><year>2011</year><month>01</month><day>1</day><volume>73</volume><issue>1</issue><fpage>3</fpage><lpage>36</lpage><pub-id pub-id-type="doi">10.1111/j.1467-9868.2010.00749.x</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Noroozi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Sapi&#x0144;ski</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kami&#x0144;ska</surname><given-names>D</given-names> </name><name name-style="western"><surname>Anbarjafari</surname><given-names>G</given-names> </name></person-group><article-title>Vocal-based emotion recognition using random forests and decision tree</article-title><source>Int J Speech Technol</source><year>2017</year><month>06</month><volume>20</volume><issue>2</issue><fpage>239</fpage><lpage>246</lpage><pub-id pub-id-type="doi">10.1007/s10772-017-9396-2</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>F</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>JT</given-names> </name></person-group><article-title>Applying random forest classification to diagnose autism using acoustical voice-quality parameters during lexical tone production</article-title><source>Biomed Signal Process Control</source><year>2022</year><month>08</month><volume>77</volume><fpage>103811</fpage><pub-id pub-id-type="doi">10.1016/j.bspc.2022.103811</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raahul</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sapthagiri</surname><given-names>R</given-names> </name><name name-style="western"><surname>Pankaj</surname><given-names>K</given-names> </name><name name-style="western"><surname>Vijayarajan</surname><given-names>V</given-names> </name></person-group><article-title>Voice based gender classification using machine learning</article-title><source>IOP Conf Ser Mater Sci Eng</source><year>2017</year><month>11</month><day>1</day><volume>263</volume><fpage>042083</fpage><pub-id pub-id-type="doi">10.1088/1757-899X/263/4/042083</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jacob</surname><given-names>A</given-names> </name></person-group><article-title>Modelling speech emotion recognition using logistic regression and decision trees</article-title><source>Int J Speech Technol</source><year>2017</year><month>12</month><volume>20</volume><issue>4</issue><fpage>897</fpage><lpage>905</lpage><pub-id pub-id-type="doi">10.1007/s10772-017-9457-6</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tafiadis</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kosma</surname><given-names>EI</given-names> </name><name name-style="western"><surname>Chronopoulos</surname><given-names>SK</given-names> </name><etal/></person-group><article-title>Voice handicap index and interpretation of the cutoff points using receiver operating characteristic curve as screening for young adult female smokers</article-title><source>J Voice</source><year>2018</year><month>01</month><volume>32</volume><issue>1</issue><fpage>64</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1016/j.jvoice.2017.03.009</pub-id><pub-id pub-id-type="medline">28392085</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rice</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Harris</surname><given-names>GT</given-names> </name></person-group><article-title>What does it mean when age is related to recidivism among sex offenders?</article-title><source>Law Hum Behav</source><year>2014</year><month>04</month><volume>38</volume><issue>2</issue><fpage>151</fpage><lpage>161</lpage><pub-id pub-id-type="doi">10.1037/lhb0000052</pub-id><pub-id pub-id-type="medline">23876093</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Uloza</surname><given-names>V</given-names> </name><name name-style="western"><surname>Latoszek</surname><given-names>BBV</given-names> </name><name name-style="western"><surname>Ulozaite-Staniene</surname><given-names>N</given-names> </name><name name-style="western"><surname>Petrauskas</surname><given-names>T</given-names> </name><name name-style="western"><surname>Maryn</surname><given-names>Y</given-names> </name></person-group><article-title>A comparison of Dysphonia Severity Index and Acoustic Voice Quality Index measures in differentiating normal and dysphonic voices</article-title><source>Eur Arch Otorhinolaryngol</source><year>2018</year><month>04</month><volume>275</volume><issue>4</issue><fpage>949</fpage><lpage>958</lpage><pub-id pub-id-type="doi">10.1007/s00405-018-4903-x</pub-id><pub-id pub-id-type="medline">29442165</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carter</surname><given-names>JV</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rai</surname><given-names>SN</given-names> </name><name name-style="western"><surname>Galandiuk</surname><given-names>S</given-names> </name></person-group><article-title>ROC-ing along: evaluation and interpretation of receiver operating characteristic curves</article-title><source>Surgery</source><year>2016</year><month>06</month><volume>159</volume><issue>6</issue><fpage>1638</fpage><lpage>1645</lpage><pub-id pub-id-type="doi">10.1016/j.surg.2015.12.029</pub-id><pub-id pub-id-type="medline">26962006</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Guizzetti</surname><given-names>L</given-names> </name><name name-style="western"><surname>Shu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>G</given-names> </name></person-group><article-title>Extending the DeLong algorithm for comparing areas under correlated receiver operating characteristic curves with missing data</article-title><source>Stat Med</source><year>2024</year><month>09</month><day>20</day><volume>43</volume><issue>21</issue><fpage>4148</fpage><lpage>4162</lpage><pub-id pub-id-type="doi">10.1002/sim.10172</pub-id><pub-id pub-id-type="medline">39013403</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gerdes</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Segal</surname><given-names>E</given-names> </name></person-group><article-title>Importance of empathy for social work practice: integrating new science</article-title><source>Soc Work</source><year>2011</year><month>04</month><volume>56</volume><issue>2</issue><fpage>141</fpage><lpage>148</lpage><pub-id pub-id-type="doi">10.1093/sw/56.2.141</pub-id><pub-id pub-id-type="medline">21553577</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Regenbogen</surname><given-names>C</given-names> </name><name name-style="western"><surname>Schneider</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Finkelmeyer</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The differential contribution of facial expressions, prosody, and speech content to empathy</article-title><source>Cogn Emot</source><year>2012</year><month>09</month><volume>26</volume><issue>6</issue><fpage>995</fpage><lpage>1014</lpage><pub-id pub-id-type="doi">10.1080/02699931.2011.631296</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Concannon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tomalin</surname><given-names>M</given-names> </name></person-group><article-title>Measuring perceived empathy in dialogue systems</article-title><source>AI Soc</source><year>2024</year><month>10</month><volume>39</volume><issue>5</issue><fpage>2233</fpage><lpage>2247</lpage><pub-id pub-id-type="doi">10.1007/s00146-023-01715-z</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kulkarni</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Detecting empathy in speech</article-title><access-date>2025-04-09</access-date><conf-name>Interspeech 2024</conf-name><conf-date>Sep 1-5, 2024</conf-date><conf-loc>Kos, Greece</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://www.isca-archive.org/interspeech_2024">https://www.isca-archive.org/interspeech_2024</ext-link></comment><pub-id pub-id-type="doi">10.21437/Interspeech.2024-347</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alam</surname><given-names>F</given-names> </name><name name-style="western"><surname>Danieli</surname><given-names>M</given-names> </name><name name-style="western"><surname>Riccardi</surname><given-names>G</given-names> </name></person-group><article-title>Annotating and modeling empathy in spoken conversations</article-title><source>Comput SPEECH Lang</source><year>2018</year><month>07</month><volume>50</volume><fpage>40</fpage><lpage>61</lpage><pub-id pub-id-type="doi">10.1016/j.csl.2017.12.003</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Neumann</surname><given-names>DL</given-names> </name><name name-style="western"><surname>Cao</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Culture-sex interaction and the self-report empathy in Australians and mainland Chinese</article-title><source>Front Psychol</source><year>2019</year><volume>10</volume><fpage>396</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2019.00396</pub-id><pub-id pub-id-type="medline">30914986</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Hasan</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Hossain</surname><given-names>MZ</given-names> </name><name name-style="western"><surname>Ghosh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Krishna</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gedeon</surname><given-names>T</given-names> </name></person-group><article-title>Empathy detection from text, audiovisual, audio or physiological signals: task formulations and machine learning methods</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 17, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2311.00721</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Besel</surname><given-names>LDS</given-names> </name><name name-style="western"><surname>Yuille</surname><given-names>JC</given-names> </name></person-group><article-title>Individual differences in empathy: the role of facial expression recognition</article-title><source>Pers Individ Dif</source><year>2010</year><month>07</month><volume>49</volume><issue>2</issue><fpage>107</fpage><lpage>112</lpage><pub-id pub-id-type="doi">10.1016/j.paid.2010.03.013</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Iyer</surname><given-names>R</given-names> </name><name name-style="western"><surname>Nedeljkovic</surname><given-names>M</given-names> </name><name name-style="western"><surname>Meyer</surname><given-names>D</given-names> </name></person-group><article-title>Using vocal characteristics to classify psychological distress in adult helpline callers: retrospective observational study</article-title><source>JMIR Form Res</source><year>2022</year><month>12</month><day>19</day><volume>6</volume><issue>12</issue><fpage>e42249</fpage><pub-id pub-id-type="doi">10.2196/42249</pub-id><pub-id pub-id-type="medline">36534456</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Th&#x00F6;lke</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mantilla-Ramos</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Abdelhedi</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Class imbalance should not throw you off balance: choosing the right classifiers and performance metrics for brain decoding with imbalanced data</article-title><source>Neuroimage</source><year>2023</year><month>08</month><day>15</day><volume>277</volume><fpage>120253</fpage><pub-id pub-id-type="doi">10.1016/j.neuroimage.2023.120253</pub-id><pub-id pub-id-type="medline">37385392</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Minixhofer</surname><given-names>C</given-names> </name><name name-style="western"><surname>Klejch</surname><given-names>O</given-names> </name><name name-style="western"><surname>Bell</surname><given-names>P</given-names> </name></person-group><article-title>Evaluating and reducing the distance between synthetic and real speech distributions</article-title><year>2023</year><access-date>2025-03-27</access-date><conf-name>Interspeech 2023</conf-name><conf-date>Aug 20-24, 2023</conf-date><conf-loc>Dublin, Ireland</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://www.isca-archive.org/interspeech_2023/">https://www.isca-archive.org/interspeech_2023/</ext-link></comment><pub-id pub-id-type="doi">10.21437/Interspeech.2023-1978</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sanjeewa</surname><given-names>R</given-names> </name><name name-style="western"><surname>Iyer</surname><given-names>R</given-names> </name><name name-style="western"><surname>Apputhurai</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wickramasinghe</surname><given-names>N</given-names> </name><name name-style="western"><surname>Meyer</surname><given-names>D</given-names> </name></person-group><article-title>Empathic conversational agent platform designs and their evaluation in the context of mental health: systematic review</article-title><source>JMIR Ment Health</source><year>2024</year><month>09</month><day>9</day><volume>11</volume><fpage>e58974</fpage><pub-id pub-id-type="doi">10.2196/58974</pub-id><pub-id pub-id-type="medline">39250799</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Online questionnaires used for data collection.</p><media xlink:href="formative_v9i1e67835_app1.docx" xlink:title="DOCX File, 30 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Descriptive statistics of the analysis.</p><media xlink:href="formative_v9i1e67835_app2.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Results of consistency between the raters.</p><media xlink:href="formative_v9i1e67835_app3.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>The results of LASSO regression. LASSO: Least Absolute Shrinkage and Selection Operator.</p><media xlink:href="formative_v9i1e67835_app4.docx" xlink:title="DOCX File, 23 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Detailed results of the significance of the vocal features through GAMM, random forest, and binary logistics regression models. GAMM: Generalized Additive Mixed Model.</p><media xlink:href="formative_v9i1e67835_app5.docx" xlink:title="DOCX File, 28 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Impact of expanded dataset and rater diversity on voice analysis results.</p><media xlink:href="formative_v9i1e67835_app6.docx" xlink:title="DOCX File, 29 KB"/></supplementary-material><supplementary-material id="app7"><label>Checklist 1</label><p>TRIPOD+AI (Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis+Artificial Intelligence) statement for reporting the modeling process.</p><media xlink:href="formative_v9i1e67835_app7.pdf" xlink:title="PDF File, 620 KB"/></supplementary-material></app-group></back></article>