<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e84110</article-id><article-id pub-id-type="doi">10.2196/84110</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Multimodal Depression Detection Through Conversational Interactions with an Emotion-Aware Social Robot: Pilot Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Liao</surname><given-names>Pu-Yu</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Su</surname><given-names>Yu-Quan</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Qian</surname><given-names>Xiaobei</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chang</surname><given-names>Yu-Ling</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Yun-Hsiang</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Fu</surname><given-names>Li-Chen</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Graduate Institute of Networking and Multimedia, National Taiwan University</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff2"><institution>Department of Computer Science &#x0026; Information Engineering, National Taiwan University</institution><addr-line>No.1, Sec. 4, Roosevelt Road</addr-line><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff3"><institution>Department of Psychology, National Taiwan University</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff4"><institution>Department of Nursing, National Taiwan University</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Schwartz</surname><given-names>Amy</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Balcarras</surname><given-names>Matthew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Li</surname><given-names>Bobo</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Borovic</surname><given-names>Mladen</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Li-Chen Fu, PhD, Department of Computer Science &#x0026; Information Engineering, National Taiwan University, No.1, Sec. 4, Roosevelt Road, Taipei, 106319, Taiwan, 886 2 3366 3558; <email>lichen@ntu.edu.tw</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>27</day><month>4</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e84110</elocation-id><history><date date-type="received"><day>15</day><month>09</month><year>2025</year></date><date date-type="rev-recd"><day>27</day><month>02</month><year>2026</year></date><date date-type="accepted"><day>27</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Pu-Yu Liao, Yu-Quan Su, Xiaobei Qian, Yu-Ling Chang, Yun-Hsiang Lee, Li-Chen Fu. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 27.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e84110"/><abstract><sec><title>Background</title><p>Depression affects more than 300 million people worldwide and is a leading contributor to the global disease burden. Traditional diagnostic methods, such as structured clinical interviews, are reliable but impractical for frequent or large-scale screening. Self-report tools like the Patient Health Questionnaire-8 (PHQ-8) require disclosure and clinician oversight, limiting accessibility. Recent artificial intelligence&#x2013;based approaches leverage multimodal behavioral cues (linguistic, acoustic, and visual) for automated depression detection but remain constrained by limited adaptability, scarce annotated data, weak emotional expression in real-world settings, and the high computational cost of deployment of socially assistive robots (SARs).</p></sec><sec><title>Objective</title><p>This study introduces Depression Social Assistant Robot (DEPRESAR)-Fusion, a lightweight multimodal depression detection framework designed for natural interactions with emotion-aware SARs. The objective of this study was to enhance detection accuracy in everyday conversations while addressing the challenges of data scarcity, weak emotional cues, and computational efficiency.</p></sec><sec sec-type="methods"><title>Methods</title><p>DEPRESAR-Fusion integrates acoustic, linguistic, and visual features with an emotion-aware response module powered by large language models to adapt conversational strategies dynamically. To stimulate richer emotional expression, participants were exposed to emotionally evocative videos before SAR interactions. To overcome data scarcity, we augmented training with (1) public depression-related social media corpora and (2) synthetic samples generated via large language models. The proposed multimodal fusion architecture was evaluated on benchmark clinical datasets for both binary depression classification and PHQ-8 regression tasks. Performance was compared against prior multimodal baselines using root mean square error, mean absolute error, and standard classification metrics.</p></sec><sec sec-type="results"><title>Results</title><p>Participants who viewed emotional stimuli before interacting with SARs exhibited significantly higher emotional expressiveness, leading to improved model performance. Regression tasks showed lower root mean square error and mean absolute error, while classification tasks achieved significantly higher accuracy than the nonstimulus condition. DEPRESAR-Fusion outperformed prior multimodal baselines across multiple benchmark datasets, achieving state-of-the-art performance in both binary classification and PHQ-8 regression. The system maintained a lightweight architecture suitable for real-time deployment on SARs.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>DEPRESAR-Fusion demonstrates that integrating emotion induction, data augmentation, and lightweight multimodal fusion can enable accurate and scalable depression detection in naturalistic SAR interactions. By bridging the gap between structured clinical assessments and everyday conversations, this approach highlights the potential of SAR-based systems as nonintrusive, artificial intelligence&#x2013;driven tools for proactive mental health support.</p></sec></abstract><kwd-group><kwd>depression detection</kwd><kwd>socially assistive robot</kwd><kwd>SAR</kwd><kwd>multimodal fusion</kwd><kwd>synthetic data generation</kwd><kwd>emotion induction</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Depression is a pervasive mental health disorder affecting over 300 million people worldwide and consistently ranks among the leading contributors to the global disease burden [<xref ref-type="bibr" rid="ref1">1</xref>]. Beyond its psychological toll, depression adversely impacts behavior and physical health, increasing the risk of cardiovascular diseases and other chronic conditions [<xref ref-type="bibr" rid="ref2">2</xref>]. Early detection and timely intervention are crucial to improving outcomes; however, many individuals avoid professional help due to stigma, shame, or misconceptions [<xref ref-type="bibr" rid="ref3">3</xref>], leading to delayed treatment and a higher suicide risk [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Traditional diagnostic methods, such as structured clinical interviews and self-report questionnaires (eg, Patient Health Questionnaire-8, PHQ-8 [<xref ref-type="bibr" rid="ref5">5</xref>]), are reliable but limited in scalability. They rely on disclosure, clinician availability, and structured settings, making them impractical for frequent or large-scale screening. These limitations have motivated research into artificial intelligence (AI)&#x2013;based depression detection, which leverages multimodal behavioral cues&#x2014;including linguistic, acoustic, and visual signals&#x2014;to provide scalable and more objective assessments.</p><p>Although promising, current AI approaches face several challenges. First, most models are trained on structured clinical interviews and generalize poorly to spontaneous conversations, limiting adaptability. Second, high-quality annotated datasets remain scarce, restricting the ability to capture subtle linguistic and emotional cues. Third, emotional signals in natural settings are often weak or suppressed, reducing sensitivity without external stimulation. Finally, many state-of-the-art architectures are computationally heavy and unsuitable for real-time deployment on resource-constrained platforms such as socially assistive robots (SARs).</p><p>Existing research on depression detection primarily relies on 3 modalities: text, audio, and visual. Advances in machine learning and deep learning have improved predictive performance; however, challenges related to data scarcity, feature robustness, and model generalizability continue to limit clinical deployment. Although text remains the most informative modality, incorporating acoustic and visual cues provides complementary behavioral signals, which motivates multimodal approaches.</p></sec><sec id="s1-2"><title>Text-Based Depression Detection</title><p>Textual analysis plays a central role in automated depression detection, as linguistic content directly reflects psychological states. Prior studies have leveraged word embeddings and semantic representations to capture depression-related language patterns [<xref ref-type="bibr" rid="ref6">6</xref>], while affective and mental health lexicons have been introduced to enhance emotional interpretability [<xref ref-type="bibr" rid="ref7">7</xref>]. Analyses of clinical interview datasets consistently show that text-based features outperform acoustic and visual modalities in predictive accuracy [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. This advantage arises from both the respondent&#x2019;s verbal expressions and the structured conversational context shaped by interviewer prompts, enabling fine-grained semantic and emotional analysis.</p></sec><sec id="s1-3"><title>Acoustic- and Vision-Based Depression Detection</title><p>Speech-based approaches commonly extract low-level acoustic descriptors, such as Mel-frequency cepstral coefficient (MFCCs) and extended Geneva Minimalistic Acoustic Parameter Set (eGeMAPS), to capture prosodic and voice-quality cues associated with depression [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Deep learning models, including convolutional neural networks, further learn discriminative speech representations [<xref ref-type="bibr" rid="ref10">10</xref>]. However, acoustic features are sensitive to speaker variability, background noise, and contextual factors, which hinder generalization.</p><p>Visual cues, including facial expressions, gaze patterns, and action units (AUs), provide additional nonverbal indicators of depressive behavior. While these features capture subtle expression dynamics, their reliability is limited in practice, as individuals with depression often exhibit subdued or inconsistent facial activity influenced by personal traits and environmental conditions.</p></sec><sec id="s1-4"><title>Multimodal Fusion and Clinical Interview Studies</title><p>Multimodal fusion frameworks integrate textual, acoustic, and visual signals to enhance robustness and sensitivity. Empirical evidence from clinical interview studies indicates that although speech and visual cues contribute auxiliary information, textual features remain the dominant predictors [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. Interview transcripts encode not only linguistic content but also interaction patterns and contextual flow, making them essential for depression assessment. Nevertheless, most existing multimodal systems rely on structured interview settings, limiting their adaptability to spontaneous, everyday conversations.</p><p>To address weak emotional expression in natural settings, recent work has emphasized emotional reactivity as a key factor in depression [<xref ref-type="bibr" rid="ref11">11</xref>]. Emotion-induction techniques help elicit observable affective responses, enriching behavioral signals and improving model interpretability.</p></sec><sec id="s1-5"><title>Emotion Induction in Depression Research</title><p>Depression involves an interaction between mood&#x2014;a persistent affective state&#x2014;and emotion, which refers to stimulus-driven responses. A meta-analysis of laboratory studies on major depressive disorder demonstrated attenuated emotional reactivity to both positive and negative stimuli, such as affective images and videos [<xref ref-type="bibr" rid="ref12">12</xref>]. Emotion-induction methods range from film clips and images to autobiographical recall and immersive environments. Although effective in eliciting transient emotional responses, these approaches often lack ecological validity and are rarely integrated into conversational or cross-cultural AI systems.</p></sec><sec id="s1-6"><title>Positioning Within the Related Work Landscape</title><p>Recent studies have explored multimodal and affective conversational systems. Large language model (LLM)&#x2013;based models [<xref ref-type="bibr" rid="ref13">13</xref>] excel at generating coherent and empathetic dialogue but are typically resource-intensive, lack multimodal perception, and do not incorporate explicit emotion-induction mechanisms. Multimodal fusion frameworks such as Zhang et al [<xref ref-type="bibr" rid="ref14">14</xref>] focus on integrating multiple modalities but do not prioritize lightweight deployment or hierarchical fusion. Hsieh et al [<xref ref-type="bibr" rid="ref15">15</xref>] emphasize efficiency but lack support for natural conversation, while Li et al [<xref ref-type="bibr" rid="ref16">16</xref>] introduce compact multimodal deep fusion architectures without emotion induction or support for free-form dialogue beyond structured interviews.</p><p>In contrast, our framework integrates natural conversational interaction, multimodal perception, hierarchical deep fusion, lightweight deployment, and explicit emotion induction. Support for natural conversation enables the analysis of unscripted dialogue; multimodal fusion enhances robustness through complementary signals; lightweight design allows deployment on resource-constrained platforms; deep fusion facilitates cross-modal interaction at multiple levels; and emotion induction actively elicits affective responses, improving detection sensitivity. To position our framework within the existing literature, we compare representative systems across 5 key dimensions: natural conversational support, multimodal fusion, lightweight deployment, hierarchical deep fusion, and emotion induction. As prior work typically focuses on only 1 or 2 of these aspects, <xref ref-type="table" rid="table1">Table 1</xref> provides a consolidated comparison to clarify the distinctive contributions of our approach.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Comparison of representative systems in affective and multimodal depression detection.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Support natural conversation</td><td align="left" valign="bottom">Multimodal fusion</td><td align="left" valign="bottom">Lightweight model (&#x003C;1 GB)</td><td align="left" valign="bottom">Deep fusion architecture</td><td align="left" valign="bottom">Emotion induction</td></tr></thead><tbody><tr><td align="left" valign="top">LLM<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>-based [<xref ref-type="bibr" rid="ref13">13</xref>]</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Zhang et al [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Hsieh et al [<xref ref-type="bibr" rid="ref15">15</xref>]</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Li et al [<xref ref-type="bibr" rid="ref16">16</xref>]</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Ours</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table1fn2"><p><sup>b</sup>Not available.</p></fn></table-wrap-foot></table-wrap><p>To address these gaps, we propose Depression Social Assistant Robot (DEPRESAR)&#x2013;Fusion, a lightweight multimodal depression detection framework designed for naturalistic interaction with SARs. The framework is built upon 3 primary objectives. First, it enables emotion-aware SAR interactions by supporting spontaneous, everyday conversations as the foundation for depression detection. Second, it incorporates emotion induction and adaptive engagement mechanisms by integrating evocative video stimuli with an LLM-based conversational response module, thereby enhancing emotional expressiveness and contextual sensitivity during dialogue. Third, it uses a lightweight multimodal deep fusion architecture that combines linguistic, acoustic, and visual features within a resource-efficient design suitable for real-time deployment on SAR platforms.</p><p>By bridging the gap between structured clinical assessments and natural conversations, DEPRESAR-Fusion advances scalable, nonintrusive, and emotionally aware AI-driven mental health support.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>System Overview</title><p>Our proposed system, DEPRESAR-Fusion, was designed for real-time depression detection through natural user interactions (<xref ref-type="fig" rid="figure1">Figure 1</xref>). It began with an emotion-induction video to encourage spontaneous emotional responses, followed by a casual conversation with an SAR. During the interaction, multimodal data&#x2014;including speech, facial expressions, and text&#x2014;were captured and processed in real time.</p><p>Key features were extracted by a data preprocessing module, which encoded audio (eg, MFCCs and eGeMAPS), visual (eg, gaze, head pose, and facial AUs), and transcribed text data. If the user explicitly ended the conversation, the system proceeded to depression prediction. Otherwise, an LLM-powered, emotion-aware social robot dynamically responded via the emotional support conversation (ESC) framework, enhanced with self-consistency prompting to ensure reliable, context-aware, and empathetic interactions. The extracted multimodal features were then passed to a dual-head depression detection module that jointly performed binary classification and PHQ-8 score regression. This architecture enabled simultaneous estimation of depressive presence and severity while maintaining an engaging and supportive user experience.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the Depression Social Assistant Robot&#x2013;Fusion system, integrating emotion induction, multimodal feature extraction, large language model&#x2013;guided empathetic dialogue, and depression prediction. PHQ-8: Patient Health Questionnaire-8.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e84110_fig01.png"/></fig></sec><sec id="s2-2"><title>Datasets and Depression Assessment</title><p>Our experiments were conducted on the Extended Distress Analysis Interview Corpus (E-DAIC), an enhanced version of the Distress Analysis Interview Corpus-Wizard-of-Oz (DAIC-WOZ) dataset, both designed for multimodal depression and anxiety detection from structured clinical interviews. DAIC-WOZ consisted of semistructured interviews conducted via a Wizard-of-Oz setup and included synchronized audio, video, and transcribed speech annotated with psychological assessment scores.</p><p>E-DAIC extended DAIC-WOZ with curated splits and standardized multimodal features for machine learning research. It contained 189 interview sessions from 100 participants, providing audio recordings (for acoustic features such as MFCC and eGeMAPS), video recordings (for facial expression and gaze analysis), time-aligned transcripts, and expert-rated PHQ-8 scores as ground truth labels.</p><p>Depression severity was measured using the PHQ-8 [<xref ref-type="bibr" rid="ref5">5</xref>], a validated 8-item instrument ranging from 0 to 24, with scores &#x2265;10 indicating clinically significant depressive symptoms (approximately 88% sensitivity and specificity).</p><p>In this study, E-DAIC served as the primary dataset for model training and evaluation, while the DAIC-WOZ benchmark subset was used to assess generalizability and facilitate comparison with prior work.</p></sec><sec id="s2-3"><title>Data Augmentation</title><p>To enhance model generalization under limited data conditions, we implemented 2 complementary data augmentation strategies involving both text-only and multimodal synthetic data generation.</p><p>First, synthetic text-only interview samples were generated using publicly available social media depression datasets, including Reddit Self-reported Depression Diagnosis and Longitudinal Twitter Dataset of Depression in the COVID Era [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. In this pipeline, user posts were segmented into response-style utterances using spaCy to simulate interview dialogue structure. Each segmented sample was temporally aligned to fixed 3-second intervals, while nontextual modalities were padded to maintain input format consistency. Because these datasets provided only binary depression labels, PHQ-8 regression scores were approximated using a thresholding strategy in which depressed samples were assigned a score of 10 and nondepressed samples a score of 0.</p><p>Second, synthetic multimodal samples were generated by paraphrasing and restructuring the original structured interview transcripts from E-DAIC [<xref ref-type="bibr" rid="ref19">19</xref>]. Paraphrasing was performed using ChatGPT (GPT-4o) while preserving the original semantic meaning. Importantly, all paraphrasing was strictly applied only to the training split after dataset partitioning. Test set transcripts were never paraphrased or used in any form during synthetic data generation, ensuring that no semantic leakage occurs. Audio and visual features were resampled from corresponding original samples and temporally aligned with the modified transcripts to construct fully synthetic multimodal instances.</p><p>Together, these augmentation strategies improved robustness and enabled the model to better accommodate linguistic variability and multimodal diversity encountered in real-world conversational settings.</p></sec><sec id="s2-4"><title>Modality-Specific Processing</title><p>The system captured and processed 3 complementary modalities&#x2014;audio, visual, and text&#x2014;to extract depression-related features. For the audio modality, acoustic features, including MFCCs and eGeMAPS descriptors, were extracted using openSMILE [<xref ref-type="bibr" rid="ref20">20</xref>], enabling the capture of prosodic patterns and voice quality characteristics associated with depressive states.</p><p>For the visual modality, nonverbal behavioral markers, such as gaze direction, head pose, and facial AUs, were obtained using OpenFace [<xref ref-type="bibr" rid="ref21">21</xref>]. These features provided cues related to affective expression and social engagement.</p><p>For the textual modality, semantic representations were derived from speech transcriptions using pretrained MiniLM [<xref ref-type="bibr" rid="ref22">22</xref>] and bidirectional encoder representations from transformers [<xref ref-type="bibr" rid="ref23">23</xref>] embeddings. Both word-level and sentence-level embeddings were incorporated to capture contextual semantics and linguistic indicators of depression.</p><p>All extracted features were temporally aligned and encoded to facilitate downstream multimodal fusion and depression assessment.</p></sec><sec id="s2-5"><title>Multimodal Fusion</title><p>To integrate heterogeneous data, the system first normalized and encoded each modality into a shared embedding space. Audio and visual features were transformed using a bag-of-audio-words or bag-of-visual-word framework [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], which mapped variable-length sequences into fixed-size distributional vectors via modality-specific codebooks. These normalized embeddings were then temporally aligned and passed to an attention-based fusion module. We used both cross-attention and self-attention mechanisms [<xref ref-type="bibr" rid="ref26">26</xref>] to model intermodal and intramodal relationships: cross-attention aligned textual features with relevant nonverbal cues, while self-attention captured contextual dependencies within each modality. The fused multimodal representation was then processed by a dual-head architecture (<xref ref-type="fig" rid="figure2">Figure 2</xref>). The classification head output the probability of clinical depression, while the regression head predicted a continuous PHQ-8 score. This unified design supported both categorical and continuous interpretations of depression severity, and attention-based fusion enhanced robustness to noise and missing data, improving overall performance. To handle data scarcity in regression targets, we scaled the regression loss.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Architecture of the multimodal depression detection module integrating audio, visual, and textual embeddings through attention-based fusion, supporting both binary classification and Patient Health Questionnaire-8 score regression tasks.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e84110_fig02.png"/></fig><p><inline-formula><mml:math id="ieqn1"><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula><italic>L</italic><sub><italic>reg</italic></sub> (around 15), higher than the classification loss <italic>L<sub>cls</sub></italic> (around 0.5) during training, ensured that the model gave adequate learning attention to the regression head. Training setup&#x2014;we minimized a multitask objective combining binary cross-entropy (for classification) and mean squared error (for regression): <italic>L</italic>=<italic>&#x03B1;</italic>&#x00B7;<italic>L<sub>cls</sub></italic>+<italic>&#x03B2;</italic>&#x00B7;<italic>L<sub>reg</sub></italic> with <italic>&#x03B1;</italic>=1.0, <italic>&#x03B2;</italic>=1.0. We used the Adam optimizer (learning rate 10<sup>&#x2013;5</sup>, weight decay 10<sup>&#x2013;4</sup>, batch size 8, and dropout rate 0.1).</p></sec><sec id="s2-6"><title>Emotion-Aware Social Robot</title><sec id="s2-6-1"><title>ESC Framework Overview</title><p>The ESC framework divided the emotional support process into 3 stages [<xref ref-type="bibr" rid="ref27">27</xref>]. This ensured that responses align with the user&#x2019;s emotional state and the appropriate support strategy. The exploration stage facilitated user expression by prompting open dialogue through questioning, paraphrasing, emotional reflection, and self-disclosure. The comfort stage reinforced empathy and understanding through emotional validation, shared experiences, and supportive affirmations. The action stage focused on resolution strategies through suggestions, practical information, and continued encouragement.</p></sec><sec id="s2-6-2"><title>Stage Selection via Self-Consistency</title><p>Upon receiving user input, the data preprocessing unit transcribed and extracted the last 10 conversational sentences. This context was processed by GPT-4o using a self-consistency method [<xref ref-type="bibr" rid="ref28">28</xref>] with a temperature of 0.7. This method prompted the model to generate multiple reasoning paths for ESC stage classification (exploration, comfort, and action) and selected the most frequent prediction. Self-consistency improved reliability by reducing variance across LLM outputs.</p></sec><sec id="s2-6-3"><title>The Full Pipeline</title><p>The full pipeline includes the following:</p><list list-type="order"><list-item><p>User input: Voice, text, and facial expressions were captured by the robot.</p></list-item><list-item><p>Preprocessing: The server extracted multimodal features&#x2014;audio (eg, MFCCs), visual (eg, AUs), and textual content.</p></list-item><list-item><p>ESC stage selection: GPT-4o was used to classify the interaction into 1 of the 3 ESC stages using self-consistent prompting.</p></list-item><list-item><p>Response generation: Based on the selected stage and strategy, the server generated an empathetic response using GPT-4o at temperature 0&#x00B0;C for consistency.</p></list-item><list-item><p>Delivery: The response was returned and delivered by the robot via voice and screen. This integration ensured dynamic and context-aware emotional support, leveraging both multimodal perception and LLM-driven strategies to foster user engagement and enable real-time, supportive interaction.</p></list-item></list></sec></sec><sec id="s2-7"><title>Deployment Architecture</title><p>DEPRESAR-Fusion operated on a 2-tier architecture: an Android application running on the Kebbi robot by Nuwa Robotics and a remote Windows-based server. This structure balanced real-time interaction with computational efficiency.</p><sec id="s2-7-1"><title>Kebbi Android Application</title><p>The Kebbi robot captured user interactions through voice, facial expressions, and text. It ran the Android front-end, which managed speech recognition, transcriptions, and device expressions. It connected to the Emotion-Aware Social Robot module, with ESC-guided LLM responses generated on the server and spoken aloud by the robot.</p></sec><sec id="s2-7-2"><title>Windows Server Backend</title><p>The core inference pipeline ran on a Windows 10 Enterprise machine (Intel i7-6700, 64GB RAM). It handled preprocessing, multimodal feature extraction, ESC-based reasoning, and depression detection. The server communicated with the robot via an HTTP application programming interface.</p></sec></sec><sec id="s2-8"><title>Experimental Setup and Compared Methods</title><p>We evaluated the proposed multimodal fusion model for depression detection on the E-DAIC dataset, supplemented with synthetic data derived from the Reddit Self-reported Depression Diagnosis and Longitudinal Twitter Dataset of Depression in the COVID Era datasets to enhance generalizability. Performance was assessed using both classification metrics (eg, <italic>F</italic><sub>1</sub>-score) and regression metrics (eg, root mean square error [RMSE], mean absolute error [MAE], and concordance correlation coefficient [CCC]) to evaluate binary depression detection and continuous PHQ-8 score prediction, respectively.</p><p>To benchmark our approach, we compared it against a diverse set of representative unimodal and multimodal baselines spanning different architectural paradigms. Text-based approaches included the graph convolutional network proposed by Burdisso et al [<xref ref-type="bibr" rid="ref29">29</xref>], which classified depressive symptoms from interview transcripts, as well as our prior lightweight text-only model (Hsieh et al [<xref ref-type="bibr" rid="ref15">15</xref>]), which leveraged linguistic signals for depression detection.</p><p>Several multimodal deep learning frameworks were also considered. Dham et al [<xref ref-type="bibr" rid="ref30">30</xref>] integrated audio, visual, and textual features for depression severity estimation, while Li et al [<xref ref-type="bibr" rid="ref16">16</xref>] proposed a Flexible Parallel Transformer that incorporated handcrafted audiovisual features for enhanced multimodal fusion. Yin et al [<xref ref-type="bibr" rid="ref31">31</xref>] introduced a hierarchical recurrent neural network designed to capture temporal dependencies across audio, visual, and text modalities, and Zhang et al [<xref ref-type="bibr" rid="ref14">14</xref>] combined speech and linguistic features for PHQ-8 regression. Additionally, Lau et al [<xref ref-type="bibr" rid="ref32">32</xref>] developed a parameter-efficient bidirectional long short-term memory with attention, adapted via prefix-tuning for PHQ-8 prediction.</p><p>We further evaluated LLMs, including GPT-4o, o1, and o3-mini (OpenAI and Anthropic), under zero-shot settings for textual inference of depressive symptoms.</p><p>Together, these baselines cover a broad spectrum of architectures and modality configurations, enabling a rigorous and comprehensive comparison with our proposed lightweight multimodal framework.</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>This study protocol was approved by the institutional review board of the National Taiwan University Hospital (202105013RINB), and written informed consent was obtained from all participants prior to participation. To protect participant privacy and confidentiality, all collected data were deidentified and securely stored, and access to the data was restricted to authorized members of the research team only. Participants received NT $200 (US $6.29) as compensation for their participation.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Regression Performance</title><p>To evaluate the ability of the models to estimate depression severity as a continuous score, we assessed regression performance using 3 standard metrics: RMSE, MAE, and CCC [<xref ref-type="bibr" rid="ref33">33</xref>]. RMSE captured the average squared difference between predicted and actual PHQ-8 scores, placing greater emphasis on larger errors. MAE measured the average absolute deviation between predictions and ground truth, providing a more interpretable indication of overall prediction error. CCC evaluated both precision and accuracy by assessing the degree to which predicted values conformed to the ground truth in terms of scale and location, thereby offering a more comprehensive measure of agreement.</p><p>As shown in <xref ref-type="table" rid="table2">Table 2</xref>, on the E-DAIC dataset, our proposed model achieved the best RMSE and MAE scores, outperforming all other methods, including those using larger or specialized multimodal architectures. While OpenAI&#x2019;s o3-mini attained the highest CCC, it did so at the cost of significantly more parameters and computational overhead. In contrast, our model offered competitive concordance with minimal complexity.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Regression performance on the Extended Distress Analysis Interview Corpus test split. Lower RMSE<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> or MAE<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> and higher CCC<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> indicate better results.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model (modality)</td><td align="left" valign="bottom">RMSE</td><td align="left" valign="bottom">MAE</td><td align="left" valign="bottom">CCC</td><td align="left" valign="bottom">Parameters</td></tr></thead><tbody><tr><td align="left" valign="top">OpenAI o3-mini (T<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>)</td><td align="left" valign="top">6.1452</td><td align="left" valign="top">4.4909</td><td align="left" valign="top">0.5109</td><td align="left" valign="top">8B (estimated)</td></tr><tr><td align="left" valign="top">Dham et al (A<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup>+V<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>) [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">6.06</td><td align="left" valign="top">5.03</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="left" valign="top">5M (estimated)</td></tr><tr><td align="left" valign="top">Li et al (T+A+V) [<xref ref-type="bibr" rid="ref16">16</xref>]</td><td align="left" valign="top">5.520</td><td align="left" valign="top">4.634</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Yin et al (T+A+V) [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">5.50</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.442</td><td align="left" valign="top">5M (estimated)</td></tr><tr><td align="left" valign="top">Zhang et al (T+AV) [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">6.11</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.403</td><td align="left" valign="top">7M (estimated)</td></tr><tr><td align="left" valign="top">Ours</td><td align="left" valign="top">5.4632</td><td align="left" valign="top">4.3481</td><td align="left" valign="top">0.4501</td><td align="left" valign="top">617K</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>RMSE: root mean square error.</p></fn><fn id="table2fn2"><p><sup>b</sup>MAE: mean absolute error.</p></fn><fn id="table2fn3"><p><sup>c</sup>CCC: concordance correlation coefficient.</p></fn><fn id="table2fn4"><p><sup>d</sup>T: text.</p></fn><fn id="table2fn5"><p><sup>e</sup>A: audio.</p></fn><fn id="table2fn6"><p><sup>f</sup>V: visual.</p></fn><fn id="table2fn7"><p><sup>g</sup>Not available.</p></fn></table-wrap-foot></table-wrap><p><xref ref-type="table" rid="table3">Table 3</xref> further validates the strength of our method, which achieved state-of-the-art RMSE and MAE results on the DAIC-WOZ benchmark. Notably, our model also achieves the highest CCC (0.6603), suggesting that it better aligns with ground-truth depression scores in both magnitude and scale.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Regression performance on the Distress Analysis Interview Corpus test split.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model (modality)</td><td align="left" valign="bottom">RMSE<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom">MAE<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom">CCC<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="bottom">Parameters</td></tr></thead><tbody><tr><td align="left" valign="top">OpenAI o3-mini (T<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>)</td><td align="left" valign="top">5.1281</td><td align="left" valign="top">3.5745</td><td align="left" valign="top">0.6295</td><td align="left" valign="top">8B (estimated)</td></tr><tr><td align="left" valign="top">Zhang et al (T+A<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup>+V<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup>) [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">4.66</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup></td><td align="left" valign="top">0.560</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Lau et al (T) [<xref ref-type="bibr" rid="ref32">32</xref>]</td><td align="left" valign="top">4.67</td><td align="left" valign="top">3.80</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">646K</td></tr><tr><td align="left" valign="top">Hsieh et al (T) [<xref ref-type="bibr" rid="ref15">15</xref>]</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">3.655</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">745K</td></tr><tr><td align="left" valign="top">Ours (T+A+V)</td><td align="left" valign="top">4.3778</td><td align="left" valign="top">3.3985</td><td align="left" valign="top">0.6603</td><td align="left" valign="top">617K</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>RMSE: root mean square error.</p></fn><fn id="table3fn2"><p><sup>b</sup>MAE: mean absolute error.</p></fn><fn id="table3fn3"><p><sup>c</sup>CCC: concordance correlation coefficient.</p></fn><fn id="table3fn4"><p><sup>d</sup>T: text.</p></fn><fn id="table3fn5"><p><sup>e</sup>A: audio.</p></fn><fn id="table3fn6"><p><sup>f</sup>V: visual.</p></fn><fn id="table3fn7"><p><sup>g</sup>Not available.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Classification Performance</title><p>As shown in <xref ref-type="table" rid="table4">Table 4</xref>, we evaluated the binary classification performance of our proposed method and several baselines on the E-DAIC test split. The objective was to distinguish between depressed and nondepressed individuals based on the standard PHQ-8 cutoff threshold (&#x2265;10). Due to the limited public availability of directly comparable classification results on other datasets, we restricted our binary classification evaluation to E-DAIC, which was widely adopted in prior research.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Classification results on the extended Distress Analysis Interview Corpus test split. Our model outperforms all baselines across all metrics.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model (modality)</td><td align="left" valign="bottom">Depression <italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Control <italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Macro <italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">OpenAI GPT-4o (T<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>)</td><td align="char" char="." valign="top">0.6316</td><td align="char" char="." valign="top">0.8056</td><td align="char" char="." valign="top">0.7186</td></tr><tr><td align="left" valign="top">OpenAI o1 (T)</td><td align="char" char="." valign="top">0.6667</td><td align="char" char="." valign="top">0.7941</td><td align="char" char="." valign="top">0.7304</td></tr><tr><td align="left" valign="top">OpenAI o3-mini (T)</td><td align="char" char="." valign="top">0.6977</td><td align="char" char="." valign="top">0.8060</td><td align="char" char="." valign="top">0.7518</td></tr><tr><td align="left" valign="top">Burdisso et al (T) [<xref ref-type="bibr" rid="ref29">29</xref>]</td><td align="char" char="." valign="top">0.63</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.73</td></tr><tr><td align="left" valign="top">Hsieh et al (T) [<xref ref-type="bibr" rid="ref15">15</xref>]</td><td align="char" char="." valign="top">0.4722</td><td align="char" char="." valign="top">0.8172</td><td align="char" char="." valign="top">0.6447</td></tr><tr><td align="left" valign="top">Ours (T+A<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup>+V<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup>)</td><td align="char" char="." valign="top">0.7647</td><td align="char" char="." valign="top">0.8947</td><td align="char" char="." valign="top">0.8297</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>T: text.</p></fn><fn id="table4fn2"><p><sup>b</sup>A: audio.</p></fn><fn id="table4fn3"><p><sup>c</sup>V: visual.</p></fn></table-wrap-foot></table-wrap><p>To account for class imbalance, we reported 3 standard <italic>F</italic><sub>1</sub>-score&#x2013;based metrics: depression <italic>F</italic><sub>1</sub>-score, control <italic>F</italic><sub>1</sub>-score, and macro <italic>F</italic><sub>1</sub>-score. Depression <italic>F</italic><sub>1</sub>-score treated the depression class as the positive class, while control <italic>F</italic><sub>1</sub>-score treated the nondepression (control) class as the positive class. Macro <italic>F</italic><sub>1</sub>-score was computed as the unweighted average of the depression <italic>F</italic><sub>1</sub>-score and control <italic>F</italic><sub>1</sub>-score, providing a balanced evaluation across both classes.</p></sec><sec id="s3-3"><title>Analysis</title><p>The proposed model achieved state-of-the-art performance in both regression and classification tasks while maintaining an exceptionally small footprint of 617K parameters. On the E-DAIC benchmark, our model attained an RMSE of 5.46 and an MAE of 4.35, outperforming prior multimodal approaches such as Zhang et al [<xref ref-type="bibr" rid="ref14">14</xref>] (RMSE 6.11) and Yin et al [<xref ref-type="bibr" rid="ref16">16</xref>] (RMSE 5.50). Similar trends were observed on the DAIC-WOZ dataset, where our method achieved an RMSE of 4.38, an MAE of 3.40, and the highest CCC of 0.66, indicating stronger agreement with ground-truth PHQ-8 scores compared to existing baselines.</p><p>In binary classification on the E-DAIC dataset, the proposed framework achieved a macro <italic>F</italic><sub>1</sub>-score of 0.8297, substantially exceeding both traditional multimodal models and zero-shot LLMs. Notably, the model demonstrated balanced detection capability, with a depression <italic>F</italic><sub>1</sub>-score of 0.7647 and a control <italic>F</italic><sub>1</sub>-score of 0.8947, which was critical in clinical screening scenarios to minimize both false negatives and false positives.</p><p>Despite its compact size, the proposed model consistently outperformed parameter-heavy LLM baselines such as GPT-4o and o3-mini, which required orders of magnitude more parameters. These quantitative results highlighted three key strengths of our approach: (1) state-of-the-art accuracy with minimal parameters, establishing a new lightweight benchmark for multimodal depression detection; (2) effective joint modeling of regression and classification through a simple multitask 2-head architecture; and (3) robust generalization across both DAIC and E-DAIC datasets, supporting deployment on resource-constrained platforms such as SARs.</p></sec><sec id="s3-4"><title>Ablation Study</title><p>To quantify the contribution of each modality in our multimodal framework, we performed an ablation study by selectively disabling feature groups during both training and testing. Each ablated modality was replaced with random noise, ensuring that the architecture and learning dynamics remained consistent across settings.</p><sec id="s3-4-1"><title>Experimental Conditions</title><p>We evaluated 5 experimental configurations to analyze the contribution of each modality within the proposed framework. The original configuration activated all modalities, including sentence-level embeddings, word-level embeddings, audio features, and facial features.</p><p>In the No SentEmb configuration, sentence-level embeddings were replaced with random noise to remove high-level semantic structure while preserving the remaining modalities. In the No WordEmb setting, word-level embeddings were replaced with noise to eliminate low-level linguistic information.</p><p>The No Audio configuration excluded acoustic features, thereby removing prosodic cues such as tone and pitch, which were associated with depressive speech patterns. Similarly, the No Facial configuration omitted visual features, eliminating facial AUs and gaze information that provided nonverbal affective signals.</p><p>This ablation design enabled a systematic examination of the relative contributions of semantic, acoustic, and visual modalities to overall depression detection performance.</p></sec><sec id="s3-4-2"><title>Ablation Study of Modality Configurations</title><p>As shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, removing word embeddings (No WordEmb) resulted in a substantial degradation in the depression <italic>F</italic><sub>1</sub>-score, underscoring the essential role of token-level linguistic cues in depression detection. Sentence-level embeddings (No SentEmb) also had a substantial impact, confirming the importance of capturing semantic context. In contrast, removing acoustic (No Audio) or visual (No Facial) features led to moderate declines in overall performance. These findings suggested that while linguistic features were the primary drivers of detection accuracy, audio and visual modalities contributed to supplementary signals that improved robustness. Our results aligned with prior work [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>], demonstrating the dominant predictive power of text in affective computing tasks. This analysis validated the effectiveness of our multimodal fusion strategy and highlighted the complementary roles of each modality.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Ablation results across modality configurations. Word-level embeddings show the largest contribution, followed by sentence-level semantics. Audio and visual cues provide additive but less critical information.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e84110_fig03.png"/></fig></sec></sec><sec id="s3-5"><title>Real-World Evaluation</title><sec id="s3-5-1"><title>Robot Platform</title><p>For the real-world evaluation, the proposed DEPRESAR-Fusion framework was deployed on a commercially available SAR, the Kebbi robot. This robot is equipped with a microphone array for audio capture, an RGB camera for visual input, a display screen for visual feedback, and onboard speech synthesis for spoken responses. Multimodal inputs, including user voice and facial expressions, were captured by the robot and transmitted to a connected server for real-time feature extraction and inference. The Kebbi platform served as the interactive interface through which users engaged naturally with the system.</p></sec><sec id="s3-5-2"><title>Deployment Setup</title><p>To examine the feasibility of real-time deployment, we conducted a pilot user study involving 22 adult participants recruited through the Department of Psychology at National Taiwan University. Participants ranged in age from 19 to 55 years, with the majority between 19 and 25 years old. The gender distribution was balanced (n=11 male participants and n=11 female participants). All participants provided informed consent prior to participation.</p><p>Each participant interacted naturally with the Kebbi robot in a spoken dialogue consisting of at least 10 user-generated utterances. Following the interaction, participants completed the PHQ-8, which served as the ground truth measure of depressive symptom severity.</p><p>Participants were randomly assigned to 1 of 2 conditions. The emotion-induced group (n=11) viewed a targeted emotion-induction video designed to elicit affective expression prior to the interaction, whereas the control group (n=11) proceeded directly to the robot interaction without priming.</p><p>This deployment was designed as a feasibility pilot study to evaluate real-time integration and interaction robustness, rather than to provide statistically powered clinical validation.</p></sec><sec id="s3-5-3"><title>Statistical Consistency with Benchmark Dataset</title><p>To validate the external validity of our deployment, we compared PHQ-8 statistics between the DAIC-WOZ test split and our real-world sample. The class distribution (depressed vs nondepressed) and score statistics (mean and SD) show strong alignment between the 2 datasets (<xref ref-type="table" rid="table5">Tables 5</xref> and <xref ref-type="table" rid="table6">6</xref>).</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Number of participants with depression and participants without depression.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">DAIC-WOZ<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup> test split</td><td align="left" valign="bottom">Real-world data</td></tr></thead><tbody><tr><td align="left" valign="top">Participants without depression</td><td align="char" char="." valign="top">33</td><td align="left" valign="top">14</td></tr><tr><td align="left" valign="top">Participants with depression</td><td align="char" char="." valign="top">14</td><td align="left" valign="top">8</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>DAIC-WOZ: Distress Analysis Interview Corpus-Wizard-of-Oz.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>PHQ-8<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup> score statistics.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Statistics</td><td align="left" valign="bottom">DAIC-WOZ<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> test split, mean (SD)</td><td align="left" valign="bottom">Real-world data, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Average PHQ-8 score</td><td align="left" valign="top">6.98 (6.47)</td><td align="left" valign="top">7.45 (6.40)</td></tr><tr><td align="left" valign="top">Average score (nondepressed)</td><td align="left" valign="top">3.36 (3.04)</td><td align="left" valign="top">3.50 (2.71)</td></tr><tr><td align="left" valign="top">Average score (depressed)</td><td align="left" valign="top">15.50 (3.76)</td><td align="left" valign="top">14.38 (4.44)</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>PHQ-8: Patient Health Questionnaire-8.</p></fn><fn id="table6fn2"><p><sup>b</sup>DAIC-WOZ: Distress Analysis Interview Corpus-Wizard-of-Oz.</p></fn></table-wrap-foot></table-wrap><p>The average PHQ-8 score and SD in both datasets were nearly identical, indicating that our field data were statistically comparable to the DAIC-WOZ benchmark and suitable for generalization assessment.</p></sec><sec id="s3-5-4"><title>Model Performance in the Wild</title><p>We evaluated the model&#x2019;s classification and regression performance in a real-world setting and compared it to its performance on the DAIC-WOZ test set. The classification model generalizes well to real-world conversations, achieving slightly improved <italic>F</italic><sub>1</sub>-scores and recall scores, despite a minor drop in overall accuracy to 0.7895 (statistically significant at paired <italic>t</italic> test, <italic>P</italic>=.02). In contrast, regression accuracy drops more noticeably, indicating that real-world data introduce greater variability in symptom expression than observed during training (<xref ref-type="table" rid="table7">Tables 7</xref><xref ref-type="table" rid="table8"/>-<xref ref-type="table" rid="table9">9</xref>).</p><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Classification task performance.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Classification task</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Accuracy</td></tr></thead><tbody><tr><td align="left" valign="top">DAIC<sup><xref ref-type="table-fn" rid="table7fn1">a</xref></sup> test split</td><td align="char" char="." valign="top">0.6667</td><td align="char" char="." valign="top">0.8571</td><td align="char" char="." valign="top">0.7500</td><td align="char" char="." valign="top">0.8300</td></tr><tr><td align="left" valign="top">Real-world data</td><td align="char" char="." valign="top">0.7000</td><td align="char" char="." valign="top">0.8750</td><td align="char" char="." valign="top">0.7778</td><td align="char" char="." valign="top">0.7895</td></tr></tbody></table><table-wrap-foot><fn id="table7fn1"><p><sup>a</sup>DAIC: Distress Analysis Interview Corpus.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t8" position="float"><label>Table 8.</label><caption><p>Regression task performance.</p></caption><table id="table8" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Real-world data</td><td align="left" valign="bottom">RMSE<sup><xref ref-type="table-fn" rid="table8fn1">a</xref></sup></td><td align="left" valign="bottom">MAE<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">DAIC<sup><xref ref-type="table-fn" rid="table8fn3">c</xref></sup> test split</td><td align="char" char="." valign="top">4.3778</td><td align="char" char="." valign="top">3.3985</td></tr><tr><td align="left" valign="top">Real-world data</td><td align="char" char="." valign="top">6.5252</td><td align="char" char="." valign="top">4.7217</td></tr></tbody></table><table-wrap-foot><fn id="table8fn1"><p><sup>a</sup>RMSE: root mean square error.</p></fn><fn id="table8fn2"><p><sup>b</sup>MAE: mean absolute error.</p></fn><fn id="table8fn3"><p><sup>c</sup>DAIC: Distress Analysis Interview Corpus.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t9" position="float"><label>Table 9.</label><caption><p>Real-world performance with and without emotion induction.</p></caption><table id="table9" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metric</td><td align="left" valign="bottom">With emotion induction</td><td align="left" valign="bottom">Without emotion induction</td></tr></thead><tbody><tr><td align="left" valign="top">Regression task</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RMSE<sup><xref ref-type="table-fn" rid="table9fn1">a</xref></sup></td><td align="left" valign="top">5.0868</td><td align="left" valign="top">7.7650</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>MAE<sup><xref ref-type="table-fn" rid="table9fn2">b</xref></sup></td><td align="left" valign="top">3.8653</td><td align="left" valign="top">5.4853</td></tr><tr><td align="left" valign="top">Classification task</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Precision</td><td align="left" valign="top">0.6000</td><td align="left" valign="top">0.6667</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Recall</td><td align="left" valign="top">1.0000</td><td align="left" valign="top">0.8000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">0.7500</td><td align="left" valign="top">0.7273</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy</td><td align="left" valign="top">0.8182</td><td align="left" valign="top">0.7273</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>P</italic> value (Binomial test)</td><td align="left" valign="top">.03</td><td align="left" valign="top">.11</td></tr></tbody></table><table-wrap-foot><fn id="table9fn1"><p><sup>a</sup>RMSE: root mean square error.</p></fn><fn id="table9fn2"><p><sup>b</sup>MAE: mean absolute error.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-5-5"><title>Impact of Emotion Induction</title><p>We further examined the influence of emotion induction on model performance by comparing results across the emotion-induced and control groups. Participants exposed to emotional stimuli before the interaction exhibited improved regression accuracy (significantly lower RMSE and MAE) and stronger classification performance. This improvement reflects the effectiveness of the proposed emotion-induction procedure as a whole, rather than isolating emotion-specific causal effects. Although the observed difference reached statistical significance (<italic>P</italic>=.033), this result should be interpreted with caution given the small sample size. The finding was intended to highlight a preliminary trend rather than to establish a definitive causal conclusion.</p></sec><sec id="s3-5-6"><title>User Feedback</title><p>We conducted a postinteraction survey with 12 Likert-scale questions (6 positive and 6 negative) to assess user experience (<xref ref-type="table" rid="table10">Table 10</xref>). Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> summarizes mean ratings across participants, with error bars indicating the SEM. Positive items (Q1-Q6) received generally higher scores than negative ones (Q7-Q12), indicating favorable perceptions of the agent&#x2019;s understanding, empathy, and clarity&#x2014;especially Q3 (&#x201C;I can understand what the robot says&#x201D;) and Q1 (perceived understanding). Negative items, particularly Q9 and Q10, scored low, reflecting disagreement with negative experiences. The paired positive or negative results confirm effective communication and emotional responsiveness. However, lower scores on Q5 and Q6 (emotional comfort or support) indicated a discrepancy between perceived understanding and perceived emotional comfort. This discrepancy was not unexpected in human-robot interaction settings. While the ESC framework guided the agent to adopt appropriate high-level empathetic strategies, perceived emotional comfort was also strongly influenced by factors such as vocal prosody, response timing, and nonverbal cues. These aspects were only partially addressed in the implementation, which may explain the observed gap between perceived understanding and emotional comfort.</p><table-wrap id="t10" position="float"><label>Table 10.</label><caption><p>User feedback questionnaire items.</p></caption><table id="table10" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ID</td><td align="left" valign="bottom">Question (translated from Chinese)</td></tr></thead><tbody><tr><td align="left" valign="top">Q1</td><td align="left" valign="top">I feel the robot understands what I say.</td></tr><tr><td align="left" valign="top">Q2</td><td align="left" valign="top">I feel the robot understands my emotional state.</td></tr><tr><td align="left" valign="top">Q3</td><td align="left" valign="top">I can understand what the robot says.</td></tr><tr><td align="left" valign="top">Q4</td><td align="left" valign="top">I feel the robot shows empathy.</td></tr><tr><td align="left" valign="top">Q5</td><td align="left" valign="top">I feel the robot comforts me.</td></tr><tr><td align="left" valign="top">Q6</td><td align="left" valign="top">I feel the robot fulfills my emotional support needs.</td></tr><tr><td align="left" valign="top">Q7</td><td align="left" valign="top">I feel the robot does not understand what I want to express.</td></tr><tr><td align="left" valign="top">Q8</td><td align="left" valign="top">I feel the robot cannot understand my emotional state.</td></tr><tr><td align="left" valign="top">Q9</td><td align="left" valign="top">I cannot understand what the robot is trying to express.</td></tr><tr><td align="left" valign="top">Q10</td><td align="left" valign="top">I feel the robot lacks empathy.</td></tr><tr><td align="left" valign="top">Q11</td><td align="left" valign="top">I feel the robot does not comfort me.</td></tr><tr><td align="left" valign="top">Q12</td><td align="left" valign="top">I feel the robot cannot fulfill my emotional support needs.</td></tr></tbody></table></table-wrap><p>This user feedback provided empirical evidence that our emotion-aware agent not only achieved effective basic communication but also demonstrated empathetic engagement, a critical factor in human-robot interaction. Unlike prior work that often focused solely on task performance, our results emphasized the agent&#x2019;s ability to foster emotional rapport, which was essential for applications in mental health support. The identified gaps in emotional comfort indicated promising directions for refining conversational tone to enhance user trust and acceptance, advancing the design of socially intelligent agents.</p></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study introduces DEPRESAR-Fusion, a lightweight multimodal depression detection framework designed for naturalistic interaction with SARs. Across both regression and classification tasks, the proposed model demonstrated strong predictive performance while maintaining an exceptionally compact architecture (617K parameters).</p><p>On PHQ-8 regression tasks, the model achieved the lowest RMSE and MAE on the E-DAIC dataset and state-of-the-art performance on the DAIC-WOZ benchmark, while maintaining competitive CCC values. These results indicate that the proposed dual-head architecture can effectively estimate both depressive severity and categorical status within a unified framework.</p><p>In binary classification, the model achieved a macro <italic>F</italic><sub>1</sub>-score of 0.8297 on E-DAIC, demonstrating balanced performance across depressed and nondepressed classes. Importantly, the system maintained stable performance during real-world deployment on a commercial SAR platform, suggesting practical feasibility beyond controlled benchmark settings.</p><p>The ablation study revealed that linguistic features&#x2014;particularly word-level embeddings&#x2014;contribute the most substantial predictive signal. Acoustic and visual modalities provided complementary improvements, enhancing robustness without dominating the predictive process. Furthermore, the pilot deployment indicated that emotion induction may increase expressive signal strength and improve downstream detection performance, suggesting the practical value of integrating affect elicitation into conversational AI systems.</p></sec><sec id="s4-2"><title>Comparison with Prior Work</title><p>Our findings are consistent with prior research demonstrating the dominant predictive role of textual features in depression detection. Studies based on clinical interview transcripts have shown that linguistic content often outperforms acoustic and visual modalities in predictive accuracy [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. Our ablation results support this observation, as removing word-level embeddings resulted in the largest degradation in performance.</p><p>Compared with multimodal architectures, such as Yin et al [<xref ref-type="bibr" rid="ref31">31</xref>], Zhang et al [<xref ref-type="bibr" rid="ref14">14</xref>], and Li et al [<xref ref-type="bibr" rid="ref16">16</xref>], our model achieves lower RMSE and MAE while maintaining a substantially smaller parameter footprint. While transformer-based or hierarchical recurrent frameworks emphasize deep cross-modal modeling, our attention-based fusion module demonstrates that efficient structured integration can achieve competitive or superior results with significantly reduced computational cost.</p><p>Relative to parameter-efficient text-only approaches such as Lau et al [<xref ref-type="bibr" rid="ref32">32</xref>], our multimodal framework provides improved regression consistency, suggesting that incorporating behavioral cues beyond text enhances severity estimation. Additionally, compared with zero-shot LLMs (eg, GPT-4o, o3-mini), our model achieves stronger classification performance despite operating at a fraction of the parameter scale, highlighting the value of task-specific multimodal design over parameter scaling alone.</p><p>Finally, while psychological literature has established altered emotional reactivity in depression [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>], few computational systems explicitly integrate emotion induction into multimodal detection pipelines. Our pilot findings suggest that embedding affect elicitation within conversational interaction may enhance behavioral signal salience in naturalistic settings.</p></sec><sec id="s4-3"><title>Limitation</title><p>This study has several limitations that warrant careful consideration. The real-world deployment involved a relatively small sample size (n=22), which constrains statistical power and limits the precision of effect size estimation. Although certain differences&#x2014;particularly in the emotion-induction condition&#x2014;reached statistical significance, the modest sample increases uncertainty regarding robustness and reproducibility. Larger-scale studies are necessary to confirm stability across broader populations.</p><p>Participants were primarily young adults affiliated with a university setting, resulting in a relatively homogeneous demographic profile. Such sampling may introduce selection bias and restrict external validity. Depression presentation varies across age groups, socioeconomic contexts, and cultural backgrounds. The generalizability of the current findings to older adults, clinical psychiatric populations, or cross-cultural environments, therefore, remains to be established.</p><p>PHQ-8 was used as the sole ground truth indicator of depressive symptom severity. While widely validated and frequently adopted in computational depression research, PHQ-8 is a self-report instrument rather than a clinician-administered diagnostic interview. Self-reported measures may be influenced by reporting bias, transient emotional states, or social desirability effects. Incorporating structured diagnostic interviews or multimethod assessment frameworks would strengthen clinical validity.</p><p>The emotion-induction protocol relied on a fixed set of emotionally evocative video stimuli. Emotional reactivity varies across individuals, and standardized stimuli may not elicit comparable affective responses in all participants. Although improved predictive performance was observed following induction, the study design does not fully disentangle whether the performance gain reflects increased emotional expressiveness, cognitive priming, or heightened engagement. More controlled experimental paradigms&#x2014;including neutral baseline conditions or counterbalanced stimulus designs&#x2014;would allow for clearer causal interpretation.</p><p>The data augmentation strategy also introduces considerations. Synthetic text samples were generated exclusively from training data to prevent information leakage; however, approximating PHQ-8 regression targets through threshold-based assignment reduces label granularity. This simplification may limit the precision of severity calibration. In addition, paraphrased transcripts, while semantically constrained, may differ distributionally from naturally occurring spontaneous conversations, potentially affecting generalization.</p><p>The evaluation framework focuses on cross-sectional assessment rather than longitudinal monitoring. Depression is dynamic and episodic, and a single interaction may not capture temporal symptom fluctuations. Long-term validation is needed to determine sensitivity to clinical change and suitability for continuous monitoring scenarios.</p><p>Finally, while attention mechanisms provide partial insight into modality weighting, the interpretability of specific multimodal feature contributions remains limited. Greater integration of explainability tools and clinician-in-the-loop evaluation would be necessary before large-scale clinical deployment.</p></sec><sec id="s4-4"><title>Future Directions</title><p>Future research should explore cross-cultural and multilingual generalization to broaden applicability across diverse populations. The emotion-induction mechanism may be further optimized through adaptive or reinforcement-based strategies to enhance user engagement.</p><p>In addition, incorporating continual and personalized learning mechanisms could support longitudinal adaptation without full retraining. Future work should also integrate explainability tools and conduct systematic failure case analyses to improve robustness and clinical interpretability. Finally, more controlled experimental designs, such as neutral-video conditions, are needed to better isolate emotion-specific induction effects.</p></sec><sec id="s4-5"><title>Conclusions</title><p>DEPRESAR-Fusion demonstrates that compact multimodal architectures can achieve state-of-the-art depression detection performance while remaining suitable for real-time SAR deployment. By integrating multimodal perception, lightweight deep fusion, and emotion-aware interaction mechanisms, this work advances practical, scalable, and field-validated approaches to AI-driven mental health support.</p></sec></sec></body><back><ack><p>The authors disclose the use of generative artificial intelligence (AI) tools during the manuscript preparation process. According to the Generative AI Delegation taxonomy (2025), generative AI was used under full human supervision to assist with language drafting, proofreading and editing, language tone refinement, translation, and reformatting.</p><p>The generative AI tool used was ChatGPT-4o (OpenAI). The AI tool was not involved in the study design, data collection, data analysis, interpretation of results, or any scientific or clinical decision-making. Responsibility for the final manuscript lies entirely with the authors. Generative AI tools are not listed as authors and do not bear responsibility for the final outcomes.</p><p>Declaration submitted by: PYL, YQS.</p></ack><notes><sec><title>Funding</title><p>This research is supported in part by National Taiwan University under the grants NTU-ICRP-115L7505 and NTU-CC-115L893401, as well as by the National Science and Technology Council under the grants NSTC 114-2223-E-002-002 and NSTC 114-2634-F-002-005.</p></sec></notes><fn-group><fn fn-type="conflict"><p>The authors declare no conflicts of interest. The authors were involved in the design and development of the system described and evaluated in this study as part of the research process. No commercial products were developed or marketed as part of this work, and the authors received no financial benefit related to the outcomes of this study.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">AU</term><def><p>action unit</p></def></def-item><def-item><term id="abb3">CCC</term><def><p>concordance correlation coefficient</p></def></def-item><def-item><term id="abb4">DAIC-WOZ</term><def><p>Distress Analysis Interview Corpus-Wizard-of-Oz</p></def></def-item><def-item><term id="abb5">DEPRESAR</term><def><p>Depression Social Assistant Robot</p></def></def-item><def-item><term id="abb6">E-DAIC</term><def><p>Extended Distress Analysis Interview Corpus</p></def></def-item><def-item><term id="abb7">eGeMAPS</term><def><p>extended Geneva Minimalistic Acoustic Parameter Set</p></def></def-item><def-item><term id="abb8">ESC</term><def><p>emotional support conversation</p></def></def-item><def-item><term id="abb9">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb10">MAE</term><def><p>mean absolute error</p></def></def-item><def-item><term id="abb11">MFCC</term><def><p>Mel-frequency cepstral coefficient</p></def></def-item><def-item><term id="abb12">PHQ-8</term><def><p>Patient Health Questionnaire-8</p></def></def-item><def-item><term id="abb13">RMSE</term><def><p>root mean square error</p></def></def-item><def-item><term id="abb14">SAR</term><def><p>socially assistive robot</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Global health data results</article-title><source>Institute for Health Metrics and Evaluation (IHME)</source><year>2023</year><access-date>2023-03-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://vizhub.healthdata.org/gbd-results">https://vizhub.healthdata.org/gbd-results</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reddy</surname><given-names>MS</given-names> </name></person-group><article-title>Depression: the disorder and the burden</article-title><source>Indian J Psychol Med</source><year>2010</year><month>01</month><volume>32</volume><issue>1</issue><fpage>1</fpage><lpage>2</lpage><pub-id pub-id-type="doi">10.4103/0253-7176.70510</pub-id><pub-id pub-id-type="medline">21799550</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stein</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Stein</surname><given-names>DJ</given-names> </name></person-group><article-title>Social anxiety disorder</article-title><source>Lancet</source><year>2008</year><month>03</month><day>29</day><volume>371</volume><issue>9618</issue><fpage>1115</fpage><lpage>1125</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(08)60488-2</pub-id><pub-id pub-id-type="medline">18374843</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arnone</surname><given-names>D</given-names> </name><name name-style="western"><surname>Karmegam</surname><given-names>SR</given-names> </name><name name-style="western"><surname>&#x00D6;stlundh</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Risk of suicidal behavior in patients with major depression and bipolar disorder - a systematic review and meta-analysis of registry-based studies</article-title><source>Neurosci Biobehav Rev</source><year>2024</year><month>04</month><volume>159</volume><fpage>105594</fpage><pub-id pub-id-type="doi">10.1016/j.neubiorev.2024.105594</pub-id><pub-id pub-id-type="medline">38368970</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kroenke</surname><given-names>K</given-names> </name><name name-style="western"><surname>Strine</surname><given-names>TW</given-names> </name><name name-style="western"><surname>Spitzer</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Williams</surname><given-names>JBW</given-names> </name><name name-style="western"><surname>Berry</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Mokdad</surname><given-names>AH</given-names> </name></person-group><article-title>The PHQ-8 as a measure of current depression in the general population</article-title><source>J Affect Disord</source><year>2009</year><month>04</month><volume>114</volume><issue>1-3</issue><fpage>163</fpage><lpage>173</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2008.06.026</pub-id><pub-id pub-id-type="medline">18752852</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Mallol-Ragolta</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Stappen</surname><given-names>L</given-names> </name><name name-style="western"><surname>Cummins</surname><given-names>N</given-names> </name><name name-style="western"><surname>Schuller</surname><given-names>BW</given-names> </name></person-group><article-title>A hierarchical attention network-based approach for depression detection from transcribed clinical interviews</article-title><conf-name>Proceedings of the Annual Conference of the International Speech Communication Association (INTERSPEECH)</conf-name><conf-date>Sep 15-19, 2019</conf-date><conf-loc>Graz, Austria</conf-loc><fpage>221</fpage><lpage>225</lpage><pub-id pub-id-type="doi">10.21437/Interspeech.2019-2036</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Villatoro-Tello</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ram&#x00ED;rez-de-la-Rosa</surname><given-names>G</given-names> </name><name name-style="western"><surname>G&#x00E1;tica-P&#x00E9;rez</surname><given-names>D</given-names> </name><name name-style="western"><surname>Magimai.-Doss</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jim&#x00E9;nez-Salazar</surname><given-names>H</given-names> </name></person-group><article-title>Approximating the mental lexicon from clinical interviews as a support tool for depression detection</article-title><conf-name>Proceedings of the 2021 International Conference on Multimodal Interaction (ICMI &#x2019;21)</conf-name><conf-date>Oct 18-22, 2021</conf-date><conf-loc>Montr&#x00E9;al, QC, Canada</conf-loc><fpage>557</fpage><lpage>566</lpage><pub-id pub-id-type="doi">10.1145/3462244.3479896</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Taguchi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tachikawa</surname><given-names>H</given-names> </name><name name-style="western"><surname>Nemoto</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Major depressive disorder discrimination using vocal acoustic features</article-title><source>J Affect Disord</source><year>2018</year><month>01</month><day>1</day><volume>225</volume><fpage>214</fpage><lpage>220</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2017.08.038</pub-id><pub-id pub-id-type="medline">28841483</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ma</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name></person-group><article-title>DepAudioNet: an efficient deep model for audio based depression classification</article-title><conf-name>Proceedings of the 6th International Workshop on Audio/Visual Emotion Challenge (AVEC &#x2019;16)</conf-name><conf-date>Oct 16, 2016</conf-date><conf-loc>Amsterdam, The Netherlands</conf-loc><fpage>35</fpage><lpage>42</lpage><pub-id pub-id-type="doi">10.1145/2988257.2988267</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Rohanian</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hough</surname><given-names>J</given-names> </name><name name-style="western"><surname>Purver</surname><given-names>M</given-names> </name></person-group><article-title>Detecting depression with word-level multimodal fusion</article-title><conf-name>Proceedings of the Annual Conference of the International Speech Communication Association (INTERSPEECH)</conf-name><conf-date>Sep 15-19, 2019</conf-date><conf-loc>Graz, Austria</conf-loc><fpage>1443</fpage><lpage>1447</lpage><pub-id pub-id-type="doi">10.21437/Interspeech.2019-2283</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shapero</surname><given-names>BG</given-names> </name><name name-style="western"><surname>Farabaugh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Terechina</surname><given-names>O</given-names> </name><etal/></person-group><article-title>Understanding the effects of emotional reactivity on depression and suicidal thoughts and behaviors: moderating effects of childhood adversity and resilience</article-title><source>J Affect Disord</source><year>2019</year><month>02</month><day>15</day><volume>245</volume><fpage>419</fpage><lpage>427</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2018.11.033</pub-id><pub-id pub-id-type="medline">30423470</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bylsma</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Morris</surname><given-names>BH</given-names> </name><name name-style="western"><surname>Rottenberg</surname><given-names>J</given-names> </name></person-group><article-title>A meta-analysis of emotional reactivity in major depressive disorder</article-title><source>Clin Psychol Rev</source><year>2008</year><month>04</month><volume>28</volume><issue>4</issue><fpage>676</fpage><lpage>691</lpage><pub-id pub-id-type="doi">10.1016/j.cpr.2007.10.001</pub-id><pub-id pub-id-type="medline">18006196</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Inkpen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kirinde Gamaarachchige</surname><given-names>P</given-names> </name></person-group><article-title>Explainable depression detection using large language models on social media data</article-title><conf-name>Proceedings of the 9th Workshop on Computational Linguistics and Clinical Psychology (CLPsych 2024)</conf-name><conf-date>Mar 21, 2024</conf-date><conf-loc>St. Julians, Malta</conf-loc><fpage>108</fpage><lpage>126</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.clpsych-1.8</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Niu</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Association between depression and clinical outcomes following percutaneous coronary intervention: a meta-analysis</article-title><source>Psychopathology</source><year>2022</year><volume>55</volume><issue>5</issue><fpage>251</fpage><lpage>257</lpage><pub-id pub-id-type="doi">10.1159/000524228</pub-id><pub-id pub-id-type="medline">35421863</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="thesis"><person-group person-group-type="author"><name name-style="western"><surname>Hsieh</surname><given-names>HC</given-names> </name><name name-style="western"><surname>Qian</surname><given-names>XB</given-names> </name><name name-style="western"><surname>Yeh</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>LC</given-names> </name></person-group><article-title>Deep learning&#x2013;based depression detection and emotional support with social assistant robot [Master&#x2019;s Thesis]</article-title><year>2023</year><access-date>2026-03-25</access-date><publisher-name>National Taiwan University (NTU)</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://tdr.lib.ntu.edu.tw/jspui/handle/123456789/90807">https://tdr.lib.ntu.edu.tw/jspui/handle/123456789/90807</ext-link></comment><pub-id pub-id-type="doi">10.6342/NTU202303356</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>M</given-names> </name><etal/></person-group><article-title>FPT-former: a flexible parallel transformer of recognizing depression by using audiovisual expert-knowledge-based multimodal measures</article-title><source>Int J Intell Syst</source><year>2024</year><month>01</month><day>29</day><volume>2024</volume><fpage>1</fpage><lpage>13</lpage><pub-id pub-id-type="doi">10.1155/2024/1564574</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boettcher</surname><given-names>N</given-names> </name></person-group><article-title>Studies of depression and anxiety using Reddit as a data source: scoping review</article-title><source>JMIR Ment Health</source><year>2021</year><month>11</month><day>25</day><volume>8</volume><issue>11</issue><fpage>e29487</fpage><pub-id pub-id-type="doi">10.2196/29487</pub-id><pub-id pub-id-type="medline">34842560</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Villa-Perez</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Trejo</surname><given-names>LA</given-names> </name></person-group><article-title>Twitter dataset for mental disorders detection</article-title><source>IEEE DataPort</source><access-date>2026-03-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ieee-dataport.org/documents/twitter-dataset-mental-disorders-detection">https://ieee-dataport.org/documents/twitter-dataset-mental-disorders-detection</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ringeval</surname><given-names>F</given-names> </name><name name-style="western"><surname>Schuller</surname><given-names>B</given-names> </name><name name-style="western"><surname>Valstar</surname><given-names>M</given-names> </name><etal/></person-group><article-title>AVEC 2019 workshop and challenge: state-of-mind, detecting depression with AI, and cross-cultural affect recognition</article-title><conf-name>Proceedings of the 9th International on Audio/Visual Emotion Challenge and Workshop (AVEC &#x2019;19)</conf-name><conf-date>Oct 21, 2019</conf-date><conf-loc>Nice, France</conf-loc><fpage>3</fpage><lpage>12</lpage><pub-id pub-id-type="doi">10.1145/3347320.3357688</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Eyben</surname><given-names>F</given-names> </name><name name-style="western"><surname>W&#x00F6;llmer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Schuller</surname><given-names>B</given-names> </name></person-group><article-title>OpenSMILE: the munich versatile and fast open-source audio feature extractor</article-title><conf-name>Proceedings of the 18th ACM International Conference on Multimedia (MM &#x2019;10)</conf-name><conf-date>Oct 25-29, 2010</conf-date><conf-loc>Firenze (Florence), Italy</conf-loc><fpage>1459</fpage><lpage>1462</lpage><pub-id pub-id-type="doi">10.1145/1873951.1874246</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Robinson</surname><given-names>P</given-names> </name><name name-style="western"><surname>Morency</surname><given-names>LP</given-names> </name></person-group><article-title>OpenFace: an open source facial behavior analysis toolkit</article-title><conf-name>2016 IEEE Winter Conference on Applications of Computer Vision (WACV)</conf-name><conf-date>Mar 7-10, 2016</conf-date><conf-loc>Lake Placid, NY, USA</conf-loc><fpage>1</fpage><lpage>10</lpage><pub-id pub-id-type="doi">10.1109/WACV.2016.7477553</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>F</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>L</given-names> </name><etal/></person-group><article-title>MINILM: deep self-attention distillation for task-agnostic compression of pre-trained transformers</article-title><conf-name>34th Conference on Neural Information Processing Systems (NeurIPS 2020)</conf-name><conf-date>Dec 6-12, 2020</conf-date><conf-loc>Vancouver, BC, Canada</conf-loc><fpage>5776</fpage><lpage>5788</lpage><pub-id pub-id-type="doi">10.5555/3495724.3496209</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><conf-name>North American Chapter of the Association for Computational Linguistics: Human Language Technologies 2019 (NAACL-HLT)</conf-name><conf-date>Jun 2-7, 2019</conf-date><conf-loc>Minneapolis, Minnesota</conf-loc><fpage>4171</fpage><lpage>4186</lpage><pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Schmitt</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ringeval</surname><given-names>F</given-names> </name><name name-style="western"><surname>Schuller</surname><given-names>B</given-names> </name></person-group><article-title>At the border of acoustics and linguistics: bag-of-audio-words for the recognition of emotions in speech</article-title><conf-name>Proceedings of the Annual Conference of the International Speech Communication Association (INTERSPEECH)</conf-name><conf-date>Sep 8-12, 2016</conf-date><conf-loc>San Francisco, USA</conf-loc><fpage>495</fpage><lpage>499</lpage><pub-id pub-id-type="doi">10.21437/Interspeech.2016-1124</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Qiao</surname><given-names>Y</given-names> </name></person-group><article-title>Bag of visual words and fusion methods for action recognition: comprehensive study and good practice</article-title><source>Comput Vis Image Underst</source><year>2016</year><month>09</month><volume>150</volume><fpage>109</fpage><lpage>125</lpage><pub-id pub-id-type="doi">10.1016/j.cviu.2016.03.013</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Vaswani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Parmar</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Attention is all you need</article-title><conf-name>Proceedings of the 31st International Conference on Neural Information Processing Systems (NIPS&#x2019;17)</conf-name><conf-date>Dec 4-9, 2017</conf-date><pub-id pub-id-type="doi">10.5555/3295222.3295349</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Demasi</surname><given-names>O</given-names> </name><etal/></person-group><article-title>Towards emotional support dialog systems</article-title><conf-name>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing</conf-name><conf-date>Aug 1-6, 2021</conf-date><pub-id pub-id-type="doi">10.18653/v1/2021.acl-long.269</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Karpas</surname><given-names>E</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Self-consistency improves chain of thought reasoning in language models</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 21, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2203.11171</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Burdisso</surname><given-names>S</given-names> </name><name name-style="western"><surname>Villatoro-Tello</surname><given-names>E</given-names> </name><name name-style="western"><surname>Madikeri</surname><given-names>S</given-names> </name><name name-style="western"><surname>Motlicek</surname><given-names>P</given-names> </name></person-group><article-title>Node-weighted graph convolutional network for depression detection in transcribed clinical interviews</article-title><conf-name>Proceedings of the Annual Conference of the International Speech Communication Association (INTERSPEECH 2023)</conf-name><conf-date>Aug 20 to Sep 24, 2023</conf-date><pub-id pub-id-type="doi">10.21437/Interspeech.2023-1923</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dham</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dhall</surname><given-names>A</given-names> </name></person-group><article-title>Depression scale recognition from audio, visual and text analysis</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 18, 2017</comment><pub-id pub-id-type="doi">10.48550/arXiv.1709.05865</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name></person-group><article-title>A multi-modal hierarchical recurrent neural network for depression detection</article-title><conf-name>Proceedings of the 9th International on Audio/Visual Emotion Challenge and Workshop (AVEC &#x2019;19)</conf-name><conf-date>Oct 21, 2019</conf-date><pub-id pub-id-type="doi">10.1145/3347320.3357696</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lau</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>WY</given-names> </name></person-group><article-title>Automatic depression severity assessment with deep learning using parameter-efficient tuning</article-title><source>Front Psychiatry</source><year>2023</year><volume>14</volume><fpage>1160291</fpage><pub-id pub-id-type="doi">10.3389/fpsyt.2023.1160291</pub-id><pub-id pub-id-type="medline">37398577</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>LIK</given-names> </name></person-group><article-title>A concordance correlation coefficient to evaluate reproducibility</article-title><source>Biometrics</source><year>1989</year><month>03</month><volume>45</volume><issue>1</issue><fpage>255</fpage><lpage>268</lpage><pub-id pub-id-type="doi">10.2307/2532051</pub-id><pub-id pub-id-type="medline">2720055</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Al Hanai</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ghassemi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Glass</surname><given-names>J</given-names> </name></person-group><article-title>Detecting depression with audio/text sequence modeling of interviews</article-title><conf-name>Proceedings of the Annual Conference of the International Speech Communication Association (INTERSPEECH 2018)</conf-name><conf-date>Sep 2-6, 2018</conf-date><conf-loc>Hyderabad, India</conf-loc><fpage>1716</fpage><lpage>1720</lpage><pub-id pub-id-type="doi">10.21437/Interspeech.2018-2522</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Morales</surname><given-names>M</given-names> </name><name name-style="western"><surname>Scherer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Levitan</surname><given-names>R</given-names> </name></person-group><article-title>A linguistically-informed fusion approach for multimodal depression detection</article-title><conf-name>Proceedings of the Fifth Workshop on Computational Linguistics and Clinical Psychology: From Keyboard to Clinic</conf-name><conf-date>Jun 5, 2018</conf-date><conf-loc>New Orleans, LA, United States</conf-loc><fpage>13</fpage><lpage>24</lpage><pub-id pub-id-type="doi">10.18653/v1/W18-0602</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Mean user ratings on a 5-point Likert scale.</p><media xlink:href="formative_v10i1e84110_app1.docx" xlink:title="DOCX File, 53 KB"/></supplementary-material></app-group></back></article>