<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e56057</article-id><article-id pub-id-type="doi">10.2196/56057</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Comparative Efficacy of MultiModal AI Methods in Screening for Major Depressive Disorder: Machine Learning Model Development Predictive Pilot Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Donghao</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Pengfei</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Xiaolong</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Qiao</surname><given-names>Runqi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Nanxi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Xiaodong</given-names></name><degrees>BD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Honggang</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Wang</surname><given-names>Gang</given-names></name><degrees>PhD,MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>School of Artificial Intelligence, Beijing University of Posts and Telecommunications</institution><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff2"><institution>Beijing Key Laboratory of Mental Disorders, National Clinical Research Center for Mental Disorders &#x0026; National Center for Mental Disorders, Beijing Anding Hospital, Capital Medical University</institution><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff3"><institution>Advanced Innovation Center for Human Brain Protection, Capital Medical University</institution><addr-line>Beijing</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Arasteh</surname><given-names>Soroosh Tayebi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Shen</surname><given-names>Zhongxia</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Gang Wang, PhD,MD, Beijing Key Laboratory of Mental Disorders, National Clinical Research Center for Mental Disorders &#x0026; National Center for Mental Disorders, Beijing Anding Hospital, Capital Medical University, No 5 Ankang Lane, Xicheng District, Beijing, 100088, China, 86 15210807053; <email>gangwangdoc@ccmu.edu.cn</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>30</day><month>5</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e56057</elocation-id><history><date date-type="received"><day>23</day><month>02</month><year>2024</year></date><date date-type="rev-recd"><day>02</day><month>05</month><year>2025</year></date><date date-type="accepted"><day>06</day><month>05</month><year>2025</year></date></history><copyright-statement>&#x00A9; Donghao Chen, Pengfei Wang, Xiaolong Zhang, Runqi Qiao, Nanxi Li, Xiaodong Zhang, Honggang Zhang, Gang Wang. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 30.5.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e56057"/><abstract><sec><title>Background</title><p>Conventional approaches for major depressive disorder (MDD) screening rely on two effective but subjective paradigms: self-rated scales and clinical interviews. Artificial intelligence (AI) can potentially contribute to psychiatry, especially through the use of objective data such as objective audiovisual signals.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate the efficacy of different paradigms using AI analysis on audiovisual signals.</p></sec><sec sec-type="methods"><title>Methods</title><p>We recruited 89 participants (mean age, 37.1 years; male: 30/89, 33.7%; female: 59/89, 66.3%), including 41 patients with MDD and 48 asymptomatic participants. We developed AI models using facial movement, acoustic, and text features extracted from videos obtained via a tool, incorporating four paradigms: conventional scale (CS), question and answering (Q&#x0026;A), mental imagery description (MID), and video watching (VW). Ablation experiments and 5-fold cross-validation were performed using two AI methods to ascertain the efficacy of paradigm combinations. Attention scores from the deep learning model were calculated and compared with correlation results to assess comprehensibility.</p></sec><sec sec-type="results"><title>Results</title><p>In video clip-based analyses, Q&#x0026;A outperformed MID with a mean binary sensitivity of 79.06% (95%CI 77.06%&#x2010;83.35%; <italic>P</italic>=.03) and an effect size of 1.0. Among individuals, the combination of Q&#x0026;A and MID outperformed MID alone with a mean extent accuracy of 80.00% (95%CI 65.88%&#x2010;88.24%; <italic>P</italic>= .01), with an effect size 0.61. The mean binary accuracy exceeded 76.25% for video clip predictions and 74.12% for individual-level predictions across the two AI methods, with top individual binary accuracy of 94.12%. The features exhibiting high attention scores demonstrated a significant overlap with those that were statistically correlated, including 18 features (all <italic>Ps</italic>&#x003C;.05), while also aligning with established nonverbal markers.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The Q&#x0026;A paradigm demonstrated higher efficacy than MID, both individually and in combination. Using AI to analyze audiovisual signals across multiple paradigms has the potential to be an effective tool for MDD screening.</p></sec></abstract><kwd-group><kwd>major depressive disorder</kwd><kwd>artificial intelligence</kwd><kwd>computational psychiatry</kwd><kwd>facial action unit</kwd><kwd>multimodal analysis</kwd><kwd>multiparadigm analysis</kwd><kwd>MDD</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Depressive disorder is a common mental disorder affecting approximately 322 million patients in the world, with major depressive disorder (MDD) as one of its two main subcategories, which can significantly affect all aspects of life, including performance at school, productivity at work, and relationships with family and friends [<xref ref-type="bibr" rid="ref1">1</xref>]. The primary methods to assess depression encompass mental status examinations and assessment scales. However, mental status examinations such as the Hamilton Rating Scale for Depression (HAMD) necessitate direct, in-person interviews conducted by clinicians, which can result in processes that are both time-consuming and labor-intensive [<xref ref-type="bibr" rid="ref2">2</xref>]. Self-Report Symptom Inventories (SRSI) such as the Beck Depression Inventory [<xref ref-type="bibr" rid="ref3">3</xref>] and the Patient Health Questionnaire-9 (PHQ-9) [<xref ref-type="bibr" rid="ref4">4</xref>] are time-efficient but can be influenced by subjective biases, which allows for individual variability [<xref ref-type="bibr" rid="ref5">5</xref>]. Therefore, the outcomes are susceptible to both intentional and unintentional subjective influences [<xref ref-type="bibr" rid="ref6">6</xref>] and more approaches are needed to improve efficiency and accuracy.</p><p>In recent years, artificial intelligence (AI) has garnered attention for its application in signal analysis across various modalities. For instance, support vector machines have been used to analyze functional magnetic resonance imaging (fMRI) data [<xref ref-type="bibr" rid="ref7">7</xref>] and a convolutional neural network (CNN) has been applied to an electroencephalogram (EEG) [<xref ref-type="bibr" rid="ref8">8</xref>] to detect depression. While physiological signals such as fMRI and EEG are unaffected by subjective factors and directly reflect the participants&#x2019; physical states, they involve complex procedures and high costs. In contrast, noncontact signals, including text, audio, visual content, and scale information are more accessible for analysis.</p><p>In the text modality, hidden Markov models and random forest models were developed to predict depression and posttraumatic stress disorder based on frequency of Twitter usage and content [<xref ref-type="bibr" rid="ref9">9</xref>]. By aggregating weighted words using lexicons, the sentiment score derived from text messages demonstrated a positive association with the severity of depression as measured by the self-rated Patient Health Questionnaire-8 (PHQ-8) [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Among the audio modalities, speech patterns such as a narrowed pitch range and reduced phonemes within the vowel space have emerged as important objective indicators for assessing depressive states [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. Along with prosodic features, mel frequency cepstral coefficients (MFCC) [<xref ref-type="bibr" rid="ref15">15</xref>], detailed spectral features [<xref ref-type="bibr" rid="ref16">16</xref>], and deep-learned acoustic characteristics [<xref ref-type="bibr" rid="ref17">17</xref>] have also been used to identify the presence of depressive symptoms, achieving binary accuracy of up to 79% or <italic>F</italic><sub>1</sub>-score of 0.890.</p><p>For the visual or multimodal domain, several open datasets are available. One notable example is the Audio/Visual Emotion Challenge and Workshop [<xref ref-type="bibr" rid="ref18">18</xref>], which focuses on the detection of depression and uses an audio-visual dataset that includes image features extracted from original images and audio recordings and transcribed text from Google Cloud, paired with the PHQ-8 scores. Facial action units (AU), as outlined in the Facial Action Coding System [<xref ref-type="bibr" rid="ref19">19</xref>], serve as the foundation for facial expressions and constitute essential image features in the Audio/Visual Emotion Challenge and Workshop. Commonly observed AUs correspond to a range of expressions such as smiling and frowning (see Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). A higher overall frowning (Action Unit 4, ie AU4) and head-down posture were identified in a study by Fiquer et al [<xref ref-type="bibr" rid="ref20">20</xref>], while a lower overall AU12 and a markedly higher overall AU14 were identified in a study by Girard et al [<xref ref-type="bibr" rid="ref21">21</xref>]. This indicated that the distribution of AUs differs significantly between depressed and nondepressed persons. Facial Action Coding System has also been employed in the analysis of stress [<xref ref-type="bibr" rid="ref22">22</xref>], anxiety [<xref ref-type="bibr" rid="ref23">23</xref>], and Parkinson&#x2019;s disease [<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>There are several other existing datasets, including Mundt-35 [<xref ref-type="bibr" rid="ref25">25</xref>], BlackDog [<xref ref-type="bibr" rid="ref26">26</xref>], and MODMA [<xref ref-type="bibr" rid="ref27">27</xref>]. Most of these datasets contain a single paradigm, primarily relying on interviews such as HAMD, or targeting the scores of SRSIs such as the PHQ-8. Additionally, current multimodal AI methods mainly extract local features from utterances or sentences for video clip predictions [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. At the same time, we believe the screening and diagnosis of MDD should include the entire process, similarly to the process of clinical practice.</p><p>For other paradigm options, mental imagery description (MID) [<xref ref-type="bibr" rid="ref30">30</xref>] can manifest across different sensory modalities, encompassing visual [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>], auditory [<xref ref-type="bibr" rid="ref33">33</xref>], and textual information, and tends to evoke stronger emotional responses than verbal processing [<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>Thus, aiming to evaluate the efficacy of different paradigms, we aggregated them in a tool, namely the Electronic Tool for Depression (ETD), and used a state-of-the-art (SOTA) method using audiovisual signals to validate their efficacies. We propose the ETD to be a nonsubjective and easy-use MDD screening tool. The SOTA method generates predictions on video clips, and two of the four paradigms contain only visual signals; therefore, we implemented a voting mechanism for individual predictions and proposed a global feature method for the remaining vision-only paradigms. This pilot study underscores our primary contributions, which can be summarized as follows: (1) to validate the efficacy of the paradigms via AI on audiovisual signals and aggregate them within a tool for MDD screening and (2) to propose a global feature method and explore its efficacy and interpretability.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Design of the Task and Building the Tool</title><p>The ETD consists of four paradigms, aggregated into an application designed for an 11.5-inch tablet featuring an 8-MP front-facing camera and a 44.8 kHz sample rate microphone. Before using the ETD, clinicians adjusted the tablet to ensure that the participant&#x2019;s head is aligned with the device at an appropriate distance (approximately 50 centimeters) for effective face capture. The ETD structure and app design are depicted in <xref ref-type="fig" rid="figure1">Figure 1</xref>. Paradigm 1 uses a conventional self-rated scale, specifically the PHQ-9. Paradigm 2 encompasses a question-and-answering (Q&#x0026;A) paradigm simulating psychiatric examinations. Paradigm 3 requires participants to describe images with the hint words [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref33">33</xref>]. Paradigm 4 presents three video clips of varying emotional sentiment scores [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>] in a positive, neutral, and negative sequence (41&#x2010;73 seconds, average 60.33 seconds). Participants sequentially respond to these components, with recordings capturing their reactions during both the viewing and responding phases, including the PHQ-9 selections; the entire process takes approximately 5 minutes. It is essential to clarify that the scale was only used to elicit reactions, and its scores did not contribute to the predictions.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Components of the Electronic Tool for Depression (ETD). PHQ-9: Patient Health Questionnaire-9; Q&#x0026;A: question-and-answer.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e56057_fig01.png"/></fig></sec><sec id="s2-2"><title>Recruitment</title><p>In this study, 89 participants were recruited from April 2022 to December 2022 in Beijing. Among these, 51 were recruited from the Beijing Anding Hospital Inpatient Department and were all diagnosed with MDD by experienced psychiatrists according to the International Code of Diseases, tenth revision (<italic>ICD-10</italic>) [<xref ref-type="bibr" rid="ref36">36</xref>]. All participants met the inclusion criteria, which were as follows: (1) age 18&#x2010;65 years, (2) proficiency in standard Chinese, (3) educational level of primary school or above, and (4) ability to understand and cooperate with the research protocol.</p><p>Exclusion criteria included (1) diagnosis of schizophrenia, schizoaffective disorder, or other mental disorders and (2) history of organic brain disease. The remaining 38 participants were recruited openly from the general population (employees and college students) who were not experiencing depression-related symptoms.</p><p>Participants from the hospital completed two steps: the first involved using the ETD app, and the second included assessment using the HAMD-17 scale by clinicians. Community participants only completed the ETD test, and all were confirmed to have no depressive symptoms based on the PHQ-9 assessment. Finally, the asymptomatic and the healthy control groups formed the nonMDD group (48 participants), while the mild group and the moderate or severe group were collectively referred to as the MDD group (41 participants). For ease of explanation, the mild group was designated as MDD-sub1, and the moderate or severe group was designated as MDD-sub2. Sex was compared using the <italic>&#x03A7;</italic><sup><italic>2</italic></sup> test; age and HAMD scores were compared using the Mann-Whitney <italic>U</italic> test. There were no significant differences in sex ratio or age between the groups, while the MDD group had significantly higher PHQ-9 scores than the nonMDD group.</p></sec><sec id="s2-3"><title>Ethical Considerations</title><p>This study was approved by the Ethics Committee of Beijing Anding Hospital Capital Medical University. No compensation fee was paid to participants, with written informed consent obtained for the data usage of research analysis. Data were deidentified and all analyses followed data privacy guidelines.</p></sec><sec id="s2-4"><title>Model Training</title><p>All recorded videos underwent a manual verification process to ensure that the image ratio of a complete head, face, and eyes exceeded the empirical 95% threshold. We adopted the MFCC-based recurrent neural network (RNN) [<xref ref-type="bibr" rid="ref29">29</xref>] as the validation model, which used a multimodal method that integrated MFCC and AU features and achieved a SOTA accuracy of 95.6% in binary classification of depression on the DAIC-WOZ (Distress Analysis Interview Corpus) dataset [<xref ref-type="bibr" rid="ref18">18</xref>]. We pretrained the RNN on RAVDESS (Ryerson Audio-Visual Database of Emotional Speech and Song dataset) [<xref ref-type="bibr" rid="ref37">37</xref>], aggregated the AU features, and fine-tuned the model on our dataset. We developed a CNN model for AU detection using EfficientNet [<xref ref-type="bibr" rid="ref38">38</xref>] on BP4D [<xref ref-type="bibr" rid="ref39">39</xref>], achieving a mean <italic>F</italic><sub>1</sub>-score of 0.76 on selected AUs. The sample size of video clips for the RNN was 11,075, comprising 6826 normal, 2933 mild, and 1316 moderate or severe instances. We established a clip-voting ratio to represent the individual results. While the RNN simultaneously processed local audio and visual data, it did not incorporate conventional scale and video watching. To address this limitation, we proposed a global feature extraction method (depicted in <xref ref-type="fig" rid="figure2">Figure 2</xref>) to derive global features and build AI models. For the vision modality, we used Gaze360 [<xref ref-type="bibr" rid="ref40">40</xref>] and Dlib [<xref ref-type="bibr" rid="ref41">41</xref>] along with AU features to estimate gaze and head orientation. For the audio modality, we extracted MFCC-based features and the pure audio duration of the human voice. For the text modality, we calculated sentiment scores using the <italic>pyltp</italic> package [<xref ref-type="bibr" rid="ref42">42</xref>]. The features were concatenated in the order of visual, audio, and text features. Additionally, we incorporated statistical characteristics such as mean and variance to enhance their global representation. Normalization and bias adjustments were applied to ensure that all the features were positive for later attention computation (complete feature list is provided in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The global feature extraction method architecture. MFCC: mel frequency cepstral coefficients; MLP: multilayer perceptron.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e56057_fig02.png"/></fig><p>We adopted a multilayer perceptron (MLP) as our classifier for global predictions, which is identical to the RNN. The MLP comprises layers with 512, 1024, 128, and 3 neurons, incorporating batch normalization and a 0.2 dropout rate to mitigate overfitting; a softmax layer was added as the prediction. The Adam optimizer was used with a base learning rate of 1e-3, &#x03B2;<sub>1</sub> of 0.9, &#x03B2;<sub>2</sub> of 0.999, and &#x03B5; of 1e-8. Given that deep learning methods are often considered &#x201C;dark magic,&#x201D; we sought to enhance comprehensibility by employing Grad-Cam [<xref ref-type="bibr" rid="ref43">43</xref>] to visualize the attention scores of the MLP&#x2019;s best-performing model across each feature. These results were then compared with Spearman and Kendall correlation coefficients computed using scikit-learn [<xref ref-type="bibr" rid="ref44">44</xref>].</p></sec><sec id="s2-5"><title>Statistical Analysis</title><p>Ablation experiments were conducted on various paradigm combinations. The models predicted three levels of severity, and binary performance was assessed to distinguish between depressed or nondepressed states. Sensitivity, specificity, accuracy, and area under the curve (AUC) were measured for binary results. Accuracy was specifically calculated for severity predictions. The five-fold performances underwent the Friedman test, followed by the posthoc Nemenyi test and Cliff &#x03B4; effect size. A 95% CI was computed using bootstrapping, with the exception of single fold clip prediction AUC, which used normal approximation.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Demographic characteristics are shown in <xref ref-type="table" rid="table1">Table 1</xref> and the findings of clip prediction and the Friedman test are presented in <xref ref-type="table" rid="table2">Table 2</xref>. The difference among Q&#x0026;A, MID, and QI (combination of Q&#x0026;A and MID) is significant in binary sensitivity (<italic>P</italic>=.02), with a large effect (&#x03B5;<sup>2</sup>=0.47). The differences in binary AUC and extent accuracy are close to significant (<italic>P</italic>=.07 and <italic>P=</italic>.09, respectively), with large effects (&#x03B5;<sup>2</sup>=0.26 and &#x03B5;<sup>2</sup>=0.23, respectively). The differences in binary specificity and binary accuracy were not significant and exhibited small effects. Posthoc Nemenyi test results for sensitivity are detailed in <xref ref-type="table" rid="table3">Table 3</xref>, revealing that Q&#x0026;A outperformed MID (<italic>P</italic>= .03) with a large effect size (Cliff &#x03B4;=1.0). The difference between QI and MID is close to significant (<italic>P</italic>=.06) with a large effect (Cliff &#x03B4;=1.0).</p><p>The results of individual prediction and the Friedman test are presented in <xref ref-type="table" rid="table4">Table 4</xref>. In the RNN voting analysis, the differences among Q&#x0026;A, MID, and QI were significant in terms of binary sensitivity (<italic>P</italic>&#x003C;.01) with a large effect (&#x03B5;<sup>2</sup>=0.61). The differences in binary accuracy and binary AUC were nonsignificant (<italic>P</italic> =.13 and <italic>P=</italic>.09, respectively) but showed large effects (&#x03B5;<sup>2</sup>=0.18 and &#x03B5;<sup>2</sup>=0.23, respectively). Posthoc Nemenyi test results on extent accuracy are presented in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Demographic characteristics.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Factors</td><td align="left" valign="bottom">MDD<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">nonMDD<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Sex, n (%)</td><td align="left" valign="top">.13<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">&#x2003;Male</td><td align="left" valign="top">12 (29.3)</td><td align="left" valign="top">18 (37.5)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Female</td><td align="left" valign="top">29 (70.3)</td><td align="left" valign="top">30 (62.5)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Age (years), mean (SD)</td><td align="left" valign="top">38.41 (15.12)</td><td align="left" valign="top">35.98(12.37)</td><td align="left" valign="top">.50<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">HAM-D<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup>, mean (SD)</td><td align="left" valign="top">14.51 (4.66)</td><td align="left" valign="top">&#x2013;<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td><td align="left" valign="top">&#x2013;</td></tr><tr><td align="left" valign="top">PHQ-9<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup>, mean (SD)</td><td align="left" valign="top">13.05 (6.00)</td><td align="left" valign="top">4.17 (2.71)</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>MDD: major depressive disorder.</p></fn><fn id="table1fn2"><p><sup>b</sup>nonMDD: non-major depressive disorder.</p></fn><fn id="table1fn3"><p><sup>c</sup>Chi-square test was used to derive the <italic>P</italic> value.</p></fn><fn id="table1fn4"><p><sup>d</sup>Mann-Whitney U test was used to derive the <italic>P</italic> value.</p></fn><fn id="table1fn5"><p><sup>e</sup>HAM-D: Hamilton rating scale for Depression.</p></fn><fn id="table1fn6"><p><sup>f</sup>Not applicable.</p></fn><fn id="table1fn7"><p><sup>g</sup>PHQ-9: PHQ-9: Patient Health Questionnaire-9.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Clip prediction results of the MFCC-based<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> RNN<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> [<xref ref-type="bibr" rid="ref29">29</xref>] for paradigm combinations.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Paradigm/statistics/<break/>performance</td><td align="left" valign="bottom">Sensitivity (%), mean (95% CI)</td><td align="left" valign="bottom">Specificity (%), mean (95% CI)</td><td align="left" valign="bottom">SA<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> (%), mean (95% CI)</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> (%), mean (95% CI)</td><td align="left" valign="bottom">EA<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup> (%), mean (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">Q&#x0026;A<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top">79.06 (77.06&#x2010;83.35)</td><td align="left" valign="top">85.71 (73.30&#x2010;90.19)</td><td align="left" valign="top">83.01 (74.43&#x2010;86.10)</td><td align="left" valign="top">78.12 (66.11&#x2010;82.35)</td><td align="left" valign="top">88.70 (83.15&#x2010;91.33)</td></tr><tr><td align="left" valign="top">MID<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="left" valign="top">56.99 (41.78&#x2010;63.36)</td><td align="left" valign="top">85.10 (76.36&#x2010;89.81)</td><td align="left" valign="top">76.25 (69.90&#x2010;80.30)</td><td align="left" valign="top">70.40 (65.09&#x2010;73.60)</td><td align="left" valign="top">81.43 (76.42&#x2010;85.35)</td></tr><tr><td align="left" valign="top">QI<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup></td><td align="left" valign="top">80.22 (75.88&#x2010;84.72)</td><td align="left" valign="top">85.61 (77.06&#x2010;89.50)</td><td align="left" valign="top">84.41 (81.98&#x2010;86.44)</td><td align="left" valign="top">80.36 (76.45&#x2010;82.78)</td><td align="left" valign="top">90.37 (88.81&#x2010;91.64)</td></tr><tr><td align="left" valign="top"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table2fn9">i</xref></sup></td><td align="left" valign="top">.02</td><td align="left" valign="top">.55</td><td align="left" valign="top">.25</td><td align="left" valign="top">.07</td><td align="left" valign="top">.09</td></tr><tr><td align="left" valign="top">Effect size (&#x03B5;<sup>2</sup>)</td><td align="left" valign="top">0.47</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.07</td><td align="left" valign="top">0.27</td><td align="left" valign="top">0.23</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>MFCC: mel frequency cepstral coefficients.</p></fn><fn id="table2fn2"><p><sup>b</sup>RNN: recurrent neural network.</p></fn><fn id="table2fn3"><p><sup>c</sup>SA: screen accuracy.</p></fn><fn id="table2fn4"><p><sup>d</sup>AUC: area under the curve.</p></fn><fn id="table2fn5"><p><sup>e</sup>EA: extent accuracy.</p></fn><fn id="table2fn6"><p><sup>f</sup>Q&#x0026;A: question-and-answer.</p></fn><fn id="table2fn7"><p><sup>g</sup>MID: mental imagery description.</p></fn><fn id="table2fn8"><p><sup>h</sup>QI: combination of Q&#x0026;A and MID.</p></fn><fn id="table2fn9"><p><sup>i</sup>Friedman test was used to calculate the <italic>P</italic> value.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Posthoc test results of the MFCC-based<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> RNN<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> clip prediction sensitivity between pairs of Q&#x0026;A<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup>, MID<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>, and QI<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Paradigm statistic item</td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="bottom">Cliff &#x03B4; (effect size)</td></tr></thead><tbody><tr><td align="left" valign="top">Q&#x0026;A-MID<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup></td><td align="left" valign="top">.03</td><td align="left" valign="top">1.0 (large)</td></tr><tr><td align="left" valign="top">Q&#x0026;A-QI<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="left" valign="top">.90</td><td align="left" valign="top">&#x2212;0.04 (negligible)</td></tr><tr><td align="left" valign="top">QI-MID<sup><xref ref-type="table-fn" rid="table3fn9">i</xref></sup></td><td align="left" valign="top">.06</td><td align="left" valign="top">1.0 (large)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>MFCC: mel frequency cepstral coefficients.</p></fn><fn id="table3fn2"><p><sup>b</sup>RNN: recurrent neural network.</p></fn><fn id="table3fn3"><p><sup>c</sup>Q&#x0026;A: question-and-answer.</p></fn><fn id="table3fn4"><p><sup>d</sup>MID: mental imagery description.</p></fn><fn id="table3fn5"><p><sup>e</sup>QI: combination of Q&#x0026;A and MID.</p></fn><fn id="table3fn6"><p><sup>f</sup>Nemenyi test.</p></fn><fn id="table3fn7"><p><sup>g</sup>Q&#x0026;A-MID: comparison between Q&#x0026;A and mental imagery description.</p></fn><fn id="table3fn8"><p><sup>h</sup>Q&#x0026;A-QI: comparison between Q&#x0026;A and combination of Q&#x0026;A and mental imagery description.</p></fn><fn id="table3fn9"><p><sup>i</sup>QI-MID: combination of Q&#x0026;A and mental imagery description, and single paradigm mental imagery description.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Individual prediction results of the MFCC-based<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> RNN<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup> voting for paradigm combinations.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Paradigm performance</td><td align="left" valign="bottom">Sensitivity (%), mean (95% CI)</td><td align="left" valign="bottom">Specificity (%), mean (95% CI)</td><td align="left" valign="bottom">SA<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup> (%), mean (95% CI)</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup> (%), mean (95% CI)</td><td align="left" valign="bottom">EA<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup> (%), mean (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">Q&#x0026;A<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">75.00 (57.50&#x2010;87.50)</td><td align="left" valign="top">82.22 (68.89&#x2010;86.67)</td><td align="left" valign="top">78.82 (72.94&#x2010;83.53)</td><td align="left" valign="top">90.83 (84.17&#x2010;95.83)</td><td align="left" valign="top">70.59 (57.65&#x2010;77.65)</td></tr><tr><td align="left" valign="top">MID<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">60.00 (42.50&#x2010;67.50)</td><td align="left" valign="top">88.89 (77.78&#x2010;95.56)</td><td align="left" valign="top">75.29 (62.35&#x2010;80.00)</td><td align="left" valign="top">85.00 (81.94&#x2010;90.45)</td><td align="left" valign="top">65.88 (57.65&#x2010;71.77)</td></tr><tr><td align="left" valign="top">SQI<sup><xref ref-type="table-fn" rid="table4fn8">h</xref></sup></td><td align="left" valign="top">75.00 (50.00&#x2010;87.50)</td><td align="left" valign="top">88.89 (80.00&#x2010;93.33)</td><td align="left" valign="top">82.36 (69.42&#x2010;89.42)</td><td align="left" valign="top">92.50 (86.11&#x2010;96.11)</td><td align="left" valign="top">80.00 (65.88&#x2010;88.24)</td></tr><tr><td align="left" valign="top"><italic>P</italic> value</td><td align="left" valign="top">.29</td><td align="left" valign="top">.26</td><td align="left" valign="top">.13</td><td align="left" valign="top">.09</td><td align="left" valign="top">.009</td></tr><tr><td align="left" valign="top">Effect size</td><td align="left" valign="top">0.04</td><td align="left" valign="top">0.06</td><td align="left" valign="top">0.18</td><td align="left" valign="top">0.23</td><td align="left" valign="top">0.61</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>MFCC: mel frequency cepstral coefficients.</p></fn><fn id="table4fn2"><p><sup>b</sup>RNN: recurrent neural network.</p></fn><fn id="table4fn3"><p><sup>c</sup>SA: screen accuracy.</p></fn><fn id="table4fn4"><p><sup>d</sup>AUC: area under the curve.</p></fn><fn id="table4fn5"><p><sup>e</sup>EA: extent accuracy.</p></fn><fn id="table4fn6"><p><sup>f</sup>Q&#x0026;A: question-and-answer.</p></fn><fn id="table4fn7"><p><sup>g</sup>MID: mental imagery description.</p></fn><fn id="table4fn8"><p><sup>h</sup>SQI: combination of conventional questionnaire, Q&#x0026;A, and mental imagery description.</p></fn></table-wrap-foot></table-wrap><p><xref ref-type="table" rid="table5">Table 5</xref>. QI outperformed MID (<italic>P</italic>&#x003C;.05) with a substantial effect (Cliff &#x03B4;=0.64). The difference between Q&#x0026;A and QI was nonsignificant (<italic>P</italic>=.14) but indicated a large effect (Cliff &#x03B4;=&#x2212;0.48). In the global feature MLP analysis, differences among the paradigms were insignificant and exhibited small effect sizes, with results in Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>The best fold performance is shown in <xref ref-type="table" rid="table6">Table 6</xref>. The global feature, SQIV (combination paradigm of CS, Q&#x0026;A, MID, and VW) MLP achieved a peak individual binary accuracy of 94.12%. Notably, the RNN voting SQI model also achieved a top accuracy of 94.12%, but with a higher extent accuracy of 94.12%, and an AUC of 0.99.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Post-hoc statistic test results of the RNN<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup> voting individual prediction extent accuracy between pairs of Q&#x0026;A,<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup> MID<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup>, and QI<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup>.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Paradigm statistic item</td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="bottom">Effect size (Cliff &#x03B4;)</td></tr></thead><tbody><tr><td align="left" valign="top">Q&#x0026;A-MID<sup><xref ref-type="table-fn" rid="table5fn6">f</xref></sup></td><td align="left" valign="top">.60</td><td align="left" valign="top">0.32 (small)</td></tr><tr><td align="left" valign="top">Q&#x0026;A-QI<sup><xref ref-type="table-fn" rid="table5fn7">g</xref></sup></td><td align="left" valign="top">.14</td><td align="left" valign="top">&#x2212;0.48 (large)</td></tr><tr><td align="left" valign="top">QI-MID<sup><xref ref-type="table-fn" rid="table5fn8">h</xref></sup></td><td align="left" valign="top">.01</td><td align="left" valign="top">0.64 (large)</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>RNN: recurrent neural network.</p></fn><fn id="table5fn2"><p><sup>b</sup>Q&#x0026;A: question-and-answer.</p></fn><fn id="table5fn3"><p><sup>c</sup>MID: mental imagery description.</p></fn><fn id="table5fn4"><p><sup>d</sup>QI: combination of Q&#x0026;A and MID.</p></fn><fn id="table5fn5"><p><sup>e</sup>Nemenyi test.</p></fn><fn id="table5fn6"><p><sup>f</sup>Q&#x0026;A-MID: comparison between Q&#x0026;A and mental imagery description.</p></fn><fn id="table5fn7"><p><sup>g</sup>Q&#x0026;A-QI: comparison between Q&#x0026;A and combination of Q&#x0026;A and mental imagery description.</p></fn><fn id="table5fn8"><p><sup>h</sup>QI-MID: combination of Q&#x0026;A and mental imagery description, and single paradigm mental imagery description.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Performance of the best fold of the global feature MLP<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup> SQIV<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> model, the MFCC-based<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup> RNN<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup> SQI<sup><xref ref-type="table-fn" rid="table6fn5">e</xref></sup> clip model, and the MFCC-based RNN SQI<sup><xref ref-type="table-fn" rid="table6fn5">e</xref></sup> voting model.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Method</td><td align="left" valign="bottom">Paradigm performance</td><td align="left" valign="bottom">Sensitivity %, (95% CI)</td><td align="left" valign="bottom">Specificity %, (95% CI)</td><td align="left" valign="bottom">SA<sup><xref ref-type="table-fn" rid="table6fn6">f</xref></sup> %, (95% CI)</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table6fn7">g</xref></sup> (95% CI)</td><td align="left" valign="bottom">EA<sup><xref ref-type="table-fn" rid="table6fn8">h</xref></sup> %(95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">MLP<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup></td><td align="left" valign="top">SQIV<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup></td><td align="left" valign="top">100.0 (63.06&#x2010;100.0)</td><td align="left" valign="top">88.89 (51.75&#x2010;99.72)</td><td align="left" valign="top">94.12 (71.31&#x2010;99.85)</td><td align="left" valign="top">0.97 (0.87&#x2010;1.0)</td><td align="left" valign="top">76.47 (50.10&#x2010;93.19)</td></tr><tr><td align="left" valign="top">RNN<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">SQI<sup><xref ref-type="table-fn" rid="table6fn5">e</xref></sup></td><td align="left" valign="top">80.69 (77.74&#x2010;83.64)</td><td align="left" valign="top">89.93 (88.60&#x2010;91.26)</td><td align="left" valign="top">87.36 (86.09&#x2010;88.63)</td><td align="left" valign="top">0.91 (0.90&#x2010;0.92)</td><td align="left" valign="top">83.03 (81.60&#x2010;84.46)</td></tr><tr><td align="left" valign="top">RNN<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup> voting</td><td align="left" valign="top">SQI<sup><xref ref-type="table-fn" rid="table6fn5">e</xref></sup></td><td align="left" valign="top">100.0 (63.06&#x2010;100.0)</td><td align="left" valign="top">88.89 (51.75&#x2010;99.72)</td><td align="left" valign="top">94.12 (71.31&#x2010;99.85)</td><td align="left" valign="top">0.99 (0.91&#x2010;1.0)</td><td align="left" valign="top">94.12 (71.31&#x2010;99.85)</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>MLP: multilayer perceptron.</p></fn><fn id="table6fn2"><p><sup>b</sup>SQIV: combination of conventional scale, Q&#x0026;A, mental imagery description, and video watching.</p></fn><fn id="table6fn3"><p><sup>c</sup>MFCC: mel frequency cepstral coefficients.</p></fn><fn id="table6fn4"><p><sup>d</sup>RNN: recurrent neural network.</p></fn><fn id="table6fn5"><p><sup>e</sup>SQI: combination of conventional scale, Q&#x0026;A, and mental imagery description</p></fn><fn id="table6fn6"><p><sup>f</sup>SA: screen accuracy</p></fn><fn id="table6fn7"><p><sup>g</sup>AUC: area under the curve</p></fn><fn id="table6fn8"><p><sup>h</sup>EA: extent accuracy</p></fn></table-wrap-foot></table-wrap><p>The test results of comparison between the RNN voting and the proposed global feature method can be found in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The input data are the same in Q&#x0026;A and MID; for paradigm combinations, we compared RNN-voting QI and the global feature method (ie, combination paradigm of CS, Q&#x0026;A, and MID; SQI), as they use the most comparable input data. No statistically significant differences were identified between the two methods across all three paradigms. Figures S1-S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> illustrate the collective and individual learnings of the best MLP SQIV model. The mean attention scores of the features are sequenced by the nonMDD group, the MDD-sub1 group, and the MDD-sub2 group. The complete attention scores with Spearman correlation scores are mentioned in Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, sorted in descending order for the MDD-sub2 group. Previously analyzed nonverbal markers in studies by Fiquer et al [<xref ref-type="bibr" rid="ref20">20</xref>] and Girard et al [<xref ref-type="bibr" rid="ref45">45</xref>] can be found in Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. As shown in Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, the different groups exhibit varying levels of attention to specific features. The Spearman and Kendall correlation coefficients for each feature relative to the target extent of depression are available in Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, where 18 features demonstrated a <italic>P</italic> value&#x003C;.05.</p></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>We aimed to evaluate the efficacy of different paradigms via AI on audiovisual signals. We aggregated the four paradigms within the ETD and held 5-fold cross-validation on the two AI models among the paradigm combinations. Our findings show that there are differences in paradigm efficacies, and the AI model learns knowledge consistent with prior human experience.</p><p>For the single paradigm with the MFCC-based RNN, Q&#x0026;A outperformed MID in identifying patients but performed equally in distinguishing extent levels. The difference between Q&#x0026;A and MID in clip sensitivity was significant, but nonsignificant in individual extent accuracy. This makes Q&#x0026;A more precise in identifying MDD patients.</p><p>For paradigm combinations with the MFCC-based RNN, integrating MID with Q&#x0026;A slightly decreased clip sensitivity significance but significantly improved individual extent accuracy significance compared with MID. Considering that the difference between QI and Q&#x0026;A was nonsignificant in either clip sensitivity and extent accuracy, and the differences among Q&#x0026;A, MID, and QI were nonsignificant in the other performance indexes, we conclude that Q&#x0026;A demonstrated higher efficacy than MID and suggest that paradigm combinations perform better than a single paradigm. As known, Q&#x0026;A is a simplified version of a clinical interview, and the questions are all symptom-related, which makes it the most relevant paradigm for MDD and the most important one.</p><p>In the individual prediction of the global feature MLP, no significant differences were observed across the paradigm combinations. When fixing the paradigms, no notable differences were found between the MFCC-based RNN voting and the global feature MLP, which validated the global features&#x2019; effectiveness. Some large effect sizes were noted, particularly in binary AUC and extent accuracy for Q&#x0026;A and QI, which may be attributable to feature granularity. The RNN feature integrates both local and global information, with local features benefiting from transfer learning, which enhances performance&#x2014;achieving a top binary accuracy of 94.12%, a top mean binary accuracy of 82.36%, and a mean extent accuracy of 80.00%. In contrast, the global features may be coarse at the granular level. Even so, the MLP still achieved a mean binary accuracy ranging from 74.12% to 85.88%, with a 95% CI spanning 69.41% to 91.77%, and a top binary accuracy of 94.12%. Overfitting might exist and could result in wide CIs.</p><p>Compared with support vector machine models [<xref ref-type="bibr" rid="ref7">7</xref>], which showed a mean binary accuracy of 78.95% on event-related fMRI [<xref ref-type="bibr" rid="ref46">46</xref>] and 85.00% on block-related fMRI [<xref ref-type="bibr" rid="ref47">47</xref>] and CNN [<xref ref-type="bibr" rid="ref8">8</xref>], which achieved a mean binary accuracy of 85.62% on EEG data, the ETD demonstrated equivalent performance while relying on much more readily accessible daily data. Compared to SRSIs, the audiovisual data are more objective and easier to use. Compared to interview-based assessments such as the HAMD [<xref ref-type="bibr" rid="ref2">2</xref>], the ETD required approximately 5 minutes, saving about 83% of the time. The ETD&#x2019;s performance and efficiency support its potential to objectively, accurately, and efficiently screen for MDD.</p><p>In the visualization, it is noteworthy that the high- and low-attention features did not intersect, particularly between the MDD group and the nonMDD groups, indicating that participants in different groups exhibited diverse behavioral patterns. Almost all 16 features mentioned by Fiquer et al [<xref ref-type="bibr" rid="ref20">20</xref>] and Girard et al [<xref ref-type="bibr" rid="ref21">21</xref>] exhibited high attention scores, with the lowest score being 0.68. Among these, 15 features ranked in the top 20%, except for &#x201C;head down&#x201D; which ranked 31st in MDD-sub1, demonstrating consistency with prior studies. Additionally, &#x201C;head motion velocity&#x201D; was not included in this study. When comparing with correlation results, 18 features emerged as having significant positive or negative relationships with MDD extent, both in feature items and in correlation trends observed via Spearman and Kendall methods&#x2014;differing only in specific weights. Of these, 11 aligned with the correlation trends; 5 showed patterns in which the attention scores for the MDD group were either higher or lower than those of the nonMDD group, and only 2 showed no clear trends. For instance, the mean attention score of AU4, interpreted as &#x201C;frown&#x201D; by Fiquer et al [<xref ref-type="bibr" rid="ref20">20</xref>], increased with the extent of MDD, and the Spearman correlation for AU4 was positive (<italic>P</italic>=.005). These high-attention score and correlation-consistent features may serve as urgently needed objective markers and should be further investigated.</p><p>The MLP leverages global features representing statistical values throughout the process, sacrificing some detail at the granular level while maintaining low model and computational complexity. Despite potential overfitting, the alignment of the visualization results with correlation findings indicates that the MLP has acquired knowledge consistent with medical prior knowledge, supporting its performance and underscores its potential as a valuable tool. For the inconsistent elements, the neural network introduces significant nonlinearity and captures relationships in high-dimensional spaces. In contrast, Spearman and Kendall correlations are limited to assessing relationships between single inputs and targets. We propose that a trained model can reveal complex multi-input&#x2013;target relationships that are difficult to define manually. Furthermore, results may vary with the accumulation of additional data.</p><p>The ETD&#x2019;s efficiency&#x2014;requiring less time and energy&#x2014; and its objectivity and accuracy make it a flexible and practical tool to be applied across diverse medical scenes that prioritize lightweight and quietness, particularly in screening and health monitoring scenes. Multimodal analysis may produce better results; for instance, AUC of binary depression status increased from 0.72 to 0.76 with networked smartphone sensors combining to text messages [<xref ref-type="bibr" rid="ref12">12</xref>], and the binary accuracy increased from 76.27% to 95.60% when AU features were added to acoustic features in the baseline MFCC-based RNN [<xref ref-type="bibr" rid="ref29">29</xref>]. As wearable devices continue to gain popularity, easily obtainable physical signals such as ECG and photoplethysmography can be integrated as additional modalities to enhance clinical outcomes. In our work, we currently use visual, acoustic, and text information jointly, which we believe may be a key point in the high performance observed and should receive more attention in future studies. As audiovisual features are also related to other conditions such as anxiety disorder and schizophrenia [<xref ref-type="bibr" rid="ref48">48</xref>], or to distinguish between MDD or bipolar depression [<xref ref-type="bibr" rid="ref49">49</xref>], aggregating multiple paradigms may further improve efficacy.</p></sec><sec id="s4-2"><title>Conclusions</title><p>The Q&#x0026;A method showed greater efficacy compared to MID, and combining paradigms may yield better results than using individual paradigms alone. Visualization interpretation showed that the AI method acquired knowledge that aligns with medical expertise and identified several potentially significant markers. By applying AI to multimodal audiovisual signals, these findings position the ETD as a valuable, objective tool for screening MDD and show potential for applications across a broader spectrum of psychiatric disorders with various data modalities.</p></sec><sec id="s4-3"><title>Limitations</title><p>The efficacy of the modalities remains inadequately explored. Automatically detected AUs may not achieve the reliability of human-labeled results. Additionally, conclusions drawn from current analyses may require revision as sample sizes increase, particularly in deep learning frameworks.</p></sec></sec></body><back><ack><p>The study was supported by Sci-Tech Innovation 2030 &#x2013; Major Project of Brain science and brain-inspired intelligence technology (2021ZD0200600) and the Beijing Municipal Administration of Hospitals Incubating Program (PX2019068).</p></ack><notes><sec><title>Data Availability</title><p>The datasets analyzed during this study are not publicly available as the agreement is only between the participants and researchers but are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">AU</term><def><p>action unit</p></def></def-item><def-item><term id="abb3">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb4">CNN</term><def><p>conventional neural network</p></def></def-item><def-item><term id="abb5">CS</term><def><p>conventional scale</p></def></def-item><def-item><term id="abb6">DL</term><def><p>deep learning</p></def></def-item><def-item><term id="abb7">EEG</term><def><p>electroencephalogram</p></def></def-item><def-item><term id="abb8">ETD</term><def><p>electronic tool for depression</p></def></def-item><def-item><term id="abb9">fMRI</term><def><p>functional magnetic resonance imaging</p></def></def-item><def-item><term id="abb10">HAMD</term><def><p>Hamilton rating scale for depression</p></def></def-item><def-item><term id="abb11">MDD</term><def><p>major depressive disorder</p></def></def-item><def-item><term id="abb12">MFCC</term><def><p>mel frequency cepstral coefficients</p></def></def-item><def-item><term id="abb13">MID</term><def><p>mental imagery description</p></def></def-item><def-item><term id="abb14">PHQ-8</term><def><p>Patient Health Questionnaire-8</p></def></def-item><def-item><term id="abb15">PHQ-9</term><def><p>Patient Health Questionnaire-9</p></def></def-item><def-item><term id="abb16">Q&#x0026;A</term><def><p>question and answering</p></def></def-item><def-item><term id="abb17">QI</term><def><p>combination of Q&#x0026;A and MID</p></def></def-item><def-item><term id="abb18">RNN</term><def><p>recurrent neural network</p></def></def-item><def-item><term id="abb19">SOTA</term><def><p>state-of-the-art</p></def></def-item><def-item><term id="abb20">SQ</term><def><p>combination paradigm of CS and Q&#x0026;A</p></def></def-item><def-item><term id="abb21">SQIV</term><def><p>combination paradigm of CS, Q&#x0026;A, MID, and VW</p></def></def-item><def-item><term id="abb22">SRSI</term><def><p>Self-Report Scales and Inventories</p></def></def-item><def-item><term id="abb23">VW</term><def><p>video-watching paradigm</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Depression and other common mental disorders: global health estimates</article-title><source>World Health Organization</source><year>2017</year><access-date>2025-03-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/publications/i/item/depression-global-health-estimates">https://www.who.int/publications/i/item/depression-global-health-estimates</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>HAMILTON</surname><given-names>M</given-names> </name></person-group><article-title>A rating scale for depression</article-title><source>J Neurol Neurosurg Psychiatry</source><year>1960</year><month>02</month><volume>23</volume><issue>1</issue><fpage>56</fpage><lpage>62</lpage><pub-id pub-id-type="doi">10.1136/jnnp.23.1.56</pub-id><pub-id pub-id-type="medline">14399272</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Beck</surname><given-names>AT</given-names> </name><name name-style="western"><surname>Steer</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Ball</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ranieri</surname><given-names>W</given-names> </name></person-group><article-title>Comparison of Beck Depression Inventories -IA and -II in psychiatric outpatients</article-title><source>J Pers Assess</source><year>1996</year><month>12</month><volume>67</volume><issue>3</issue><fpage>588</fpage><lpage>597</lpage><pub-id pub-id-type="doi">10.1207/s15327752jpa6703_13</pub-id><pub-id pub-id-type="medline">8991972</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kroenke</surname><given-names>K</given-names> </name><name name-style="western"><surname>Spitzer</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Williams</surname><given-names>JB</given-names> </name></person-group><article-title>The PHQ-9: validity of a brief depression severity measure</article-title><source>J Gen Intern Med</source><year>2001</year><month>09</month><volume>16</volume><issue>9</issue><fpage>606</fpage><lpage>613</lpage><pub-id pub-id-type="doi">10.1046/j.1525-1497.2001.016009606.x</pub-id><pub-id pub-id-type="medline">11556941</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Pichot</surname><given-names>P</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Hippius</surname><given-names>H</given-names> </name><name name-style="western"><surname>Klerman</surname><given-names>GL</given-names> </name><name name-style="western"><surname>Matussek</surname><given-names>N</given-names> </name></person-group><article-title>Self-report inventories in the study of depression</article-title><source>New Results in Depression Research</source><year>1986</year><fpage>53</fpage><lpage>58</lpage><pub-id pub-id-type="doi">10.1007/978-3-642-70702-5_7</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ben-Porath</surname><given-names>YS</given-names> </name></person-group><article-title>Assessing personality and psychopathology with self-report inventories</article-title><source>Handbook of Psychology</source><year>2003</year><fpage>553</fpage><lpage>577</lpage><pub-id pub-id-type="doi">10.1002/0471264385</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rosa</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Portugal</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hahn</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Sparse network-based models for patient classification using fMRI</article-title><source>Neuroimage</source><year>2015</year><month>01</month><day>15</day><volume>105</volume><fpage>493</fpage><lpage>506</lpage><pub-id pub-id-type="doi">10.1016/j.neuroimage.2014.11.021</pub-id><pub-id pub-id-type="medline">25463459</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>La</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>EEG-based mild depression recognition using convolutional neural network</article-title><source>Med Biol Eng Comput</source><year>2019</year><month>06</month><volume>57</volume><issue>6</issue><fpage>1341</fpage><lpage>1352</lpage><pub-id pub-id-type="doi">10.1007/s11517-019-01959-2</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reece</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Reagan</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Lix</surname><given-names>KLM</given-names> </name><name name-style="western"><surname>Dodds</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Danforth</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Langer</surname><given-names>EJ</given-names> </name></person-group><article-title>Forecasting the onset and course of mental illness with Twitter data</article-title><source>Sci Rep</source><year>2017</year><month>10</month><day>11</day><volume>7</volume><issue>1</issue><fpage>13006</fpage><pub-id pub-id-type="doi">10.1038/s41598-017-12961-9</pub-id><pub-id pub-id-type="medline">29021528</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Schwartz</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Eichstaedt</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kern</surname><given-names>ML</given-names> </name><etal/></person-group><article-title>Towards assessing changes in degree of depression through facebook</article-title><conf-name>Proceedings of the Workshop on Computational Linguistics and Clinical Psychology</conf-name><conf-date>Jun 2014</conf-date><conf-loc>Baltimore, Maryland, USA</conf-loc><pub-id pub-id-type="doi">10.3115/v1/W14-3214</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chandra Guntuku</surname><given-names>S</given-names> </name><name name-style="western"><surname>Buffone</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jaidka</surname><given-names>K</given-names> </name><name name-style="western"><surname>Eichstaedt</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Ungar</surname><given-names>LH</given-names> </name></person-group><article-title>Understanding and measuring psychological stress using social media</article-title><source>ICWSM</source><year>2019</year><volume>13</volume><issue>1</issue><fpage>214</fpage><lpage>225</lpage><pub-id pub-id-type="doi">10.1609/icwsm.v13i01.3223</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Meyerhoff</surname><given-names>J</given-names> </name><name name-style="western"><surname>Eichstaedt</surname><given-names>JC</given-names> </name><etal/></person-group><article-title>The relationship between text message sentiment and self-reported depression</article-title><source>J Affect Disord</source><year>2022</year><month>04</month><day>1</day><volume>302</volume><fpage>7</fpage><lpage>14</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2021.12.048</pub-id><pub-id pub-id-type="medline">34963643</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cummins</surname><given-names>N</given-names> </name><name name-style="western"><surname>Scherer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Krajewski</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schnieder</surname><given-names>S</given-names> </name><name name-style="western"><surname>Epps</surname><given-names>J</given-names> </name><name name-style="western"><surname>Quatieri</surname><given-names>TF</given-names> </name></person-group><article-title>A review of depression and suicide risk assessment using speech analysis</article-title><source>Speech Commun</source><year>2015</year><month>07</month><volume>71</volume><fpage>10</fpage><lpage>49</lpage><pub-id pub-id-type="doi">10.1016/j.specom.2015.03.004</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Scherer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Morency</surname><given-names>LP</given-names> </name><name name-style="western"><surname>Gratch</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pestian</surname><given-names>J</given-names> </name></person-group><article-title>Reduced vowel space is a robust indicator of psychological distress: a cross-corpus analysis</article-title><conf-name>ICASSP 2015 - 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</conf-name><conf-date>Apr 19-24, 2025</conf-date><conf-loc>South Brisbane, Queensland, Australia</conf-loc><pub-id pub-id-type="doi">10.1109/ICASSP.2015.7178880</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Logan</surname><given-names>B</given-names> </name></person-group><article-title>Mel frequency cepstral coefficients for music modeling</article-title><source>Proc of Ismir</source><year>2000</year><access-date>2025-05-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ismir2000.ismir.net/papers/logan_paper.pdf">https://ismir2000.ismir.net/papers/logan_paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Cummins</surname><given-names>N</given-names> </name><name name-style="western"><surname>Epps</surname><given-names>J</given-names> </name><name name-style="western"><surname>Breakspear</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goecke</surname><given-names>R</given-names> </name></person-group><article-title>An investigation of depressed speech detection: features and normalization</article-title><conf-name>INTERSPEECH 2011</conf-name><conf-date>Aug 27-31, 2011</conf-date><conf-loc>Florence, Italy</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://www.isca-archive.org/interspeech_2011">https://www.isca-archive.org/interspeech_2011</ext-link></comment><pub-id pub-id-type="doi">10.21437/Interspeech.2011-750</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Jang</surname><given-names>EH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>KY</given-names> </name><name name-style="western"><surname>Park</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>HC</given-names> </name></person-group><article-title>Automatic depression detection using smartphone-based text-dependent speech signals: deep convolutional neural network approach</article-title><source>J Med Internet Res</source><year>2023</year><month>01</month><day>25</day><volume>25</volume><fpage>e34474</fpage><pub-id pub-id-type="doi">10.2196/34474</pub-id><pub-id pub-id-type="medline">36696160</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Gratch</surname><given-names>J</given-names> </name><name name-style="western"><surname>Artstein</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lucas</surname><given-names>G</given-names> </name></person-group><article-title>The distress analysis interview corpus of human and computer interviews</article-title><year>2019</year><month>10</month><day>15</day><access-date>2025-03-21</access-date><conf-name>Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC&#x2019;14)</conf-name><conf-loc>Reykjavik, Iceland</conf-loc><fpage>3123</fpage><lpage>3128</lpage><comment><ext-link ext-link-type="uri" xlink:href="http://www.lrec-conf.org/proceedings/lrec2014/pdf/508_Paper.pdf">http://www.lrec-conf.org/proceedings/lrec2014/pdf/508_Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Ekman</surname><given-names>P</given-names> </name><name name-style="western"><surname>Friesen</surname><given-names>WV</given-names> </name></person-group><article-title>Facial action coding system (FACS): a technique for the measurement of facial actions</article-title><source>APA PsycTests</source><access-date>2025-03-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1037/t27734-000">https://doi.org/10.1037/t27734-000</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fiquer</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Moreno</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Brunoni</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Barros</surname><given-names>VB</given-names> </name><name name-style="western"><surname>Fernandes</surname><given-names>F</given-names> </name><name name-style="western"><surname>Gorenstein</surname><given-names>C</given-names> </name></person-group><article-title>What is the nonverbal communication of depression? Assessing expressive differences between depressive patients and healthy volunteers during clinical interviews</article-title><source>J Affect Disord</source><year>2018</year><month>10</month><day>1</day><volume>238</volume><fpage>636</fpage><lpage>644</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2018.05.071</pub-id><pub-id pub-id-type="medline">29957481</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Girard</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Cohn</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Mahoor</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Mavadati</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rosenwald</surname><given-names>DP</given-names> </name></person-group><article-title>Social risk and depression: evidence from manual and automatic facial expression analysis</article-title><conf-name>2013 10th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition (FG)</conf-name><conf-date>Apr 22-26, 2013</conf-date><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1109/FG.2013.6553748</pub-id><pub-id pub-id-type="medline">24598859</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gavrilescu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Vizireanu</surname><given-names>N</given-names> </name></person-group><article-title>Predicting depression, anxiety, and stress levels from videos using the facial action coding system</article-title><source>Sensors (Basel)</source><year>2019</year><month>08</month><day>25</day><volume>19</volume><issue>17</issue><fpage>3693</fpage><pub-id pub-id-type="doi">10.3390/s19173693</pub-id><pub-id pub-id-type="medline">31450687</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Melfsen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Osterlow</surname><given-names>J</given-names> </name><name name-style="western"><surname>Florin</surname><given-names>I</given-names> </name></person-group><article-title>Deliberate emotional expressions of socially anxious children and their mothers</article-title><source>J Anxiety Disord</source><year>2000</year><volume>14</volume><issue>3</issue><fpage>249</fpage><lpage>261</lpage><pub-id pub-id-type="doi">10.1016/s0887-6185(99)00037-7</pub-id><pub-id pub-id-type="medline">10868983</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Smith</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Ellgring</surname><given-names>H</given-names> </name></person-group><article-title>Spontaneous and posed facial expression in Parkinson&#x2019;s disease</article-title><source>J Int Neuropsychol Soc</source><year>1996</year><month>09</month><volume>2</volume><issue>5</issue><fpage>383</fpage><lpage>391</lpage><pub-id pub-id-type="doi">10.1017/s1355617700001454</pub-id><pub-id pub-id-type="medline">9375163</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mundt</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Snyder</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Cannizzaro</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Chappie</surname><given-names>K</given-names> </name><name name-style="western"><surname>Geralts</surname><given-names>DS</given-names> </name></person-group><article-title>Voice acoustic measures of depression severity and treatment response collected via interactive voice response (IVR) technology</article-title><source>J Neurolinguistics</source><year>2007</year><month>01</month><volume>20</volume><issue>1</issue><fpage>50</fpage><lpage>64</lpage><pub-id pub-id-type="doi">10.1016/j.jneuroling.2006.04.001</pub-id><pub-id pub-id-type="medline">21253440</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>McIntyre</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gocke</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hyett</surname><given-names>M</given-names> </name><name name-style="western"><surname>Green</surname><given-names>M</given-names> </name><name name-style="western"><surname>Breakspear</surname><given-names>M</given-names> </name></person-group><article-title>An approach for automatically measuring facial activity in depressed subjects</article-title><conf-name>2009 3rd International Conference on Affective Computing and Intelligent Interaction and Workshops (ACII 2009)</conf-name><conf-date>Sep 10-12, 2009</conf-date><conf-loc>Amsterdam</conf-loc><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1109/ACII.2009.5349593</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cai</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A multi-modal open dataset for mental-disorder analysis</article-title><source>Sci Data</source><year>2022</year><month>04</month><day>19</day><volume>9</volume><issue>1</issue><fpage>178</fpage><pub-id pub-id-type="doi">10.1038/s41597-022-01211-x</pub-id><pub-id pub-id-type="medline">35440583</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ray</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Reddy</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mukherjee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Garg</surname><given-names>R</given-names> </name></person-group><article-title>Multi-level attention network using text, audio and video for depression prediction</article-title><conf-name>AVEC &#x2019;19: Proceedings of the 9th International on Audio/Visual Emotion Challenge and Workshop</conf-name><conf-date>Oct 15, 2019</conf-date><conf-loc>Nice France</conf-loc><fpage>81</fpage><lpage>88</lpage><pub-id pub-id-type="doi">10.1145/3347320.3357697</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rejaibi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Komaty</surname><given-names>A</given-names> </name><name name-style="western"><surname>Meriaudeau</surname><given-names>F</given-names> </name><name name-style="western"><surname>Agrebi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Othmani</surname><given-names>A</given-names> </name></person-group><article-title>MFCC-based recurrent neural network for automatic clinical depression recognition and assessment from speech</article-title><source>Biomed Signal Process Control</source><year>2022</year><month>01</month><volume>71</volume><fpage>103107</fpage><pub-id pub-id-type="doi">10.1016/j.bspc.2021.103107</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Holmes</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Mathews</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mackintosh</surname><given-names>B</given-names> </name><name name-style="western"><surname>Dalgleish</surname><given-names>T</given-names> </name></person-group><article-title>The causal effect of mental imagery on emotion assessed using picture-word cues</article-title><source>Emotion</source><year>2008</year><month>06</month><volume>8</volume><issue>3</issue><fpage>395</fpage><lpage>409</lpage><pub-id pub-id-type="doi">10.1037/1528-3542.8.3.395</pub-id><pub-id pub-id-type="medline">18540755</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>We&#x00DF;lau</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cloos</surname><given-names>M</given-names> </name><name name-style="western"><surname>H&#x00F6;fling</surname><given-names>V</given-names> </name><name name-style="western"><surname>Steil</surname><given-names>R</given-names> </name></person-group><article-title>Visual mental imagery and symptoms of depression - results from a large-scale web-based study</article-title><source>BMC Psychiatry</source><year>2015</year><month>12</month><day>2</day><volume>15</volume><fpage>308</fpage><pub-id pub-id-type="doi">10.1186/s12888-015-0689-1</pub-id><pub-id pub-id-type="medline">26631081</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Andrade</surname><given-names>J</given-names> </name><name name-style="western"><surname>May</surname><given-names>J</given-names> </name><name name-style="western"><surname>Deeprose</surname><given-names>C</given-names> </name><name name-style="western"><surname>Baugh</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Ganis</surname><given-names>G</given-names> </name></person-group><article-title>Assessing vividness of mental imagery: The Plymouth Sensory Imagery Questionnaire</article-title><source>Br J Psychol</source><year>2014</year><month>11</month><volume>105</volume><issue>4</issue><fpage>547</fpage><lpage>563</lpage><pub-id pub-id-type="doi">10.1111/bjop.12050</pub-id><pub-id pub-id-type="medline">24117327</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tiggemann</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kemps</surname><given-names>E</given-names> </name></person-group><article-title>The phenomenology of food cravings: the role of mental imagery</article-title><source>Appetite</source><year>2005</year><month>12</month><volume>45</volume><issue>3</issue><fpage>305</fpage><lpage>313</lpage><pub-id pub-id-type="doi">10.1016/j.appet.2005.06.004</pub-id><pub-id pub-id-type="medline">16112776</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ge</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Houston</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name></person-group><article-title>A standardised database of Chinese emotional film clips</article-title><source>Cogn Emot</source><year>2019</year><month>08</month><volume>33</volume><issue>5</issue><fpage>976</fpage><lpage>990</lpage><pub-id pub-id-type="doi">10.1080/02699931.2018.1530197</pub-id><pub-id pub-id-type="medline">30293475</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>G</given-names> </name><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ge</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>Y</given-names> </name></person-group><article-title>Real-Time movie-induced discrete emotion recognition from EEG Signals</article-title><source>IEEE Trans Affective Comput</source><year>2018</year><volume>9</volume><issue>4</issue><fpage>550</fpage><lpage>562</lpage><pub-id pub-id-type="doi">10.1109/TAFFC.2017.2660485</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>International Advisory Group for the Revision of ICD-10 Mental and Behavioural Disorders</collab></person-group><article-title>A conceptual framework for the revision of the ICD-10 classification of mental and behavioural disorders</article-title><source>World Psychiatry</source><year>2011</year><month>06</month><volume>10</volume><issue>2</issue><fpage>86</fpage><lpage>92</lpage><pub-id pub-id-type="doi">10.1002/j.2051-5545.2011.tb00022.x</pub-id><pub-id pub-id-type="medline">21633677</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Livingstone</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Russo</surname><given-names>FA</given-names> </name></person-group><article-title>The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English</article-title><source>PLoS ONE</source><year>2018</year><month>05</month><day>16</day><volume>13</volume><issue>5</issue><fpage>e0196391</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0196391</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Tan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Le</surname><given-names>Q</given-names> </name></person-group><article-title>EfficientNet: rethinking model scaling for convolutional neural networks</article-title><year>2019</year><access-date>2025-03-21</access-date><conf-name>Proceedings of the 36th International Conference on Machine Learning</conf-name><fpage>6105</fpage><lpage>6114</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v97/tan19a.html">https://proceedings.mlr.press/v97/tan19a.html</ext-link></comment></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Worm</surname><given-names>T</given-names> </name><name name-style="western"><surname>Reale</surname><given-names>M</given-names> </name></person-group><article-title>A high-resolution 3D dynamic facial expression database</article-title><conf-name>2008 8th IEEE International Conference on Automatic Face &#x0026; Gesture Recognition</conf-name><conf-date>Sep 17-19, 2008</conf-date><conf-loc>Amsterdam, Netherlands</conf-loc><pub-id pub-id-type="doi">10.1109/AFGR.2008.4813324</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kellnhofer</surname><given-names>P</given-names> </name><name name-style="western"><surname>Recasens</surname><given-names>A</given-names> </name><name name-style="western"><surname>Stent</surname><given-names>S</given-names> </name><name name-style="western"><surname>Matusik</surname><given-names>W</given-names> </name><name name-style="western"><surname>Torralba</surname><given-names>A</given-names> </name></person-group><article-title>Gaze360: physically unconstrained gaze estimation in the wild</article-title><conf-name>2019 IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name><conf-date>Oct 27 to Nov 2, 2019</conf-date><conf-loc>Seoul, Korea (South</conf-loc><fpage>6912</fpage><lpage>6921</lpage><pub-id pub-id-type="doi">10.1109/ICCV.2019.00701</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>King</surname><given-names>DE</given-names> </name></person-group><article-title>Dlib-ml: A machine learning toolkit</article-title><source>J Mach Learn Res</source><year>2009</year><volume>10</volume><issue>3</issue><fpage>1755</fpage><lpage>1758</lpage><pub-id pub-id-type="doi">10.1145/1577069.1755843</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Che</surname><given-names>W</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Qin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>T</given-names> </name></person-group><article-title>N-LTP: an open-source neural language technology platform for chinese</article-title><conf-name>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Nov 2021</conf-date><conf-loc>Online and Punta Cana, Dominican Republic</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2021.emnlp-demo.6</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Selvaraju</surname><given-names>RR</given-names> </name><name name-style="western"><surname>Cogswell</surname><given-names>M</given-names> </name><name name-style="western"><surname>Das</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vedantam</surname><given-names>R</given-names> </name><name name-style="western"><surname>Parikh</surname><given-names>D</given-names> </name><name name-style="western"><surname>Batra</surname><given-names>D</given-names> </name></person-group><article-title>Grad-CAM: visual explanations from deep networks via gradient-based localization</article-title><conf-name>2017 IEEE International Conference on Computer Vision (ICCV)</conf-name><conf-date>Oct 22-29, 2017</conf-date><conf-loc>Venice</conf-loc><fpage>618</fpage><lpage>626</lpage><pub-id pub-id-type="doi">10.1109/ICCV.2017.74</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pedregosa</surname><given-names>F</given-names> </name><name name-style="western"><surname>Varoquaux</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gramfort</surname><given-names>A</given-names> </name></person-group><article-title>Scikit-learn: Machine learning in Python</article-title><source>JMLR</source><year>2011</year><access-date>2025-03-21</access-date><volume>12</volume><fpage>2825</fpage><lpage>2830</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.jmlr.org/papers/volume12/pedregosa11a/pedregosa11a.pdf">https://www.jmlr.org/papers/volume12/pedregosa11a/pedregosa11a.pdf</ext-link></comment></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Girard</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Cohn</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Mahoor</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Mavadati</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Hammal</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Rosenwald</surname><given-names>DP</given-names> </name></person-group><article-title>Nonverbal social withdrawal in depression: evidence from manual and automatic analysis</article-title><source>Image Vis Comput</source><year>2014</year><month>10</month><volume>32</volume><issue>10</issue><fpage>641</fpage><lpage>647</lpage><pub-id pub-id-type="doi">10.1016/j.imavis.2013.12.007</pub-id><pub-id pub-id-type="medline">25378765</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fu</surname><given-names>CHY</given-names> </name><name name-style="western"><surname>Williams</surname><given-names>SCR</given-names> </name><name name-style="western"><surname>Cleare</surname><given-names>AJ</given-names> </name><etal/></person-group><article-title>Attenuation of the neural response to sad faces in major depression by antidepressant treatment: a prospective, event-related functional magnetic resonance imaging study</article-title><source>Arch Gen Psychiatry</source><year>2004</year><month>09</month><volume>61</volume><issue>9</issue><fpage>877</fpage><lpage>889</lpage><pub-id pub-id-type="doi">10.1001/archpsyc.61.9.877</pub-id><pub-id pub-id-type="medline">15351766</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hahn</surname><given-names>T</given-names> </name><name name-style="western"><surname>Marquand</surname><given-names>AF</given-names> </name><name name-style="western"><surname>Ehlis</surname><given-names>AC</given-names> </name><etal/></person-group><article-title>Integrating neurobiological markers of depression</article-title><source>Arch Gen Psychiatry</source><year>2011</year><month>04</month><volume>68</volume><issue>4</issue><fpage>361</fpage><lpage>368</lpage><pub-id pub-id-type="doi">10.1001/archgenpsychiatry.2010.178</pub-id><pub-id pub-id-type="medline">21135315</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abbas</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hansen</surname><given-names>BJ</given-names> </name><name name-style="western"><surname>Koesmahargyo</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Facial and vocal markers of schizophrenia measured using remote smartphone assessments: observational study</article-title><source>JMIR Form Res</source><year>2022</year><month>01</month><day>21</day><volume>6</volume><issue>1</issue><fpage>e26276</fpage><pub-id pub-id-type="doi">10.2196/26276</pub-id><pub-id pub-id-type="medline">35060906</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ruihua</surname><given-names>M</given-names> </name><name name-style="western"><surname>Meng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Nan</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Differences in facial expression recognition between unipolar and bipolar depression</article-title><source>Front Psychol</source><year>2021</year><volume>12</volume><fpage>619368</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2021.619368</pub-id><pub-id pub-id-type="medline">34335353</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Definitions, individual prediction results, and attention scores.</p><media xlink:href="formative_v9i1e56057_app1.docx" xlink:title="DOCX File, 564 KB"/></supplementary-material></app-group></back></article>