<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e78082</article-id><article-id pub-id-type="doi">10.2196/78082</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Preprocessing Large-Scale Conversational Datasets: A Framework and Its Application to Behavioral Health Transcripts</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Naim</surname><given-names>Paz Mor</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sadeh-Sharvit</surname><given-names>Shiri</given-names></name><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jefroykin</surname><given-names>Samuel</given-names></name><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Silber</surname><given-names>Eddie</given-names></name><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Morrison</surname><given-names>Dennis P</given-names></name><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Goldstein</surname><given-names>Ariel</given-names></name><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Cognitive and Brain Sciences, Hebrew University of Jerusalem</institution><addr-line>Mount Scopus</addr-line><addr-line>Jerusalem</addr-line><country>Israel</country></aff><aff id="aff2"><institution>Eleos Health</institution><addr-line>Waltham</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff3"><institution>Palo Alto University</institution><addr-line>Palo Alto</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff4"><institution>Morrison Consulting</institution><addr-line>Bloomington</addr-line><addr-line>IN</addr-line><country>United States</country></aff><aff id="aff5"><institution>Business School, Hebrew University of Jerusalem</institution><addr-line>Jerusalem</addr-line><country>Israel</country></aff><aff id="aff6"><institution>Department of Psychology, Azrieli Israel Center for Addiction and Mental Health (Azrieli ICAMH), Hebrew University of Jerusalem</institution><addr-line>Jerusalem</addr-line><country>Israel</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Adepoju</surname><given-names>Adewumi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Elbattah</surname><given-names>Mahmoud</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Biswas</surname><given-names>Sandipan</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Paz Mor Naim, Department of Cognitive and Brain Sciences, Hebrew University of Jerusalem, Mount Scopus, Jerusalem, 9190500, Israel, 972 025882888; <email>paz.naim@mail.huji.ac.il</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>24</day><month>10</month><year>2025</year></pub-date><volume>9</volume><elocation-id>e78082</elocation-id><history><date date-type="received"><day>27</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>03</day><month>09</month><year>2025</year></date><date date-type="accepted"><day>09</day><month>09</month><year>2025</year></date></history><copyright-statement>&#x00A9; Paz Mor Naim, Shiri Sadeh-Sharvit, Samuel Jefroykin, Eddie Silber, Dennis P Morrison, Ariel Goldstein. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 24.10.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2025/1/e78082"/><abstract><sec><title>Background</title><p>The rise of artificial intelligence and accessible audio equipment has led to a proliferation of recorded conversation transcripts datasets across various fields. However, automatic mass recording and transcription often produce noisy, unstructured data that contain unintended recordings such as hallway conversations, media (eg, TV, radio), or transcription inaccuracies as speaker misattribution or misidentified words. As a result, large conversational transcript datasets require careful preprocessing and filtering to ensure their research utility. This challenge is particularly relevant in behavioral health contexts (eg, therapy, counseling) where deriving meaningful insights, specifically dynamic processes, depends on accurate conversation representation.</p></sec><sec><title>Objective</title><p>We present a framework for preprocessing large datasets of conversational transcripts and filtering out <italic>non-sessions</italic>&#x2014;transcripts that do not reflect a behavioral treatment session but instead capture unrelated conversations or background noise. This framework is applied to a large dataset of behavioral health transcripts from community mental health clinics across the United States.</p></sec><sec sec-type="methods"><title>Methods</title><p>Our approach integrated basic feature extraction, human annotation, and advanced applications of large language models (LLMs). We began by mapping transcription errors and assessing the number of non-sessions. Next, we extracted statistical and structural features to characterize transcripts and detect outliers. Notably, we used LLM perplexity as a measure of comprehensibility to assess transcript noise levels. Finally, we used zero-shot prompting with an LLM to classify transcripts as sessions or non-sessions, validating its output against expert annotations. Throughout, we prioritized data security by selecting tools that preserve anonymity and minimize the risk of data breaches.</p></sec><sec sec-type="results"><title>Results</title><p>Initial assessment revealed that transcription errors&#x2014;such as incomprehensible segments, unusually short transcripts, and speaker diarization issues&#x2014;were present in approximately one-third (n=36 out of 100) of a manually reviewed sample. Statistical outliers revealed that high speaking rate (&#x003E;3.5 words per second) is associated with short transcripts and answering machine messages, while short conversation duration (&#x003C;15 min) was an indicator for case management sessions. The 75th percentile of LLM perplexity scores was significantly higher in non-sessions than sessions (permutation test mean difference = &#x2212;258, <italic>P</italic> =.02), although this feature alone offered only moderate classification performance (precision =0.63, recall =0.23 after outlier removal). In contrast, zero-shot LLM prompting effectively distinguished sessions from non-sessions with high agreement to expert ratings (&#x03BA;=0.71) while also capturing the nature of the meeting.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study&#x2019;s hybrid approach effectively characterizes errors, evaluates content, and distinguishes text types within unstructured conversational dataset. It provides a foundation for research on conversational data, key methods, and practical guidelines that serve as crucial first steps in ensuring data quality and usability, particularly in the context of mental health sessions. We highlight the importance of integrating clinical experts with artificial intelligence tools while prioritizing data security throughout the process.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>behavioral health</kwd><kwd>clinical documentation</kwd><kwd>clinical texts</kwd><kwd>conversational transcripts</kwd><kwd>data preprocessing</kwd><kwd>data quality assessment</kwd><kwd>health informatics</kwd><kwd>health information systems</kwd><kwd>large language models</kwd><kwd>natural language processing</kwd><kwd>psychotherapy</kwd><kwd>text classification</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>As one of our primary communication tools, conversations offer a window into human relationships. Speaking with each other is one of the fundamental aspects of our existence. Consequently, transcripts of conversations have garnered significant interest in research across various disciplines [<xref ref-type="bibr" rid="ref1">1</xref>]. Analyzing conversational data is a central research focus in fields such as health care [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>], law [<xref ref-type="bibr" rid="ref4">4</xref>], customer support [<xref ref-type="bibr" rid="ref5">5</xref>], negotiations [<xref ref-type="bibr" rid="ref6">6</xref>], education [<xref ref-type="bibr" rid="ref7">7</xref>], and behavioral treatment and psychotherapy [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>Among these diverse domains, talk therapy, which relies on a conversation between 2 or more individuals, is a particularly intriguing case for conversation analysis. Analysis of treatment sessions can enhance our understanding of different therapeutic protocols [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref14">14</xref>]; improve clinical training [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]; and provide insights into the relationships between conversational elements, therapeutic outcomes, and the therapeutic alliances [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. Conversation analysis research of behavioral health sessions can provide comprehensive insights into the intricate dynamics of therapeutic interactions [<xref ref-type="bibr" rid="ref2">2</xref>]. By meticulously examining communicative choices, researchers can understand how specific counseling strategies impact client engagement, identify successful intervention techniques, and reveal nonlinear patterns of change within therapy sessions [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. This research methodology allows professionals to study the dyadic or group psychotherapeutic processes, predict potential treatment outcomes, develop more effective interventions, and ultimately enhance the quality of behavioral health care [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref22">22</xref>].</p><p>Historically, collecting, transcribing, and analyzing treatment conversations have required significant human effort. This is now changing, thanks to 2 major technological advances. First, recent developments in audio capturing technologies and the ubiquitousness of smart devices simplify the collection of conversation data. Second, the abundance of artificial intelligence (AI)&#x2013;based speech-to-text tools [<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref25">25</xref>] has automated transcription and speaker recognition tasks. These innovations have made it possible to capture high-quality human speech in diverse settings with less manual effort [<xref ref-type="bibr" rid="ref26">26</xref>]. Consequently, large-scale transcription of audio files is now feasible with unprecedented speed and precision. The automation of these processes introduces new challenges. Transcripts&#x2019; quality depends on several factors: the recording devices and their placement in the room, background noise, internet connection stability, and accurate speech-to-text and speaker diarization models. Additionally, characteristics of the conversation itself, such as slurred speech, dialects, rare languages, or use of slang, all pose further challenges for automatic speech recognition (ASR) models [<xref ref-type="bibr" rid="ref27">27</xref>]. Similarly, interrupted, rapid, or overlapping speech, unknown number of speakers as well as same-sex speakers [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>] each introduce complications for speaker recognition systems.</p><p>Moreover, the expected growth in large datasets accompanying this automation amplifies these challenges. Large and diverse datasets might include recordings of different quality levels and therefore are prone to various types of errors. Furthermore, when recordings are made routinely and automatically, unintentional recordings&#x2014;irrelevant conversations, phone calls, empty moments, or accidental noise&#x2014;may occur. This illustrates how large volumes of automatically generated transcripts are unstructured and susceptible to errors. Therefore, guidelines and methods for error handling, assessing, and filtering should be established.</p><p>These challenges are well-known in the broader field of health care, which has long grappled with extracting meaningful information from unstructured data such as clinical notes, discharge summaries, or patient-generated text [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. As the field has evolved, various preprocessing pipelines and methodological frameworks have been proposed for handling missing metadata, variable data quality, and inconsistent documentation practices [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref34">34</xref>]. However, most existing work focuses on clinical records, and less attention has been given to large-scale, conversational health care data.</p><p>Yeomans et al [<xref ref-type="bibr" rid="ref35">35</xref>] have recently proposed a methodological pipeline for building conversation-based research, starting with the planning and collection of conversations and going through editing and analyzing. However, with the availability of ambient AI and larger datasets, many researchers may be working with secondary data&#x2014;datasets they have not collected themselves. Therefore, they might not have access to the original recordings, to the speech-to-text models, nor to the transcripts editing process. Lacking a ground truth for verification necessitates expanding these guidelines to address such challenges.</p><p>In the case of behavioral treatment sessions, the noisiness of the transcript and its accurate representation of the conversation are key for detecting the interventions applied and extracting treatment insights [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. For instance, misidentifying speakers could compromise insights regarding the therapeutic alliance [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. Understanding which conversations are professional and which are accidental recordings is the first step in this endeavor.</p><p>In this paper the abovementioned challenges are addressed, and a methodological approach for preprocessing a large dataset of behavioral treatment transcripts without access to their respective original recordings is presented. First, a systematic approach for characterizing the data is outlined, followed by methods that allow its assessment for future analysis. This methodology is then illustrated by applying it to a large dataset of deidentified behavioral health sessions. Finally, the strengths and limitations of our approach are discussed. We hope to promote the integration of computational tools in traditional talk therapy and to offer relevant methods for any dataset of conversational transcripts.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data and Settings</title><p>We analyzed 22,337 behavioral treatment sessions from 50 behavioral health programs across the United States collected through the Eleos Health platform between June 2020 and January 2024. Eleos Health&#x2019;s digital platform is designed to promote behavioral treatment quality by providing intervention feedback, supporting clinical decision-making, and enabling progress note automation. Sessions were processed as part of the routine implementation of ambient AI tools within participating behavioral health programs. All sessions underwent transcription, deidentification, and anonymization using Eleos&#x2019; proprietary models before inclusion in the study. The research team had no access to audio recordings or transcripts prior to this process, and only deidentified data were analyzed. Processed transcripts were stored as comma-separated values files with 3 primary columns: deidentified speaker labels, timestamps, and content. Each content row contained a speech segment from either the therapist or the client&#x2014;typically up to several sentences&#x2014;segmented by the Eleos ASR model. Session metadata included random therapist and client ID numbers, organization name, session date, and treatment delivery method (phone, video conferencing, or in-person). This study was conducted in concert with the STROBE Checklist [<xref ref-type="bibr" rid="ref38">38</xref>].</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study was determined exempt from review by the Sterling Institutional Review Board under the Department of Health and Human Services Exemption Category 4. This exemption permits secondary research using identifiable health information when the data are either publicly available or recorded in such a way that subjects cannot be reidentified, and there is no direct interaction with participants. All research procedures adhered to applicable ethical guidelines and regulations. Clients and therapists provided informed consent for the use of anonymized, deidentified session data in secondary research conducted by the company. Both parties retained the option to opt out of having their sessions processed. All data used in this study were deidentified prior to analysis, and no identifiable information was accessed by the research team. No compensation was provided for participation in this secondary analysis, and no images or supplementary materials contain identifiable individuals.</p></sec><sec id="s2-3"><title>Data Analysis Approach</title><p>Our data analysis approach consisted of several stages, each employing different methods for characterizing and filtering the dataset (see <xref ref-type="fig" rid="figure1">Figure 1</xref>). The initial stages were exploratory, beginning with a manual review to assess data&#x2019;s content and quality. This process helped us identify categories of files and potential errors in the data, enhancing our understanding of the dataset and the features most suitable for characterization and filtering. Subsequent steps focused on directly classifying transcripts as &#x201C;sessions&#x201D; or &#x201C;non-sessions&#x201D; (eg, nonprofessional conversations, noise, mock sessions). Using available metadata, we filtered out mock sessions (non-sessions) that had known identifiers. Additionally, we applied zero-shot prompting with a large language model (LLM) to distinguish therapy sessions from irrelevant conversations. Automation played a critical initial role in this phase; however, human expertise - specifically trained psychologists - was incorporated to validate the automated classifications. At this stage, aligned with our primary goal of preparing the dataset for psychological research, various types of interventions were distinguished, and the sessions category was restricted only to formal treatments (excluding peer support, sponsorship, and case management). Notably, we did not prompt the model with these exclusion criteria, testing its ability to make such distinction based solely on therapeutic elements.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Work pipeline&#x2014;starting with manual review of a dataset subset, followed by statistical feature extraction and outlier detection. Next, we apply zero-shot prompting with a large language model (LLM) to classify transcripts as sessions or non-sessions and validate the LLM&#x2019;s decisions against human annotators. Finally, we compare LLM&#x2019;s perplexity of different classes.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e78082_fig01.png"/></fig><sec id="s2-3-1"><title>Dataset Preprocessing</title><sec id="s2-3-1-1"><title>Dataset Characteristics</title><p>We identified potential duplicates in the dataset by analyzing the similarity between file names. This method is efficient in terms of processing time and computational resources, as it avoids comparing entire text contents. The underlying assumption is that identical files might be mistakenly saved under similar file names. To measure similarity, we used Python&#x2019;s SequenceMatcher function [<xref ref-type="bibr" rid="ref39">39</xref>]. File names were considered similar if the ratio of their longest matching segment to the total number of characters exceeded 75%. Files meeting this threshold were then manually reviewed to confirm duplication.</p><p>Additionally, some transcripts were recorded in languages other than English. To detect the languages, we used the langdetect library in Python [<xref ref-type="bibr" rid="ref40">40</xref>].</p></sec><sec id="s2-3-1-2"><title>Initial Assessment</title><p>A total of 100 randomly selected transcripts were manually reviewed by human raters, with 12 segments analyzed from each transcript to identify common errors and quantify their prevalence. At least 4 categories of errors were defined: (1) non-sessions: transcripts that clearly did not represent a session. At this stage, no distinction was made between types of intervention (eg, peer support, sponsorship, or formal treatment); (2) too short: transcripts with a total duration of less than 2700 seconds (45 min, which approximates the expected session length); (3) unreadable: transcripts with excessive missing words, duplicated segments, or incomprehensible text&#x2014;readers could not infer the missing content or the meaning of the segment. For example, a transcript with multiple repeated filler phrases (eg, &#x201C;uh-huh, uh-huh, uh-huh&#x2026;&#x201D;) was generally deemed unreadable; (4) speaker diarization errors: transcripts with substantial speaker attribution mistakes.</p><p>In some cases, annotators indicated that additional context was required to ascertain the presence of an error.</p></sec></sec></sec><sec id="s2-4"><title>Features Learning</title><p>We collected statistics about the whole dataset and each session which included (1) conversation length, (2) speaking rate, (3) frequent words, and (4) segment perplexity.</p><p>In addition to characterizing the dataset, these features serve as potential indicators for assessing the data&#x2019;s compatibility for further analysis. For instance, they can highlight recording errors or determine whether the content is session related. In the following section, we elaborate on some of these features and explain how they were calculated for each session.</p><sec id="s2-4-1"><title>Conversation Length</title><p>Conversation length was calculated by extracting the end timestamp of the final speech segment in the transcript. This method does not account for any time elapsed between when the recording device was activated and the conversation began. However, if the session starts with silence, this approach accurately reflects the length of the conversation.</p></sec><sec id="s2-4-2"><title>Speaking Rate</title><p>Speaking rate, defined as the average number of words spoken per second, has been employed to predict speaker characteristics [<xref ref-type="bibr" rid="ref41">41</xref>], detect speech anomalies, and identify changes in conversational dynamics or context [<xref ref-type="bibr" rid="ref42">42</xref>]. In American conversational speech, the reported average speaking rate ranges from 1.85 to 4.86 words per second (WPS) [<xref ref-type="bibr" rid="ref41">41</xref>]. Values outside this range may indicate abnormal recordings, such as background noise or transcription errors.</p><p>For each transcript, speaking rate was calculated by averaging across segment-level speaking rates. Because silent pauses within segments were not available, we could not adjust for them as is commonly done. However, segments were brief (typically a few seconds) and cut before long silences, making the absence of silent moments negligible and this measure a reasonable approximation. For comparison, we also calculated a global speaking rate by dividing the total word count by the overall conversation length, which incorporates silent periods.</p></sec><sec id="s2-4-3"><title>Word Frequency</title><p>We explored the content expressed in the transcripts by extracting the most common nouns in the dataset. First, we cleaned each text by removing stop words [<xref ref-type="bibr" rid="ref43">43</xref>] and technical terms (a list of which can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). These filtered words were typically used in bureaucratic contexts, such as scheduling meetings or filling out professional forms. Next, we separated each transcript into 2 speaker partitions: therapist and client. Using Python&#x2019;s library, Spacy [<xref ref-type="bibr" rid="ref44">44</xref>], we collected all the nouns spoken across all transcripts for each speaker. This content analysis revealed common topics discussed during conversations. Additionally, unexpected words that emerged may indicate common errors made by the ASR model.</p></sec><sec id="s2-4-4"><title>Perplexity</title><p>Perplexity is a measure of a text sequence, indicating its likelihood of being produced by a language model [<xref ref-type="bibr" rid="ref45">45</xref>]. It is the model&#x2019;s loss when given the words in the sequence and is conceptually similar to the cross-entropy between the model&#x2019;s and the sequence&#x2019;s probability distribution:</p><disp-formula id="equWL1">,<mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>P</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>p</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>g</mml:mi><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2223;</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:msub><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>w</italic><sub><italic>i</italic></sub> is a token in the segment&#x2019;s sequence. Due to the short length of segments, the number of tokens is typically smaller than the model&#x2019;s context length. Consequently, the probability of <italic>w</italic><sub><italic>i</italic></sub> is calculated considering all previous tokens in the sequence. High perplexity suggests that the model is less likely to generate such a segment.</p><p>We used OpenAI&#x2019;s <italic>GPT-2</italic> (locally, through the <italic>Hugging Face</italic> platform [<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref47">47</xref>]) to compute the perplexity of approximately 5 million segments in 9067 transcripts. Perplexity was calculated at the segment level and then aggregated within each transcript. We hypothesized that higher perplexity would reflect higher semantic complexity, thereby acting as a marker for actual sessions. However, we also anticipated that extreme values of perplexity could indicate recording errors or corrupted transcripts that do not represent a therapeutic conversation. Thus, we propose that therapeutic conversations will generally have higher average perplexities but will not show the highest maximum values. To test this assumption, we extracted the average perplexity, standard deviation of perplexity, and maximal perplexity for each transcript. Additionally, to get an accurate representation of the upper-bound perplexity values without being over-influenced by outliers, we calculated the 75th percentile of perplexity values for each transcript.</p></sec></sec><sec id="s2-5"><title>Classification&#x2014;Distinguishing Sessions From Conversations</title><sec id="s2-5-1"><title>Model and Platform</title><p>To classify transcripts of behavioral treatment to sessions and non-sessions, we queried an LLM with a zero-shot approach. We used the Amazon Bedrock platform [<xref ref-type="bibr" rid="ref48">48</xref>] that enables running closed models through a third party, ensuring that the data were not shared or exposed to the model&#x2019;s provider. The model selection out of the available platform&#x2019;s options was based on criteria of cost, context length, and proven reliability. Table S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> presents the cost of different tokens at the time of writing. Anthropic&#x2019;s Claude [<xref ref-type="bibr" rid="ref49">49</xref>] was chosen because of its proficiency in human tasks and cost-efficiency [<xref ref-type="bibr" rid="ref49">49</xref>-<xref ref-type="bibr" rid="ref51">51</xref>]. Both the LLM and the platform were carefully selected to ensure data security. Amazon&#x2019;s Bedrock platform served as a third party between the model and the user, enabling us to use the model non-locally without exposing our data. Data were not saved on the platform&#x2019;s servers, nor were they used for the benefit of training the model [<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref52">52</xref>].</p></sec><sec id="s2-5-2"><title>Filtering and Subsetting</title><p>Based on the metadata, we excluded files from organizations that are not clinical public entities, such as academic institutes conducting mock sessions and trials (25% filtered). From this subset, we randomly selected 850 transcripts for automatic classification.</p></sec><sec id="s2-5-3"><title>Classification With LLM</title><p>&#x201C;Claude-3-Sonnet&#x201D; was applied through Amazon&#x2019;s Bedrock platform (model ID: &#x201C;anthropic.claude-3-sonnet-20240229-v1:0&#x201D;) with the following parameters: maximal generated words (max_out): 350; temperature: 0.3; and the remaining parameters were set to default. A detailed prompt with guidelines specifying the criteria for classifying a transcript as a session, and a format for providing a well explained answer were created to ensure the model performed the task as expected. In collaboration with an expert clinical psychologist, we compiled a list of elements that define a behavioral treatment session. The prompt addressed the following 5 core elements of a behavioral treatment session:</p><p>Dynamics: a correspondence where one of the sides shares experiences and the other focuses on listening and responding.</p><p>Content: the conversation contains personal matters, emotions and experiences, discussions about personal goals, thoughts, or behaviors.</p><p>Therapeutic elements: demonstration of active listening and empathy by one side, and use of therapeutic techniques such as reframing, providing coping strategies.</p><p>Professional language: therapeutic terminology, references for treatment plans, or previous sessions.</p><p>Context clues: mentions of confidentiality, session time limit, or scheduling future appointments.</p><p>The prompt specifically instructed the model to look for these elements in its decision-making process, explain its reasoning, and rate its certainty on a scale of 1-5. Additionally, we asked the model to provide a brief summary of the conversation content and to indicate whether it identified nontherapeutic conversational dynamics. These instructions were designed to ensure that the model addressed the entire conversation and understood its content. To maintain consistency, we used XML tags (see <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p></sec><sec id="s2-5-4"><title>Validation by Clinical Expert and Interrater Reliability</title><p>To validate the model&#x2019;s classifications, 2 human raters independently classified 150 randomly selected transcripts. Both raters were graduate students in psychology or cognitive sciences, pursuing either clinical or theoretical training related to psychotherapy. Both were familiar with transcripts of behavioral health sessions prior to this task.</p><p>Raters were instructed to read transcript segments (approximately 12 turns) carefully and determine whether the segment belonged to an individual treatment session, excluding family or couple sessions, peer support, and case management calls. If a decision could not be made based on the provided segment, raters were encouraged to consult the full transcript.</p><p>These exclusion criteria were not part of the model&#x2019;s automated classification process and may therefore have introduced some discrepancies between human and model decisions. Nevertheless, our aim was to assess whether applying instructions focused on contextual and relational dynamics of the conversation would naturally result in the exclusion of nonprofessional conversations. Interrater agreement was assessed using Cohen kappa and percent agreement. Cohen kappa higher than 0.61 is generally interpreted as a substantial agreement [<xref ref-type="bibr" rid="ref53">53</xref>]. We then calculated the percent agreement of the human raters with the model for each category (session, non-session).</p></sec><sec id="s2-5-5"><title>Perplexity of Sessions Versus Non-Sessions</title><p>To measure whether the 2 perplexity distributions can be assigned to 2 different distributions&#x2014;sessions and non-sessions&#x2014;we conducted a permutation test. Out of the transcripts for which we calculated segment perplexity, 335 transcripts were also assigned a class by the LLM: 285 sessions and 50 non-sessions.</p><p>We did not control the length of segments and their number, both are expected to affect the perplexity of a file. To ensure that the results are robust under filtering of extremely short transcripts and short segments, we calculated the same results after filtering segments with less than 5 or 10 words, and transcripts for which less than 10 or 20 segments. Finally, based on the results, we assessed the separability of sessions and non-sessions distributions by evaluating a perplexity-based classifier.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Dataset Characteristics</title><sec id="s3-1-1"><title>Duplicates Identification and Language Detection</title><p>We identified 1 duplicate file and 1 empty file in the dataset. After removing these, the dataset consisted of 22,335 transcripts. The analysis revealed 18 different languages. The most common was English (98%), followed by Hebrew (0.7%).</p></sec><sec id="s3-1-2"><title>Initial Assessment</title><p>The manual review of 100 transcripts yielded the following results: 46% of the transcripts were comprehensible and had <italic>little to no errors</italic>, 18% required a more thorough review for clear evaluation and 36% contained <italic>clear errors</italic>, which were categorized as follows (some transcripts had multiple error types): speaker identification errors (eg, confusing the therapist with the client, 42%); incomprehensible text (34%); too short to be indicative of the conversation type (22%); either non-session content or group sessions (11%) and <italic>missing words or duplicated segments</italic> (8%).</p><p>This preliminary evaluation found that the dataset was highly diverse, comprising the following transcript types:</p><list list-type="order"><list-item><p>Non-session content: The presence of non-session and group-session transcripts underscores the need for filtering mechanisms to exclude these from analysis.</p></list-item><list-item><p>Session quality: Based on this evaluation, we estimate that approximately half of the dataset contains sessions suitable for further analysis.</p></list-item><list-item><p>Speaker recognition errors: Given the frequency of speaker diarization errors, features that rely on speaker identification (eg, number of turns, spoken time by speaker) may be unreliable without corrections.</p></list-item><list-item><p>Incomprehensible text: This analysis underscores the importance of developing tools to improve text comprehensibility. With appropriate adjustments, more data could be rendered suitable for analysis.</p></list-item></list></sec></sec><sec id="s3-2"><title>Features Learning</title><sec id="s3-2-1"><title>Conversation Length</title><p>Conversations length ranged between 0.5 and 18,783 seconds, with an average of 2707 seconds and median of 3062 seconds. <xref ref-type="fig" rid="figure2">Figure 2</xref> shows the full distribution of conversation length (A) and the same distribution after omitting conversations shorter than 15 minutes long (B). A Gaussian mixture model best fitted 3 Gaussian distributions for the full distribution and 2 for the reduced distribution. <xref ref-type="fig" rid="figure3">Figure 3</xref> illustrates a comparison of the model&#x2019;s full distribution fit for different numbers of components, and <xref ref-type="table" rid="table1">Table 1</xref> presents the values of the Akaike information criterion and the Bayesian information criterion under each parameter.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Comparison of transcript duration distributions before and after data cleaning. (<bold>A) </bold>Distribution of durations for all transcripts, including trial sessions and experimental recordings. (<bold>B) </bold>Distribution after excluding transcripts based on organization category metadata. The comparison highlights that many short-duration transcripts originated from irrelevant or excluded organizations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e78082_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Comparison of Gaussian mixtures models (GMMs) with 2, 3, and 4 components fitted to the distribution of session durations (in seconds) based on transcript timestamps (<xref ref-type="fig" rid="figure2">Figure 2A</xref>). The negative log-likelihood is reported for each model, with lower values indicating better fit.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e78082_fig03.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Akaike information criterion and Bayesian information criterion for different number of components computed for the Gaussian mixture models presented in <xref ref-type="fig" rid="figure3">Figure 3</xref>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Components</td><td align="left" valign="bottom">AIC<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">BIC<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">39,3727</td><td align="char" char="." valign="top">39,3767</td></tr><tr><td align="char" char="." valign="top">3</td><td align="char" char="." valign="top">38,4168</td><td align="char" char="." valign="top">38,4232</td></tr><tr><td align="char" char="." valign="top">4</td><td align="char" char="." valign="top">38,3384</td><td align="char" char="." valign="top">38,3473</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>AIC: Akaike information criterion.</p></fn><fn id="table1fn2"><p><sup>b</sup>BIC: Bayesian information criterion.</p></fn></table-wrap-foot></table-wrap><p>To support our findings, we examined the metadata to determine whether different conversation types (based on organization) were associated with different transcript length. Among the short transcripts (&#x003C;15 min) that included conversational content, we found brief case management phone calls. The long transcripts (&#x003E;1.2 h) included 2 sessions in a row, or a recording extended before or after a session. These patterns suggest the presence of 2 conversation populations, each with a different duration distribution.</p></sec><sec id="s3-2-2"><title>Speaking Rate</title><p>Speaking rate ranged from 0 to 4.4 WPS with an average of 2.17. The net speaking rate ranged from 0.04 to 6.63 WPS with an average of 2.9 WPS (<xref ref-type="fig" rid="figure4">Figure 4B</xref>). Both averages fall within the reported range of the American average speaking rate in conversation [<xref ref-type="bibr" rid="ref41">41</xref>] (1.85&#x2010;4.86 WPS).</p><p>The highest speaking rate was observed in a short transcript of an automatic answering machine. Only 30% (n=9) of transcripts with speaking rate above 3.5 WPS (<xref ref-type="fig" rid="figure4">Figure 4</xref>A) were longer than 20 minutes, suggesting that high speaking rate often reflected accidental recordings. For instance, of the 9 transcripts with rates greater than 4 WPS, 8 were identified as automatic voice answering machines.</p><p>Examining the ratio of overall session time to speech duration, or equivalently the ratio of speaking rate (A) to net speaking rate (B), shows that transcripts with high values (where total conversation time far exceeds spoken time) often reflect timestamp errors or short transcripts with very few words. In either case, such transcripts are not suitable for further analysis.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Distributions of average speaking rates extracted from transcripts with 2 methods: (<bold>A) </bold>Average number of words per second (WPS) over the whole session (including silences). (<bold>B) </bold>Average number of words per second for the net spoken time (without silences between segments). The red arrow&#x2019;s height represents the number of values exceeding the 95th percentile.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e78082_fig04.png"/></fig></sec><sec id="s3-2-3"><title>Frequent Noun Words</title><p>After filtering words according to <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> , we identified 8,393,775 nouns in the clients&#x2019; speech and 32,591 nouns in the therapists&#x2019; speech. <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> displays the 60 most common words before and after filtering. Therapists&#x2019; 5 most common words were &#x201C;thing,&#x201D; &#x201C;time,&#x201D; &#x201C;people,&#x201D; &#x201C;know,&#x201D; and &#x201C;way,&#x201D; and clients&#x2019; most common nouns were similar: &#x201C;thing,&#x201D; &#x201C;time,&#x201D; &#x201C;know,&#x201D; &#x201C;date,&#x201D; and &#x201C;lot&#x201D; followed by &#x201C;people.&#x201D; Among the 15 most common words for clients were also &#x201C;friend,&#x201D; &#x201C;mom,&#x201D; &#x201C;work,&#x201D; &#x201C;talk,&#x201D; and &#x201C;life,&#x201D; and therapists shared some of these nouns (&#x201C;work,&#x201D; &#x201C;talk,&#x201D; and &#x201C;date&#x201D;) but also frequently used &#x201C;help&#x201D; and &#x201C;guy.&#x201D;</p></sec></sec><sec id="s3-3"><title>Classification</title><sec id="s3-3-1"><title>Zero-Shot Prompting</title><p>Of 850 transcripts that were given to the LLM, it identified 737 as sessions (86.7%, n=737), including 56 transcripts classified as couple&#x2019;s sessions. The remaining 113 transcripts were classified as non-sessions (13.3%, n=113). The model&#x2019;s certainty ratings were skewed toward the higher end of the scale, with 55 transcripts rated at 4 (7.9%) and 645 transcripts rated at 5 (92.1%, highest certainty).</p></sec><sec id="s3-3-2"><title>Validation and Interrater Reliability</title><p>The raters agreed on 86.3% (44 of 51; see Figure 5A in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>) of the test set transcripts, predominantly classifying transcripts as therapy sessions rather than nontherapy sessions (Rater 1: 61%; Rater 2: 63%). Cohen kappa score was 0.71, indicating substantial agreement [<xref ref-type="bibr" rid="ref53">53</xref>]. Disagreements primarily revolved around identifying the session type&#x2014;distinguishing between individual, couple, or family therapy&#x2014;rather than determining whether a therapy session took place at all. In 43% (3 of 7) of the disagreement cases, both raters classified the transcript as a therapy session but differed on the type. In the remaining 57% (n=4), the disagreement was about whether the conversation was professional or casual. The full distribution of classifications by the raters and the model is shown in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p></sec></sec><sec id="s3-4"><title>LLM Versus Raters</title><p>Unlike the raters, who were instructed to mark non-individual sessions (eg, family or couple therapy) as non-sessions, the LLM was instructed to identify any kind of treatment session, including family and couple therapy. This difference naturally led to disagreements between raters and the model. However, when the model explicitly identified the session type (eg, family or couples) and indicated it clearly in its explanation or summary, we considered the classification an agreement. This verification process can be automated by searching for specific keywords (eg, &#x201C;couple session&#x201D;) in the summary text or by using another language model to interpret the model&#x2019;s output. <xref ref-type="table" rid="table2">Table 2</xref> shows the distribution of rater-model agreement for sessions and non-sessions.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Agreement between the model and the raters for transcripts agreed over by raters<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Raters: yes, n</td><td align="left" valign="bottom">Raters: no, n</td><td align="left" valign="bottom">Total, n</td></tr></thead><tbody><tr><td align="left" valign="top">Model: yes, n</td><td align="char" char="." valign="top">26</td><td align="char" char="." valign="top">7</td><td align="char" char="." valign="top">33</td></tr><tr><td align="left" valign="top">Model: no, n</td><td align="char" char="." valign="top">2</td><td align="char" char="." valign="top">9</td><td align="char" char="." valign="top">11</td></tr><tr><td align="left" valign="top">Total, n</td><td align="char" char="." valign="top">28</td><td align="char" char="." valign="top">16</td><td align="left" valign="top">44</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>The table features the number of transcripts in each category.</p></fn></table-wrap-foot></table-wrap><p>Overall, the model identified 57 sessions as couples or family therapy, demonstrating its ability not only to detect therapy sessions but also to categorize them by type.</p><p>Among the 9 disagreements between the model and the human raters, most involved the model misclassifying case management conversations as therapy. Conversely, in cases where the model misidentified conversations tagged as therapy sessions by human raters, its reasoning was because of &#x201C;lack of therapeutic techniques.&#x201D;</p></sec><sec id="s3-5"><title>Perplexity of Different Classes</title><p><xref ref-type="fig" rid="figure5">Figure 5</xref> shows the distributions of different statistics of perplexity of transcripts for different classes.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Results of permutation tests between distributions of perplexity metrics of sessions (&#x201C;Yes&#x201D;) and non-sessions (&#x201C;No&#x201D;). Perplexity metrics&#x2014;maximum, average, 75th percentile, and standard deviation&#x2014;were calculated over the distribution of segment perplexity for each transcript. The 75th percentile showed a significant result (mean difference = &#x2212;258, <italic>P</italic>=.01) with sessions having lower values, while the maximum perplexity was higher for sessions (mean difference =73,888 , <italic>P</italic>=.007).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e78082_fig05.png"/></fig><p>Among the transcripts with high perplexities, 1 transcript had exceptionally high values (mean perplexity of more than 11,000, which was approximately 11 standard deviations more than the average mean perplexity). This 4-segment transcript contained nonwords and backchannels that seemed to be transcribed noises (mostly the sound &#x201C;Mhm&#x201D;). Another high-perplexity file was a recorded rap song.</p><p>Of the 4 statistics&#x2014;max, mean, standard deviation, and 75th percentile&#x2014;a permutation test for means showed a significant result for the 75th percentile (<italic>P</italic>=.01), with sessions having a lower mean 75th percentile than non-sessions (mean difference=&#x2212;258). In contrast, the permutation test showed that the sessions group had higher maximal values (<italic>P</italic>=.007).</p><p>To check the robustness of these results, we repeated the test after limiting the calculation to segments with minimal number of words (minimum words per segment [MWPS]=5, 10) and omitting transcripts with fewer than a minimal number of relevant segments (minimal number of segments [MS]=10, 20). These tests showed that the 75th percentile measure (<xref ref-type="fig" rid="figure6">Figure 6A</xref>) remained significant under these parameter changes, whereas the max perplexity measure (<xref ref-type="fig" rid="figure6">Figure 6B</xref>) lost significance when limitations were applied (MS&#x003E;0 or MWPS &#x003E;0).</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p><italic>P</italic> values of permutation tests comparing sessions and non-sessions perplexity measures across parameter settings&#x2014;minimum number of segments and minimal number of words per segment. (<bold>A) </bold>Maximum perplexity shows no significant values across parameters except when no restrictions are applied and (<bold>B) </bold>75th percentile of perplexity is significant for most parameter combinations. Bright colors indicate lower <italic>P</italic> values, *<italic>P</italic>&#x003C;.05, **<italic>P</italic>&#x003C;.01.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v9i1e78082_fig06.png"/></fig><p>Furthermore, we evaluated the classification performance of a 75th percentile perplexity&#x2013;based classifier by analyzing receiver operating characteristic (ROC) curves and precision-recall metrics across parameters (MWPS, MS). To align with perplexity values, in this analysis, higher perplexity values correspond to the positive class&#x2014;non-sessions&#x2014;while sessions constitute the negative class. The optimal threshold was determined by maximizing the difference between true positive rate and false positive rate, prioritizing accurate classification of non-sessions. The results (<xref ref-type="table" rid="table3">Table 3</xref>) indicate moderate discriminative ability, with ROC area under the curve values ranging from approximately 0.62 to 0.73 and precision increasing alongside minimal number of segments. Precision tends to increase with larger segment sizes (up to ~0.63), while recall decreases (~0.26).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Classification performance of the 75th percentile perplexity&#x2013;based filter across varying thresholds for minimal words per segment (MWPS) and minimal number of segments (MS).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">MWPS</td><td align="left" valign="bottom">MS</td><td align="left" valign="bottom">Sessions</td><td align="left" valign="bottom">Non-sessions</td><td align="left" valign="bottom">ROC AUC</td><td align="left" valign="bottom">PR AUC</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td></tr></thead><tbody><tr><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="char" char="." valign="top">285</td><td align="char" char="." valign="top">50</td><td align="char" char="." valign="top">0.618</td><td align="char" char="." valign="top">0.240</td><td align="char" char="." valign="top">0.200</td><td align="char" char="." valign="top">0.740</td></tr><tr><td align="left" valign="top">0</td><td align="left" valign="top">5</td><td align="char" char="." valign="top">285</td><td align="char" char="." valign="top">43</td><td align="char" char="." valign="top">0.636</td><td align="char" char="." valign="top">0.198</td><td align="char" char="." valign="top">0.182</td><td align="char" char="." valign="top">0.767</td></tr><tr><td align="left" valign="top">0</td><td align="left" valign="top">10</td><td align="char" char="." valign="top">284</td><td align="char" char="." valign="top">42</td><td align="char" char="." valign="top">0.633</td><td align="char" char="." valign="top">0.196</td><td align="char" char="." valign="top">0.178</td><td align="char" char="." valign="top">0.762</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">0</td><td align="char" char="." valign="top">285</td><td align="char" char="." valign="top">47</td><td align="char" char="." valign="top">0.664</td><td align="char" char="." valign="top">0.354</td><td align="char" char="." valign="top">0.353</td><td align="char" char="." valign="top">0.255</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">5</td><td align="char" char="." valign="top">284</td><td align="char" char="." valign="top">41</td><td align="char" char="." valign="top">0.657</td><td align="char" char="." valign="top">0.331</td><td align="char" char="." valign="top">0.333</td><td align="char" char="." valign="top">0.268</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">10</td><td align="char" char="." valign="top">284</td><td align="char" char="." valign="top">40</td><td align="char" char="." valign="top">0.665</td><td align="char" char="." valign="top">0.335</td><td align="char" char="." valign="top">0.333</td><td align="char" char="." valign="top">0.275</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">0</td><td align="char" char="." valign="top">285</td><td align="char" char="." valign="top">44</td><td align="char" char="." valign="top">0.687</td><td align="char" char="." valign="top">0.378</td><td align="char" char="." valign="top">0.625</td><td align="char" char="." valign="top">0.227</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">5</td><td align="char" char="." valign="top">284</td><td align="char" char="." valign="top">40</td><td align="char" char="." valign="top">0.701</td><td align="char" char="." valign="top">0.380</td><td align="char" char="." valign="top">0.625</td><td align="char" char="." valign="top">0.250</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">10</td><td align="char" char="." valign="top">284</td><td align="char" char="." valign="top">38</td><td align="char" char="." valign="top">0.726</td><td align="char" char="." valign="top">0.392</td><td align="char" char="." valign="top">0.625</td><td align="char" char="." valign="top">0.263</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>The table reports the number of transcripts remaining in each class after filtering outliers, the receiver operating characteristic area under the curve (ROC AUC), precision-recall AUC (PR AUC), precision, recall, and F1 scores. Increasing the minimal words and segments thresholds improves ROC AUC and precision but reduces recall.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>With a dataset of over 22,000 unfiltered transcripts of recordings associated with behavioral treatment sessions, our primary objective in this methodological paper was to illustrate a systematic approach for characterizing the data and implementing a filtering process for subsequent analysis in academic research. Prior research on leveraging machine learning and natural language processing methods for classifying large text datasets, both conversational [<xref ref-type="bibr" rid="ref54">54</xref>] and nonconversational [<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>], has focused mostly on content-based classification rather than the contextual framework of conversations. Our proposed methodology integrated human evaluation, statistical analysis, and automated tools, including LLMs, emphasizing the importance of contextual and relational features, which are especially critical for distinguishing therapy sessions from non-sessions.</p><p>The preliminary analysis identified a significant proportion of non-session transcripts within the dataset. These non-sessions encompassed brief case management encounters, informal conversations, mock sessions, and incidental recordings captured between sessions. Additionally, some transcripts included errors such as incorrect speaker identification, text duplication, and incomprehensible content [<xref ref-type="bibr" rid="ref25">25</xref>]. These findings highlight the necessity for systematic filtering mechanisms to exclude low-quality and non-session data. Given that only approximately half of the dataset was found suitable for analysis, leveraging automated classification and quality scoring can significantly enhance the dataset&#x2019;s utility for research purposes.</p><p>Recent works have listed features for characterizing texts in corpora [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref57">57</xref>]. In this paper, we have focused on 3 basic ways to extract relevant statistics and examined if outliers could be indicative of problematic transcripts:</p><p>Conversation length&#x2014;Analysis of conversation length distributions revealed that shorter transcripts often represented non-sessions, such as case management encounters, as expected for this type of service, phone calls, or noise. In contrast, unexpectedly long conversations may result from sessions being concatenated or recordings extending beyond the session&#x2019;s actual duration, thus requiring careful preprocessing.</p><p>Speaking rate&#x2014;Speaking rate has been evaluated in many contexts, and it has been shown that its values can predict speakers&#x2019; features [<xref ref-type="bibr" rid="ref41">41</xref>], detect speech anomalies, and identify change in conversation dynamics or context [<xref ref-type="bibr" rid="ref42">42</xref>]. We found that high speaking rate was associated with shorter transcripts, particularly erroneous recordings such as answering machine messages. This finding supports the use of speaking rate as a potential indicator of non-sessions. Additionally, the ratio between speech duration and overall recording duration may serve as a marker for time-labeling errors.</p><p>Content analysis&#x2014;Bag-of-Words is a basic tool for evaluating semantic content. Through word count (mostly nouns), themes and topics can be discovered and ultimately enhance text classification [<xref ref-type="bibr" rid="ref58">58</xref>-<xref ref-type="bibr" rid="ref60">60</xref>]. In this paper, we used it as a &#x201C;reality check&#x201D; to reveal the contents of the transcripts at hand. Our content analysis provided insights into the vocabulary and topics typical of therapeutic conversations. The most prevalent words were related to everyday life (&#x201C;relationship,&#x201D; &#x201C;house,&#x201D; &#x201C;job,&#x201D; &#x201C;today,&#x201D; &#x201C;school,&#x201D; &#x201C;sleep,&#x201D; and &#x201C;car&#x201D;), clients' inner world (&#x201C;thought,&#x201D; &#x201C;want,&#x201D; &#x201C;feel,&#x201D; and &#x201C;need&#x201D;), resolving issues (&#x201C;situation,&#x201D; &#x201C;problem,&#x201D; &#x201C;help,&#x201D; and &#x201C;care&#x201D;), and relationships (&#x201C;kid,&#x201D; &#x201C;love,&#x201D; &#x201C;date,&#x201D; &#x201C;sister,&#x201D; &#x201C;couple,&#x201D; &#x201C;dad,&#x201D; and &#x201C;guy&#x201D;), emphasizing the centrality of these themes in these conversations. Comparing individual transcript themes against the topics that were found in this analysis could be an indicator for being a genuine session. However, it is of note that this analysis ignores the context in which words appear.</p><sec id="s4-1-1"><title>Perplexity Analysis</title><p>Perplexity is traditionally used to evaluate language models [<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref62">62</xref>]. In this study, however, we used it to evaluate the comprehensibility and uniqueness of text segments [<xref ref-type="bibr" rid="ref63">63</xref>,<xref ref-type="bibr" rid="ref64">64</xref>] in order to gain insight into potential differences between sessions and non-sessions. Transcripts with higher perplexity scores often contained transcription errors or nonverbal content (eg, noise and backchannels).</p><p>Sessions initially exhibited higher maximal transcript perplexity than non-sessions; however, this result was no longer significant when outliers, namely, transcripts with few segments or short segments, were omitted. This suggests that the higher maximal perplexity values in sessions are not necessarily indicative of more verbally complex content but rather stem from transcription errors, backchanneling, discourse markers, or interrupted recordings, which tend to produce short segments and short transcripts. These findings indicate that sessions, in general, do not have higher maximal perplexity than non-sessions. This interpretation is further supported by the finding that the 75th percentile perplexity score for sessions was lower than non-sessions after removing outliers. This may suggest that sessions contain more structured and predictable language, particularly when compared to background noise or accidentally captured conversation fragments. This distinction is particularly useful because it suggests that 75th percentile perplexity could serve as a reliable feature in a non-session filtering model. Unlike maximal perplexity, which was influenced by errors and outliers, the 75th percentile measure remains stable across different parameter settings, reinforcing its robustness as an indicator of session-like structure.</p><p>To support this claim and further evaluate its ability to differentiate sessions from non-sessions, we assessed a 75th percentile perplexity&#x2013;based classifier across varying thresholds for MS and MWPS. The optimal threshold remained consistent across conditions, suggesting its practical applicability as a filtering criterion. However, the results indicate moderate-to-low discriminative performance, with precision improving under stricter outlier exclusion. For instance, removing transcripts with fewer than 10 segments of more than 10 words each yielded the highest precision (~0.63) but substantially reduced recall (~0.26). This pattern likely reflects the dataset imbalance&#x2014;where non-sessions are the minority&#x2014;and the observation that outliers were predominantly non-sessions, suggesting they tend to exhibit relatively high 75th percentile perplexity. These findings reinforce the conclusion that this metric is particularly useful when filtering aims to prioritize precision, though caution is needed due to the class imbalance impacting these metrics.</p><p>Overall, our results suggest that perplexity can be used to identify flawed, incomprehensible, or highly improbable text segments and may serve as a useful tool for detecting low-quality or non-session transcripts, but not as a standalone classifier. To validate our results and extend this work, we propose 2 key next steps: (1) to distinguish nonprofessional conversations from flawed transcripts to determine which group is more clearly separable from sessions through perplexity measures. This distinction is critical for understanding the sources of high- and low-perplexity segments. (2) To replicate these analyses with a larger set of labeled transcripts to overcome data imbalance.</p></sec><sec id="s4-1-2"><title>Large Language Model Classification</title><p>Research on LLMs has explored their use for both text classification and mental health text analysis as separate tasks. Prior work has demonstrated these models&#x2019; ability to extract key concepts, analyze text dynamics, and identify psychological concepts [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref65">65</xref>-<xref ref-type="bibr" rid="ref69">69</xref>]. In this study, we integrated both tasks, leveraging zero-shot prompting to analyze mental health conversations with the goal of classification. Our findings indicate that with zero-shot prompting, the model can classify transcripts effectively, showing high agreement with human coders while demonstrating robustness in handling transcript errors. For example, even when speaker identification referenced only a therapist and a patient, the model correctly identified couple or family sessions by interpreting contextual and semantic cues. Additionally, the LLM successfully corrected speaker identification errors, highlighting its potential as an automated error-correction tool.</p><p>Instances of disagreement between human coders and the LLM often stemmed from the model&#x2019;s sensitivity to therapeutic techniques. One of the classification guidelines was the presence of such techniques, and the model appeared to weigh them heavily when the conversational framing was ambiguous. Furthermore, in some cases, we found discrepancies between the binary classification and the LLM&#x2019;s explanation. For instance, despite classifying a conversation as a session, the model noted: <italic>The therapist and psychologist discuss updates on multiple client cases, including challenges with clients&#x2019; behaviors</italic>, recognizing both speakers as therapists. This example underscores the importance of including model explanations as part of the classification process. Future work should investigate these discrepancies and refine classification guidelines accordingly.</p><p>Finally, while research on LLMs&#x2019; ability to analyze conversations through prompting is still evolving, existing studies have yielded inconsistent findings regarding their effectiveness in analyzing conversational contexts [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref65">65</xref>-<xref ref-type="bibr" rid="ref69">69</xref>]. Our results suggest that when provided with full conversation transcripts, LLMs can capture nuanced textual and relational dynamics, offering valuable insights into participants&#x2019; interactions. While recent studies have used these capabilities in text-based applications, we demonstrated their applicability to conversations, where relational and dynamic elements are crucial in distinguishing sessions from non-sessions. Thus, although few-shot learning or fine-tuning may further enhance classification accuracy [<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>], our findings suggest that these techniques may not be strictly necessary for effective classification.</p></sec></sec><sec id="s4-2"><title>Limitations</title><p>Defining when a transcript reflects a treatment session presents a challenge: sessions share many aspects with nontherapeutic conversations, and some therapeutic techniques might appear as trivial exchanges. In this paper, we proposed both statistical heuristics and explicit guidelines to evaluate conversations in light of this question.</p><p>While we offer a methodological examination of a transcript dataset and highlight considerations for careful filtering, defining a filtering model is beyond the scope of this study. Since ground-truth labels were unavailable for supervised classification, we address this by identifying features that provide valuable insights into the classification process and by demonstrating how LLMs can assist experts when guided by well-designed prompt engineering. Hence, the approach presented here may serve as a foundation for developing a comprehensive classification model.</p><p>This study delineates a general framework and provides guidelines for working with conversational datasets in the absence of prior knowledge or structured labels. However, variation exists across datasets in format, content, quality, and the proportion of non-session files and transcription errors. While this paper applies its suggested framework on a large and varied dataset, it still features working with a single data platform. Variation in platforms and datasets calls for researchers to (1) detect any peculiarities relevant to their datasets and acknowledge them before referring to the relevant steps in the framework and (2) cultivate content matter expertise for datasets they analyze or better&#x2014;conduct studies in multidisciplinary teams and keep a human (expert) in the loop.</p></sec><sec id="s4-3"><title>Future Research</title><p>Our findings suggest that while statistical features can provide broad insights into conversational structure and flag irregularities that may correlate with non-sessions, LLMs add a complementary layer by capturing the essence, dynamics, and nature of conversations. As a proof of concept, this study demonstrates the potential of LLM-based classification; however, further validation, including human annotation on larger samples, is needed. In a treatment setting, we envision a semiautomated pipeline that integrates statistical and semantic features with LLMs. Transcripts would first be screened using metrics such as perplexity, duration, and speaking rate; those within a normal range would then be analyzed by an LLM to classify the conversation type&#x2014;treatment session or unrelated. Periodic human review of selected transcripts would ensure model validity, with reviewer feedback used to refine prompts or feature thresholds dynamically. This &#x201C;human-in-the-loop&#x201D; workflow enables continuous processing while limiting manual review to a small subset of cases. Future work should evaluate different LLM models across diverse behavioral health datasets to determine their ability to capture nuances in treatment styles.</p><p>Regarding statistical feature&#x2013;based outliers filtering, additional research could enhance its efficiency and interpretability. Incorporating more informative linguistic features&#x2014;such as the frequency of backchanneling cues, discourse markers, and silent pauses&#x2014;into interpretable models such as decision trees may improve classification while remaining computationally efficient compared to fine-tuning large-scale models. Moreover, content analysis could reveal how the themes extracted from an individual session differ from those found across the dataset, highlighting deviations that might indicate non-session transcripts. Examining bigrams and multi-word expressions could also refine differentiation between informal conversations and structured therapeutic sessions, while helping to detect common transcription errors, such as misinterpretations caused by background noise.</p><p>Perplexity analyses could examine the relationship between different segment error types, such as misspelling, and perplexity by categorizing segments accordingly. Additionally, applying a mixed-model analysis to account for the statistical dependencies of perplexities within individual transcripts. Moreover, the interaction between perplexity and factors such as speaker familiarity or amorphous nature of the conversation remains an open question and could help explain cases where high perplexity signals meaningful conversational ambiguity rather than transcription errors. Finally, improving transcript quality remains a critical avenue for future research. Existing methods for correcting speaker diarization errors and ASR mistakes without access to the original audio [<xref ref-type="bibr" rid="ref70">70</xref>,<xref ref-type="bibr" rid="ref71">71</xref>] could be integrated into the preprocessing pipeline to enhance transcript reliability before filtering. Implementing these techniques could enhance the preprocessing stage, improve the accuracy of extracted features, and ultimately enhance the filtering process and its success rates.</p></sec><sec id="s4-4"><title>Conclusion</title><p>This study demonstrated the importance of integrating human judgment with automated tools when processing large, unstructured datasets. We assess secondary data&#x2014;data collected independently of current research&#x2014;where initial human evaluation is critical for understanding dataset characteristics such as readability, content, and diarization quality. This foundational knowledge can inform the development of effective filtering strategies. While basic statistics, perplexity, and LLM prompting facilitate automated filtering, preliminary human review remains essential for understanding dataset variability and refining classification features. This hybrid approach ensures adaptable and accurate filtering processes, even in the presence of transcription errors.</p></sec></sec></body><back><notes><sec><title>Disclaimer</title><p>This manuscript was partially edited for language and style with the assistance of the AI language models ChatGPT (OpenAI) and Claude (Anthropic). The authors reviewed and edited all AI-generated content and take full responsibility for the final text.</p></sec><sec><title>Data Availability</title><p>The dataset generated and analyzed during the current study is not publicly available due to privacy and confidentiality reasons.</p></sec></notes><fn-group><fn fn-type="conflict"><p>SSS, SJ, and ES are employees of Eleos Health whose artificial intelligence platform was used to generate the data analyzed in this study. DPM is the Founder of Morrison Consulting, which provides consulting services to Eleos Health, including his role as Chief Clinical Officer; he is affiliated with Eleos Health in a consulting capacity, but not as an employee. PMN and AG declare no conflicts of interest.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">ASR</term><def><p>automatic speech recognition</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">MS</term><def><p>minimal number of segments</p></def></def-item><def-item><term id="abb5">MWPS</term><def><p>minimum words per segment</p></def></def-item><def-item><term id="abb6">ROC</term><def><p>receiver operating characteristic</p></def></def-item><def-item><term id="abb7">WPS</term><def><p>words per second</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Whalen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Raymond</surname><given-names>GT</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Borgatta</surname><given-names>EF</given-names> </name><name name-style="western"><surname>Montgomery</surname><given-names>RJV</given-names> </name></person-group><article-title>Conversation analysis</article-title><source>Encyclopedia of Sociology</source><year>2000</year><edition>2</edition><publisher-name>Macmillan Reference USA</publisher-name><pub-id pub-id-type="other">9780028648507</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Drew</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chatwin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Collins</surname><given-names>S</given-names> </name></person-group><article-title>Conversation analysis: a method for research into interactions between patients and health-care professionals</article-title><source>Health Expect</source><year>2001</year><month>03</month><volume>4</volume><issue>1</issue><fpage>58</fpage><lpage>70</lpage><pub-id pub-id-type="doi">10.1046/j.1369-6513.2001.00125.x</pub-id><pub-id pub-id-type="medline">11286600</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kotzias</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kuo</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Detecting conversation topics in primary care office visits from transcripts of patient-provider interactions</article-title><source>J Am Med Inform Assoc</source><year>2019</year><month>12</month><day>1</day><volume>26</volume><issue>12</issue><fpage>1493</fpage><lpage>1504</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocz140</pub-id><pub-id pub-id-type="medline">31532490</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fraser</surname><given-names>H</given-names> </name></person-group><article-title>A framework for deciding how to create and evaluate transcripts for forensic and other purposes</article-title><source>Front Commun</source><year>2022</year><volume>7</volume><fpage>898410</fpage><pub-id pub-id-type="doi">10.3389/fcomm.2022.898410</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>F&#x00F8;lstad</surname><given-names>A</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>C</given-names> </name></person-group><article-title>Investigating the user experience of customer service chatbot interaction: a framework for qualitative analysis of chatbot dialogues</article-title><source>Qual User Exp</source><year>2021</year><month>12</month><volume>6</volume><issue>1</issue><fpage>6</fpage><pub-id pub-id-type="doi">10.1007/s41233-021-00046-5</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeong</surname><given-names>M</given-names> </name><name name-style="western"><surname>Minson</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yeomans</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gino</surname><given-names>F</given-names> </name></person-group><article-title>Communicating with warmth in distributive negotiations is surprisingly counterproductive</article-title><source>Manage Sci</source><year>2019</year><month>12</month><volume>65</volume><issue>12</issue><fpage>5813</fpage><lpage>5837</lpage><pub-id pub-id-type="doi">10.1287/mnsc.2018.3199</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hennessy</surname><given-names>S</given-names> </name><name name-style="western"><surname>Calcagni</surname><given-names>E</given-names> </name><name name-style="western"><surname>Leung</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mercer</surname><given-names>N</given-names> </name></person-group><article-title>An analysis of the forms of teacher-student dialogue that are most productive for learning</article-title><source>Language and Education</source><year>2023</year><month>03</month><day>4</day><volume>37</volume><issue>2</issue><fpage>186</fpage><lpage>211</lpage><pub-id pub-id-type="doi">10.1080/09500782.2021.1956943</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goldberg</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Flemotomos</surname><given-names>N</given-names> </name><name name-style="western"><surname>Martinez</surname><given-names>VR</given-names> </name><etal/></person-group><article-title>Machine learning and natural language processing in psychotherapy research: alliance as example use case</article-title><source>J Couns Psychol</source><year>2020</year><month>07</month><volume>67</volume><issue>4</issue><fpage>438</fpage><lpage>448</lpage><pub-id pub-id-type="doi">10.1037/cou0000382</pub-id><pub-id pub-id-type="medline">32614225</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Spinrad</surname><given-names>A</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>CB</given-names> </name><name name-style="western"><surname>Ruzek</surname><given-names>JI</given-names> </name><etal/></person-group><article-title>Action recommendations review in community-based therapy and depression and anxiety outcomes: a machine learning approach</article-title><source>BMC Psychiatry</source><year>2024</year><month>02</month><day>16</day><volume>24</volume><issue>1</issue><fpage>133</fpage><pub-id pub-id-type="doi">10.1186/s12888-024-05570-0</pub-id><pub-id pub-id-type="medline">38365635</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Atzil-Slonim</surname><given-names>D</given-names> </name><name name-style="western"><surname>Eliassaf</surname><given-names>A</given-names> </name><name name-style="western"><surname>Warikoo</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Leveraging natural language processing to study emotional coherence in psychotherapy</article-title><source>Psychotherapy (Chic)</source><year>2024</year><month>03</month><volume>61</volume><issue>1</issue><fpage>82</fpage><lpage>92</lpage><pub-id pub-id-type="doi">10.1037/pst0000517</pub-id><pub-id pub-id-type="medline">38236227</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sadeh-Sharvit</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hollon</surname><given-names>SD</given-names> </name></person-group><article-title>Leveraging the power of nondisruptive technologies to optimize mental health treatment: case study</article-title><source>JMIR Ment Health</source><year>2020</year><month>11</month><day>26</day><volume>7</volume><issue>11</issue><fpage>e20646</fpage><pub-id pub-id-type="doi">10.2196/20646</pub-id><pub-id pub-id-type="medline">33242025</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sadeh-Sharvit</surname><given-names>S</given-names> </name><name name-style="western"><surname>Camp</surname><given-names>TD</given-names> </name><name name-style="western"><surname>Horton</surname><given-names>SE</given-names> </name><etal/></person-group><article-title>Effects of an artificial intelligence platform for behavioral interventions on depression and anxiety symptoms: randomized clinical trial</article-title><source>J Med Internet Res</source><year>2023</year><month>07</month><day>10</day><volume>25</volume><fpage>e46781</fpage><pub-id pub-id-type="doi">10.2196/46781</pub-id><pub-id pub-id-type="medline">37428547</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Imel</surname><given-names>ZE</given-names> </name><name name-style="western"><surname>Pace</surname><given-names>BT</given-names> </name><name name-style="western"><surname>Soma</surname><given-names>CS</given-names> </name><etal/></person-group><article-title>Design feasibility of an automated, machine-learning based feedback system for motivational interviewing</article-title><source>Psychotherapy (Chic)</source><year>2019</year><month>06</month><volume>56</volume><issue>2</issue><fpage>318</fpage><lpage>328</lpage><pub-id pub-id-type="doi">10.1037/pst0000221</pub-id><pub-id pub-id-type="medline">30958018</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Abdou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sahi</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Hull</surname><given-names>TD</given-names> </name><name name-style="western"><surname>Nook</surname><given-names>EC</given-names> </name><name name-style="western"><surname>Daw</surname><given-names>ND</given-names> </name></person-group><article-title>Leveraging large language models to estimate clinically relevant psychological constructs in psychotherapy transcripts</article-title><comment>Preprint posted online on 2025</comment><pub-id pub-id-type="doi">10.1101/2025.03.04.25323338</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ruzek</surname><given-names>JI</given-names> </name><name name-style="western"><surname>Sadeh-Sharvit</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bunge</surname><given-names>EL</given-names> </name><etal/></person-group><article-title>Training the psychologist of the future in the use of digital mental health technologies</article-title><source>Prof Psychol: Res Pract</source><year>2024</year><volume>55</volume><issue>5</issue><fpage>395</fpage><lpage>404</lpage><pub-id pub-id-type="doi">10.1037/pro0000567</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Flaherty</surname><given-names>HB</given-names> </name></person-group><article-title>Teaching note&#x2014;using technology to enhance experiential learning through simulated role plays</article-title><source>J Soc Work Educ</source><year>2023</year><month>10</month><day>2</day><volume>59</volume><issue>4</issue><fpage>1294</fpage><lpage>1300</lpage><pub-id pub-id-type="doi">10.1080/10437797.2022.2050869</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sadeh-Sharvit</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rego</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Jefroykin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Peretz</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kupershmidt</surname><given-names>T</given-names> </name></person-group><article-title>A comparison between clinical guidelines and real-world treatment data in examining the use of session summaries: retrospective study</article-title><source>JMIR Form Res</source><year>2022</year><month>08</month><day>16</day><volume>6</volume><issue>8</issue><fpage>e39846</fpage><pub-id pub-id-type="doi">10.2196/39846</pub-id><pub-id pub-id-type="medline">35972782</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Shapira</surname><given-names>N</given-names> </name><name name-style="western"><surname>Atzil-Slonim</surname><given-names>D</given-names> </name><name name-style="western"><surname>Tuval Mashiach</surname><given-names>R</given-names> </name><name name-style="western"><surname>Shapira</surname><given-names>O</given-names> </name></person-group><article-title>Measuring linguistic synchrony in psychotherapy</article-title><conf-name>Proceedings of the Eighth Workshop on Computational Linguistics and Clinical Psychology</conf-name><conf-date>Jul 14-15, 2022</conf-date><conf-loc>Seattle, USA</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2022.clpsych-1.14</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Atzil-Slonim</surname><given-names>D</given-names> </name><name name-style="western"><surname>Soma</surname><given-names>CS</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Paz</surname><given-names>A</given-names> </name><name name-style="western"><surname>Imel</surname><given-names>ZE</given-names> </name></person-group><article-title>Facilitating dyadic synchrony in psychotherapy sessions: systematic review and meta-analysis</article-title><source>Psychother Res</source><year>2023</year><month>09</month><volume>33</volume><issue>7</issue><fpage>898</fpage><lpage>917</lpage><pub-id pub-id-type="doi">10.1080/10503307.2023.2191803</pub-id><pub-id pub-id-type="medline">37001119</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yonatan-Leus</surname><given-names>R</given-names> </name><name name-style="western"><surname>Gwertzman</surname><given-names>G</given-names> </name><name name-style="western"><surname>Tishby</surname><given-names>O</given-names> </name></person-group><article-title>Using machine learning methods to identify trajectories of change and predict responders and non-responders to short-term dynamic therapy</article-title><source>Psychother Res</source><year>2025</year><month>09</month><volume>35</volume><issue>7</issue><fpage>1070</fpage><lpage>1086</lpage><pub-id pub-id-type="doi">10.1080/10503307.2024.2420725</pub-id><pub-id pub-id-type="medline">39461002</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Atzil-Slonim</surname><given-names>D</given-names> </name><name name-style="western"><surname>Juravski</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bar-Kalifa</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Using topic models to identify clients&#x2019; functioning levels and alliance ruptures in psychotherapy</article-title><source>Psychotherapy (Chic)</source><year>2021</year><month>06</month><volume>58</volume><issue>2</issue><fpage>324</fpage><lpage>339</lpage><pub-id pub-id-type="doi">10.1037/pst0000362</pub-id><pub-id pub-id-type="medline">33734743</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Althoff</surname><given-names>T</given-names> </name><name name-style="western"><surname>Clark</surname><given-names>K</given-names> </name><name name-style="western"><surname>Leskovec</surname><given-names>J</given-names> </name></person-group><article-title>Large-scale analysis of counseling conversations: an application of natural language processing to mental health</article-title><source>Trans Assoc Comput Linguist</source><year>2016</year><volume>4</volume><fpage>463</fpage><lpage>476</lpage><pub-id pub-id-type="medline">28344978</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Brockman</surname><given-names>G</given-names> </name><name name-style="western"><surname>Mcleavey</surname><given-names>C</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Krause</surname><given-names>A</given-names> </name><name name-style="western"><surname>Brunskill</surname><given-names>E</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>K</given-names> </name><name name-style="western"><surname>Engelhardt</surname><given-names>B</given-names> </name><name name-style="western"><surname>Sabato</surname><given-names>S</given-names> </name><name name-style="western"><surname>Scarlett</surname><given-names>J</given-names> </name></person-group><article-title>Robust speech recognition via large-scale weak supervision</article-title><conf-name>Proceedings of the 40th International Conference on Machine Learning</conf-name><conf-date>Jul 23-29, 2023</conf-date><fpage>28492</fpage><lpage>28518</lpage></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bain</surname><given-names>M</given-names> </name><name name-style="western"><surname>Huh</surname><given-names>J</given-names> </name><name name-style="western"><surname>Han</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zisserman</surname><given-names>A</given-names> </name></person-group><article-title>WhisperX: time-accurate speech transcription of long-form audio</article-title><conf-name>INTERSPEECH 2023</conf-name><conf-date>Aug 20-24, 2023</conf-date><conf-loc>Dublin, Ireland</conf-loc><fpage>4489</fpage><lpage>4493</lpage><pub-id pub-id-type="doi">10.21437/Interspeech.2023-78</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miner</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Haque</surname><given-names>A</given-names> </name><name name-style="western"><surname>Fries</surname><given-names>JA</given-names> </name><etal/></person-group><article-title>Assessing the accuracy of automatic speech recognition for psychotherapy</article-title><source>NPJ Digit Med</source><year>2020</year><volume>3</volume><issue>1</issue><fpage>82</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-0285-8</pub-id><pub-id pub-id-type="medline">32550644</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zolnoori</surname><given-names>M</given-names> </name><name name-style="western"><surname>Vergez</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kostic</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Audio recording patient-nurse verbal communications in home health care settings: pilot feasibility and usability study</article-title><source>JMIR Hum Factors</source><year>2022</year><month>05</month><day>11</day><volume>9</volume><issue>2</issue><fpage>e35325</fpage><pub-id pub-id-type="doi">10.2196/35325</pub-id><pub-id pub-id-type="medline">35544296</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Graham</surname><given-names>C</given-names> </name><name name-style="western"><surname>Roll</surname><given-names>N</given-names> </name></person-group><article-title>Evaluating OpenAI&#x2019;s Whisper ASR: performance analysis across diverse accents and speaker traits</article-title><source>JASA Express Lett</source><year>2024</year><month>02</month><day>1</day><volume>4</volume><issue>2</issue><fpage>025206</fpage><pub-id pub-id-type="doi">10.1121/10.0024876</pub-id><pub-id pub-id-type="medline">38391582</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Kanda</surname><given-names>N</given-names> </name><name name-style="western"><surname>Dimitriadis</surname><given-names>D</given-names> </name><name name-style="western"><surname>Han</surname><given-names>KJ</given-names> </name><name name-style="western"><surname>Watanabe</surname><given-names>S</given-names> </name><name name-style="western"><surname>Narayanan</surname><given-names>S</given-names> </name></person-group><article-title>A review of speaker diarization: recent advances with deep learning</article-title><source>Comput Speech Lang</source><year>2022</year><month>03</month><volume>72</volume><fpage>101317</fpage><pub-id pub-id-type="doi">10.1016/j.csl.2021.101317</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Church</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Vopicka</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pelecanos</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dimitriadis</surname><given-names>D</given-names> </name><name name-style="western"><surname>Fousek</surname><given-names>P</given-names> </name></person-group><article-title>Speaker diarization: a perspective on challenges and opportunities from theory to practice</article-title><conf-name>2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</conf-name><conf-date>Mar 5-9, 2017</conf-date><conf-loc>New Orleans, LA</conf-loc><fpage>4950</fpage><lpage>4954</lpage><pub-id pub-id-type="doi">10.1109/ICASSP.2017.7953098</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hao</surname><given-names>T</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>B</given-names> </name></person-group><article-title>Health natural language processing: methodology development and applications</article-title><source>JMIR Med Inform</source><year>2021</year><month>10</month><day>21</day><volume>9</volume><issue>10</issue><fpage>e23898</fpage><pub-id pub-id-type="doi">10.2196/23898</pub-id><pub-id pub-id-type="medline">34673533</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Elbattah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Arnaud</surname><given-names>&#x00C9;</given-names> </name><name name-style="western"><surname>Gignon</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dequen</surname><given-names>G</given-names> </name></person-group><article-title>The role of text analytics in healthcare: a review of recent developments and applications</article-title><conf-name>Proceedings of the 14th International Joint Conference on Biomedical Engineering Systems and Technologies</conf-name><conf-date>Feb 11-13, 2021</conf-date><conf-loc>Vienna, Austria</conf-loc><pub-id pub-id-type="doi">10.5220/0010414508250832</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bazoge</surname><given-names>A</given-names> </name><name name-style="western"><surname>Morin</surname><given-names>E</given-names> </name><name name-style="western"><surname>Daille</surname><given-names>B</given-names> </name><name name-style="western"><surname>Gourraud</surname><given-names>PA</given-names> </name></person-group><article-title>Applying natural language processing to textual data from clinical data warehouses: systematic review</article-title><source>JMIR Med Inform</source><year>2023</year><month>12</month><day>15</day><volume>11</volume><fpage>e42477</fpage><pub-id pub-id-type="doi">10.2196/42477</pub-id><pub-id pub-id-type="medline">38100200</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Patel</surname><given-names>KN</given-names> </name><name name-style="western"><surname>Kiran</surname><given-names>P</given-names> </name></person-group><article-title>Preprocessing methods for unstructured healthcare text data</article-title><source>IJITEE</source><year>2019</year><volume>9</volume><issue>2S</issue><fpage>715</fpage><lpage>719</lpage><pub-id pub-id-type="doi">10.35940/ijitee.B1024.1292S19</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sedlakova</surname><given-names>J</given-names> </name><name name-style="western"><surname>Daniore</surname><given-names>P</given-names> </name><name name-style="western"><surname>Horn Wintsch</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Challenges and best practices for digital unstructured data enrichment in health research: a systematic narrative review</article-title><source>PLOS Digit Health</source><year>2023</year><month>10</month><volume>2</volume><issue>10</issue><fpage>e0000347</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000347</pub-id><pub-id pub-id-type="medline">37819910</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yeomans</surname><given-names>M</given-names> </name><name name-style="western"><surname>Boland</surname><given-names>FK</given-names> </name><name name-style="western"><surname>Collins</surname><given-names>HK</given-names> </name><name name-style="western"><surname>Abi-Esber</surname><given-names>N</given-names> </name><name name-style="western"><surname>Brooks</surname><given-names>AW</given-names> </name></person-group><article-title>A practical guide to conversation research: how to study what people say to each other</article-title><source>Adv Meth Pract Psychol Sci</source><year>2023</year><month>10</month><volume>6</volume><issue>4</issue><pub-id pub-id-type="doi">10.1177/25152459231183919</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Atkins</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Steyvers</surname><given-names>M</given-names> </name><name name-style="western"><surname>Imel</surname><given-names>ZE</given-names> </name><name name-style="western"><surname>Smyth</surname><given-names>P</given-names> </name></person-group><article-title>Scaling up the evaluation of psychotherapy: evaluating motivational interviewing fidelity via statistical text classification</article-title><source>Implement Sci</source><year>2014</year><month>04</month><day>24</day><volume>9</volume><issue>1</issue><fpage>49</fpage><pub-id pub-id-type="doi">10.1186/1748-5908-9-49</pub-id><pub-id pub-id-type="medline">24758152</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koole</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Tschacher</surname><given-names>W</given-names> </name></person-group><article-title>Synchrony in psychotherapy: a review and an integrative framework for the therapeutic alliance</article-title><source>Front Psychol</source><year>2016</year><volume>7</volume><fpage>862</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2016.00862</pub-id><pub-id pub-id-type="medline">27378968</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>von Elm</surname><given-names>E</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Egger</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pocock</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>G&#x00F8;tzsche</surname><given-names>PC</given-names> </name><name name-style="western"><surname>Vandenbroucke</surname><given-names>JP</given-names> </name></person-group><article-title>The Strengthening the Reporting of Observational Studies in Epidemiology (STROBE) statement: guidelines for reporting observational studies</article-title><source>J Clin Epidemiol</source><year>2008</year><month>04</month><volume>61</volume><issue>4</issue><fpage>344</fpage><lpage>349</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2007.11.008</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="web"><source>difflib&#x2014;helpers for computing deltas</source><access-date>2024-12-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.python.org/3/library/difflib.html">https://docs.python.org/3/library/difflib.html</ext-link></comment></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="web"><source>Langdetect</source><year>2021</year><month>12</month><access-date>2024-12-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://pypi.org/project/langdetect/">https://pypi.org/project/langdetect/</ext-link></comment></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yuan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liberman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cieri</surname><given-names>C</given-names> </name></person-group><article-title>Towards an integrated understanding of speaking rate in conversation</article-title><conf-name>INTERSPEECH 2006</conf-name><conf-date>Sep 17-21, 2006</conf-date><conf-loc>Pittsburgh, PA, USA</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://www.isca-archive.org/interspeech_2006">https://www.isca-archive.org/interspeech_2006</ext-link></comment><pub-id pub-id-type="doi">10.21437/Interspeech.2006-204</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wardle</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cederbaum</surname><given-names>K</given-names> </name><name name-style="western"><surname>de Wit</surname><given-names>H</given-names> </name></person-group><article-title>Quantifying talk: developing reliable measures of verbal productivity</article-title><source>Behav Res Methods</source><year>2011</year><month>03</month><volume>43</volume><issue>1</issue><fpage>168</fpage><lpage>178</lpage><pub-id pub-id-type="doi">10.3758/s13428-010-0019-y</pub-id><pub-id pub-id-type="medline">21287128</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bird</surname><given-names>S</given-names> </name></person-group><article-title>NLTK: the natural language toolkit</article-title><access-date>2025-01-05</access-date><conf-name>Proceedings of the COLING/ACL on Interactive Presentation Sessions</conf-name><conf-date>Jul 17-18, 2006</conf-date><conf-loc>Sydney, Australia</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/P06-4000/">https://aclanthology.org/P06-4000/</ext-link></comment></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Honnibal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Montani</surname><given-names>I</given-names> </name><name name-style="western"><surname>Landeghem</surname><given-names>S</given-names> </name><name name-style="western"><surname>Boyd</surname><given-names>A</given-names> </name></person-group><article-title>SpaCy: industrial-strength natural language processing in python</article-title><pub-id pub-id-type="doi">10.5281/zenodo.1212303</pub-id><pub-id pub-id-type="medline">7069443</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Colla</surname><given-names>D</given-names> </name><name name-style="western"><surname>Delsanto</surname><given-names>M</given-names> </name><name name-style="western"><surname>Agosto</surname><given-names>M</given-names> </name><name name-style="western"><surname>Vitiello</surname><given-names>B</given-names> </name><name name-style="western"><surname>Radicioni</surname><given-names>DP</given-names> </name></person-group><article-title>Semantic coherence markers: the contribution of perplexity metrics</article-title><source>Artif Intell Med</source><year>2022</year><month>12</month><volume>134</volume><fpage>102393</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2022.102393</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wolf</surname><given-names>T</given-names> </name><name name-style="western"><surname>Debut</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sanh</surname><given-names>V</given-names> </name><etal/></person-group><article-title>HuggingFace&#x2019;s transformers: state-of-the-art natural language processing</article-title><access-date>2024-12-26</access-date><comment>Preprint posted online on 2019</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1910.03771">https://arxiv.org/abs/1910.03771</ext-link></comment></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="web"><source>HuggingFace GPT2</source><access-date>2024-12-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/gpt2">https://huggingface.co/gpt2</ext-link></comment></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="web"><source>Amazon&#x2019;s Bedrock docs</source><access-date>2024-12-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.aws.amazon.com/bedrock">https://docs.aws.amazon.com/bedrock</ext-link></comment></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="web"><article-title>Anthropic News. Introducing Claude 3.5 Sonnet</article-title><source>Anthropic News</source><year>2024</year><month>06</month><day>21</day><access-date>2025-03-05</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.anthropic.com/news/claude-3-5-sonnet">https://www.anthropic.com/news/claude-3-5-sonnet</ext-link></comment></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="report"><article-title>Anthropic. claude 3.5 sonnet model card addendum</article-title><year>2024</year><access-date>2025-03-05</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf">https://www-cdn.anthropic.com/fed9cc193a14b84131812372d8d5857f8f304c52/Model_Card_Claude_3_Addendum.pdf</ext-link></comment></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>H</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name></person-group><article-title>Comparative study of Claude 3.5-Sonnet and human physicians in generating discharge summaries for patients with renal insufficiency: assessment of efficiency, accuracy, and quality</article-title><source>Front Digit Health</source><year>2024</year><volume>6</volume><fpage>1456911</fpage><pub-id pub-id-type="doi">10.3389/fdgth.2024.1456911</pub-id><pub-id pub-id-type="medline">39703756</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="web"><source>Amazon&#x2019;s Bedrock Data Security</source><access-date>2025-02-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.aws.amazon.com/bedrock/latest/userguide/data-protection.html">https://docs.aws.amazon.com/bedrock/latest/userguide/data-protection.html</ext-link></comment></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landis</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Koch</surname><given-names>GG</given-names> </name></person-group><article-title>The measurement of observer agreement for categorical data</article-title><source>Biometrics</source><year>1977</year><month>03</month><volume>33</volume><issue>1</issue><fpage>159</fpage><pub-id pub-id-type="doi">10.2307/2529310</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Rathor</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jadon</surname><given-names>RS</given-names> </name></person-group><article-title>Domain classification of textual conversation using machine learning approach</article-title><conf-name>2018 9th International Conference on Computing, Communication and Networking Technologies (ICCCNT)</conf-name><conf-date>Jul 10-12, 2018</conf-date><conf-loc>Bangalore</conf-loc><fpage>1</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.1109/ICCCNT.2018.8494197</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hopkins</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kalm</surname><given-names>K</given-names> </name></person-group><article-title>Classifying complex documents: comparing bespoke solutions to large language models</article-title><access-date>2025-01-07</access-date><comment>Preprint posted online on  Dec 12, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2312.07182">https://arxiv.org/abs/2312.07182</ext-link></comment></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Edwards</surname><given-names>A</given-names> </name><name name-style="western"><surname>Camacho-Collados</surname><given-names>J</given-names> </name></person-group><article-title>Language models for text classification: is in-context learning enough</article-title><year>2024</year><month>03</month><day>26</day><access-date>2025-03-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.lrec-main.879/">https://aclanthology.org/2024.lrec-main.879/</ext-link></comment></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Guthrie</surname><given-names>D</given-names> </name><name name-style="western"><surname>Guthrie</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wilks</surname><given-names>Y</given-names> </name></person-group><article-title>An unsupervised probabilistic approach for the detection of outliers in corpora</article-title><access-date>2025-03-04</access-date><conf-name>Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC&#x2019;08)</conf-name><conf-date>Mar 28-30, 2008</conf-date><conf-loc>Marrakech, Morocco</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/L08-1109/">https://aclanthology.org/L08-1109/</ext-link></comment></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Li</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>L</given-names> </name></person-group><article-title>Network-based Bag-of-Words model for text classification</article-title><source>IEEE Access</source><year>2020</year><volume>8</volume><fpage>82641</fpage><lpage>82652</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2020.2991074</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lubis</surname><given-names>DH</given-names> </name><name name-style="western"><surname>Sawaluddin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Candra</surname><given-names>A</given-names> </name></person-group><article-title>Machine learning model for language classification: Bag-of-Words and multilayer perceptron</article-title><source>JITE</source><year>2023</year><volume>7</volume><issue>1</issue><fpage>356</fpage><lpage>365</lpage><pub-id pub-id-type="doi">10.31289/jite.v7i1.10114</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Juluru</surname><given-names>K</given-names> </name><name name-style="western"><surname>Shih</surname><given-names>HH</given-names> </name><name name-style="western"><surname>Keshava Murthy</surname><given-names>KN</given-names> </name><name name-style="western"><surname>Elnajjar</surname><given-names>P</given-names> </name></person-group><article-title>Bag-of-Words technique in natural language processing: a primer for radiologists</article-title><source>Radiographics</source><year>2021</year><volume>41</volume><issue>5</issue><fpage>1420</fpage><lpage>1426</lpage><pub-id pub-id-type="doi">10.1148/rg.2021210025</pub-id><pub-id pub-id-type="medline">34388050</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chelba</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name><name name-style="western"><surname>Schuster</surname><given-names>M</given-names> </name><etal/></person-group><article-title>One billion word benchmark for measuring progress in statistical language modeling</article-title><conf-name>Interspeech 2014</conf-name><conf-date>Sep 14-18, 2014</conf-date><conf-loc>Singapore</conf-loc><fpage>2635</fpage><lpage>2639</lpage><pub-id pub-id-type="doi">10.21437/Interspeech.2014-564</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jozefowicz</surname><given-names>R</given-names> </name><name name-style="western"><surname>Vinyals</surname><given-names>O</given-names> </name><name name-style="western"><surname>Schuster</surname><given-names>M</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name></person-group><article-title>Exploring the limits of language modeling</article-title><access-date>2025-01-12</access-date><comment>Preprint posted online on  Feb 7, 2016</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/1602.02410">https://arxiv.org/pdf/1602.02410</ext-link></comment></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Miaschi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alzetta</surname><given-names>C</given-names> </name><name name-style="western"><surname>Brunato</surname><given-names>D</given-names> </name><name name-style="western"><surname>Dell&#x2019;Orletta</surname><given-names>F</given-names> </name><name name-style="western"><surname>Venturi</surname><given-names>G</given-names> </name></person-group><article-title>Is neural language model perplexity related to readability</article-title><access-date>2025-01-12</access-date><conf-name>Seventh Italian Conference on Computational Linguistics</conf-name><conf-date>Mar 1-3, 2021</conf-date><conf-loc>Milan, Italy</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://ceur-ws.org/Vol-2769/paper_57.pdf">https://ceur-ws.org/Vol-2769/paper_57.pdf</ext-link></comment></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>August</surname><given-names>T</given-names> </name><name name-style="western"><surname>Leroy</surname><given-names>G</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>T</given-names> </name></person-group><article-title>APPLS: evaluating evaluation metrics for plain language summarization</article-title><access-date>2025-01-12</access-date><comment>Preprint posted online on  May 23, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2305.14341">https://arxiv.org/abs/2305.14341</ext-link></comment></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ma</surname><given-names>J</given-names> </name><name name-style="western"><surname>Na</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Rambow</surname><given-names>O</given-names> </name><name name-style="western"><surname>Wanner</surname><given-names>L</given-names> </name><name name-style="western"><surname>Apidianaki</surname><given-names>M</given-names> </name><name name-style="western"><surname>Al-Khalifa</surname><given-names>H</given-names> </name><name name-style="western"><surname>Di</surname><given-names>EB</given-names> </name><name name-style="western"><surname>Schockaert</surname><given-names>S</given-names> </name></person-group><article-title>Detecting conversational mental manipulation with intent-aware prompting</article-title><access-date>2025-3-5</access-date><conf-name>Proceedings of the 31st International Conference on Computational Linguistics Association for Computational Linguistics</conf-name><conf-date>Jan 19-24, 2025</conf-date><conf-loc>Abu Dhabi, United Arab Emirates</conf-loc><fpage>9176</fpage><lpage>9183</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2025.coling-main.616/">https://aclanthology.org/2025.coling-main.616/</ext-link></comment></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ji</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Kuang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ananiadou</surname><given-names>S</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Bouamor</surname><given-names>H</given-names> </name><name name-style="western"><surname>Pino</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bali</surname><given-names>K</given-names> </name></person-group><article-title>Towards interpretable mental health analysis with large language models</article-title><conf-name>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 6-10, 2023</conf-date><conf-loc>Singapore</conf-loc><fpage>6056</fpage><lpage>6077</lpage><pub-id pub-id-type="doi">10.18653/v1/2023.emnlp-main.370</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>L</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>F</given-names> </name></person-group><article-title>The imperative of conversation analysis in the era of llms: a survey of tasks, techniques, and trends</article-title><access-date>2025-3-5</access-date><comment>Preprint posted online on  Sep 21, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2409.14195">https://arxiv.org/abs/2409.14195</ext-link></comment></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Goldwasser</surname><given-names>D</given-names> </name><name name-style="western"><surname>Reese</surname><given-names>LS</given-names> </name></person-group><article-title>Towards understanding counseling conversations: domain knowledge and large language models</article-title><access-date>2025-3-5</access-date><comment>Preprint posted online on  Feb 21, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2402.14200">https://arxiv.org/abs/2402.14200</ext-link></comment></nlm-citation></ref><ref id="ref69"><label>69</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Long</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>Evaluating large language models in analysing classroom dialogue</article-title><source>NPJ Sci Learn</source><year>2024</year><month>10</month><day>3</day><volume>9</volume><issue>1</issue><fpage>60</fpage><pub-id pub-id-type="doi">10.1038/s41539-024-00273-3</pub-id><pub-id pub-id-type="medline">39358390</pub-id></nlm-citation></ref><ref id="ref70"><label>70</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Si</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cobas</surname><given-names>O</given-names> </name><name name-style="western"><surname>Fababeir</surname><given-names>M</given-names> </name></person-group><article-title>Lexical error guard: leveraging large language models for enhanced ASR error correction</article-title><source>MAKE</source><year>2024</year><volume>6</volume><issue>4</issue><fpage>2435</fpage><lpage>2446</lpage><pub-id pub-id-type="doi">10.3390/make6040120</pub-id></nlm-citation></ref><ref id="ref71"><label>71</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Cheng</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Improving speaker diarization using semantic information: joint pairwise constraints propagation</article-title><access-date>2025-1-7</access-date><comment>Preprint posted online on  Sep 19, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2309.10456">https://arxiv.org/abs/2309.10456</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Clean-up words.</p><media xlink:href="formative_v9i1e78082_app1.png" xlink:title="PNG File, 122 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Amazon Bedrock models prices.</p><media xlink:href="formative_v9i1e78082_app2.png" xlink:title="PNG File, 36 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Prompt.</p><media xlink:href="formative_v9i1e78082_app3.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Most common words: before and after filtering.</p><media xlink:href="formative_v9i1e78082_app4.png" xlink:title="PNG File, 403 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Session versus non-session classification results: (A) inter-rater answers distribution and (B) distribution of agreement between the model and the human raters.</p><media xlink:href="formative_v9i1e78082_app5.png" xlink:title="PNG File, 101 KB"/></supplementary-material></app-group></back></article>