<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e85230</article-id><article-id pub-id-type="doi">10.2196/85230</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Automatic Speech Recognition and Acoustic Analysis for Dysarthria Assessment in Telerehabilitation: User-Centered Design and Usability Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Vinet</surname><given-names>Pierre</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dillenbourg</surname><given-names>Pierre</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Slot</surname><given-names>Amelieke</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Selvanayakam</surname><given-names>Sharmila</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Giovanoli</surname><given-names>Sandra</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Du</surname><given-names>Elisa</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cardoso</surname><given-names>Julia</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Branscheidt</surname><given-names>Meret</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Easthope Awai</surname><given-names>Chris</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Bauer</surname><given-names>Christoph Michael</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Therapy Science Lab, Health, Lake Lucerne Institute</institution><addr-line>Rubistrasse 9</addr-line><addr-line>Vitznau</addr-line><country>Switzerland</country></aff><aff id="aff2"><institution>School of Engineering, Section of Microtechniques, &#x00C9;cole Polytechnique F&#x00E9;d&#x00E9;rale de Lausanne</institution><addr-line>Lausanne</addr-line><country>Switzerland</country></aff><aff id="aff3"><institution>School of Computer amnd Communication Sciences, Computer-Human Interaction Lab for Learning &#x0026; Instruction, &#x00C9;cole Polytechnique F&#x00E9;d&#x00E9;rale de Lausanne</institution><addr-line>Lausanne</addr-line><country>Switzerland</country></aff><aff id="aff4"><institution>Center for Neurorehabilitation, Cereneo</institution><addr-line>Vitznau</addr-line><country>Switzerland</country></aff><aff id="aff5"><institution>Data Analytics &#x0026; Rehabilitation Technology (DART), Health, Lake Lucerne Institute</institution><addr-line>Vitznau</addr-line><country>Switzerland</country></aff><aff id="aff6"><institution>Lehre HEST, D-HEST, ETH Zurich</institution><addr-line>Z&#x00FC;rich</addr-line><country>Switzerland</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Stone</surname><given-names>Alicia</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Mendes</surname><given-names>Clarion</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Irfan</surname><given-names>Rizwana</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Christoph Michael Bauer, Prof Dr, Therapy Science Lab, Health, Lake Lucerne Institute, Rubistrasse 9, Vitznau, 6354, Switzerland, 41 79 5272449; <email>christoph.bauer@llui.org</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>3</day><month>7</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e85230</elocation-id><history><date date-type="received"><day>03</day><month>10</month><year>2025</year></date><date date-type="rev-recd"><day>08</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>15</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Pierre Vinet, Pierre Dillenbourg, Amelieke Slot, Sharmila Selvanayakam, Sandra Giovanoli, Elisa Du, Julia Cardoso, Meret Branscheidt, Chris Easthope Awai, Christoph Michael Bauer. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 3.7.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e85230"/><abstract><sec><title>Background</title><p>Dysarthria is a frequent motor speech disorder following a stroke, affecting up to 42% of survivors and resulting in reduced speech intelligibility and diminished quality of life. Clinical assessments, such as the Frenchay Dysarthria Assessment, Second Edition (FDA-2), rely heavily on the subjective judgment of speech-language pathologists (SLPs), which limits comparability and scalability. Telepractice solutions have the potential to extend access to care, but validated digital tools that combine automatic analysis with clinically usable interfaces remain scarce.</p></sec><sec><title>Objective</title><p>This study aimed to develop and evaluate a web-based application that integrates automatic speech recognition (ASR) and acoustic analysis into a user-centered dashboard for SLPs. Specifically, we investigated: (1) whether ASR can provide intelligibility scores comparable to those of human listeners; (2) the usability of the system in 2 iterative cycles with SLPs; and (3) the feasibility of presenting clinically relevant acoustic features to support telerehabilitation.</p></sec><sec sec-type="methods"><title>Methods</title><p>A user-centered design process was followed, involving contextual inquiry, requirements gathering, prototype development, and iterative testing with SLPs. The analytical core of the prototype included an ASR module (Whisper Large-v3) to compute intelligibility scores, combining word error rate&#x2013;based accuracy with sentence-level and word-level alignment. Phoneme-level error highlighting was implemented to identify frequent substitution or deletion patterns. In parallel, an acoustic module extracted clinically relevant measures, including fundamental frequency (mean and range), intensity (mean and variability), and vowel formants (F1&#x2013;F2 space), supplemented by sustained phonation duration. A pilot validation compared ASR-based intelligibility scores with transcriptions from 8 lay listeners for 3 patients with dysarthria performing the Frenchay Dysarthria Assessment&#x2013;2 word and sentence tasks. Usability was evaluated in 2 cycles with 8 and 4 SLPs, respectively, using the System Usability Scale and structured questionnaires.</p></sec><sec sec-type="results"><title>Results</title><p>In the pilot validation, ASR performance was comparable to, and in some cases better than, untrained human listeners for individuals with mild and moderate dysarthria, though performance declined with severe cases. Both usability cycles yielded excellent System Usability Scale scores (cycle 1: mean 88.4, SD 4.6; cycle 2: mean 91.7, SD 4.1). Core workflow elements, including navigation, session upload, and intelligibility score presentation, were consistently rated highly. Feedback evolved from bug reports and requests for clearer terminology in cycle 1 to suggestions for advanced analytic features in cycle 2, such as additional voice-quality indices and integrated note-taking.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The prototype demonstrates that automatic intelligibility scoring and acoustic analysis can be integrated into a clinically usable, web-based dashboard. While current limitations include reliance on English-only phoneme analysis, limited advanced acoustic features, and lack of regulatory compliance, the application achieved excellent usability and shows promise for scalable telerehabilitation. Future work should expand multilingual support, incorporate additional acoustic measures, and validate the tool in larger clinical cohorts.</p></sec></abstract><kwd-group><kwd>speech and language therapy</kwd><kwd>user-centered design</kwd><kwd>telerehabilitation</kwd><kwd>dysarthria</kwd><kwd>web application</kwd><kwd>automatic speech recognition</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Dysarthria is a neuromotor speech disorder resulting from neurological damage and is present across many neurological diseases, including stroke, cerebral palsy, and amyotrophic lateral sclerosis [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. It affects the speed, strength, accuracy, range, tone, or duration of the movements required for speech control and commonly reduces speech intelligibility [<xref ref-type="bibr" rid="ref1">1</xref>], substantially impacting participation, psychosocial well-being, and quality of life [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Clinical assessment tools, such as the Frenchay Dysarthria Assessment, Second Edition (FDA-2), one of the most widely used assessments across different clinical systems [<xref ref-type="bibr" rid="ref5">5</xref>], are widely used but rely heavily on subjective judgment, leading to interrater and intrarater variability in some items [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. This creates variability and limits comparability across raters and time.</p><p>Speech-language pathologists (SLPs) play a vital role in diagnosing and treating dysarthria [<xref ref-type="bibr" rid="ref9">9</xref>], yet their reach is constrained by time, geography, and patient accessibility. Telerehabilitation, which involves delivering rehabilitation services remotely, has emerged as a scalable, clinically endorsed solution to bridge this gap, with professional guidance and reimbursement pathways increasingly being established [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. By enabling remote assessment and treatment, telerehabilitation can extend therapeutic support to underserved regions and improve continuity of care for individuals who experienced stroke.</p><p>Integrating acoustic analysis based on automatic speech recognition (ASR) into telerehabilitation platforms opens the possibility of continuous, noninvasive speech monitoring in naturalistic settings and timely feedback to patients. Advancements in signal processing and large self-supervised &#x201C;audio or language&#x201D; models now enable robust extraction of acoustic features (eg, fundamental frequency, intensity range, and speaking rate) and interpretation of speech. Such measures serve as quantifiable indicators of speech quality and progression, specifically the severity of dysarthria, providing data-driven support to SLPs and researchers during evaluation [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. Modern representation-learning approaches [<xref ref-type="bibr" rid="ref16">16</xref>] further strengthen automated analyses and downstream assessment pipelines [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. However, the integration of these advances into clinical practice has not kept pace, creating a mismatch between technological potential and clinical uptake that motivates this study. Despite these advances, adoption in routine care remains limited. Many available tools require installing desktop software and substantial technical training, have limited usability, fit poorly within clinical workflows, and lack standardized, shareable digital outcome measures. Furthermore, patient privacy needs and data protection concerns must be addressed. As a result, human perceptual judgment continues to serve as the de facto reference standard [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>This study addresses this gap by developing a web-based tool that integrates ASR and acoustic analysis. While initially designed with stroke-related dysarthria in mind [<xref ref-type="bibr" rid="ref22">22</xref>], the underlying analytical framework is intended to be generalizable across dysarthria etiologies. Specifically, we examined: (1) whether ASR can achieve comparable performance to human listeners in intelligibility assessment; (2) the usability of the tool for SLPs in a formative evaluation; and (3) implications for telerehabilitation and long-term monitoring.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>A user-centered design approach guided the project from early requirements gathering to final prototype validation [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. Initial requirements were collected before the frontend design to match the SLPs&#x2019; requirements for a web-based digitized speech and language assessment [<xref ref-type="bibr" rid="ref22">22</xref>]. This study consisted of a three-step approach: (1) needs and context analysis including specifying the context of use and gathering user requirements; (2) preliminary testing; and (3) end-user testing and refinement. The 2 empirical components&#x2014;prototype development and preliminary testing, and end-user testing and refinement&#x2014;are reported in the &#x201C;Results section.&#x201D; Two iterative usability testing cycles were conducted to identify and address initial usability issues and to evaluate the effectiveness of implemented improvements in a refined prototype.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>Patient data were obtained from patients with stroke and the publicly available TORGO database, a database that includes individuals with cerebral palsy and amyotrophic lateral sclerosis, which permits academic use [<xref ref-type="bibr" rid="ref25">25</xref>]. All procedures adhered to ethical guidelines and regulations. Approvals were obtained from the Ethics Committee of Northwestern and Central Switzerland (Req-2024&#x2010;00103 for usability testing, R-2025&#x2010;00538 for voice recordings). Written informed consent was obtained from all participants. The participants did not receive compensation.</p></sec><sec id="s2-3"><title>User-Centered Design</title><sec id="s2-3-1"><title>Needs and Context Analysis</title><sec id="s2-3-1-1"><title>Needs Assessment and Context Analysis</title><p>The iSpeak system was initially developed for SLPs conducting assessments of dysarthria in individuals who experienced stroke [<xref ref-type="bibr" rid="ref22">22</xref>]. To capture dysarthria across neurological conditions more broadly, this study used the TORGO database [<xref ref-type="bibr" rid="ref25">25</xref>]. Nine SLPs were recruited, and participant demographics are summarized in <xref ref-type="table" rid="table1">Table 1</xref>. A purposive sampling method was used, based on the assumption that each participant would offer unique insights [<xref ref-type="bibr" rid="ref26">26</xref>]. Because the participants&#x2019; roles were not interchangeable, the sample size was guided by data saturation rather than statistical power analysis [<xref ref-type="bibr" rid="ref26">26</xref>]. The SLPs contributed through preliminary meetings, focus groups, prototype testing, and feedback.</p><p>The following 2 methods informed contextual understanding:</p><list list-type="bullet"><list-item><p>Work shadowing sessions (n=6, 30&#x2010;45 min each), including observation of live telerehabilitation sessions and mock sessions where 1 researcher acted as the patient, revealed workflow challenges and opportunities for automation. Field notes from the work shadowing sessions were summarized descriptively and reviewed to identify recurring workflow steps, contextual constraints, and opportunities for automation using Microsoft Excel.</p></list-item><list-item><p>Two focus group interviews with a total of 6 SLPs, who practiced telerehabilitation with a mean experience of 4.6 years, were conducted to explore assessment practices, workflow bottlenecks, and requirements for automated tools. Focus group responses were summarized descriptively by the research team and reviewed to identify recurring needs, workflow barriers, and potential requirements for the prototype; no dedicated qualitative analysis software was used for this step. During the focus groups, the SLPs emphasized the need for asynchronous use. Typical practice involved conducting a live therapy session via videoconferencing, followed by posthoc review and analysis using the application.</p></list-item></list><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Overview of participants in study phases.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Phase</td><td align="left" valign="bottom">Participants</td><td align="left" valign="bottom">Region</td><td align="left" valign="bottom">sex (female), n</td><td align="left" valign="bottom">Work experience (y), mean (SD; range)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Needs and context analysis</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Work shadowing</td><td align="left" valign="top">3 patients and 3 mock sessions</td><td align="left" valign="top">Europe (4)&#x2013;Arabic Peninsula (1)</td><td align="left" valign="top">3</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Focus group interviews</td><td align="left" valign="top">6 SLPs<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">Europe (4)&#x2013;Arabic Peninsula (1)</td><td align="left" valign="top">5</td><td align="left" valign="top">4.6 (2.1; 3-6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>User requirements</td><td align="left" valign="top">6 SLPs</td><td align="left" valign="top">Europe (4)&#x2013;Arabic Peninsula (1)</td><td align="left" valign="top">5</td><td align="left" valign="top">4.6 (2.1; 3-6)</td></tr><tr><td align="left" valign="top" colspan="5">Preliminary testing</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Pilot validation</td><td align="left" valign="top">8 lay users</td><td align="left" valign="top">Europe (8)</td><td align="left" valign="top">5</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>End-user testing and refinement</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Usability study cycle 1</td><td align="left" valign="top">8 SLPs</td><td align="left" valign="top">Europe (7)&#x2013;United States (1)</td><td align="left" valign="top">8</td><td align="left" valign="top">7.56 (3.2; 3-26)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Usability study cycle 2</td><td align="left" valign="top">5 SLPs</td><td align="left" valign="top">Europe (4)&#x2013;Arabic Peninsula (1)</td><td align="left" valign="top">5</td><td align="left" valign="top">4.3 (2.0; 3&#x2013;6)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Not applicable</p></fn><fn id="table1fn2"><p><sup>b</sup>SLP: speech and language pathologist.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3-1-2"><title>User Requirements</title><p>Requirements were identified from observations and focus groups, summarized by the research team, and classified into functional and nonfunctional categories (Tables S6-S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The requirements were then prioritized according to feasibility and perceived importance for SLPs using Microsoft Excel; functional requirements described expected features (eg, review of recordings, intelligibility scoring, and progress tracking), while nonfunctional requirements addressed qualities such as speed, reliability, and security. Prioritization considered both feasibility and user importance. These requirements informed prototype design and evaluation (Tables S6-S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-3-1-3"><title>Design and Prototype Development</title><p>The prototype design was created in Figma using Shadcn component libraries to ensure consistency and rapid iteration. Designs were validated with SLPs before implementation. The final prototype comprised a backend (Python, Python Software Foundation; FastAPI, Sebasti&#x00E1;n Ram&#x00ED;rez [known online as @tiangolo]; PostgreSQL, PostgreSQL Global Development Group) hosted on a Swiss VPS, and a frontend (Next.js, Vercel Inc; Tailwind CSS, Tailwind Labs Inc) deployed via Vercel, Vercel Inc. These implementation details are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. <xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates the prototype's backend modules for the analysis end point. The prototype's style guide is illustrated in Figure S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the backend modules for the analysis end point. ASR: automatic speech recognition.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e85230_fig01.png"/></fig></sec></sec></sec><sec id="s2-4"><title>Evaluation of the Prototype</title><p>The prototype was evaluated in 2 stages: prototype development and preliminary testing with lay users to detect usability issues and bugs, as well as the technical validity of ASR-based intelligibility scoring; and end-user testing and refinement with SLPs across 2 iterative cycles, during which SLPs evaluated usability, workflow integration, and clinical relevance.</p></sec><sec id="s2-5"><title>Preliminary Testing</title><p>Local community members familiar with laptops explored the prototype freely, uploading recordings and navigating the dashboard. A mock session ensured comparability. Minor issues (eg, broken links and unclear navigation) were corrected before expert testing.</p><sec id="s2-5-1"><title>Participants</title><p>Eight lay listeners were recruited from the faculty staff. None had training in speech-language pathology, but all possessed advanced English proficiency. They were selected as proxies for na&#x00EF;ve intelligibility judgments, which are commonly used in dysarthria research to approximate real-world listener understanding rather than relying on expert clinical ratings [<xref ref-type="bibr" rid="ref27">27</xref>]. Three patients with stroke with varying dysarthria severity were recruited from a neurorehabilitation hospital and provided speech material. Each had completed the FDA-2 reading tasks (10 words and 10 sentences) [<xref ref-type="bibr" rid="ref5">5</xref>].</p></sec><sec id="s2-5-2"><title>Materials and Procedure</title><p>Recordings were processed using OpenAI Whisper (Large-v3), configured for English with a temperature of 0.0 [<xref ref-type="bibr" rid="ref28">28</xref>]. Participants listened via a laptop and transcribed the speech without access to reference texts. Both lay transcriptions and ASR output were normalized using the same procedures and compared against reference texts. Performance was quantified by word error rate (WER), with lower WER indicating higher intelligibility. The results were visualized using boxplots.</p></sec></sec><sec id="s2-6"><title>End-User Testing and Refinement</title></sec><sec id="s2-7"><title>Usability Testing and Refinement</title><sec id="s2-7-1"><title>Overview</title><p>Two structured cycles followed a standardized protocol. Each participant received access credentials, a user manual with screenshots, and sample patient data (from TORGO). Usability was assessed through a questionnaire that included open-ended questions, Likert scales, and the System Usability Scale (SUS) [<xref ref-type="bibr" rid="ref29">29</xref>]. Anonymous web analytics complemented self-reports by capturing interaction patterns and navigation behavior. Likert-scale questionnaire items and SUS scores were analyzed descriptively by calculating item-level and overall mean scores. Open-ended questionnaire responses were summarized and categorized according to their content using Microsoft Excel. Anonymous web analytics were inspected descriptively to identify navigation patterns, task completion issues, and dead clicks. After each cycle, open-ended feedback and observed usability issues were categorized into bugs, improvements, and feature requests, which were used to guide the next refinement step. Critical issues were fixed before the next cycle. The analytical pipeline was designed to be condition-agnostic, operating on speech signal characteristics rather than disease-specific features, thereby supporting potential generalization across different dysarthria etiologies.</p><p>Results of the 2 cycles, including SUS scores and qualitative feedback, are presented in the &#x201C;Results&#x201D; section.</p></sec><sec id="s2-7-2"><title>Participants</title><p>Eight SLPs who practiced telerehabilitation were recruited through an established professional network to participate in 2 usability cycles, with a mean of 7.57 years of clinical practice. The SLPs used the application at their workplaces or homes. The SLPs received audio recordings and reference transcripts from the TORGO database [<xref ref-type="bibr" rid="ref25">25</xref>] to ensure compliance with health care patient data regulations while providing realistic clinical testing scenarios.</p></sec></sec><sec id="s2-8"><title>The Prototype</title><sec id="s2-8-1"><title>Preprocessing</title><p>Recordings were preprocessed by extracting audio, applying a band-pass filter (80 Hz-8 kHz), normalizing the signal amplitude, and resampling to 16 kHz mono. These steps ensured consistency across sessions and compliance with model requirements [<xref ref-type="bibr" rid="ref28">28</xref>].</p></sec><sec id="s2-8-2"><title>Transcription and Normalization</title><p>Whisper Large-v3 produced transcriptions, which, together with reference texts, were normalized (lowercased, punctuation removed, numbers expanded, and contractions expanded) to ensure scoring reflected content rather than formatting [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>].</p></sec><sec id="s2-8-3"><title>Error Metrics</title><p>WER and character error rate were calculated using the standard edit-distance (Levenshtein) algorithm. WER is defined as the sum of substitutions, deletions, and insertions divided by the total number of reference words. Character error rate follows the same logic at the character level. Detailed equations are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref31">31</xref>].</p></sec><sec id="s2-8-4"><title>Alignment</title><p>To compute sentence-level scores, a global alignment between concatenated reference and transcription strings was performed. Insertions, deletions, and substitutions were marked, and then boundaries were adjusted back to individual sentences. This allowed for per-sentence WER and alignment visualizations highlighting correctly and incorrectly recognized words.</p></sec><sec id="s2-8-5"><title>Intelligibility Score</title><p>Intelligibility refers to how much of a speaker&#x2019;s intended message is understood by a listener and is commonly operationalized in dysarthria research through listener transcription accuracy at the word or sentence level [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. In FDA-2 scoring, each item is judged as correct or incorrect. We implemented two scoring modes:</p><list list-type="bullet"><list-item><p>Binary scoring (FDA-2 standard) [<xref ref-type="bibr" rid="ref5">5</xref>]: words and sentences are scored as correct or incorrect.</p></list-item><list-item><p>Word-level scoring: sentence scores are calculated from the average word-level accuracy.</p></list-item></list><p>The final intelligibility score (IS) was defined as follows:</p><p><inline-formula><mml:math id="ieqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>I</mml:mi><mml:mi>S</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>W</mml:mi><mml:mi>E</mml:mi><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mtext>avg</mml:mtext></mml:mrow></mml:msub></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>(1)</p><p>Where WER<sub>avg</sub> is the average WER across tasks. Global scores combine word and sentence results as follows:</p><p><inline-formula><mml:math id="ieqn2"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>I</mml:mi><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>&#x2217;</mml:mo><mml:mi>I</mml:mi><mml:msub><mml:mi>S</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>N</mml:mi><mml:mi>s</mml:mi></mml:msub><mml:mo>&#x2217;</mml:mo><mml:mi>I</mml:mi><mml:msub><mml:mi>S</mml:mi><mml:mi>s</mml:mi></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>N</mml:mi><mml:mi>s</mml:mi></mml:msub></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>(2)</p><p>where <italic>IS<sub>W</sub></italic> and <italic>IS<sub>S</sub></italic> are word and sentence intelligibility scores, and <italic>N<sub>w</sub></italic> and <italic>N<sub>s</sub></italic> are their respective counts.</p></sec><sec id="s2-8-6"><title>Phoneme-Level Analysis</title><p>Phonemes are the smallest contrastive speech sound units that can distinguish meaning within a language; phoneme-level analysis, therefore, focuses on the sound structure of words rather than on the listener&#x2019;s global understanding of the message [<xref ref-type="bibr" rid="ref34">34</xref>]. Reference words were converted to phoneme sequences using CMUdict and mapped to the International Phonetic Alphabet. Errors were classified as substitutions or deletions, aggregated, and ranked by frequency and word position. This analysis provided SLPs with clinically relevant insights for tailoring therapy.</p></sec><sec id="s2-8-7"><title>Acoustic Feature Extraction</title><p>Acoustic features were extracted to provide objective, interpretable measures of speech. Based on clinical input, 3 families were prioritized as follows [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>]:</p><list list-type="bullet"><list-item><p>Fundamental frequency (F0): mean, range, and variability.</p></list-item><list-item><p>Intensity: mean and SD.</p></list-item><list-item><p>Formants (F1, F2): vowel-space measures that indicate articulatory precision.</p></list-item></list><p>Maximum phonation time was derived from sustained /a/ tasks. These features capture prosodic control, respiratory support, and articulation. Sex-related differences in vocal tract anatomy, which influence formant frequencies (F1 and F2), were not controlled for in this study. Extraction was performed using Parselmouth, a Python interface to Praat, integrated into the backend pipeline.</p></sec><sec id="s2-8-8"><title>Statistical Methods Used for Analyzing Acoustic Features</title><p>The extracted acoustic features were computed using Parselmouth and summarized descriptively to support clinical interpretation. No inferential statistical analyses were performed on these features, as the study focused on the feasibility and usability of the analytical pipeline rather than a hypothesis-driven evaluation of acoustic measures.</p></sec><sec id="s2-8-9"><title>Dashboard and Visualization</title><p>The web application dashboard was designed to present results clearly to SLPs. Key components included (1) circular progress indicators showing intelligibility percentages; (2) bar charts of phoneme error frequencies, color-coded by word position; and (3) side-by-side alignment displays highlighting correctly recognized (green) versus incorrect (red) words.</p><p><xref ref-type="fig" rid="figure2">Figures 2</xref><xref ref-type="fig" rid="figure3"/>-<xref ref-type="fig" rid="figure4">4</xref> illustrate the interface components and analytics dashboards.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Analysis and segment selection (of 1 TORGO database patient). FDA2: Frenchay Dysarthria Assessment, Second Edition.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e85230_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Intelligibility score and phonemes analysis (left) detailed phonemes analysis page (right) of a patient from the TORGO database.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e85230_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Acoustic analysis: acoustic base frequency (top), acoustic intensity (middle), acoustic formants (bottom) of a patient from the TORGO database. F0: acoustic base frequency, F1-F5: acoustic formants.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e85230_fig04.png"/></fig></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p><xref ref-type="table" rid="table1">Table 1</xref> provides an overview of the study phases and participant demographics. All SLPs who took part in the study practice telerehabilitation.</p><sec id="s3-1"><title>Preliminary Testing</title><p>Eight lay users (<xref ref-type="table" rid="table1">Table 1</xref>) transcribed audio recordings from 3 patients with dysarthria to validate the accuracy of the automatic intelligibility assessment. The recordings included the FDA-2 words and sentences tasks. Performance was quantified using WER, with lower values indicating greater accuracy.</p><p>The ASR system (Whisper Large-v3) performed comparably to, and in some cases better than, untrained listeners. For mild dysarthria, ASR achieved lower median WERs than most participants. For moderate and severe dysarthria, ASR errors increased but remained within the variability range of human listeners. Boxplots of WER distributions are shown in <xref ref-type="fig" rid="figure5">Figure 5</xref>.</p><p>These findings confirm that ASR can approximate human perceptual judgments of intelligibility and support its integration into the prototype.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Word error rate (WER) comparison between lay user transcriptors and ASR transcriptions for the word (A) and sentences (B) tasks; ASR: automatic speech recognition system.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="formative_v10i1e85230_fig05.png"/></fig></sec><sec id="s3-2"><title>End-User Testing and Refinement-Cycle 1</title><p>Eight SLPs participated in the first usability cycle (mean clinical experience 7.56, SD 3.2 y; <xref ref-type="table" rid="table1">Table 1</xref>). Most accessed the application at the workplace, while 2 participated from home, using a mix of Chrome, Firefox, and Edge.</p><p>The mean SUS score was 88.4 (SD 4.6), which is considered excellent [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. Average ratings across survey items were 4.26/5 (<xref ref-type="table" rid="table2">Table 2</xref>). Core workflow elements&#x2014;including navigation, uploading sessions, segment selection, and processing time&#x2014;scored between 4.50 and 4.63. Visual design received the highest rating (mean 4.88, SD 0.35). Understanding the intelligibility outputs scored slightly lower: the side-by-side transcription view was rated 4.50, and the average intelligibility score clarity was 4.13 (SD 0.13). The lowest ratings were for the perceived accuracy of the automatic intelligibility score (mean 3.13, SD 1.25) and phoneme-level error highlighting (mean 3.88, SD 1.25). Qualitative feedback (<xref ref-type="table" rid="table3">Table 3</xref>; Table S9 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) clustered into 3 categories:</p><list list-type="bullet"><list-item><p>Fixes or bugs: reliability and alignment issues (failed uploads, shifting regions, and missed words).</p></list-item><list-item><p>Usability improvements: clearer wording in the user interface, simultaneous listening and text entry, and time markers.</p></list-item><list-item><p>Feature requests: phonetic equivalence in intelligibility scoring, and pause analysis.</p></list-item></list><p>These results confirmed high usability while identifying areas for refinement before the second cycle.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>End-user testing and refinement&#x2014;cycle 1.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Survey question</td><td align="left" valign="bottom">SLP<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> score (out of 5), mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">The web application met my expectations.</td><td align="left" valign="top">3.88 (1.13)</td></tr><tr><td align="left" valign="top">The navigation within the app was easy and logical.</td><td align="left" valign="top">4.50 (0.76)</td></tr><tr><td align="left" valign="top">The design of the app (colors, fonts, layout) was appealing and professional.</td><td align="left" valign="top">4.88 (0.35)</td></tr><tr><td align="left" valign="top">The session upload process was straightforward</td><td align="left" valign="top">4.63 (0.74)</td></tr><tr><td align="left" valign="top">Selecting audio segments was intuitive</td><td align="left" valign="top">4.63 (0.52)</td></tr><tr><td align="left" valign="top">The time required for upload and selection was acceptable</td><td align="left" valign="top">4.50 (0.53)</td></tr><tr><td align="left" valign="top">How easy was it to select audio segments and upload your video or audio session?</td><td align="left" valign="top">4.50 (1.41)</td></tr><tr><td align="left" valign="top">The automatic intelligibility score seemed accurate</td><td align="left" valign="top">3.13 (1.25)</td></tr><tr><td align="left" valign="top">The intelligibility score was easy to understand</td><td align="left" valign="top">4.13 (0.99)</td></tr><tr><td align="left" valign="top">The side-by-side view of patients` transcription versus reference was clear</td><td align="left" valign="top">4.50 (0.76)</td></tr><tr><td align="left" valign="top">The phoneme-level error highlighting was intuitive</td><td align="left" valign="top">3.88 (1.25)</td></tr><tr><td align="left" valign="top">How easy was the intelligibility score to understand?</td><td align="left" valign="top">4.00 (2.51)</td></tr><tr><td align="left" valign="top">Overall mean across questions</td><td align="left" valign="top">4.26 (0.58)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>SLP: speech and language pathologist.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>End-user testing and refinement&#x2014;cycle 1.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ID<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom">Representative SLP<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> feedback</td><td align="left" valign="bottom">Category</td></tr></thead><tbody><tr><td align="left" valign="top">I01-002</td><td align="left" valign="top">First files did not upload.</td><td align="left" valign="top">Fix or bug</td></tr><tr><td align="left" valign="top">I01-013</td><td align="left" valign="top">The selected region shifted right when I extended the clip.</td><td align="left" valign="top">Fix or bug</td></tr><tr><td align="left" valign="top">I01-004</td><td align="left" valign="top">The term &#x201C;misspelled&#x201D; is confusing because it suggests the SLP made the error.</td><td align="left" valign="top">Improvement</td></tr><tr><td align="left" valign="top">I01-005</td><td align="left" valign="top">I want to listen to the segment while I enter the reference words.</td><td align="left" valign="top">Improvement</td></tr><tr><td align="left" valign="top">I01-007</td><td align="left" valign="top">Show minutes/seconds under the clip to guide word and sentence selection.</td><td align="left" valign="top">Improvement</td></tr><tr><td align="left" valign="top">I01-019</td><td align="left" valign="top">Count phonetic equivalents as correct in the intelligibility calculation.</td><td align="left" valign="top">Feature</td></tr><tr><td align="left" valign="top">I01-018</td><td align="left" valign="top">Analyze the duration of pauses between words to reflect prosody issues.</td><td align="left" valign="top">Feature</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>ID: feedback identifier.</p></fn><fn id="table3fn2"><p><sup>b</sup>SLP: speech and language pathologist.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>End-User Testing and Refinement-Cycle 2</title><p>Five SLPs participated in the second usability cycle (mean clinical experience 4.3, SD 2.0 y; <xref ref-type="table" rid="table1">Table 1</xref>). The testing environments and browsers were the same as those used in cycle 1.</p><p>The mean SUS score increased to 91.7 (SD 4.1), which is again considered excellent [<xref ref-type="bibr" rid="ref38">38</xref>]. Survey results (<xref ref-type="table" rid="table4">Table 4</xref>) showed consistently high ratings. Navigation and the new acoustic analysis page both received a score of 4.75, indicating that participants found the workflow intuitive and the additional analysis features useful. Frequency and intensity graphs were rated positively (4.33 each), while the formants graph was rated lower (mean 3.66, SD 0.65), suggesting limited clarity or perceived clinical utility. Overall expectations were rated at a mean of 4.20 (SD 0.42).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>End-user testing and refinement&#x2013;cycle 2.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Survey question</td><td align="left" valign="bottom">SLP<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> score (out of 5), mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">I found the fundamental frequency graph useful.</td><td align="left" valign="top">4.33 (0.57)</td></tr><tr><td align="left" valign="top">I found the intensity graph useful.</td><td align="left" valign="top">4.33 (0.89)</td></tr><tr><td align="left" valign="top">I found the formants graph useful.</td><td align="left" valign="top">3.66 (0.65)</td></tr><tr><td align="left" valign="top">How easy was the acoustic analysis page to understand?</td><td align="left" valign="top">4.75 (0.41)</td></tr><tr><td align="left" valign="top">The web application met my expectations</td><td align="left" valign="top">4.20 (0.42)</td></tr><tr><td align="left" valign="top">The navigation within the app was easy and logical.</td><td align="left" valign="top">4.75 (0.78)</td></tr><tr><td align="left" valign="top">Overall mean across questions</td><td align="left" valign="top">4.34 (0.40)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>SLP: speech and language pathologist.</p></fn></table-wrap-foot></table-wrap><p>Representative feedback (<xref ref-type="table" rid="table5">Table 5</xref>) emphasized advanced feature requests. Suggestions included the option to add notes to assessment results, integration of advanced voice-quality indices (eg, Cepstral Peak Prominence [CPPS] and Acoustic Voice Quality Index [AVQI]), and audio playback directly from result pages. These comments indicate readiness for clinical refinement, with priorities shifting toward advanced analysis rather than fundamental usability.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>End-user testing and refinement&#x2013;cycle 2.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ID<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="bottom">Representative SLP<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup> feedback</td><td align="left" valign="bottom">Category</td></tr></thead><tbody><tr><td align="left" valign="top">I02-002</td><td align="left" valign="top">Ability to add notes to an assessment result. It should be possible to comment on patient behavior (eg, &#x201C;struggled with breath here&#x201D;).</td><td align="left" valign="top">Feature</td></tr><tr><td align="left" valign="top">I01-013</td><td align="left" valign="top">Inclusion of voice-quality indices (eg, CPPS<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup> and AVQI<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup>) was suggested to provide additional clinical insight.</td><td align="left" valign="top">Feature</td></tr><tr><td align="left" valign="top">I01-004</td><td align="left" valign="top">Playback of the analyzed audio directly from the Results page was requested to allow comparison between perceived audio and computed metrics.</td><td align="left" valign="top">Feature</td></tr><tr><td align="left" valign="top">I01-005</td><td align="left" valign="top">Ability to listen to the uploaded audio on the Results page was requested, especially to review the alignment between the reference text and the transcription</td><td align="left" valign="top">Feature</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup> ID: feedback identifier.</p></fn><fn id="table5fn2"><p><sup>b</sup>SLP: speech and language pathologist.</p></fn><fn id="table5fn3"><p><sup>c</sup>CPPS: Cepstral Peak Prominence. </p></fn><fn id="table5fn4"><p><sup>d</sup>AVQI: Acoustic Voice Quality Index.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Summary of Results</title><p>Across all 3 substudies, the iSpeak prototype demonstrated promising performance and usability. The pilot validation confirmed that ASR achieved accuracy comparable to or greater than that of untrained human listeners across varying severities of dysarthria. Both usability cycles with SLPs yielded excellent SUS scores (&#x003E;88), with cycle 2 reaching a mean of 91.7 (SD 4.1) after refinement. Core workflows were consistently rated highly, and feedback evolved from identifying technical issues in cycle 1 to requesting advanced analytical features in cycle 2. Together, these results indicate that the system is both usable and potentially clinically relevant, with future development focused on enhanced analytic capabilities and broader validation.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study evaluated a web-based application designed to support SLPs in assessing dysarthric speech in patients with poststroke. Developed using a user-centered design framework in accordance with the International Organization for Standardization (ISO) 9241&#x2010;210, the prototype integrates ASR and acoustic analysis into a clinical dashboard. A pilot validation compared ASR with lay listeners, followed by 2 iterative usability cycles with SLPs. The findings demonstrate that ASR can approximate human performance in intelligibility assessment and that the application was consistently rated as highly usable, with SUS scores above 85 in both cycles. Feedback confirmed the value of automatic intelligibility scoring and acoustic analysis while identifying priorities for further development.</p></sec><sec id="s4-2"><title>Preliminary Testing</title><p>The pilot validation provided initial evidence of how ASR compares with human listeners on dysarthric speech. Across tasks, word lists produced higher WERs than sentence lists, reflecting the contextual advantage of sentences [<xref ref-type="bibr" rid="ref27">27</xref>]. Importantly, lay listeners were not native English speakers, which may have introduced bias, as subtle dysarthric articulations could be more difficult to interpret for nonnative listeners [<xref ref-type="bibr" rid="ref40">40</xref>]. This may have led to an underestimation of intelligibility relative to trained or native listeners [<xref ref-type="bibr" rid="ref41">41</xref>].</p><p>Patient-level differences were evident. For &#x201C;Patient 001,&#x201D; ASR outperformed most lay listeners across tasks, suggesting that the system could handle this speech relatively well. For &#x201C;Patient 003,&#x201D; ASR performance was competitive, especially in the sentences task. &#x201C;Patient 002&#x201D; posed the greatest challenge: lay listeners showed high variability in WERs for the word task, and ASR performed worse than all listeners in the sentences task. These findings highlight how severity and individual variability in dysarthria strongly affect ASR performance [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>].</p><p>Although Whisper achieves state-of-the-art performance on typical speech, its accuracy declines with dysarthric input. Stroke-related dysarthria alters pronunciation, speech rate, and voice quality in diverse ways. Because such data are underrepresented in training corpora, ASR robustness remains limited [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. Nonetheless, pilot results indicate that Whisper can sometimes match or surpass untrained human listeners, suggesting promise as a baseline for automatic intelligibility scoring. Similar findings are reported in recent validation efforts of automatic intelligibility measures for motor speech disorders [<xref ref-type="bibr" rid="ref43">43</xref>]. Future validation with larger patient cohorts and SLP raters is needed to confirm these observations.</p></sec><sec id="s4-3"><title>End-User Testing and Refinement&#x2013;Cycle 1</title><p>The first usability cycle demonstrated that the prototype was already perceived as highly usable. The mean SUS score of 88.4 (SD 4.6) falls in the &#x201C;excellent&#x201D; range, confirming that the design was well-adapted to SLPs&#x2019; needs [<xref ref-type="bibr" rid="ref44">44</xref>]. Importantly, the application was tested both in clinical and home environments, as well as across different browsers, indicating technical robustness in varied contexts of use.</p><p>Survey ratings averaged 4.26/5. Core workflow elements&#x2014;navigation, uploading, segment selection, and processing time&#x2014;scored above 4.5, and visual design was rated highest (mean 4.88, SD 0.35). These results align with reviews of existing speech therapy apps, which similarly show that visual design and general usability are often rated highly while output clarity (such as the accuracy of automated scores or error highlighting) tends to receive more mixed feedback [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref45">45</xref>].</p><p>Three items scored lower: perceived accuracy of the intelligibility score (mean 3.13, SD 1.25), phoneme-level error highlighting (mean 3.88, SD 1.25), and meeting expectations (mean 3.88, SD 1.13). These reflected a technical alignment bug in this prototype version, which caused intelligibility scores and error highlights to miss clearly spoken words. This limitation directly influenced perceptions of accuracy, a pattern also reported in empirical studies of online SLP tools, where users frequently note misalignments or ambiguous output explanations [<xref ref-type="bibr" rid="ref39">39</xref>].</p><p>Open feedback aligned with these quantitative findings. Fixes focused on reliability (eg, failed uploads and region shifting). Usability improvements included clearer terminology, simultaneous listening and transcription, and visible time markers. Feature requests suggested phonetic equivalence in scoring and pause analysis. Similar issues of clinician-perceived usability and workflow integration have been noted in broader reviews of automated speech therapy tools [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. Fourteen priority fixes and improvements were implemented for cycle 2, balancing feasibility with the need to proceed rapidly to the next evaluation.</p></sec><sec id="s4-4"><title>End-User Testing and Refinement&#x2013;Cycle 2</title><p>The second cycle reinforced and extended these findings. The mean SUS score increased to 91.7 (SD 4.1), again in the &#x201C;excellent&#x201D; range, suggesting that refinements made after cycle 1 successfully improved usability, as SLP tools evolve from basic usability to richer features [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref45">45</xref>]. As before, testing across home and clinical settings and multiple browsers confirmed robustness. This pattern mirrors findings in reviews of eHealth speech-language therapy applications, where iterative improvements based on clinician feedback tend to yield measurable increases in satisfaction and functionality [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref45">45</xref>].</p><p>Survey ratings averaged 4.34/5. Navigation and the new acoustic analysis page received the highest scores (mean 4.75, SD 0.78). Frequency and intensity graphs were positively rated (4.33 each), confirming clinical relevance. The formants graph was rated lower (mean 3.66, SD 0.65), indicating that its value was less evident in practice. The &#x201C;meeting expectations&#x201D; item improved to a mean of 4.20 (SD 0.42). These improvements in visual clarity and interactive analytics reflect the user experience findings from telepractice tools, where dashboards and visualization features are increasingly valued as maturity grows [<xref ref-type="bibr" rid="ref47">47</xref>].</p><p>Qualitative feedback shifted from identifying bugs to requesting advanced features. Suggestions included adding notes to assessment results, incorporating voice-quality indices such as CPPS and AVQI, and enabling audio playback directly from results pages. This shift is consistent with empirical studies of speech therapy platforms, where, once baseline usability is achieved, user requests focus more on analytic depth and interactivity [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref48">48</xref>]. Broader surveys of online and AI-enabled speech therapy systems echo this pattern, with early iterations focusing on usability and later development emphasizing advanced functionality [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>].</p></sec><sec id="s4-5"><title>The Prototype</title><p>The final prototype can be considered a simple, efficient, and potentially clinically relevant dashboard for assessing speech clarity and phonetic accuracy. Usage analytics confirmed the intuitive design: task times decreased after initial use, and only 3 dead clicks were recorded. The modular architecture enables extension to multilingual contexts, although current phoneme segmentation is restricted to English. While Whisper is multilingual, the phoneme module relies on English-only resources, limiting its applicability across languages.</p><p>Occasional ASR hallucinations were observed in cases of severe dysarthria or poor microphone quality. These issues were mitigated through design decisions (eg, suppressing insertions in alignment), ensuring that phoneme analysis was not distorted. The current implementation also assumes one speaker per session; while speaker diarization could be added to handle overlapping speech, it introduces additional complexity.</p></sec><sec id="s4-6"><title>Future Work</title><p>Several directions for development emerged. First, multilingual phoneme support is required to reflect the diversity of clinical populations. Second, advanced acoustic indices (eg, jitter, shimmer, CPPS, and AVQI) should be integrated to capture additional aspects of dysarthria severity [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>]. Third, diarization would allow for the analysis of multispeaker sessions, accounting for overlaps or caregiver contributions. Finally, larger-scale validation with patients and practicing SLPs is essential to confirm accuracy and usability across broader contexts. An important next step is to systematically evaluate the generalizability of the system across different neurological populations, as initial testing across mixed etiologies suggests feasibility but does not yet establish condition-specific validity. Future work should also consider implementation within the broader framework of telerehabilitation, where evidence is growing for the effectiveness of remote interventions in speech-language therapy [<xref ref-type="bibr" rid="ref49">49</xref>].</p></sec><sec id="s4-7"><title>Limitations</title><p>At present, the system is not compliant with data protection regulations such as the Health Insurance Portability and Accountability Act (HIPAA) and the Health Information Technology for Economic and Clinical Health Act (HITECH), which poses a barrier to clinical deployment over the internet. Use within closed clinical networks is feasible, but full compliance will be essential for wider adoption. A limitation of this study is the small sample size in the usability testing cycles, which may limit the generalizability of the usability findings and warrants further testing in larger cohorts.</p></sec><sec id="s4-8"><title>Conclusion</title><p>This study demonstrates the feasibility of integrating ASR and acoustic analysis into a web-based application to support SLPs in assessing dysarthric speech. A pilot validation confirmed that ASR performance was comparable to that of untrained human listeners, while 2 usability cycles with SLPs yielded consistently excellent SUS scores, indicating that the system was perceived as highly usable and clinically relevant. Iterative refinements improved navigation and workflow, and feedback evolved from bug reports to requests for advanced analytic features, underscoring both the robustness of the core design and the demand for deeper functionality. Although current limitations include reliance on English-only phoneme segmentation, the prototype establishes a solid foundation for scalable digital assessment. Future work should extend validation to larger and more diverse patient groups, expand multilingual support, and integrate additional advanced outcome measures to further enhance clinical adoption and impact.</p></sec></sec></body><back><ack><p>Generative artificial intelligence (ChatGPT 5.5 high) assistance was limited to the wording and language editing of the manuscript; all scientific content, analysis, and interpretation were developed by the authors.</p></ack><notes><sec><title>Funding</title><p>This project did not receive external funding.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are not publicly available due to data privacy considerations but are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: PV, PD, AS, SG, MB, CEA, CMB</p><p>Data curation: PV, CMB</p><p>Formal analysis: PV</p><p>Funding acquisition: CMB</p><p>Investigation: PV, JC</p><p>Methodology: PV, PD, AS, CMB</p><p>Project administration: CMB</p><p>Resources: MB, CMB</p><p>Software: PV, SS, ED</p><p>Supervision: PD, CMB</p><p>Validation: PV, PD, CMB</p><p>Visualization: PV</p><p>Writing&#x2013;original draft: PV, CMB</p><p>Writing&#x2013;review &#x0026; editing: PD, AS, SS, SG, ED, JC, MB, CEA</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ASR</term><def><p> automatic speech recognition</p></def></def-item><def-item><term id="abb2">AVQI</term><def><p>Acoustic Voice Quality Index</p></def></def-item><def-item><term id="abb3">CPPS</term><def><p>Cepstral Peak Prominence</p></def></def-item><def-item><term id="abb4">FDA-2</term><def><p>Frenchay Dysarthria Assessment, Second Edition</p></def></def-item><def-item><term id="abb5">HIPAA</term><def><p>Health Insurance Portability and Accountability Act</p></def></def-item><def-item><term id="abb6">HITECH</term><def><p>Health Information Technology for Economic and Clinical Health Act</p></def></def-item><def-item><term id="abb7">ISO</term><def><p>International Organization for Standardization</p></def></def-item><def-item><term id="abb8">SLP</term><def><p> speech and language pathologist</p></def></def-item><def-item><term id="abb9">SUS</term><def><p>System Usability Scale</p></def></def-item><def-item><term id="abb10">WER</term><def><p>word error rate</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Jayaraman</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Das</surname><given-names>JM</given-names> </name></person-group><article-title>Dysarthria</article-title><source>StatPearls</source><year>2023</year><access-date>2026-06-06</access-date><publisher-name>StatPearls Publishing</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/books/NBK592453/">https://www.ncbi.nlm.nih.gov/books/NBK592453/</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>Dysarthria in adults</article-title><source>American Speech-Language-Hearing Association</source><access-date>2026-06-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.asha.org/practice-portal/clinical-topics/dysarthria-in-adults/">https://www.asha.org/practice-portal/clinical-topics/dysarthria-in-adults/</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vogel</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Graf</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wei&#x00DF;</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>CSJ</given-names> </name><name name-style="western"><surname>Hepworth</surname><given-names>G</given-names> </name><name name-style="western"><surname>Synofzik</surname><given-names>M</given-names> </name></person-group><article-title>Development and validation of the dysarthria impact scale: a patient-reported outcome for motor speech disorders</article-title><source>J Neurol</source><year>2026</year><month>03</month><day>10</day><volume>273</volume><issue>3</issue><fpage>195</fpage><pub-id pub-id-type="doi">10.1007/s00415-026-13740-1</pub-id><pub-id pub-id-type="medline">41805906</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Atkinson-Clement</surname><given-names>C</given-names> </name><name name-style="western"><surname>Letanneux</surname><given-names>A</given-names> </name><name name-style="western"><surname>Baille</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Psychosocial impact of dysarthria: the patient-reported outcome as part of the clinical management</article-title><source>Neurodegener Dis</source><year>2019</year><volume>19</volume><issue>1</issue><fpage>12</fpage><lpage>21</lpage><pub-id pub-id-type="doi">10.1159/000499627</pub-id><pub-id pub-id-type="medline">31112944</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Enderby</surname><given-names>P</given-names> </name></person-group><article-title>Frenchay Dysarthria Assessment</article-title><source>Int J Lang Commun Disord</source><year>1980</year><month>01</month><volume>15</volume><issue>3</issue><fpage>165</fpage><lpage>173</lpage><pub-id pub-id-type="doi">10.3109/13682828009112541</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Riolo</surname><given-names>V</given-names> </name><name name-style="western"><surname>Pizzorni</surname><given-names>N</given-names> </name><name name-style="western"><surname>Guanziroli</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Cross-cultural adaptation into Italian and validation of the Frenchay Dysarthria Assessment - 2</article-title><source>Eur J Phys Rehabil Med</source><year>2022</year><month>06</month><volume>58</volume><issue>3</issue><fpage>342</fpage><lpage>351</lpage><pub-id pub-id-type="doi">10.23736/S1973-9087.21.07029-5</pub-id><pub-id pub-id-type="medline">34498832</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cardoso</surname><given-names>R</given-names> </name><name name-style="western"><surname>Guimar&#x00E3;es</surname><given-names>I</given-names> </name><name name-style="western"><surname>Santos</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Frenchay Dysarthria Assessment (FDA-2) in Parkinson&#x2019;s disease: cross-cultural adaptation and psychometric properties of the European Portuguese version</article-title><source>J Neurol</source><year>2017</year><month>01</month><volume>264</volume><issue>1</issue><fpage>21</fpage><lpage>31</lpage><pub-id pub-id-type="doi">10.1007/s00415-016-8298-6</pub-id><pub-id pub-id-type="medline">27747392</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Icht</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bergerzon-Bitton</surname><given-names>O</given-names> </name><name name-style="western"><surname>Ben-David</surname><given-names>BM</given-names> </name></person-group><article-title>Validation and cross-linguistic adaptation of the Frenchay Dysarthria Assessment (FDA-2) speech intelligibility tests: hebrew version</article-title><source>Int J Lang Commun Disord</source><year>2022</year><month>09</month><volume>57</volume><issue>5</issue><fpage>1023</fpage><lpage>1049</lpage><pub-id pub-id-type="doi">10.1111/1460-6984.12737</pub-id><pub-id pub-id-type="medline">35714104</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><article-title>Telepractice</article-title><source>American Speech-Language-Hearing Association</source><access-date>2025-02-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.asha.org/practice-portal/professional-issues/telepractice/#collapse_3">https://www.asha.org/practice-portal/professional-issues/telepractice/#collapse_3</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="web"><article-title>Telehealth guidance</article-title><source>Royal College of Speech and Language Therapists</source><access-date>2026-06-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.rcslt.org/members/delivering-quality-services/telehealth-guidance/">https://www.rcslt.org/members/delivering-quality-services/telehealth-guidance/</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><article-title>What is speech? What is language?</article-title><source>American Speech-Language-Hearing Association</source><access-date>2026-06-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.asha.org/public/speech/development/speech-and-language/">https://www.asha.org/public/speech/development/speech-and-language/</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scott</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Clark</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cardona</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Telehealth versus face-to-face delivery of speech language pathology services: a systematic review and meta-analysis</article-title><source>J Telemed Telecare</source><year>2025</year><month>10</month><volume>31</volume><issue>9</issue><fpage>1203</fpage><lpage>1215</lpage><pub-id pub-id-type="doi">10.1177/1357633X241272976</pub-id><pub-id pub-id-type="medline">39387166</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yorkston</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Strand</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Kennedy</surname><given-names>MRT</given-names> </name></person-group><article-title>Comprehensibility of dysarthric speech: implications for assessment and treatment planning</article-title><source>Am J Speech Lang Pathol</source><year>1996</year><month>02</month><volume>5</volume><issue>1</issue><fpage>55</fpage><lpage>66</lpage><pub-id pub-id-type="doi">10.1044/1058-0360.0501.55</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kent</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>YJ</given-names> </name></person-group><article-title>Toward an acoustic typology of motor speech disorders</article-title><source>Clin Linguist Phon</source><year>2003</year><month>09</month><volume>17</volume><issue>6</issue><fpage>427</fpage><lpage>445</lpage><pub-id pub-id-type="doi">10.1080/0269920031000086248</pub-id><pub-id pub-id-type="medline">14564830</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Song</surname><given-names>TJ</given-names> </name></person-group><article-title>Efficacy and feasibility of a digital speech therapy for post-stroke dysarthria: protocol for a randomized controlled trial</article-title><source>Front Neurol</source><year>2024</year><volume>15</volume><fpage>1305297</fpage><pub-id pub-id-type="doi">10.3389/fneur.2024.1305297</pub-id><pub-id pub-id-type="medline">38356882</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Baevski</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Mohamed</surname><given-names>A</given-names> </name><name name-style="western"><surname>Auli</surname><given-names>M</given-names> </name></person-group><article-title>Wav2vec 2.0: a framework for self-supervised learning of speech representations</article-title><source>NIPS&#x2019;20: Proceedings of the 34th International Conference on Neural Information Processing Systems</source><year>2020</year><access-date>2026-06-06</access-date><fpage>12449</fpage><lpage>12460</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.5555/3495724.3496768">https://dl.acm.org/doi/abs/10.5555/3495724.3496768</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tucker</surname><given-names>JK</given-names> </name></person-group><article-title>Perspectives of speech-language pathologists on the use of telepractice in schools: the qualitative view</article-title><source>Int J Telerehabil</source><year>2012</year><volume>4</volume><issue>2</issue><fpage>47</fpage><lpage>60</lpage><pub-id pub-id-type="doi">10.5195/ijt.2012.6102</pub-id><pub-id pub-id-type="medline">25945203</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hsu</surname><given-names>WN</given-names> </name><name name-style="western"><surname>Bolte</surname><given-names>B</given-names> </name><name name-style="western"><surname>Tsai</surname><given-names>YHH</given-names> </name><name name-style="western"><surname>Lakhotia</surname><given-names>K</given-names> </name><name name-style="western"><surname>Salakhutdinov</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mohamed</surname><given-names>A</given-names> </name></person-group><article-title>HuBERT: self-supervised speech representation learning by masked prediction of hidden units</article-title><source>IEEE/ACM Trans Audio Speech Lang Process</source><year>2021</year><volume>29</volume><fpage>3451</fpage><lpage>3460</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2021.3122291</pub-id><pub-id pub-id-type="medline">25079929</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>WavLM: large-scale self-supervised pre-training for full stack speech processing</article-title><source>IEEE J Sel Top Signal Process</source><year>2022</year><month>10</month><volume>16</volume><issue>6</issue><fpage>1505</fpage><lpage>1518</lpage><pub-id pub-id-type="doi">10.1109/JSTSP.2022.3188113</pub-id><pub-id pub-id-type="medline">25079929</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Molini-Avejonas</surname><given-names>DR</given-names> </name><name name-style="western"><surname>Rondon-Melo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Amato</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Samelli</surname><given-names>AG</given-names> </name></person-group><article-title>A systematic review of the use of telehealth in speech, language and hearing sciences</article-title><source>J Telemed Telecare</source><year>2015</year><month>10</month><volume>21</volume><issue>7</issue><fpage>367</fpage><lpage>376</lpage><pub-id pub-id-type="doi">10.1177/1357633X15583215</pub-id><pub-id pub-id-type="medline">26026181</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mitchell</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shirota</surname><given-names>C</given-names> </name><name name-style="western"><surname>Clanchy</surname><given-names>K</given-names> </name></person-group><article-title>Factors that influence the adoption of rehabilitation technologies: a multi-disciplinary qualitative exploration</article-title><source>J Neuroeng Rehabil</source><year>2023</year><month>06</month><day>20</day><volume>20</volume><issue>1</issue><fpage>80</fpage><pub-id pub-id-type="doi">10.1186/s12984-023-01194-9</pub-id><pub-id pub-id-type="medline">37340496</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Selvanayakam</surname><given-names>S</given-names> </name><name name-style="western"><surname>Giovanoli</surname><given-names>S</given-names> </name><name name-style="western"><surname>Slot</surname><given-names>A</given-names> </name><etal/></person-group><article-title>I speak Tele outlines the design of a digitized dysarthria assessment</article-title><source>Sci Rep</source><year>2025</year><volume>15</volume><issue>1</issue><fpage>35903</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-19726-9</pub-id><pub-id pub-id-type="medline">41087405</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chandran</surname><given-names>S</given-names> </name><name name-style="western"><surname>Al-Sa&#x2019;di</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ahmad</surname><given-names>E</given-names> </name></person-group><article-title>Exploring user centered design in healthcare: a literature review</article-title><conf-name>2020 4th International Symposium on Multidisciplinary Studies and Innovative Technologies (ISMSIT)</conf-name><conf-date>Oct 22-24, 2020</conf-date><pub-id pub-id-type="doi">10.1109/ISMSIT50672.2020.9255313</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Good</surname><given-names>A</given-names> </name><name name-style="western"><surname>Omisade</surname><given-names>O</given-names> </name></person-group><article-title>Linking activity theory with user centred design: a human computer interaction framework for the design and evaluation of mHealth interventions</article-title><source>Stud Health Technol Inform</source><year>2019</year><month>07</month><day>30</day><volume>263</volume><fpage>49</fpage><lpage>63</lpage><pub-id pub-id-type="doi">10.3233/SHTI190110</pub-id><pub-id pub-id-type="medline">31411152</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rudzicz</surname><given-names>F</given-names> </name><name name-style="western"><surname>Namasivayam</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Wolff</surname><given-names>T</given-names> </name></person-group><article-title>The TORGO database of acoustic and articulatory speech from speakers with dysarthria</article-title><source>Lang Resources &#x0026; Evaluation</source><year>2012</year><month>12</month><volume>46</volume><issue>4</issue><fpage>523</fpage><lpage>541</lpage><pub-id pub-id-type="doi">10.1007/s10579-011-9145-0</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Etikan</surname><given-names>I</given-names> </name><name name-style="western"><surname>Musa</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Alkassim</surname><given-names>RS</given-names> </name></person-group><article-title>Comparison of convenience sampling and purposive sampling</article-title><source>Am J Theor Appl Stat</source><year>2016</year><volume>5</volume><issue>1</issue><fpage>1</fpage><lpage>4</lpage><pub-id pub-id-type="doi">10.11648/j.ajtas.20160501.11</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hustad</surname><given-names>KC</given-names> </name></person-group><article-title>Effects of speech stimuli and dysarthria severity on intelligibility scores and listener confidence ratings for speakers with cerebral palsy</article-title><source>Folia Phoniatr Logop</source><year>2007</year><volume>59</volume><issue>6</issue><fpage>306</fpage><lpage>317</lpage><pub-id pub-id-type="doi">10.1159/000108337</pub-id><pub-id pub-id-type="medline">17965573</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><article-title>ggml-org/whisper.cpp</article-title><source>GitHub</source><access-date>2026-06-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/ggml-org/whisper.cpp">https://github.com/ggml-org/whisper.cpp</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Grier</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Bangor</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kortum</surname><given-names>P</given-names> </name><name name-style="western"><surname>Peres</surname><given-names>SC</given-names> </name></person-group><article-title>The system usability scale: beyond standard usability testing</article-title><source>Proceedings of the Human Factors and Ergonomics Society Annual Meeting</source><volume>57</volume><issue>1</issue><fpage>187</fpage><lpage>191</lpage><pub-id pub-id-type="doi">10.1177/1541931213571042</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Konstantinidis</surname><given-names>S</given-names> </name></person-group><article-title>Computing the edit distance of a regular language</article-title><source>Inf Comput</source><year>2007</year><month>09</month><volume>205</volume><issue>9</issue><fpage>1307</fpage><lpage>1316</lpage><pub-id pub-id-type="doi">10.1016/j.ic.2007.06.001</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Levenshtein</surname><given-names>VI</given-names> </name></person-group><article-title>Binary codes capable of correcting deletions, insertions, and reversals</article-title><source>Sov Phys Dokl</source><year>1966</year><access-date>2026-06-06</access-date><volume>10</volume><fpage>707</fpage><lpage>710</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://nymity.ch/sybilhunting/pdf/Levenshtein1966a.pdf">https://nymity.ch/sybilhunting/pdf/Levenshtein1966a.pdf</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xue</surname><given-names>W</given-names> </name><name name-style="western"><surname>van Hout</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cucchiarini</surname><given-names>C</given-names> </name><name name-style="western"><surname>Strik</surname><given-names>H</given-names> </name></person-group><article-title>Assessing speech intelligibility of pathological speech in sentences and word lists: the contribution of phoneme-level measures</article-title><source>J Commun Disord</source><year>2023</year><volume>102</volume><fpage>106301</fpage><pub-id pub-id-type="doi">10.1016/j.jcomdis.2023.106301</pub-id><pub-id pub-id-type="medline">36709701</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kent</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Weismer</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kent</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Rosenbek</surname><given-names>JC</given-names> </name></person-group><article-title>Toward phonetic intelligibility testing in dysarthria</article-title><source>J Speech Hear Disord</source><year>1989</year><month>11</month><volume>54</volume><issue>4</issue><fpage>482</fpage><lpage>499</lpage><pub-id pub-id-type="doi">10.1044/jshd.5404.482</pub-id><pub-id pub-id-type="medline">2811329</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weismer</surname><given-names>G</given-names> </name><name name-style="western"><surname>Jeng</surname><given-names>JY</given-names> </name><name name-style="western"><surname>Laures</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Kent</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Kent</surname><given-names>JF</given-names> </name></person-group><article-title>Acoustic and intelligibility characteristics of sentence production in neurogenic speech disorders</article-title><source>Folia Phoniatr Logop</source><year>2001</year><volume>53</volume><issue>1</issue><fpage>1</fpage><lpage>18</lpage><pub-id pub-id-type="doi">10.1159/000052649</pub-id><pub-id pub-id-type="medline">11125256</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nyl&#x00E9;n</surname><given-names>F</given-names> </name></person-group><article-title>An acoustic model of speech dysprosody in patients with Parkinson&#x2019;s disease</article-title><source>Front Hum Neurosci</source><year>2025</year><volume>19</volume><fpage>1566274</fpage><pub-id pub-id-type="doi">10.3389/fnhum.2025.1566274</pub-id><pub-id pub-id-type="medline">40356883</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Villain</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cosin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Glize</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Affective prosody and depression after stroke: a pilot study</article-title><source>Stroke</source><year>2016</year><month>09</month><volume>47</volume><issue>9</issue><fpage>2397</fpage><lpage>2400</lpage><pub-id pub-id-type="doi">10.1161/STROKEAHA.116.013852</pub-id><pub-id pub-id-type="medline">27507865</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ross</surname><given-names>ED</given-names> </name></person-group><article-title>Affective prosody and its impact on the neurology of language, depression, memory and emotions</article-title><source>Brain Sci</source><year>2023</year><month>11</month><day>9</day><volume>13</volume><issue>11</issue><fpage>1572</fpage><pub-id pub-id-type="doi">10.3390/brainsci13111572</pub-id><pub-id pub-id-type="medline">38002532</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vaezipour</surname><given-names>A</given-names> </name><name name-style="western"><surname>Campbell</surname><given-names>J</given-names> </name><name name-style="western"><surname>Theodoros</surname><given-names>D</given-names> </name><name name-style="western"><surname>Russell</surname><given-names>T</given-names> </name></person-group><article-title>Mobile apps for speech-language therapy in adults with communication disorders: review of content and quality</article-title><source>JMIR mHealth uHealth</source><year>2020</year><month>10</month><day>29</day><volume>8</volume><issue>10</issue><fpage>e18858</fpage><pub-id pub-id-type="doi">10.2196/18858</pub-id><pub-id pub-id-type="medline">33118953</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wouda</surname><given-names>L</given-names> </name><name name-style="western"><surname>Boerma</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gerrits</surname><given-names>E</given-names> </name><name name-style="western"><surname>Blom</surname><given-names>E</given-names> </name></person-group><article-title>First steps toward implementation of the online test battery LITMUS-NL: a usability and feasibility study</article-title><source>Perspect ASHA Spec Interest Groups</source><year>2024</year><month>10</month><day>3</day><volume>9</volume><issue>5</issue><fpage>1439</fpage><lpage>1455</lpage><pub-id pub-id-type="doi">10.1044/2024_PERSP-23-00308</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Thompson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SJ</given-names> </name></person-group><article-title>Does native language matter in perceptual ratings of dysarthria?</article-title><source>J Speech Lang Hear Res</source><year>2024</year><month>09</month><day>12</day><volume>67</volume><issue>9</issue><fpage>2842</fpage><lpage>2855</lpage><pub-id pub-id-type="doi">10.1044/2024_JSLHR-23-00668</pub-id><pub-id pub-id-type="medline">38662924</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qian</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>C</given-names> </name></person-group><article-title>A survey of technologies for automatic dysarthric speech recognition</article-title><source>EURASIP J Audio Speech Music Process</source><year>2023</year><month>11</month><day>11</day><volume>2023</volume><issue>1</issue><fpage>48</fpage><pub-id pub-id-type="doi">10.1186/s13636-023-00318-2</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qian</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>K</given-names> </name></person-group><article-title>A survey of automatic speech recognition for dysarthric speech</article-title><source>Electronics (Basel)</source><year>2023</year><month>10</month><day>16</day><volume>12</volume><issue>20</issue><fpage>4278</fpage><pub-id pub-id-type="doi">10.3390/electronics12204278</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tr&#x00F6;ger</surname><given-names>J</given-names> </name><name name-style="western"><surname>D&#x00F6;rr</surname><given-names>F</given-names> </name><name name-style="western"><surname>Schwed</surname><given-names>L</given-names> </name><etal/></person-group><article-title>An automatic measure for speech intelligibility in dysarthrias-validation across multiple languages and neurological disorders</article-title><source>Front Digit Health</source><year>2024</year><volume>6</volume><fpage>1440986</fpage><pub-id pub-id-type="doi">10.3389/fdgth.2024.1440986</pub-id><pub-id pub-id-type="medline">39108340</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bangor</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kortum</surname><given-names>PT</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>JT</given-names> </name></person-group><article-title>An empirical evaluation of the system usability scale</article-title><source>Int J Hum-Comput Interact</source><year>2008</year><month>07</month><volume>24</volume><issue>6</issue><fpage>574</fpage><lpage>594</lpage><pub-id pub-id-type="doi">10.1080/10447310802205776</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Attwell</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Bennin</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Tekinerdogan</surname><given-names>B</given-names> </name></person-group><article-title>A systematic review of online speech therapy systems for intervention in childhood speech communication disorders</article-title><source>Sensors (Basel)</source><year>2022</year><month>12</month><day>11</day><volume>22</volume><issue>24</issue><fpage>9713</fpage><pub-id pub-id-type="doi">10.3390/s22249713</pub-id><pub-id pub-id-type="medline">36560082</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Green</surname><given-names>JR</given-names> </name></person-group><article-title>Artificial intelligence in communication sciences and disorders: introduction to the forum</article-title><source>J Speech Lang Hear Res</source><year>2024</year><month>11</month><day>7</day><volume>67</volume><issue>11</issue><fpage>4157</fpage><lpage>4161</lpage><pub-id pub-id-type="doi">10.1044/2024_JSLHR-24-00594</pub-id><pub-id pub-id-type="medline">39418586</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shankar</surname><given-names>V</given-names> </name><name name-style="western"><surname>Ramkumar</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>S</given-names> </name></person-group><article-title>Understanding the implementation of telepractice in speech and language services using a mixed-methods approach</article-title><source>Wellcome Open Res</source><year>2022</year><volume>7</volume><fpage>46</fpage><pub-id pub-id-type="doi">10.12688/wellcomeopenres.17622.2</pub-id><pub-id pub-id-type="medline">36158869</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weidner</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lowman</surname><given-names>J</given-names> </name></person-group><article-title>Telepractice for adult speech-language pathology services: a systematic review</article-title><source>Perspect ASHA Spec Interest Groups</source><year>2020</year><month>02</month><day>21</day><volume>5</volume><issue>1</issue><fpage>326</fpage><lpage>338</lpage><pub-id pub-id-type="doi">10.1044/2019_PERSP-19-00146</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cetinkaya</surname><given-names>B</given-names> </name><name name-style="western"><surname>Twomey</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bullard</surname><given-names>B</given-names> </name><name name-style="western"><surname>EL Kouaissi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Conroy</surname><given-names>P</given-names> </name></person-group><article-title>Telerehabilitation of aphasia: a systematic review of the literature</article-title><source>Aphasiology</source><year>2024</year><volume>38</volume><issue>7</issue><fpage>1271</fpage><lpage>1302</lpage><pub-id pub-id-type="doi">10.1080/02687038.2023.2274621</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Needs, requirements, feedback, and style guide.</p><media xlink:href="formative_v10i1e85230_app1.docx" xlink:title="DOCX File, 234 KB"/></supplementary-material></app-group></back></article>