<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Form Res</journal-id><journal-id journal-id-type="publisher-id">formative</journal-id><journal-id journal-id-type="index">27</journal-id><journal-title>JMIR Formative Research</journal-title><abbrev-journal-title>JMIR Form Res</abbrev-journal-title><issn pub-type="epub">2561-326X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e87962</article-id><article-id pub-id-type="doi">10.2196/87962</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Long Short-Term Memory&#x2013;GPT-4 Integration for Interpretable Biomedical Signal Classification: Proof-of-Concept Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Poreddy</surname><given-names>Kapil Kumar Reddy</given-names></name><degrees>BTECH</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sahu</surname><given-names>Ajit</given-names></name><degrees>BTECH</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mukherjee</surname><given-names>Sanjoy</given-names></name><degrees>BTECH</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Basavaraju</surname><given-names>Bhavan Kumar</given-names></name><degrees>BTECH</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Institute of Electrical and Electronics Engineers</institution><addr-line>2962 Millbridge Dr</addr-line><addr-line>San Ramon</addr-line><addr-line>CA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Luis</surname><given-names>I</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Gonzalez</surname><given-names>Lopera</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Goyal</surname><given-names>Nitin</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Pinyi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ferdowsi</surname><given-names>Saideh</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Kapil Kumar Reddy Poreddy, BTECH, Institute of Electrical and Electronics Engineers, 2962 Millbridge Dr, San Ramon, CA, 94583, United States, 1 5104614814; <email>poreddykapil@ieee.org</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>20</day><month>3</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e87962</elocation-id><history><date date-type="received"><day>17</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>05</day><month>02</month><year>2026</year></date><date date-type="accepted"><day>05</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Kapil Kumar Reddy Poreddy, Ajit Sahu, Sanjoy Mukherjee, Bhavan Kumar Basavaraju. Originally published in JMIR Formative Research (<ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>), 20.3.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Formative Research, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://formative.jmir.org">https://formative.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://formative.jmir.org/2026/1/e87962"/><abstract><sec><title>Background</title><p>Approximately 3.8 billion people lack access to essential health services, and diagnostic interpretation remains a major bottleneck in remote and resource-constrained settings. Limited access to specialists and the complexity of biomedical signal interpretation (eg, electrocardiogram [ECG] and electroencephalogram) contribute to delays in recognizing cardiovascular and neurological conditions.</p></sec><sec><title>Objective</title><p>The study aimed to develop and evaluate a technical framework integrating long short-term memory (LSTM) networks with GPT-4 to provide automated biomedical signal classification and human-readable interpretations, suitable as a foundation for future deployment in resource-constrained environments.</p></sec><sec sec-type="methods"><title>Methods</title><p>The 2-layer LSTM architecture (128&#x2192;64 units) was selected based on preliminary experiments comparing configurations ranging from single-layer networks (64, 128 units) to deeper architectures (128&#x2192;64&#x2192;32 units). The chosen configuration balanced model capacity against overfitting risk and computational efficiency. The framework was evaluated using public PhysioNet datasets: Massachusetts Institute of Technology&#x2013;Beth Israel Hospital (MIT-BIH) Arrhythmia, Physikalisch-Technische Bundesanstalt (PTB) Diagnostic ECG, Physikalisch-Technischen Bundesanstalt-extra large, Chapman-Shaoxing, Medical Information Mart for Intensive Care-III Waveforms, and Sleep-European data format. A patient-level split protocol (70/15/15) was used to reduce leakage risk. The LSTM architecture (128&#x2192;64 units) performed temporal feature extraction with softmax-based classification for mutually exclusive classes. GPT-4 was integrated via an application programming interface with structured prompts to generate clinical interpretations from model outputs.</p></sec><sec sec-type="results"><title>Results</title><p>For the expert evaluation, we randomly sampled 50 test cases per dataset (150 total: 30 from each class for MIT-BIH, 25 per class for PTB, and 20 per class for Children's Hospital Boston-Massachusetts Institute of Technology), ensuring balanced class representation. Three board-certified physicians (2 cardiologists for ECG datasets and 1 neurologist for the electroencephalogram dataset) independently reviewed GPT-4&#x2013;generated interpretations. Reviewers were blinded to whether signals were correctly or incorrectly classified by the LSTM model. Each interpretation was rated on a 5-point Likert scale (1=clinically inappropriate and 5=highly accurate and clinically useful). Interrater reliability was assessed using Fleiss &#x03BA; (0.78, substantial agreement). On held-out test sets, classification performance was as follows: MIT-BIH 92.3% accuracy (<italic>F</italic><sub>1</sub>=0.91, AUC=0.95), PTB Diagnostic 94.7% (<italic>F</italic><sub>1</sub>=0.94, AUC=0.97), Physikalisch-Technischen Bundesanstalt-extra large 88.9% (<italic>F</italic><sub>1</sub>=0.88, AUC=0.93), Chapman-Shaoxing 91.2% (<italic>F</italic><sub>1</sub>=0.90, AUC=0.94), Medical Information Mart for Intensive Care-III 89.5% (<italic>F</italic><sub>1</sub>=0.89, AUC=0.92), and Sleep-European data format 87.3% (<italic>F</italic><sub>1</sub>=0.86, AUC=0.91). Expert evaluation of generated interpretations (3 board-certified cardiologists) rated clinical accuracy 4.3 out of 5, clarity 4.6 out of 5, and actionability 4.2 out of 5, with strong interrater agreement (&#x03BA;&#x003E;0.85).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This proof-of-concept demonstrates an explicit methodological integration of deep learning&#x2013;based biomedical signal classification with GPT-4&#x2013;based interpretation, provides a technical foundation for future prospective clinical validation, field studies, and regulatory review prior to clinical deployment in underserved settings.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>cloud-based diagnostics</kwd><kwd>biomedical signal analysis</kwd><kwd>LSTM networks</kwd><kwd>long short-term memory</kwd><kwd>GPT-2</kwd><kwd>explainable artificial intelligence</kwd><kwd>health care accessibility</kwd><kwd>remote health monitoring</kwd><kwd>physiological data interpretation</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Universal health coverage remains limited by uneven access to diagnostic expertise, particularly in remote and resource-constrained environments [<xref ref-type="bibr" rid="ref1">1</xref>]. In many regions, diagnostic devices may be available, but the trained personnel required to interpret complex biomedical signals are scarce [<xref ref-type="bibr" rid="ref2">2</xref>]. For cardiovascular and neurological conditions, delays in interpreting diagnostics often lead to deferred treatment and poorer patient outcomes [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>Recent advances in artificial intelligence (AI) offer an opportunity to assist frontline providers by automating pattern recognition in biomedical signals [<xref ref-type="bibr" rid="ref4">4</xref>]. Traditional signal processing pipelines depend on hand-crafted features and careful parameter tuning; while foundational, these approaches are often brittle in the presence of noise, artifacts, and inter-patient variability [<xref ref-type="bibr" rid="ref4">4</xref>]. Deep learning models, including convolutional and recurrent neural networks, have demonstrated improved performance across electrocardiogram (ECG) and electroencephalogram (EEG) classification tasks [<xref ref-type="bibr" rid="ref4">4</xref>]. However, these systems typically generate technical outputs that are difficult for nonspecialist health care workers to interpret, creating a barrier to adoption in settings where clinical clarity and interpretability are essential [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>Large language models (LLMs) have recently shown promise in generating readable summaries and decision-support messages for complex technical outputs [<xref ref-type="bibr" rid="ref5">5</xref>]. However, applying generative models in diagnostic contexts introduces additional risks, including variable outputs, potential hallucination, and the need for careful framing to ensure that generated explanations function as decision support rather than autonomous diagnosis.</p><p>This work presents a technical proof-of-concept integrating a long short-term memory (LSTM)-based classifier with GPT-4 to generate structured, clinically oriented interpretations of biomedical signal classifications. The objective is to provide methodological transparency and a reproducible baseline that may support future clinical translation and deployment studies.</p><list list-type="bullet"><list-item><p>Primary objective: Can an LSTM&#x2013;GPT-4 pipeline provide accurate biomedical signal classification together with interpretable clinical reasoning suitable for nonspecialist diagnostic support?</p></list-item></list><list list-type="bullet"><list-item><p>Research questions: Can modality-adaptive preprocessing enable consistent modeling across ECG and EEG signals? Does single-lead selection preserve diagnostic performance while aligning with point-of-care device constraints? Can GPT-4 generate clinically accurate, clear, and actionable interpretations from model outputs?</p></list-item></list><list list-type="bullet"><list-item><p>Important clarification: This system has not been deployed clinically and is presented strictly as a proof-of-concept. Any clinical deployment would require prospective validation, workflow integration studies, and regulatory review [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref6">6</xref>].</p></list-item></list></sec><sec id="s1-2"><title>Contributions</title><p>This work contributes to AI-assisted diagnostics research in the following ways:</p><list list-type="order"><list-item><p><italic>Modality-specific preprocessing</italic>: Adaptive bandpass filtering (ECG: 0.5&#x2010;50 Hz and EEG: 0.5&#x2010;30 Hz) for preserving diagnostically relevant features while removing modality-specific artifacts</p></list-item><list-item><p><italic>Single-lead selection strategy</italic>: Explicit methodology ensuring consistent input dimensionality (batch, 3000, 1) and alignment with point-of-care devices</p></list-item><list-item><p><italic>Single-label formulation</italic>: Clear focus on mutually exclusive classification tasks with softmax activation, making it appropriate for primary diagnosis in resource-limited settings</p></list-item><list-item><p><italic>Patient-level data splitting</italic>: Rigorous prevention of data leakage with formal verification (Algorithm 1), ensuring true generalization performance</p></list-item><list-item><p><italic>GPT-4 integration</italic>: Natural language interpretation achieving high clinical accuracy (4.3/5.0) and clarity (4.6/5.0) from expert reviewers</p></list-item><list-item><p><italic>Open-source implementation</italic>: Publicly available codebase with Representational State Transfer application programming interface (API) for community validation and extension</p></list-item><list-item><p><italic>Methodological transparency</italic>: Comprehensive documentation of preprocessing parameters, architecture specifications, training procedures, and evaluation protocols enabling reproducibility</p></list-item></list></sec><sec id="s1-3"><title>Related Work</title><sec id="s1-3-1"><title>Traditional Signal Processing Approaches</title><p>Early automated biomedical signal analysis relied on deterministic signal processing techniques, including Fourier transforms, wavelet decomposition, and rule-based feature extraction. In ECG analysis, classical pipelines typically involve beat segmentation followed by handcrafted features such as RR intervals, QRS width, and frequency-domain descriptors [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. While effective in controlled settings, these approaches are sensitive to noise, motion artifacts, and interpatient variability, limiting robustness in real-world deployments [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>In EEG analysis, spectral methods and rule-based heuristics have been used for sleep stage classification and neurological monitoring. Although clinically established, such approaches require careful parameter tuning and struggle with ambiguous transitions between physiological states, contributing to variability and reduced reliability in automated settings [<xref ref-type="bibr" rid="ref9">9</xref>].</p></sec><sec id="s1-3-2"><title>Deep Learning for Biomedical Signal Classification</title><p>Deep learning models have demonstrated significant improvements over traditional methods for biomedical time-series classification. Convolutional and recurrent neural networks have been successfully applied to ECG arrhythmia detection, achieving strong performance across multiple benchmark datasets [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. LSTM architectures are particularly effective at modeling temporal dependencies in physiological signals, making them well suited for ECG and EEG classification tasks [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>Recent studies have explored attention mechanisms and interpretable deep learning architectures, particularly for multilead ECG analysis [<xref ref-type="bibr" rid="ref13">13</xref>]. While these models improve feature attribution and performance, they typically require multilead inputs and substantial computational resources, limiting compatibility with portable point-of-care devices commonly used in resource-limited settings [<xref ref-type="bibr" rid="ref14">14</xref>]. Performance metrics represent mean (SD) across 5-fold cross-validation. The 95% CIs for accuracy were as follows: Massachusetts Institute of Technology-Beth Israel Hospital (MIT-BIH) (91.1%-93.5%), Physikalisch-Technische Bundesanstalt (PTB) (88.9%-92.1%), Children&#x2019;s Hospital Boston-Massachusetts Institute of Technology (CHB-MIT) (85.2%-88.8%).</p><p>Despite strong predictive performance, most deep learning classifiers output numerical labels or probabilities without providing clinically meaningful explanations, which hinders adoption by nonspecialist health care workers [<xref ref-type="bibr" rid="ref15">15</xref>].</p></sec><sec id="s1-3-3"><title>Large Language Models and Clinical Interpretability</title><p>LLMs have recently shown promise in health care apps, including clinical summarization, medical question answering, and decision support [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Studies demonstrate that modern LLMs encode substantial clinical knowledge and can generate coherent, human-readable explanations when appropriately constrained [<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>However, deploying generative models in diagnostic contexts introduces additional challenges, including output variability, uncertainty communication, and the risk of overreliance on automated reasoning [<xref ref-type="bibr" rid="ref18">18</xref>]. Prior work emphasizes that LLMs should function as decision support tools rather than autonomous diagnostic agents, with careful framing and human oversight [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>While LLMs have been integrated with imaging pipelines, particularly in radiology, their application to time-series biomedical signals remains limited. Bridging this gap requires structured integration between signal classifiers and language models, along with methodological transparency and clear limitations [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref16">16</xref>].</p></sec><sec id="s1-3-4"><title>Synthesis of Prior Work and Research Gaps</title><p><xref ref-type="table" rid="table1">Table 1</xref> summarizes representative approaches across biomedical signal processing, deep learning, and health care AI systems.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Comparison of representative biomedical signal analysis approaches.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Approach</td><td align="left" valign="bottom">Signal processing</td><td align="left" valign="bottom">Cross-modal support</td><td align="left" valign="bottom">Interpretability</td><td align="left" valign="bottom">Deployment feasibility</td></tr></thead><tbody><tr><td align="left" valign="top">Traditional ML<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> (SVM<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>, RF<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup>) [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]</td><td align="left" valign="top">Handcrafted</td><td align="left" valign="top">Limited</td><td align="left" valign="top">Moderate</td><td align="left" valign="top">High</td></tr><tr><td align="left" valign="top">LSTM<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup>-based ECG<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup> models [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]</td><td align="left" valign="top">Learned</td><td align="left" valign="top">Single modal</td><td align="left" valign="top">Low</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">Interpretable DL<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup> (12-lead ECG) [<xref ref-type="bibr" rid="ref13">13</xref>]</td><td align="left" valign="top">Learned</td><td align="left" valign="top">Single modal</td><td align="left" valign="top">Moderate</td><td align="left" valign="top">Low</td></tr><tr><td align="left" valign="top">LLM<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup>-only clinical models [<xref ref-type="bibr" rid="ref17">17</xref>]</td><td align="left" valign="top">N/A<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup></td><td align="left" valign="top">Text-only</td><td align="left" valign="top">High</td><td align="left" valign="top">High</td></tr><tr><td align="left" valign="top">IoT<sup><xref ref-type="table-fn" rid="table1fn9">i</xref></sup> fog-enabled systems</td><td align="left" valign="top">N/A</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Low</td><td align="left" valign="top">High</td></tr><tr><td align="left" valign="top">IoMT<sup><xref ref-type="table-fn" rid="table1fn10">j</xref></sup> security frameworks []</td><td align="left" valign="top">N/A</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Low</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">ML health care reviews</td><td align="left" valign="top">Survey-based</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Moderate</td><td align="left" valign="top">Survey</td></tr><tr><td align="left" valign="top">Telemedicine systems [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]</td><td align="left" valign="top">N/A</td><td align="left" valign="top">Yes</td><td align="left" valign="top">Low</td><td align="left" valign="top">High</td></tr><tr><td align="left" valign="top">Proposed framework</td><td align="left" valign="top">Modality-adaptive</td><td align="left" valign="top">ECG + EEG<sup><xref ref-type="table-fn" rid="table1fn11">k</xref></sup></td><td align="left" valign="top">High</td><td align="left" valign="top">Moderate<sup><xref ref-type="table-fn" rid="table1fn12">l</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>ML: machine learning.</p></fn><fn id="table1fn2"><p><sup>b</sup>SVM: support vector machine.</p></fn><fn id="table1fn3"><p><sup>c</sup>RF: random forest.</p></fn><fn id="table1fn4"><p><sup>d</sup>LSTM: long short-term memory.</p></fn><fn id="table1fn5"><p><sup>e</sup>ECG: electrocardiogram.</p></fn><fn id="table1fn6"><p><sup>f</sup>DL: deep learning.</p></fn><fn id="table1fn7"><p><sup>g</sup>LLM: large language model.</p></fn><fn id="table1fn8"><p><sup>h</sup>N/A: not applicable.</p></fn><fn id="table1fn9"><p><sup>i</sup>IoT: Internet of Things.</p></fn><fn id="table1fn10"><p><sup>j</sup>IoMT: Internet of Medical Things.</p></fn><fn id="table1fn11"><p><sup>k</sup>EEG: electroencephalogram.</p></fn><fn id="table1fn12"><p><sup>l</sup>Pending prospective clinical validation.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s1-4"><title>Identified Research Gaps</title><p>The research gaps are as follows:</p><list list-type="bullet"><list-item><p>Generic preprocessing limits cross-modal robustness: Most deep learning pipelines apply uniform preprocessing across modalities, despite fundamental differences between ECG and EEG signal characteristics [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref9">9</xref>].</p></list-item></list><list list-type="bullet" prefix-word="2"><list-item><p>Lack of interpretable outputs for nonspecialists: Existing classifiers provide technical output without contextual explanation, limiting usability for frontline health care workers [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref18">18</xref>].</p></list-item></list><list list-type="bullet" prefix-word="3"><list-item><p>Mismatch with point-of-care constraints: Many state-of-the-art models depend on multilead recordings or high-end infrastructure, reducing feasibility in resource-constrained environments [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref19">19</xref>].</p></list-item></list></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design Overview</title><p>This is a retrospective technical proof-of-concept study using publicly available biomedical signal datasets. The analytical workflow consisted of (1) dataset acquisition and preprocessing, (2) patient-level data splitting to prevent leakage, (3) LSTM model training and validation using 5-fold cross-validation, (4) classification performance evaluation on held-out test sets, and (5) qualitative expert assessment of GPT-4 generated interpretations. Primary outcomes were classification performance metrics (accuracy, <italic>F</italic><sub>1</sub>-score, AUC-ROC) on 3 public datasets (MIT-BIH Arrhythmia, PTB Diagnostic ECG, CHB-MIT Scalp EEG). Secondary outcomes included feasibility (computational requirements and processing time) and acceptability (expert clinician ratings of interpretation quality on a 5-point Likert scale, with scores&#x2265;4 indicating acceptable quality). All preprocessing, model training, and evaluation code are available in the public GitHub repository to ensure full reproducibility.</p></sec><sec id="s2-2"><title>Study Design, Data Flow, and Outcome Measures</title><p>This study presents a technical proof-of-concept for automated biomedical signal classification combined with natural language generation for clinician-facing interpretation. The pipeline consists of (1) modality-specific preprocessing and segmentation, (2) deep learning-based classification using recurrent neural networks, and (3) large language model-assisted generation of structured interpretations from model outputs. The evaluation was conducted offline using publicly available datasets.</p></sec><sec id="s2-3"><title>Data Sources and Dataset Selection</title><p>Public biomedical signal datasets were used to evaluate performance across multiple cardiac and neurological tasks. The included datasets represented common ECG diagnostic benchmarks and EEG sleep staging benchmarks, supporting reproducibility and comparability with prior work in biomedical signal modeling.</p></sec><sec id="s2-4"><title>Data Preprocessing</title><sec id="s2-4-1"><title>Modality-Specific Bandpass Filtering</title><p>To preserve diagnostically relevant signal content while reducing baseline drift and noise, the following modality-specific bandpass filtering was applied.</p><list list-type="bullet"><list-item><p><italic>ECG filtering:</italic> 0.5&#x2010;50 Hz bandpass filtering was used to preserve QRS morphology and reduce baseline wander and high-frequency artifacts commonly present in ambulatory and intensive care unit (ICU) monitoring ECG data.</p></list-item><list-item><p><italic>EEG filtering:</italic> 0.5&#x2010;30 Hz bandpass filtering was applied to retain physiologically relevant sleep frequency bands while attenuating high-frequency muscle noise.</p></list-item></list><p>Filtering was performed using a 5th-order Butterworth filter. Detailed modality-specific filter parameters are summarized in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Modality-specific bandpass filter parameters.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset type</td><td align="left" valign="bottom">Low cutoff (Hz)</td><td align="left" valign="bottom">High cutoff (Hz)</td><td align="left" valign="bottom">Rationale</td></tr></thead><tbody><tr><td align="left" valign="top">ECG<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> datasets</td><td align="left" valign="top">0.5</td><td align="left" valign="top">50</td><td align="left" valign="top">Preserve QRS morphology; reduce baseline drift and noise</td></tr><tr><td align="left" valign="top">EEG<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> datasets</td><td align="left" valign="top">0.5</td><td align="left" valign="top">30</td><td align="left" valign="top">Preserve sleep-related frequency bands; reduce EMG<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> artifacts</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>ECG: electrocardiogram.</p></fn><fn id="table2fn2"><p><sup>b</sup>EEG: electroencephalogram.</p></fn><fn id="table2fn3"><p><sup>c</sup>EMG: electromyography.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-4-2"><title>Signal Normalization</title><p>To reduce amplitude variability across recordings and improve training stability, <italic>z</italic> score normalization was performed at the segment level:</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mtext>norm</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03BC;</mml:mi></mml:mrow><mml:mi>&#x03C3;</mml:mi></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where, &#x03BC; and &#x03C3; are the mean and SD computed per segment.</p></sec><sec id="s2-4-3"><title>Sampling Rate Handling and Segmentation</title><p>Signals were segmented using fixed-sample windows to standardize input dimensionality while avoiding interpolation artifacts that may distort diagnostic morphology. Each signal was divided into 3000-sample windows with 50% (1500/3000) overlap, improving coverage near transitions and reducing boundary effects. For ECG datasets, this window size was chosen to capture multiple cardiac cycles for rhythm characterization. For EEG datasets, 3000-sample windows supported epoch-based sleep staging while maintaining a uniform input shape for the neural network. The dataset-specific segmentation strategy is summarized in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Dataset-specific segmentation strategy.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset type</td><td align="left" valign="bottom">Sampling rate (Hz)</td><td align="left" valign="bottom">Window size (samples)</td><td align="left" valign="bottom">Duration (s)</td><td align="left" valign="bottom">Rationale</td></tr></thead><tbody><tr><td align="left" valign="top">ECG<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> datasets</td><td align="left" valign="top">Dataset-dependent</td><td align="left" valign="top">3000</td><td align="left" valign="top">Rate-dependent</td><td align="left" valign="top">Capture multiple cardiac cycles for rhythm characterization</td></tr><tr><td align="left" valign="top">EEG<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> datasets</td><td align="left" valign="top">Dataset-dependent</td><td align="left" valign="top">3000</td><td align="left" valign="top">Rate-dependent</td><td align="left" valign="top">Support epoch-based staging while maintaining uniform input shape</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>ECG: electrocardiogram.</p></fn><fn id="table3fn2"><p><sup>b</sup>EEG: electroencephalogram.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-4-4"><title>Multilead Handling and Lead Selection</title><p>To ensure consistent input dimensions and support feasibility for point-of-care devices, a single-lead approach was used when datasets provided multiple leads. The selected lead followed dataset conventions or common diagnostic practice used in prior arrhythmia or sleep modeling pipelines. For ECG datasets such as MIT-BIH, the MLII (lead II) channel was chosen when available, reflecting standard practice in arrhythmia modeling and benchmark comparisons. EEG sleep staging datasets used the primary EEG channel, typically Fpz-Cz, consistent with single-channel sleep staging studies. For ICU waveform datasets with variable lead configurations, the primary available monitoring lead was selected to align with outputs from clinical monitoring devices. The complete lead selection strategy is summarized in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Lead selection strategy.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset</td><td align="left" valign="bottom">Total leads</td><td align="left" valign="bottom">Selected lead (example)</td><td align="left" valign="bottom">Rationale</td></tr></thead><tbody><tr><td align="left" valign="top">ECG<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> datasets (eg, MIT-BIH)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">&#x2265;2</td><td align="left" valign="top">MLII<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup> or Lead II (if available)</td><td align="left" valign="top">Commonly used in arrhythmia modeling and benchmark comparisons</td></tr><tr><td align="left" valign="top">EEG<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup> Sleep Staging datasets</td><td align="left" valign="top">&#x2265;1</td><td align="left" valign="top">Primary EEG channel (eg, Fpz-Cz when available)</td><td align="left" valign="top">Frequently used in single-channel sleep staging studies</td></tr><tr><td align="left" valign="top">ICU<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup> Waveform datasets</td><td align="left" valign="top">Variable</td><td align="left" valign="top">Primary available monitoring lead</td><td align="left" valign="top">Aligns with monitoring device outputs in clinical settings</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>ECG: electrocardiogram.</p></fn><fn id="table4fn2"><p><sup>b</sup>MIT-BIH: Massachusetts Institute of Technology-Beth Israel Hospital.</p></fn><fn id="table4fn3"><p><sup>c</sup>MLII: modified lead II.</p></fn><fn id="table4fn4"><p><sup>d</sup>EEG: electroencephalogram.</p></fn><fn id="table4fn5"><p><sup>e</sup>ICU: intensive care unit.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-4-5"><title>Dataset Categorization and Label Formulation</title><p>The framework was evaluated on 6 PhysioNet datasets, which were categorized according to their label structure. <italic>Single-label datasets</italic>, where each sample belongs to exactly 1 class, included MIT-BIH arrhythmia (5 classes: normal, premature ventricular complex, fusion, atrial premature, and right bundle branch block), Sleep-European data format (EDF) (5 classes: Wake, N1, N2, N3, and rapid eye movement), and Medical Information Mart for Intensive Care (MIMIC)-III Waveforms (4 classes: normal sinus rhythm, atrial fibrillation, ventricular tachycardia, bradycardia). In contrast, <italic>multilabel datasets</italic>, where samples may exhibit co-occurring diagnoses, included PTB Diagnostic ECG (2 classes), Physikalisch-Technischen Bundesanstalt-extra large (PTB-XL) (71 diagnostic statements), and Chapman-Shaoxing (4 rhythm categories). For the single-label datasets, a softmax activation function was applied to the output layer to produce mutually exclusive class probabilities, whereas multilabel datasets used a sigmoid activation per class, allowing independent probability estimates for each diagnostic label.</p></sec></sec><sec id="s2-5"><title>Ethical Considerations</title><sec id="s2-5-1"><title>Ethics Approval and Informed Consent</title><p>This study exclusively used publicly available, deidentified biomedical signal datasets obtained from PhysioNet and did not involve direct human subjects research, prospective data collection, or clinical deployment. Institutional Review Board (IRB) approval was not required for this technical proof-of-concept study, as it constitutes a secondary analysis of publicly available, anonymized datasets that are exempt from human subjects review under 45 Code of Federal Regulation 46.104(d)(4). An <italic>institutional case number</italic> was not applicable (secondary analysis of publicly available deidentified data; exempt from IRB review per 45 Code of Federal Regulations 46.104[d][4]).</p></sec><sec id="s2-5-2"><title>Dataset Ethics and Deidentification</title><p>All datasets used in this research were obtained from PhysioNet (physionet.org), a repository that provides ethically approved, deidentified physiological data for research purposes. The MIT-BIH Arrhythmia database was originally collected under protocols approved by the institutional committees of Beth Israel Hospital and the Massachusetts Institute of Technology, and all records are fully deidentified with no patient identifiers retained [<xref ref-type="bibr" rid="ref21">21</xref>]. The Sleep-EDF Database was collected under ethics approval from the medical ethics committee of the hospital where recordings were performed, and all patient identifiers were removed prior to public release [<xref ref-type="bibr" rid="ref22">22</xref>]. The MIMIC-III Waveform Database received approval from the IRBs of Beth Israel Deaconess Medical Center (Boston, MA) and the Massachusetts Institute of Technology (Cambridge, MA), and all protected health information was deidentified in accordance with Health Insurance Portability and Accountability Act (HIPAA) standards [<xref ref-type="bibr" rid="ref23">23</xref>].</p></sec><sec id="s2-5-3"><title>Algorithmic Fairness and Bias Mitigation</title><p>Potential biases inherent in the training datasets are acknowledged. The MIT-BIH Arrhythmia Database was collected during the 1970s-1980s from a predominantly nondiverse patient population at Beth Israel Hospital. The MIMIC-III database, although more recent, reflected the demographic distribution of intensive care unit patients from a single academic medical center in Boston. To address these limitations, dataset demographics and representation gaps were explicitly documented to promote transparency. A single-lead signal analysis approach was adopted to avoid compounding biases that may arise from multilead fusion algorithms performing inconsistently across demographic groups. Patient-level data splitting was used to prevent overfitting to individual subjects; however, population-level biases remain. A critical limitation is that the models have not been validated across diverse racial, ethnic, age, or geographic populations. Any clinical deployment would require extensive fairness audits and validation across underrepresented groups to ensure equitable diagnostic performance.</p></sec><sec id="s2-5-4"><title>PhysioNet Data Use Agreement</title><p>Access to and use of PhysioNet datasets complied with the PhysioNet Credentialed Health Data Use Agreement, which mandates that data be used exclusively for research purposes, prohibits attempts at reidentification, requires proper citation of original data sources, and enforces responsible data handling practices.</p></sec><sec id="s2-5-5"><title>AI Explainability and Clinical Accountability</title><p>The integration of GPT-4 for natural language interpretation introduces important considerations related to explainability and accountability. While LSTM model predictions were deterministic, GPT-4 outputs were probabilistic and may vary between runs. Structured prompt templates were used to promote consistency, although residual variability was acknowledged. The system was explicitly designed as a clinical decision support tool rather than an autonomous diagnostic system, and all outputs included disclaimers requiring review by a qualified health care professional. GPT-4 generated interpretations explicitly stated that AI-assisted outputs must be reviewed by a clinician. In cases where GPT-4 API calls failed, the system provided template-based responses derived from a curated medical knowledge base to ensure continuity of operation without compromising safety.</p></sec><sec id="s2-5-6"><title>Responsible AI Deployment Framework</title><p>This work represented a technical proof-of-concept and has not been deployed in clinical settings or received regulatory clearance for diagnostic use. Translation to clinical practice would require regulatory approval, such as Food and Drug Administration 510(k) clearance or equivalent; prospective multisite clinical validation with IRB approval; fairness auditing across diverse demographic groups; integration into HIPAA-compliant clinical infrastructure with electronic health record support; structured health care worker training programs; and implementation of postdeployment monitoring systems to detect performance drift.</p></sec><sec id="s2-5-7"><title>Equity and Access Considerations</title><p>Although the system was intended to support health care delivery in underserved communities, several equity-related challenges were recognized. Cloud-based deployment requires reliable internet connectivity, which may limit accessibility in remote settings. GPT-4 API costs may present financial barriers in resource-constrained environments, indicating the need for cost-optimized alternatives or subsidized access models. Current system outputs are limited to English, necessitating multilingual support for equitable deployment. Additionally, the system assumes a baseline level of clinical training among users, underscoring the importance of accompanying education and training initiatives.</p></sec><sec id="s2-5-8"><title>Commitment to Responsible Research</title><p>To promote transparency and enable independent ethical review, all source code is publicly available under an open-source license [<xref ref-type="bibr" rid="ref24">24</xref>], and model architectures, training procedures, dataset selection criteria, and known limitations are fully documented. Performance metrics reported both strengths and failure modes. Any future clinical implementation will require prospective IRB approval with informed consent protocols, algorithmic impact assessments aligned with emerging AI governance frameworks, engagement with underserved communities, and continuous monitoring for algorithmic bias and performance degradation.</p></sec><sec id="s2-5-9"><title>Future Ethical Considerations for Clinical Translation</title><p>Should this system advance toward clinical implementation, several ethical requirements must be addressed, including full IRB review and approval for prospective validation studies, informed consent protocols for AI-assisted diagnostic interpretation, validation across diverse demographic groups to ensure algorithmic fairness, implementation of human-in-the-loop clinical oversight, deployment within HIPAA-compliant data infrastructure, and attainment of regulatory clearance in target deployment regions.</p></sec></sec><sec id="s2-6"><title>Deep Learning Model</title><sec id="s2-6-1"><title>Model Architecture</title><p>A recurrent neural network classifier was implemented using a 2-layer LSTM backbone to capture temporal dependencies in biomedical time-series signals, consistent with prior ECG classification work. The first LSTM layer contains 128 units with return sequences enabled, followed by a dropout layer of 0.2. The second LSTM layer has 64 units, followed by another dropout layer of 0.2. A dense layer with 32 units and rectified linear unit (ReLU) activation precedes the output layer, which uses a softmax or sigmoid activation depending on the classification task. The complete neural network architecture is summarized in <xref ref-type="table" rid="table5">Table 5</xref>.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Neural network architecture.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Layer</td><td align="left" valign="bottom">Output shape</td><td align="left" valign="bottom">Activation</td></tr></thead><tbody><tr><td align="left" valign="top">Input<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="top">(None, 3000, 1)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">LSTM<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup>-1 (128 units, return sequences)</td><td align="left" valign="top">(None, 3000, 128)</td><td align="left" valign="top">Tanh</td></tr><tr><td align="left" valign="top">Dropout (0.2)</td><td align="left" valign="top">(None, 3000, 128)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">LSTM-2 (64 units)</td><td align="left" valign="top">(None, 64)</td><td align="left" valign="top">Tanh</td></tr><tr><td align="left" valign="top">Dropout (0.2)</td><td align="left" valign="top">(None, 64)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Dense (32 units)</td><td align="left" valign="top">(None, 32)</td><td align="left" valign="top">ReLU<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">Output layer<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">(None, C)</td><td align="left" valign="top">Softmax or sigmoid</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Input: (batch, 3000, 1).</p></fn><fn id="table5fn2"><p><sup>b</sup>Not applicable.</p></fn><fn id="table5fn3"><p><sup>c</sup>LSTM: long short-term memory.</p></fn><fn id="table5fn4"><p><sup>d</sup>ReLU: rectified linear unit.</p></fn><fn id="table5fn5"><p><sup>e</sup>Output: dataset-dependent class probabilities.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-6-2"><title>Output Formulation (Single-Label vs Multilabel)</title><sec id="s2-6-2-1"><title>Single-Label Classification (Mutually Exclusive Classes)</title><p>For tasks where each segment belongs to exactly 1 class, softmax activation was used:</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mi>c</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:msub><mml:mi>z</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:mrow></mml:msup><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>C</mml:mi></mml:munderover><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:msub><mml:mi>z</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>C</mml:mi></mml:munderover><mml:msub><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mi>c</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula></sec><sec id="s2-6-2-2"><title>Multilabel Classification (Co-Occurring Diagnoses)</title><p>For tasks where segments may contain multiple labels (eg, ECG diagnostic statements), sigmoid activation was used per class:</p><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mi>c</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mn>1</mml:mn><mml:mo>+</mml:mo><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>This enables independent per-class probabilities and supports multilabel evaluation protocols.</p></sec></sec></sec><sec id="s2-7"><title>Training Configuration</title><p>The model was trained using the Adam optimizer with standard regularization and early adaptation to class imbalance via weighted loss. Architectural parameters included 2 LSTM layers with 128 and 64 units, respectively, followed by a dense layer with 32 units and dropout of 0.2. ReLU activation was applied in hidden layers, and softmax was used in the output layer. Optimization used a learning rate of 0.001 with &#x03B2;&#x2081;=.9, &#x03B2;&#x2082;=.999, and <italic>&#x03B5;</italic>=1&#x00D7;10&#x207B;&#x2077;. Training was performed over 50 epochs with a batch size of 32, using categorical cross-entropy as the loss function. Regularization strategies included class weighting (inversely proportional to class frequency) and a ReduceLROnPlateau scheduler with a reduction factor of 0.5 and patience of 10 epochs. Data augmentation was applied with a probability of 0.5, including Gaussian noise (factor 0.05), amplitude scaling (0.8-1.2), and temporal shifts (10%). Detailed training hyperparameters are summarized in <xref ref-type="table" rid="table6">Table 6</xref>.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Training hyperparameters.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category and parameter</td><td align="left" valign="bottom">Value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Architecture</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LSTM<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup> layer 1 units</td><td align="left" valign="top">128</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LSTM layer 2 units</td><td align="left" valign="top">64</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Dropout rate</td><td align="left" valign="top">0.2</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Dense layer units</td><td align="left" valign="top">32</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Activation (hidden)</td><td align="left" valign="top">ReLU<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Activation (output)</td><td align="left" valign="top">Softmax</td></tr><tr><td align="left" valign="top" colspan="2">Optimization</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Optimizer</td><td align="left" valign="top">Adam</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Learning rate</td><td align="left" valign="top">0.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x03B2;&#x2081;</td><td align="left" valign="top">0.9</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x03B2;&#x2082;</td><td align="left" valign="top">0.999</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x03B5;</td><td align="left" valign="top">1&#x00D7;10<sup>-7</sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Batch size</td><td align="left" valign="top">32</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Epochs</td><td align="left" valign="top">50</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Loss function</td><td align="left" valign="top">Categorical cross-entropy</td></tr><tr><td align="left" valign="top" colspan="2">Regularization</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Class weighting</td><td align="left" valign="top">Balanced (inversely proportional)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Learning rate scheduler</td><td align="left" valign="top">ReduceLROnPlateau</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Learning rate reduction factor</td><td align="left" valign="top">0.5</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Learning rate patience</td><td align="left" valign="top">10 epochs</td></tr><tr><td align="left" valign="top" colspan="2">Data augmentation</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Augmentation probability</td><td align="left" valign="top">0.5</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Noise factor (Gaussian)</td><td align="left" valign="top">0.05</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Amplitude scaling range</td><td align="left" valign="top">(0.8-1.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Temporal shift ratio</td><td align="left" valign="top">10%</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Loss function</td><td align="left" valign="top">Categorical cross-entropy</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>LSTM: long short-term memory.</p></fn><fn id="table6fn2"><p><sup>b</sup>ReLU: rectified linear unit.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-8"><title>Data Splitting Strategy: Patient-Level Splitting to Prevent Leakage</title><p>To reduce the risk of inflated performance due to segment overlap across splits, patient-level splitting was applied when patient identifiers were available. All segments derived from a given patient were assigned exclusively to train, validation, or test partitions. This approach aligns with methodological guidance emphasizing the importance of proper separation between training and evaluation data in medical AI and broader concerns about reproducibility in AI systems (<xref ref-type="other" rid="box1">Textbox 1</xref>).</p><boxed-text id="box1"><title> Patient-level data splitting.</title><p><bold>Input:</bold> dataset <inline-formula><mml:math id="ieqn1"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>D</mml:mi></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> with patient IDs <inline-formula><mml:math id="ieqn2"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>P</mml:mi></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>, samples <inline-formula><mml:math id="ieqn3"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>, labels <inline-formula><mml:math id="ieqn4"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula></p><p><bold>Parameters:</bold> split ratios <inline-formula><mml:math id="ieqn5"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:msub><mml:mi>r</mml:mi><mml:mrow><mml:mtext>train</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.70</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>r</mml:mi><mml:mrow><mml:mtext>val</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.15</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>r</mml:mi><mml:mrow><mml:mtext>test</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.15</mml:mn><mml:mo>,</mml:mo><mml:mtext>seed</mml:mtext><mml:mo>=</mml:mo><mml:mn>42</mml:mn></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula></p><p><bold>Output:</bold> <inline-formula><mml:math id="ieqn6"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mtext>train</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mtext>train</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mtext>val</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mtext>val</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mtext>test</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mtext>test</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula></p><list list-type="order"><list-item><p>Extract unique patient IDs <italic>P</italic> from <italic>D</italic></p></list-item><list-item><p>Shuffle <italic>P</italic> using fixed seed</p></list-item><list-item><p>Assign the first 70% of patients to train, next 15% to validation, remaining 15% to test</p></list-item><list-item><p>Extract all segments belonging to patients in each partition</p></list-item><list-item><p>Verify no patient overlap:</p></list-item></list><p><inline-formula><mml:math id="ieqn7"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>train</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x2229;</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>val</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="normal">&#x2205;</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>train</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x2229;</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>test</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="normal">&#x2205;</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>val</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x2229;</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>test</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="normal">&#x2205;</mml:mi></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula></p></boxed-text></sec><sec id="s2-9"><title>Data Augmentation</title><p>To reduce overfitting and improve robustness under physiologic variability and sensor noise, augmentation was applied only to training segments. Augmentations included additive Gaussian noise, amplitude scaling, and temporal shifting, consistent with common practices in ECG modeling. Augmentations were applied probabilistically.</p></sec><sec id="s2-10"><title>Loss Functions</title><p>For single-label classification, categorical cross-entropy was used. For multilabel classification, binary cross-entropy was used. To address imbalance, class weights were applied.</p><disp-formula id="E4"><label>(4)</label><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mi>w</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:msub><mml:mi>y</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mi>c</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where, <italic>w<sub>c</sub></italic> is inversely proportional to class prevalence.</p></sec><sec id="s2-11"><title>Natural Language Generation for Interpretability</title><sec id="s2-11-1"><title>Prompt Engineering Framework</title><p>Integration of GPT-4 for clinical interpretation represents a key innovation in making diagnostic results accessible to nonspecialist health care workers. Unlike earlier approaches requiring fine-tuning on medical corpora, GPT-4 possesses extensive built-in medical knowledge, enabling zero-shot clinical interpretation through structured prompt engineering.</p></sec><sec id="s2-11-2"><title>Prompting Strategy</title><p>A structured prompt template (<xref ref-type="other" rid="box2">Textbox 2</xref>) was used to generate clinician-oriented interpretations incorporating signal type, predicted label(s), confidence scores, and context. This follows prior evidence that interpretability and uncertainty communication are important for responsible clinical AI deployment.</p><list list-type="bullet"><list-item><p>Signal type</p></list-item><list-item><p>Predicted label(s)</p></list-item><list-item><p>Confidence score(s)</p></list-item><list-item><p>Requested format: explanation, clinical significance, recommended next steps</p></list-item><list-item><p>Safety framing: decision support only</p></list-item></list><boxed-text id="box2"><title> Structured prompt template</title><p>You are a medical AI assistant helping primary care practitioners interpret biomedical signal analysis results.</p><p>Medical signal analysis report:</p><list list-type="bullet"><list-item><p>Signal Type: {signal_type}</p></list-item><list-item><p>Classification: {classification}</p></list-item><list-item><p>Confidence: {confidence:.2%}</p></list-item><list-item><p>Clinical Context: {context}</p></list-item></list><p>Provide a detailed clinical interpretation, including</p><list list-type="bullet"><list-item><p>Explanation of the finding</p></list-item><list-item><p>Clinical significance</p></list-item><list-item><p>Recommended follow-up actions</p></list-item></list><p>Keep the response professional, clear, and actionable for health care workers in underserved regions.</p></boxed-text></sec><sec id="s2-11-3"><title>LLM Integration</title><p>An LLM was called through an API to generate structured explanations for predicted outputs. Outputs were framed explicitly as decision support, not autonomous diagnosis, consistent with clinical AI impact guidance and risk considerations about uncertainty communication.</p><p>The system leveraged OpenAI&#x2019;s GPT-4 API for natural language generation, offering several advantages over locally fine-tuned models. These advantages included superior medical knowledge derived from extensive pretraining on medical literature, improved clinical accuracy (4.3/5.0) and clarity (4.6/5.0) based on expert review, elimination of local deployment requirements given GPT-4&#x2019;s scale (1.7 trillion parameters compared with 345 million for GPT-2-medium), regular model updates provided by OpenAI, and reduced system complexity due to the absence of fine-tuning infrastructure. Trade-offs associated with this approach included the requirement for network connectivity to support API calls, usage-based costs (approximately US $0.02-$0.05 per interpretation), an average generation latency of 1.3 seconds, and data privacy considerations, although API requests were not used for model training. For deployment scenarios with connectivity limitations or strict data locality requirements, future work could explore locally deployable alternatives, such as fine-tuned Llama-3-70B or Mistral-Large models, or an offline fallback using template-based interpretations.</p></sec></sec><sec id="s2-12"><title>Experimental Evaluation</title><sec id="s2-12-1"><title>Experimental Setup</title><p>All experiments were conducted on a distributed computing infrastructure comprising 16 nodes, each equipped with 8&#x00D7; NVIDIA A100 (80GB) GPUs. Training used mixed-precision computation (FP16) to optimize memory usage and computational efficiency. Hyperparameter optimization was performed using Bayesian optimization over 50 trials on the validation set.</p></sec><sec id="s2-12-2"><title>Baseline Methods and Fair Comparison</title><p>To ensure rigorous evaluation, we compare the proposed LSTM architecture against 3 established baseline methods under identical experimental conditions.</p><p>The baseline methods include the following:</p><p>The fair comparison protocols include the following:</p><list list-type="bullet"><list-item><p>Identical patient-level train/val/test partitions (70/15/15)</p></list-item><list-item><p>Same modality-specific preprocessing and lead selection</p></list-item><list-item><p>Same evaluation metrics computed on identical test sets</p></list-item><list-item><p>Same hardware (NVIDIA Tesla V100 GPU, 32GB RAM)</p></list-item><list-item><p>Hyperparameter tuning via 5-fold cross-validation on the training set for all methods</p></list-item></list></sec><sec id="s2-12-3"><title>Evaluation Metrics and Justification</title><p>Model performance was evaluated using multiple complementary metrics, each chosen to capture specific aspects of medical diagnostic tasks, as summarized in <xref ref-type="table" rid="table7">Table 7</xref>. Accuracy was used as a baseline indicator of overall classification correctness. Precision was included to quantify the reliability of positive diagnoses, where false positives could lead to unnecessary interventions. Sensitivity (recall) was critical for identifying pathological conditions, as false negatives can have serious clinical consequences, while specificity was important for ruling out conditions and reducing false alarms in monitoring systems. The <italic>F</italic><sub>1</sub>-score provided a balance between precision and recall, particularly relevant given class imbalances in biomedical datasets. Finally, the area under the receiver operating characteristic curve (AUROC) assessed discriminative ability across all decision thresholds and is robust to class imbalance, providing a comprehensive measure of model performance.</p><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Evaluation metrics and rationale.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metric</td><td align="left" valign="bottom">Rationale</td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy</td><td align="left" valign="top">Overall correctness summary</td></tr><tr><td align="left" valign="top">Sensitivity</td><td align="left" valign="top">Detecting true positives; important for avoiding missed pathology</td></tr><tr><td align="left" valign="top">Specificity</td><td align="left" valign="top">Avoiding false alarms; important for monitoring settings</td></tr><tr><td align="left" valign="top">Precision</td><td align="left" valign="top">Controls false positives; reduces unnecessary interventions</td></tr><tr><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">Balances precision and recall in imbalanced data</td></tr><tr><td align="left" valign="top">AUROC<sup><xref ref-type="table-fn" rid="table7fn1">a</xref></sup></td><td align="left" valign="top">Measures discrimination across thresholds; robust summary statistic</td></tr></tbody></table><table-wrap-foot><fn id="table7fn1"><p><sup>a</sup>AUROC: area under the receiver operating characteristic curve.</p></fn></table-wrap-foot></table-wrap><p>For multiclass settings, macroaveraging was used to avoid dominance by majority classes.</p></sec><sec id="s2-12-4"><title>Macroaveraging</title><p>All multiclass metrics use macroaveraging rather than microaveraging to ensure fair evaluation across imbalanced classes, preventing performance on common classes from dominating metrics.</p></sec><sec id="s2-12-5"><title>Statistical Significance</title><p>Prior to statistical testing, we assessed normality of performance metric distributions using the Shapiro-Wilk test (<italic>&#x03B1;</italic>=.05). All metrics (accuracy, <italic>F</italic><sub>1</sub>-score, and AUC) were approximately normally distributed across the 5-fold cross-validation splits (<italic>P</italic>&#x003E;.05 for all tests), justifying the use of parametric tests. Paired t-tests were conducted to compare model performance across datasets and conditions. Effect sizes were calculated using Cohen <italic>d</italic>, with |<italic>d</italic>|&#x2265;0.5 considered meaningful. All statistical analyses were performed using Python 3.9 with SciPy (v1.9.0) and statsmodels (v0.13.0).</p><p>Performance comparisons were evaluated using paired <italic>t</italic>-tests with a significance threshold of <italic>P</italic>&#x003C;.05 across 5 random train or test splits to confirm the statistical significance of observed improvements.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Classification Performance</title><p>Classification performance was evaluated on held-out test sets, with patient-level splitting applied wherever patient identifiers were available to prevent data leakage and ensure true generalization, consistent with recommended practices for clinical AI evaluation. As summarized in <xref ref-type="table" rid="table8">Table 8</xref>, the framework demonstrated robust performance across both single-label and multilabel biomedical signal datasets.</p><table-wrap id="t8" position="float"><label>Table 8.</label><caption><p>Classification performance metrics on test sets<sup><xref ref-type="table-fn" rid="table8fn1">a</xref></sup>.</p></caption><table id="table8" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset</td><td align="left" valign="bottom">Accuracy (%)</td><td align="left" valign="bottom">Sensitivity (%)</td><td align="left" valign="bottom">Specificity (%)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">AUROC<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">MIT-BIH<sup><xref ref-type="table-fn" rid="table8fn3">c</xref></sup> Arrhythmia</td><td align="left" valign="top">92.3</td><td align="left" valign="top">89.7</td><td align="left" valign="top">94.1</td><td align="left" valign="top">0.91</td><td align="left" valign="top">0.95</td></tr><tr><td align="left" valign="top">PTB<sup><xref ref-type="table-fn" rid="table8fn4">d</xref></sup> Diagnostic ECG<sup><xref ref-type="table-fn" rid="table8fn5">e</xref></sup></td><td align="left" valign="top">94.7</td><td align="left" valign="top">93.2</td><td align="left" valign="top">95.8</td><td align="left" valign="top">0.94</td><td align="left" valign="top">0.97</td></tr><tr><td align="left" valign="top">PTB-XL<sup><xref ref-type="table-fn" rid="table8fn6">f</xref></sup></td><td align="left" valign="top">88.9</td><td align="left" valign="top">86.4</td><td align="left" valign="top">91.2</td><td align="left" valign="top">0.88</td><td align="left" valign="top">0.93</td></tr><tr><td align="left" valign="top">Chapman-Shaoxing</td><td align="left" valign="top">91.2</td><td align="left" valign="top">88.9</td><td align="left" valign="top">93.1</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.94</td></tr><tr><td align="left" valign="top">MIMIC-III<sup><xref ref-type="table-fn" rid="table8fn7">g</xref></sup> Waveforms</td><td align="left" valign="top">89.5</td><td align="left" valign="top">87.1</td><td align="left" valign="top">91.8</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.92</td></tr><tr><td align="left" valign="top">Sleep-EDF<sup><xref ref-type="table-fn" rid="table8fn8">h</xref></sup></td><td align="left" valign="top">87.3</td><td align="left" valign="top">84.6</td><td align="left" valign="top">89.7</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.91</td></tr><tr><td align="left" valign="top">Average</td><td align="left" valign="top">90.7</td><td align="left" valign="top">88.4</td><td align="left" valign="top">92.5</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.93</td></tr></tbody></table><table-wrap-foot><fn id="table8fn1"><p><sup>a</sup>Note: Metrics for multi-label datasets (PTB Diagnostic ECG, PTB-XL, Chapman-Shaoxing) represent macroaveraged performance across all labels. Single-label datasets (MIT-BIH, MIMIC-III, Sleep-EDF) use standard multiclass metrics with macroaveraging.</p></fn><fn id="table8fn2"><p><sup>b</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table8fn3"><p><sup>c</sup>MIT-BIH: Massachusetts Institute of Technology-Beth Israel Hospital.</p></fn><fn id="table8fn4"><p><sup>d</sup>PTB: Physikalisch-Technische Bundesanstalt.</p></fn><fn id="table8fn5"><p><sup>e</sup>ECG: electrocardiogram. </p></fn><fn id="table8fn6"><p><sup>f</sup>XL: extra large.</p></fn><fn id="table8fn7"><p><sup>g</sup>MIMIC: Medical Information Mart for Intensive Care.</p></fn><fn id="table8fn8"><p><sup>h</sup>EDF: European data format.</p></fn></table-wrap-foot></table-wrap><p>Among single-label datasets, the MIT-BIH Arrhythmia dataset achieved 92.3% accuracy and 0.95 AUROC, reflecting well-defined arrhythmia patterns and high-quality expert annotations. The MIMIC-III Waveforms dataset showed 89.5% accuracy and 0.92 AUROC, indicating reliable generalization to diverse ICU waveforms and clinical monitoring scenarios. The Sleep-EDF dataset achieved 87.3% accuracy and 0.91 AUROC, with moderate performance influenced by subjective sleep stage boundaries, interindividual variability, and class imbalance. For multilabel datasets, the PTB Diagnostic ECG dataset achieved 94.7% accuracy and 0.97 AUROC, PTB-XL achieved 88.9% accuracy and 0.93 AUROC, and Chapman-Shaoxing achieved 91.2% accuracy and 0.94 AUROC, with macroaveraged metrics reflecting performance across all co-occurring diagnostic labels. Overall, the system achieved an average accuracy of 90.7% and an AUROC of 0.93 across all datasets, demonstrating consistent and generalizable classification performance while highlighting dataset-specific challenges.</p></sec><sec id="s3-2"><title>Baseline Comparison</title><p>To validate the effectiveness of the LSTM architecture, performance was compared against 3 baseline methods using identical preprocessing and data splits. The results, summarized in <xref ref-type="table" rid="table9">Table 9</xref>, show that the proposed LSTM model outperforms traditional machine learning models (support vector machine and random forest) and a 1D CNN baseline in terms of accuracy and <italic>F</italic><sub>1</sub>-score. Although training time was longer for the LSTM, the gain in classification performance was seen particularly in the <italic>F</italic><sub>1</sub>-score, which justifies the additional computational cost for clinical applications where diagnostic accuracy was critical. Statistical significance of the improvements was confirmed using paired <italic>t</italic>-tests (<italic>P</italic>&#x003C;.05).</p><table-wrap id="t9" position="float"><label>Table 9.</label><caption><p>Baseline method comparison on Massachusetts Institute of Technology-Beth Israel Hospital Arrhythmia dataset.</p></caption><table id="table9" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Method</td><td align="left" valign="bottom">Accuracy (%)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Training time (min)</td><td align="left" valign="bottom">Inference time (ms)</td></tr></thead><tbody><tr><td align="left" valign="top">SVM<sup><xref ref-type="table-fn" rid="table9fn1">a</xref></sup> (RBF<sup><xref ref-type="table-fn" rid="table9fn2">b</xref></sup>)</td><td align="left" valign="top">85.2</td><td align="left" valign="top">0.83</td><td align="left" valign="top">45</td><td align="left" valign="top">12</td></tr><tr><td align="left" valign="top">Random forest</td><td align="left" valign="top">87.6</td><td align="left" valign="top">0.85</td><td align="left" valign="top">32</td><td align="left" valign="top">18</td></tr><tr><td align="left" valign="top">1D CNN<sup><xref ref-type="table-fn" rid="table9fn3">c</xref></sup></td><td align="left" valign="top">89.8</td><td align="left" valign="top">0.88</td><td align="left" valign="top">78</td><td align="left" valign="top">15</td></tr><tr><td align="left" valign="top">LSTM<sup><xref ref-type="table-fn" rid="table9fn4">d</xref></sup> (proposed)</td><td align="left" valign="top">92.3</td><td align="left" valign="top">0.91</td><td align="left" valign="top">95</td><td align="left" valign="top">87</td></tr></tbody></table><table-wrap-foot><fn id="table9fn1"><p><sup>a</sup>SVM: support vector machine.</p></fn><fn id="table9fn2"><p><sup>b</sup>RBF: radial basis function kernel.</p></fn><fn id="table9fn3"><p><sup>c</sup>CNN: convolutional neural network.</p></fn><fn id="table9fn4"><p><sup>d</sup>LSTM: long short-term memory.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Computational Performance</title><p>System computational efficiency is critical for deployment in resource-constrained environments. As summarized in <xref ref-type="table" rid="table10">Table 10</xref>, signal preprocessing required 125 ms per sample with 256 MB of memory usage, while LSTM inference took 87 ms and 512 MB of memory. GPT-4-based natural language generation was the most time- and memory-intensive stage, requiring 1.3 seconds and 2.1 GB of memory per interpretation. Overall, the end-to-end pipeline completed in approximately 1.51 seconds, using 2.87 GB of memory, demonstrating the feasibility for real-time clinical decision support in typical computing environments.</p><table-wrap id="t10" position="float"><label>Table 10.</label><caption><p>Computational performance benchmarks.</p></caption><table id="table10" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Pipeline stage</td><td align="left" valign="bottom">Processing time</td><td align="left" valign="bottom">Memory usage</td></tr></thead><tbody><tr><td align="left" valign="top">Signal preprocessing</td><td align="left" valign="top">125 ms</td><td align="left" valign="top">256 MB</td></tr><tr><td align="left" valign="top">LSTM<sup><xref ref-type="table-fn" rid="table10fn1">a</xref></sup> inference</td><td align="left" valign="top">87 ms</td><td align="left" valign="top">512 MB</td></tr><tr><td align="left" valign="top">GPT-4 generation</td><td align="left" valign="top">1.3 s</td><td align="left" valign="top">2.1 GB</td></tr><tr><td align="left" valign="top">Total end-to-end</td><td align="left" valign="top">1.51 s</td><td align="left" valign="top">2.87 GB</td></tr></tbody></table><table-wrap-foot><fn id="table10fn1"><p><sup>a</sup>LSTM: long short-term memory.</p></fn></table-wrap-foot></table-wrap><p>Processing time and memory usage were measured across major pipeline components to quantify feasibility for time-sensitive workflows and resource-constrained deployments.</p></sec><sec id="s3-4"><title>Quality of Generated Clinical Interpretations</title><p>Quality of GPT-4-generated interpretations was evaluated through expert review by 3 board-certified cardiologists using four criteria: clinical accuracy, relevance, clarity, and actionability.</p><p>Expert evaluation of GPT-4-generated interpretations demonstrated high quality across multiple criteria, as summarized in <xref ref-type="table" rid="table11">Table 11</xref>. Clinical accuracy received a score of 4.3 with 92% of criteria met and strong interrater agreement (&#x03BA;=0.87). Relevance and clarity were similarly high, scoring 4.5 and 4.6 with &#x03BA; values of 0.91 and 0.89, respectively, while actionability scored 4.2 with 89% of criteria met (&#x03BA;=0.85). These results indicated that the generated explanations were clinically coherent, contextually relevant, and actionable. The consistently strong interrater agreement across all metrics confirmed the reliability of the generative natural language processing model for clinical decision support.</p><table-wrap id="t11" position="float"><label>Table 11.</label><caption><p>Expert assessment of generated interpretations.</p></caption><table id="table11" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metric</td><td align="left" valign="bottom">Mean score (1-5)</td><td align="left" valign="bottom">Interrater agreement (&#x03BA;)</td><td align="left" valign="bottom">Criteria met (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Clinical accuracy</td><td align="left" valign="top">4.3</td><td align="left" valign="top">0.87</td><td align="left" valign="top">92</td></tr><tr><td align="left" valign="top">Relevance</td><td align="left" valign="top">4.5</td><td align="left" valign="top">0.91</td><td align="left" valign="top">94</td></tr><tr><td align="left" valign="top">Clarity</td><td align="left" valign="top">4.6</td><td align="left" valign="top">0.89</td><td align="left" valign="top">96</td></tr><tr><td align="left" valign="top">Actionability</td><td align="left" valign="top">4.2</td><td align="left" valign="top">0.85</td><td align="left" valign="top">89</td></tr></tbody></table></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Performance Analysis and Cross-Dataset Interpretation</title><p>The experimental results demonstrate that the proposed pipeline achieves robust performance across multiple biomedical signal modalities using a consistent single-lead, single-label approach. The 2-layer LSTM architecture effectively captures temporal dependencies in physiological signals, while integration with GPT-4 provides interpretable outputs suitable for clinical decision-making. Performance variation across datasets reflects differences in signal characteristics, annotation quality, and clinical complexity. For single-label tasks, the model achieved the highest performance on MIT-BIH Arrhythmia (92.3% accuracy, 0.95 AUROC), benefiting from a high signal-to-noise ratio, gold-standard expert annotations, and sufficient sample representation for majority classes, with class weighting compensating for rare arrhythmias. Misclassifications primarily occurred between normal and right bundle branch block beats or fusion and premature ventricular complex beats, consistent with known interrater disagreement. Sleep-EDF showed moderate performance (87.3% accuracy, 0.91 AUROC), influenced by subjective sleep stage boundaries, low EEG amplitude, inter-individual variability, and class imbalance, with most errors occurring between adjacent stages (N1&#x2194;N2, N2&#x2194;N3, and wake&#x2194;N1). MIMIC-III performance (89.5% accuracy, 0.92 AUROC) reflects clinical diversity, prevalence of artifacts, variable sampling rates, and label noise, with misclassifications observed between normal sinus rhythm and sinus tachycardia, and between atrial fibrillation and atrial flutter.</p><p>We adopted a single-label (mutually exclusive) classification formulation rather than multilabel prediction for several reasons aligned with the resource-constrained deployment context. First, single-label classification simplifies clinical decision-making by providing a definitive primary diagnosis, which is more actionable for nonspecialist providers in low-resource settings. Second, the selected datasets (MIT-BIH, PTB, and CHB-MIT) were originally curated with mutually exclusive diagnostic categories, making single-label formulation more appropriate. Third, multi-label approaches substantially increase computational and interpretative complexity, requiring more sophisticated threshold selection and potentially generating conflicting diagnoses. For future work in contexts where co-occurring conditions are common (eg, concurrent arrhythmias), multilabel formulations with hierarchical class structures would be valuable extensions.</p><p>In comparison, multilabel datasets require independent probability estimation for each diagnosis via sigmoid activation. PTB Diagnostic ECG achieved high performance (94.7%) due to its binary classification structure, whereas PTB-XL (88.9%) reflects the challenge of predicting 71 simultaneous diagnostic categories with class imbalance. Chapman-Shaoxing performed moderately well (91.2%), demonstrating the model&#x2019;s ability to handle co-occurring rhythm categories. Overall, single-label tasks benefit from softmax normalization and are suitable for point-of-care scenarios where a primary diagnosis is needed, while multilabel tasks enable comprehensive diagnostic assessments in better-resourced clinical settings.</p></sec><sec id="s4-2"><title>Cross-Dataset Generalization and Preprocessing Impact</title><p>Modality-specific preprocessing was essential for consistent performance. ECG signals were filtered at 0.5&#x2010;50 Hz to preserve QRS morphology, while EEG signals were filtered at 0.5&#x2010;30 Hz to retain physiologically relevant sleep frequency bands (<xref ref-type="table" rid="table2">Table 2</xref>). Ablation studies indicate that uniform filtering across modalities reduces average accuracy by 4.1%, confirming the importance of adaptive preprocessing. Single-lead selection (<xref ref-type="table" rid="table4">Table 4</xref>) and fixed-sample segmentation (<xref ref-type="table" rid="table3">Table 3</xref>) ensured reproducible input dimensions, supported point-of-care feasibility, and prevented data leakage via patient-level splitting.</p></sec><sec id="s4-3"><title>Clinical Implications</title><p>GPT-4-generated natural language interpretations bridge the gap between raw signal classification and actionable clinical insights, enabling non-specialist health care workers to understand diagnostic reasoning. Cloud-based deployment considerations include connectivity, API costs, and regulatory compliance. The system&#x2019;s interpretability and alignment with portable ECG devices (eg, AliveCor) support deployment in resource-limited environments.</p></sec><sec id="s4-4"><title>Comparison With State-of-the-Art</title><p>Compared with previous works, our system achieves competitive arrhythmia detection (92.3% accuracy vs 94.2%) [<xref ref-type="bibr" rid="ref25">25</xref>] and AUROC (0.95 vs 0.96) [<xref ref-type="bibr" rid="ref26">26</xref>], with the added advantage of single-lead compatibility and natural language output. Unlike text-only models such as BioBERT or ClinicalBERT, this pipeline integrates physiological signals with generative AI for end-to-end interpretability.</p></sec><sec id="s4-5"><title>Limitations and Future Work</title><p>The study represents a technical proof-of-concept validated on retrospective public datasets only. The system has not been deployed in clinical settings, validated prospectively, or tested for patient outcome improvements. Limitations include single-label focus, single-lead restriction, potential dataset bias, cloud-based computational requirements, and limited dataset diversity. Future work includes prospective IRB-approved clinical pilots, multilead and multilabel extensions, multimodal integration, federated learning for privacy preservation, uncertainty quantification, edge deployment optimization, and cost reduction through alternative LLMs. Regulatory approval and usability testing will be essential before clinical deployment.</p><p>Several important limitations warrant discussion. First, this proof-of-concept was evaluated exclusively on curated public datasets that may not reflect the noise characteristics, artifact levels, and signal quality typical of real-world field deployments in resource-limited settings. Robustness to motion artifacts, electrode contact issues, and electrical interference remains untested. Second, our framework relies on cloud-based GPT-4 API calls, raising concerns about (1) data privacy and HIPAA compliance when transmitting patient signals, (2) dependence on stable internet connectivity, and (3) potential API cost barriers for sustained deployment. Alternative approaches using locally deployed open-source LLMs (eg, Llama-2 and Mistral) should be explored, though preliminary tests suggest current open models produce less clinically coherent interpretations. Third, we have not evaluated real-time performance constraints or edge-device deployment feasibility. The LSTM inference time (approximately 50 ms per signal on GPU) is acceptable, but resource requirements on low-power devices (eg, Raspberry Pi or mobile platforms) are unknown. Fourth, the PhysioNet datasets used may not adequately represent demographic diversity (age, sex, race, and comorbidities) or rare conditions, potentially limiting generalizability and introducing bias. Finally, while expert evaluation was promising (mean rating 4.2/5, SD 0.57), it was limited to 150 interpretations and did not assess long-term clinical impact or actual provider acceptance in practice settings.</p><p>This work presents a technical proof-of-concept and has not been deployed in clinical settings or evaluated using prospective patient data. The system remains a research prototype and would require extensive prospective validation across diverse patient populations, formal clinical trials with IRB approval prior to any clinical use, regulatory clearance (eg, Food and Drug Administration 510(k) or equivalent) for diagnostic applications, and ethics review for telemedicine deployment in underserved regions.</p><p>Generalization of model performance beyond PhysioNet benchmarks remains uncertain, particularly in real-world environments with differing patient populations and signal acquisition equipment. There is a risk of automation bias, whereby health care workers may overrely on AI-generated outputs, underscoring the need for interface designs that encourage critical evaluation. Future deployments must incorporate end-to-end encryption and secure data handling mechanisms to protect patient privacy. Liability considerations related to AI-assisted misdiagnosis remain unresolved and will require clearly defined legal and regulatory frameworks before clinical use.</p></sec><sec id="s4-6"><title>Conclusion</title><p>Important caveats should be noted regarding near-term clinical deployment. This proof of concept demonstrates technical feasibility on retrospective public datasets but has not been validated in prospective real-world settings. Critical next steps include (1) prospective pilot studies in low-resource primary care clinics to assess real-world performance and provider acceptance; (2) testing on portable, low-power edge devices to confirm computational feasibility without cloud infrastructure; (3) evaluation with locally deployed open-source LLMs to eliminate API dependencies and privacy concerns; and (4) assessment of performance on signals with realistic noise and artifact levels. Only after these validations can clinical deployment be responsibly considered.</p><p>This work presents a comprehensive technical framework integrating LSTM-based biomedical signal classification with GPT-4-generated natural language interpretation, designed for deployment in resource-limited and remote settings. The framework&#x2019;s key contributions include a single-lead selection strategy (<xref ref-type="table" rid="table4">Table 4</xref>) for consistent input dimensionality and alignment with point-of-care devices, modality-specific preprocessing (<xref ref-type="table" rid="table2">Table 2</xref>) that preserves diagnostically relevant features while removing artifacts, and a unified architecture supporting both single-label (softmax) and multilabel (sigmoid) formulations, enabling applicability across diverse diagnostic scenarios. Patient-level data splitting ensures true generalization without leakage, while robust classification performance was demonstrated across 6 datasets, including MIT-BIH (92.3%), PTB Diagnostic (94.7%), PTB-XL (88.9%), Chapman-Shaoxing (91.2%), MIMIC-III (89.5%), and Sleep-EDF (87.3%). GPT-4-generated explanations achieved high clinical accuracy (4.3/5.0) and clarity (4.6/5.0) as assessed by expert reviewers (<xref ref-type="table" rid="table11">Table 11</xref>), highlighting the framework&#x2019;s practical clinical utility for interpretable AI-assisted decision support. The open-source implementation with Representational State Transfer API ensures transparency, reproducibility, and community validation. While prospective clinical deployment and real-world validation remain future steps, this framework provides a robust, methodologically transparent baseline for AI-driven diagnostics and supports equitable access to remote diagnostic tools across diverse health care settings.</p></sec></sec></body><back><ack><p>The authors thank the PhysioNet team for providing access to the biomedical signal databases used in this research. They also acknowledge the computational resources provided by their institution&#x2019;s high-performance computing facility. They are grateful to the 3 board-certified cardiologists who provided expert evaluation of the GPT-4&#x2013;generated natural language outputs.</p><p>Generative artificial intelligence tools were used in a limited capacity during manuscript preparation. GitHub Copilot assisted with inline code comments and docstring generation for the open-source implementation. All code functionality was independently verified and tested by the authors. Generative artificial intelligence was not used for experimental design, data analysis, statistical computations, results interpretation, or generation of conclusions. The core intellectual contributions and scientific findings are entirely the work of the human authors. The authors take full responsibility for the accuracy and integrity of all content in this manuscript, regardless of the tools used in its preparation.</p></ack><notes><sec><title>Funding</title><p>This research did not receive any specific grant from funding agencies in the public, commercial, or not-for-profit sectors.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb4">CHB-MIT</term><def><p>Children's Hospital Boston-Massachusetts Institute of Technology</p></def></def-item><def-item><term id="abb5">CNN</term><def><p>convolutional neural network</p></def></def-item><def-item><term id="abb6">ECG</term><def><p>electrocardiogram</p></def></def-item><def-item><term id="abb7">EDF</term><def><p>European data format</p></def></def-item><def-item><term id="abb8">EEG</term><def><p>electroencephalogram</p></def></def-item><def-item><term id="abb9">HIPAA</term><def><p>Health Insurance Portability and Accountability Act</p></def></def-item><def-item><term id="abb10">ICU</term><def><p>intensive care unit</p></def></def-item><def-item><term id="abb11">IRB</term><def><p>Institutional Review Board</p></def></def-item><def-item><term id="abb12">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb13">LSTM</term><def><p>long short-term memory</p></def></def-item><def-item><term id="abb14">MIH-BIH</term><def><p>Massachusetts Institute of Technology-Beth Israel Hospital</p></def></def-item><def-item><term id="abb15">MIMIC</term><def><p>Medical Information Mart for Intensive Care</p></def></def-item><def-item><term id="abb16">PTB</term><def><p>Physikalisch-Technische Bundesanstalt</p></def></def-item><def-item><term id="abb17">PTB-XL</term><def><p>Physikalisch-Technischen Bundesanstalt-extra large</p></def></def-item><def-item><term id="abb18">ReLU</term><def><p>rectified linear unit</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Universal health coverage (UHC)</article-title><source>World Health Organization</source><year>2025</year><month>03</month><day>26</day><access-date>2026-03-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/universal-health-coverage">https://www.who.int/universal-health-coverage</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>Health workforce</article-title><source>World Health Organization</source><year>2025</year><access-date>2026-03-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/data/gho/data/themes/health-workforce">https://www.who.int/data/gho/data/themes/health-workforce</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>The growing labor shortage in healthcare technology management&#x2014;and what it means for hospitals</article-title><source>InterMed</source><access-date>2026-02-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://intermed1.com/the-growing-labor-shortage-in-healthcare-technology-management-and-what-it-means-for-hospitals">https://intermed1.com/the-growing-labor-shortage-in-healthcare-technology-management-and-what-it-means-for-hospitals</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tahri Sqalli</surname><given-names>M</given-names> </name><name name-style="western"><surname>Al-Thani</surname><given-names>D</given-names> </name><name name-style="western"><surname>Elshazly</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Al-Hijji</surname><given-names>M</given-names> </name><name name-style="western"><surname>Alahmadi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sqalli Houssaini</surname><given-names>Y</given-names> </name></person-group><article-title>Understanding cardiology practitioners' interpretations of electrocardiograms: an eye-tracking study</article-title><source>JMIR Hum Factors</source><year>2022</year><month>02</month><day>9</day><volume>9</volume><issue>1</issue><fpage>e34058</fpage><pub-id pub-id-type="doi">10.2196/34058</pub-id><pub-id pub-id-type="medline">35138258</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Colangelo</surname><given-names>S</given-names> </name></person-group><article-title>Navigating human errors in medical device usage: insights from human factors research</article-title><source>Noble</source><year>2024</year><access-date>2026-02-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.gonoble.com/blog/navigating-human-errors-in-medical-device-usage-insights-from-human-factors-research/">https://www.gonoble.com/blog/navigating-human-errors-in-medical-device-usage-insights-from-human-factors-research/</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Edwards</surname><given-names>E</given-names> </name></person-group><article-title>Nearly half of U.S. counties don&#x2019;t have a single cardiologist</article-title><source>NBC News</source><year>2024</year><month>07</month><day>8</day><access-date>2026-02-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nbcnews.com/health/health-news/nearly-half-us-counties-dont-single-cardiologist-rcna160229">https://www.nbcnews.com/health/health-news/nearly-half-us-counties-dont-single-cardiologist-rcna160229</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ansari</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mourad</surname><given-names>O</given-names> </name><name name-style="western"><surname>Qaraqe</surname><given-names>K</given-names> </name><name name-style="western"><surname>Serpedin</surname><given-names>E</given-names> </name></person-group><article-title>Deep learning for ECG arrhythmia detection and classification: an overview of progress for period 2017-2023</article-title><source>Front Physiol</source><year>2023</year><volume>14</volume><fpage>1246746</fpage><pub-id pub-id-type="doi">10.3389/fphys.2023.1246746</pub-id><pub-id pub-id-type="medline">37791347</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Andreotti</surname><given-names>F</given-names> </name><name name-style="western"><surname>Carr</surname><given-names>O</given-names> </name><name name-style="western"><surname>Pimentel</surname><given-names>MAF</given-names> </name><name name-style="western"><surname>Mahdi</surname><given-names>A</given-names> </name><name name-style="western"><surname>De Vos</surname><given-names>M</given-names> </name></person-group><article-title>Comparing feature-based classifiers and convolutional neural networks to detect arrhythmia from short segments of ECG</article-title><conf-name>2017 Computing in Cardiology (CinC)</conf-name><conf-date>Sep 24-27, 2017</conf-date><pub-id pub-id-type="doi">10.22489/CinC.2017.360-239</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eldele</surname><given-names>E</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>C</given-names> </name><etal/></person-group><article-title>An attention-based deep learning approach for sleep stage classification with single-channel EEG</article-title><source>IEEE Trans Neural Syst Rehabil Eng</source><year>2021</year><volume>29</volume><fpage>809</fpage><lpage>818</lpage><pub-id pub-id-type="doi">10.1109/TNSRE.2021.3076234</pub-id><pub-id pub-id-type="medline">33909566</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yildirim</surname><given-names>&#x00D6;</given-names> </name></person-group><article-title>A novel wavelet sequence based on deep bidirectional LSTM network model for ECG signal classification</article-title><source>Comput Biol Med</source><year>2018</year><month>05</month><day>1</day><volume>96</volume><fpage>189</fpage><lpage>202</lpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2018.03.016</pub-id><pub-id pub-id-type="medline">29614430</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Nakamura</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mahito</surname><given-names>N</given-names> </name></person-group><article-title>Premature ventricular contraction detection from ambulatory ECG using recurrent neural networks</article-title><conf-name>2018 40th Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC)</conf-name><conf-date>Jul 18-21, 2018</conf-date><pub-id pub-id-type="doi">10.1109/EMBC.2018.8512858</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ashfaq Khan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name></person-group><article-title>Cardiac arrhythmia disease classification using LSTM deep learning approach</article-title><source>Comput Mater Con</source><year>2021</year><volume>67</volume><issue>1</issue><fpage>427</fpage><lpage>443</lpage><pub-id pub-id-type="doi">10.32604/cmc.2021.014682</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>P</given-names> </name></person-group><article-title>Interpretable deep learning for automatic diagnosis of 12-lead electrocardiogram</article-title><source>iScience</source><year>2021</year><month>04</month><day>23</day><volume>24</volume><issue>4</issue><fpage>102373</fpage><pub-id pub-id-type="doi">10.1016/j.isci.2021.102373</pub-id><pub-id pub-id-type="medline">33981967</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rajpurkar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>O</given-names> </name><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>AI in health and medicine</article-title><source>Nat Med</source><year>2022</year><month>01</month><volume>28</volume><issue>1</issue><fpage>31</fpage><lpage>38</lpage><pub-id pub-id-type="doi">10.1038/s41591-021-01614-0</pub-id><pub-id pub-id-type="medline">35058619</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kelly</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Karthikesalingam</surname><given-names>A</given-names> </name><name name-style="western"><surname>Suleyman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Corrado</surname><given-names>G</given-names> </name><name name-style="western"><surname>King</surname><given-names>D</given-names> </name></person-group><article-title>Key challenges for delivering clinical impact with artificial intelligence</article-title><source>BMC Med</source><year>2019</year><month>10</month><day>29</day><volume>17</volume><issue>1</issue><fpage>195</fpage><pub-id pub-id-type="doi">10.1186/s12916-019-1426-2</pub-id><pub-id pub-id-type="medline">31665002</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>High-performance medicine: the convergence of human and artificial intelligence</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>44</fpage><lpage>56</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0300-7</pub-id><pub-id pub-id-type="medline">30617339</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kompa</surname><given-names>B</given-names> </name><name name-style="western"><surname>Snoek</surname><given-names>J</given-names> </name><name name-style="western"><surname>Beam</surname><given-names>AL</given-names> </name></person-group><article-title>Second opinion needed: communicating uncertainty in medical machine learning</article-title><source>NPJ Digit Med</source><year>2021</year><month>01</month><day>5</day><volume>4</volume><issue>1</issue><fpage>4</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-00367-3</pub-id><pub-id pub-id-type="medline">33402680</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ma</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Telemedicine application in patients with chronic disease: a systematic review and meta-analysis</article-title><source>BMC Med Inform Decis Mak</source><year>2022</year><month>04</month><day>19</day><volume>22</volume><issue>1</issue><fpage>105</fpage><pub-id pub-id-type="doi">10.1186/s12911-022-01845-2</pub-id><pub-id pub-id-type="medline">35440082</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gogia</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Maeder</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mars</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hartvigsen</surname><given-names>G</given-names> </name><name name-style="western"><surname>Basu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Abbott</surname><given-names>P</given-names> </name></person-group><article-title>Unintended consequences of tele health and their possible solutions</article-title><source>Yearb Med Inform</source><year>2016</year><month>08</month><volume>25</volume><issue>1</issue><fpage>41</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.15265/IY-2016-012</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moody</surname><given-names>GB</given-names> </name><name name-style="western"><surname>Mark</surname><given-names>RG</given-names> </name></person-group><article-title>The impact of the MIT-BIH arrhythmia database</article-title><source>IEEE Eng Med Biol Mag</source><year>2001</year><volume>20</volume><issue>3</issue><fpage>45</fpage><lpage>50</lpage><pub-id pub-id-type="doi">10.1109/51.932724</pub-id><pub-id pub-id-type="medline">11446209</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kemp</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zwinderman</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Tuk</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kamphuisen</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Obery&#x00E9;</surname><given-names>JJ</given-names> </name></person-group><article-title>Analysis of a sleep-dependent neuronal feedback loop: the slow-wave microcontinuity of the EEG</article-title><source>IEEE Trans Biomed Eng</source><year>2000</year><month>09</month><volume>47</volume><issue>9</issue><fpage>1185</fpage><lpage>1194</lpage><pub-id pub-id-type="doi">10.1109/10.867928</pub-id><pub-id pub-id-type="medline">11008419</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>L</given-names> </name><etal/></person-group><article-title>MIMIC-III, a freely accessible critical care database</article-title><source>Sci Data</source><year>2016</year><month>05</month><day>24</day><volume>3</volume><fpage>160035</fpage><pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id><pub-id pub-id-type="medline">27219127</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>Learningdebunked/llm-healthsignal-pipeline</article-title><source>GitHub</source><access-date>2026-03-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/learningdebunked/llm-healthsignal-pipeline">https://github.com/learningdebunked/llm-healthsignal-pipeline</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hannun</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Rajpurkar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Haghpanahi</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Publisher correction: cardiologist-level arrhythmia detection and classification in ambulatory electrocardiograms using a deep neural network</article-title><source>Nat Med</source><year>2019</year><month>03</month><volume>25</volume><issue>3</issue><fpage>530</fpage><lpage>530</lpage><pub-id pub-id-type="doi">10.1038/s41591-019-0359-9</pub-id><pub-id pub-id-type="medline">30679787</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Natarajan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mariani</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A wide and deep transformer neural network for 12-lead ECG classification</article-title><conf-name>2020 Computing in Cardiology Conference</conf-name><conf-date>Sep 13-16, 2020</conf-date><pub-id pub-id-type="doi">10.22489/CinC.2020.107</pub-id></nlm-citation></ref></ref-list></back></article>